feat: Implement robots.txt parser
This commit is contained in:
46
gemini/robots.go
Normal file
46
gemini/robots.go
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited
|
||||||
|
func ParseRobotsTxt(content string, host string) []string {
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||||
|
var disallowedPaths []string
|
||||||
|
|
||||||
|
// Skip everything until we find "User-agent: *" line
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
if strings.ToLower(line) == "user-agent: *" {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now collect all Disallow paths
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
|
||||||
|
// Stop if we hit another User-agent section
|
||||||
|
if strings.HasPrefix(strings.ToLower(line), "user-agent:") {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse Disallow lines
|
||||||
|
if strings.HasPrefix(strings.ToLower(line), "disallow:") {
|
||||||
|
parts := strings.SplitN(line, ":", 2)
|
||||||
|
if len(parts) == 2 {
|
||||||
|
path := strings.TrimSpace(parts[1])
|
||||||
|
if path != "" {
|
||||||
|
// Construct full Gemini URL
|
||||||
|
disallowedPaths = append(disallowedPaths,
|
||||||
|
fmt.Sprintf("gemini://%s%s", host, path))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return disallowedPaths
|
||||||
|
}
|
||||||
25
gemini/robots_test.go
Normal file
25
gemini/robots_test.go
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"reflect"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseRobotsTxt(t *testing.T) {
|
||||||
|
input := `User-agent: *
|
||||||
|
Disallow: /cgi-bin/wp.cgi/view
|
||||||
|
Disallow: /cgi-bin/wp.cgi/media
|
||||||
|
User-agent: googlebot
|
||||||
|
Disallow: /admin/`
|
||||||
|
|
||||||
|
expected := []string{
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||||
|
}
|
||||||
|
|
||||||
|
result := ParseRobotsTxt(input, "example.com")
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(result, expected) {
|
||||||
|
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user