diff --git a/gemini/robots.go b/gemini/robots.go new file mode 100644 index 0000000..fb5dceb --- /dev/null +++ b/gemini/robots.go @@ -0,0 +1,46 @@ +package gemini + +import ( + "bufio" + "fmt" + "strings" +) + +// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited +func ParseRobotsTxt(content string, host string) []string { + scanner := bufio.NewScanner(strings.NewReader(content)) + var disallowedPaths []string + + // Skip everything until we find "User-agent: *" line + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if strings.ToLower(line) == "user-agent: *" { + break + } + } + + // Now collect all Disallow paths + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + // Stop if we hit another User-agent section + if strings.HasPrefix(strings.ToLower(line), "user-agent:") { + break + } + + // Parse Disallow lines + if strings.HasPrefix(strings.ToLower(line), "disallow:") { + parts := strings.SplitN(line, ":", 2) + if len(parts) == 2 { + path := strings.TrimSpace(parts[1]) + if path != "" { + // Construct full Gemini URL + disallowedPaths = append(disallowedPaths, + fmt.Sprintf("gemini://%s%s", host, path)) + } + } + } + } + + return disallowedPaths +} diff --git a/gemini/robots_test.go b/gemini/robots_test.go new file mode 100644 index 0000000..2572f67 --- /dev/null +++ b/gemini/robots_test.go @@ -0,0 +1,25 @@ +package gemini + +import ( + "testing" + "reflect" +) + +func TestParseRobotsTxt(t *testing.T) { + input := `User-agent: * +Disallow: /cgi-bin/wp.cgi/view +Disallow: /cgi-bin/wp.cgi/media +User-agent: googlebot +Disallow: /admin/` + + expected := []string{ + "gemini://example.com/cgi-bin/wp.cgi/view", + "gemini://example.com/cgi-bin/wp.cgi/media", + } + + result := ParseRobotsTxt(input, "example.com") + + if !reflect.DeepEqual(result, expected) { + t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected) + } +}