refactor: Simplify robots.txt parsing logic

This commit is contained in:
2024-10-23 14:06:55 +03:00
committed by antanst (aider)
parent 8d9ea6cdec
commit 3e01cb1819

View File

@@ -1,46 +1,31 @@
package gemini package gemini
import ( import (
"bufio"
"fmt" "fmt"
"strings" "strings"
) )
// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited // Takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't
// be visited.
// TODO Also take into account the user agent?
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
func ParseRobotsTxt(content string, host string) []string { func ParseRobotsTxt(content string, host string) []string {
scanner := bufio.NewScanner(strings.NewReader(content))
var disallowedPaths []string var disallowedPaths []string
for _, line := range strings.Split(content, "\n") {
// Skip everything until we find "User-agent: *" line line = strings.TrimSpace(line)
for scanner.Scan() { line = strings.ToLower(line)
line := strings.TrimSpace(scanner.Text()) if strings.HasPrefix(line, "disallow:") {
if strings.ToLower(line) == "user-agent: *" {
break
}
}
// Now collect all Disallow paths
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Stop if we hit another User-agent section
if strings.HasPrefix(strings.ToLower(line), "user-agent:") {
break
}
// Parse Disallow lines
if strings.HasPrefix(strings.ToLower(line), "disallow:") {
parts := strings.SplitN(line, ":", 2) parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 { if len(parts) == 2 {
path := strings.TrimSpace(parts[1]) path := strings.TrimSpace(parts[1])
if path != "" { if path != "" {
// Construct full Gemini URL // Construct full Gemini URL
disallowedPaths = append(disallowedPaths, disallowedPaths = append(disallowedPaths,
fmt.Sprintf("gemini://%s%s", host, path)) fmt.Sprintf("gemini://%s%s", host, path))
} }
} }
} }
} }
return disallowedPaths return disallowedPaths
} }