refactor: Improve robots.txt parsing and caching

This commit is contained in:
2024-10-23 14:06:56 +03:00
parent 3e01cb1819
commit 6a96fb26cc

View File

@@ -5,27 +5,49 @@ import (
"strings" "strings"
) )
// Takes robots.txt content and a host, and func ParseRobotsTxt(content string) *RobotsData {
// returns a list of full URLs that shouldn't data := &RobotsData{}
// be visited. var currentUserAgent string
// TODO Also take into account the user agent?
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi for _, line := range strings.Split(content, "\n") {
func ParseRobotsTxt(content string, host string) []string { line = strings.TrimSpace(line)
var disallowedPaths []string if line == "" || strings.HasPrefix(line, "#") {
for _, line := range strings.Split(content, "\n") { continue
line = strings.TrimSpace(line) }
line = strings.ToLower(line)
if strings.HasPrefix(line, "disallow:") { parts := strings.SplitN(line, ":", 2)
parts := strings.SplitN(line, ":", 2) if len(parts) != 2 {
if len(parts) == 2 { continue
path := strings.TrimSpace(parts[1]) }
if path != "" {
// Construct full Gemini URL directive := strings.TrimSpace(strings.ToLower(parts[0]))
disallowedPaths = append(disallowedPaths, value := strings.TrimSpace(parts[1])
fmt.Sprintf("gemini://%s%s", host, path))
} switch directive {
} case "user-agent":
} currentUserAgent = value
} case "allow":
return disallowedPaths if value != "" {
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: true,
Path: value,
})
}
case "disallow":
if value != "" {
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: false,
Path: value,
})
}
case "crawl-delay":
if delay, err := strconv.Atoi(value); err == nil {
data.CrawlDelay = delay
}
}
}
return data
} }