refactor: Improve robots.txt parsing and caching

This commit is contained in:
2024-10-23 14:06:56 +03:00
parent 3e01cb1819
commit 6a96fb26cc

View File

@@ -5,27 +5,49 @@ import (
"strings" "strings"
) )
// Takes robots.txt content and a host, and func ParseRobotsTxt(content string) *RobotsData {
// returns a list of full URLs that shouldn't data := &RobotsData{}
// be visited. var currentUserAgent string
// TODO Also take into account the user agent?
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
func ParseRobotsTxt(content string, host string) []string {
var disallowedPaths []string
for _, line := range strings.Split(content, "\n") { for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line) line = strings.TrimSpace(line)
line = strings.ToLower(line) if line == "" || strings.HasPrefix(line, "#") {
if strings.HasPrefix(line, "disallow:") { continue
}
parts := strings.SplitN(line, ":", 2) parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 { if len(parts) != 2 {
path := strings.TrimSpace(parts[1]) continue
if path != "" { }
// Construct full Gemini URL
disallowedPaths = append(disallowedPaths, directive := strings.TrimSpace(strings.ToLower(parts[0]))
fmt.Sprintf("gemini://%s%s", host, path)) value := strings.TrimSpace(parts[1])
switch directive {
case "user-agent":
currentUserAgent = value
case "allow":
if value != "" {
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: true,
Path: value,
})
}
case "disallow":
if value != "" {
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: false,
Path: value,
})
}
case "crawl-delay":
if delay, err := strconv.Atoi(value); err == nil {
data.CrawlDelay = delay
} }
} }
} }
}
return disallowedPaths return data
} }