Revert "refactor: Improve robots.txt parsing and caching"

This reverts commit 6a96fb26cc.
This commit is contained in:
2024-10-23 14:07:14 +03:00
parent 6a96fb26cc
commit 1ac250ca6e

View File

@@ -5,49 +5,27 @@ import (
"strings" "strings"
) )
func ParseRobotsTxt(content string) *RobotsData { // Takes robots.txt content and a host, and
data := &RobotsData{} // returns a list of full URLs that shouldn't
var currentUserAgent string // be visited.
// TODO Also take into account the user agent?
for _, line := range strings.Split(content, "\n") { // Check gemini://geminiprotocol.net/docs/companion/robots.gmi
line = strings.TrimSpace(line) func ParseRobotsTxt(content string, host string) []string {
if line == "" || strings.HasPrefix(line, "#") { var disallowedPaths []string
continue for _, line := range strings.Split(content, "\n") {
} line = strings.TrimSpace(line)
line = strings.ToLower(line)
parts := strings.SplitN(line, ":", 2) if strings.HasPrefix(line, "disallow:") {
if len(parts) != 2 { parts := strings.SplitN(line, ":", 2)
continue if len(parts) == 2 {
} path := strings.TrimSpace(parts[1])
if path != "" {
directive := strings.TrimSpace(strings.ToLower(parts[0])) // Construct full Gemini URL
value := strings.TrimSpace(parts[1]) disallowedPaths = append(disallowedPaths,
fmt.Sprintf("gemini://%s%s", host, path))
switch directive { }
case "user-agent": }
currentUserAgent = value }
case "allow": }
if value != "" { return disallowedPaths
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: true,
Path: value,
})
}
case "disallow":
if value != "" {
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: false,
Path: value,
})
}
case "crawl-delay":
if delay, err := strconv.Atoi(value); err == nil {
data.CrawlDelay = delay
}
}
}
return data
} }