Revert "refactor: Improve robots.txt parsing and caching"

This reverts commit 6a96fb26cc.
This commit is contained in:
2024-10-23 14:07:14 +03:00
parent 6a96fb26cc
commit 1ac250ca6e

View File

@@ -5,49 +5,27 @@ import (
"strings"
)
func ParseRobotsTxt(content string) *RobotsData {
data := &RobotsData{}
var currentUserAgent string
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
directive := strings.TrimSpace(strings.ToLower(parts[0]))
value := strings.TrimSpace(parts[1])
switch directive {
case "user-agent":
currentUserAgent = value
case "allow":
if value != "" {
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: true,
Path: value,
})
}
case "disallow":
if value != "" {
data.Rules = append(data.Rules, RobotRule{
UserAgent: currentUserAgent,
Allow: false,
Path: value,
})
}
case "crawl-delay":
if delay, err := strconv.Atoi(value); err == nil {
data.CrawlDelay = delay
}
}
}
return data
// Takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't
// be visited.
// TODO Also take into account the user agent?
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
func ParseRobotsTxt(content string, host string) []string {
var disallowedPaths []string
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
line = strings.ToLower(line)
if strings.HasPrefix(line, "disallow:") {
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
path := strings.TrimSpace(parts[1])
if path != "" {
// Construct full Gemini URL
disallowedPaths = append(disallowedPaths,
fmt.Sprintf("gemini://%s%s", host, path))
}
}
}
}
return disallowedPaths
}