From 6a96fb26ccc2599703ec5a2705e589a23ed23207 Mon Sep 17 00:00:00 2001 From: "antanst (aider)" Date: Wed, 23 Oct 2024 14:06:56 +0300 Subject: [PATCH] refactor: Improve robots.txt parsing and caching --- gemini/robots.go | 68 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/gemini/robots.go b/gemini/robots.go index be4477a..76021c3 100644 --- a/gemini/robots.go +++ b/gemini/robots.go @@ -5,27 +5,49 @@ import ( "strings" ) -// Takes robots.txt content and a host, and -// returns a list of full URLs that shouldn't -// be visited. -// TODO Also take into account the user agent? -// Check gemini://geminiprotocol.net/docs/companion/robots.gmi -func ParseRobotsTxt(content string, host string) []string { - var disallowedPaths []string - for _, line := range strings.Split(content, "\n") { - line = strings.TrimSpace(line) - line = strings.ToLower(line) - if strings.HasPrefix(line, "disallow:") { - parts := strings.SplitN(line, ":", 2) - if len(parts) == 2 { - path := strings.TrimSpace(parts[1]) - if path != "" { - // Construct full Gemini URL - disallowedPaths = append(disallowedPaths, - fmt.Sprintf("gemini://%s%s", host, path)) - } - } - } - } - return disallowedPaths +func ParseRobotsTxt(content string) *RobotsData { + data := &RobotsData{} + var currentUserAgent string + + for _, line := range strings.Split(content, "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + continue + } + + directive := strings.TrimSpace(strings.ToLower(parts[0])) + value := strings.TrimSpace(parts[1]) + + switch directive { + case "user-agent": + currentUserAgent = value + case "allow": + if value != "" { + data.Rules = append(data.Rules, RobotRule{ + UserAgent: currentUserAgent, + Allow: true, + Path: value, + }) + } + case "disallow": + if value != "" { + data.Rules = append(data.Rules, RobotRule{ + UserAgent: currentUserAgent, + Allow: false, + Path: value, + }) + } + case "crawl-delay": + if delay, err := strconv.Atoi(value); err == nil { + data.CrawlDelay = delay + } + } + } + + return data }