From 3e01cb181972ee5ad179a7ba36de46dd4e747bf2 Mon Sep 17 00:00:00 2001 From: antanst Date: Wed, 23 Oct 2024 14:06:55 +0300 Subject: [PATCH] refactor: Simplify robots.txt parsing logic --- gemini/robots.go | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/gemini/robots.go b/gemini/robots.go index fb5dceb..be4477a 100644 --- a/gemini/robots.go +++ b/gemini/robots.go @@ -1,46 +1,31 @@ package gemini import ( - "bufio" "fmt" "strings" ) -// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited +// Takes robots.txt content and a host, and +// returns a list of full URLs that shouldn't +// be visited. +// TODO Also take into account the user agent? +// Check gemini://geminiprotocol.net/docs/companion/robots.gmi func ParseRobotsTxt(content string, host string) []string { - scanner := bufio.NewScanner(strings.NewReader(content)) var disallowedPaths []string - - // Skip everything until we find "User-agent: *" line - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - if strings.ToLower(line) == "user-agent: *" { - break - } - } - - // Now collect all Disallow paths - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - - // Stop if we hit another User-agent section - if strings.HasPrefix(strings.ToLower(line), "user-agent:") { - break - } - - // Parse Disallow lines - if strings.HasPrefix(strings.ToLower(line), "disallow:") { + for _, line := range strings.Split(content, "\n") { + line = strings.TrimSpace(line) + line = strings.ToLower(line) + if strings.HasPrefix(line, "disallow:") { parts := strings.SplitN(line, ":", 2) if len(parts) == 2 { path := strings.TrimSpace(parts[1]) if path != "" { // Construct full Gemini URL - disallowedPaths = append(disallowedPaths, + disallowedPaths = append(disallowedPaths, fmt.Sprintf("gemini://%s%s", host, path)) } } } } - return disallowedPaths }