diff --git a/gemini/robots.go b/gemini/robots.go index 76021c3..be4477a 100644 --- a/gemini/robots.go +++ b/gemini/robots.go @@ -5,49 +5,27 @@ import ( "strings" ) -func ParseRobotsTxt(content string) *RobotsData { - data := &RobotsData{} - var currentUserAgent string - - for _, line := range strings.Split(content, "\n") { - line = strings.TrimSpace(line) - if line == "" || strings.HasPrefix(line, "#") { - continue - } - - parts := strings.SplitN(line, ":", 2) - if len(parts) != 2 { - continue - } - - directive := strings.TrimSpace(strings.ToLower(parts[0])) - value := strings.TrimSpace(parts[1]) - - switch directive { - case "user-agent": - currentUserAgent = value - case "allow": - if value != "" { - data.Rules = append(data.Rules, RobotRule{ - UserAgent: currentUserAgent, - Allow: true, - Path: value, - }) - } - case "disallow": - if value != "" { - data.Rules = append(data.Rules, RobotRule{ - UserAgent: currentUserAgent, - Allow: false, - Path: value, - }) - } - case "crawl-delay": - if delay, err := strconv.Atoi(value); err == nil { - data.CrawlDelay = delay - } - } - } - - return data +// Takes robots.txt content and a host, and +// returns a list of full URLs that shouldn't +// be visited. +// TODO Also take into account the user agent? +// Check gemini://geminiprotocol.net/docs/companion/robots.gmi +func ParseRobotsTxt(content string, host string) []string { + var disallowedPaths []string + for _, line := range strings.Split(content, "\n") { + line = strings.TrimSpace(line) + line = strings.ToLower(line) + if strings.HasPrefix(line, "disallow:") { + parts := strings.SplitN(line, ":", 2) + if len(parts) == 2 { + path := strings.TrimSpace(parts[1]) + if path != "" { + // Construct full Gemini URL + disallowedPaths = append(disallowedPaths, + fmt.Sprintf("gemini://%s%s", host, path)) + } + } + } + } + return disallowedPaths }