package robotsMatch import ( "context" "errors" "fmt" "strings" "sync" "gemini-grc/common/contextlog" "gemini-grc/common/snapshot" geminiUrl "gemini-grc/common/url" "gemini-grc/config" "gemini-grc/contextutil" "gemini-grc/gemini" "git.antanst.com/antanst/logging" ) // RobotsCache is a map of blocked URLs // key: URL // value: []string list of disallowed URLs // If a key has no blocked URLs, an empty // list is stored for caching. var RobotsCache sync.Map //nolint:gochecknoglobals func populateRobotsCache(ctx context.Context, key string) (entries []string, _err error) { // Create a context for robots cache population cacheCtx := contextutil.ContextWithComponent(ctx, "robotsCache") // We either store an empty list when // no rules, or a list of disallowed URLs. // This applies even if we have an error // finding/downloading robots.txt defer func() { RobotsCache.Store(key, entries) }() url := fmt.Sprintf("gemini://%s/robots.txt", key) contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Fetching robots.txt from %s", url) // Use the context-aware version to honor timeout and cancellation robotsContent, err := gemini.ConnectAndGetData(cacheCtx, url) if err != nil { // Check for context timeout or cancellation specifically if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Timeout or cancellation while fetching robots.txt: %v", err) // Don't cache the result on timeout, to allow retrying later return []string{}, err } // For other errors, we store an empty list for this host // to avoid continually hitting it contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Failed to get robots.txt: %v", err) RobotsCache.Store(key, []string{}) return []string{}, err } s, err := snapshot.SnapshotFromURL(url, true) if err != nil { contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Failed to create snapshot from URL: %v", err) return []string{}, nil } s = gemini.UpdateSnapshotWithData(*s, robotsContent) if s.ResponseCode.ValueOrZero() != 20 { contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero()) return []string{}, nil } // Some return text/plain, others text/gemini. // According to spec, the first is correct, // however let's be lenient var data string switch { case s.MimeType.ValueOrZero() == "text/plain": data = string(s.Data.ValueOrZero()) case s.MimeType.ValueOrZero() == "text/gemini": data = s.GemText.ValueOrZero() default: contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Unsupported mime type: %s", s.MimeType.ValueOrZero()) return []string{}, nil } entries = ParseRobotsTxtWithContext(ctx, data, key) return entries, nil } // RobotMatch checks if the snapshot URL matches // a robots.txt allow rule. func RobotMatch(ctx context.Context, u string) bool { // Create a context for robots operations robotsCtx := contextutil.ContextWithComponent(ctx, "robotsMatch") // TODO Missing Gopher functionality if config.CONFIG.GopherEnable { return false } url, err := geminiUrl.ParseURL(u, "", true) if err != nil { return false } key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port)) contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Checking robots.txt for URL: %s with host key: %s", u, key) var disallowedURLs []string cacheEntries, ok := RobotsCache.Load(key) if !ok { // First time check, populate robot cache contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "No robots.txt cache for %s, fetching...", key) var fetchErr error disallowedURLs, fetchErr = populateRobotsCache(ctx, key) if fetchErr != nil { return false } if len(disallowedURLs) > 0 { contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Added to robots.txt cache: %v => %v", key, disallowedURLs) } else { contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "No disallowed paths found in robots.txt for %s", key) } } else { var ok bool disallowedURLs, ok = cacheEntries.([]string) if !ok { contextlog.LogErrorWithContext(robotsCtx, logging.GetSlogger(), "Invalid type in robots.txt cache for %s", key) disallowedURLs = []string{} // Use empty list as fallback } contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Found %d disallowed paths in robots.txt cache for %s", len(disallowedURLs), key) } return isURLblocked(ctx, disallowedURLs, url.Full) } // Initialize initializes the robots.txt match package func Initialize() error { logging.LogDebug("Initializing robotsMatch package") return nil } // Shutdown cleans up the robots.txt match package func Shutdown() error { logging.LogDebug("Shutting down robotsMatch package") return nil } func isURLblocked(ctx context.Context, disallowedURLs []string, input string) bool { // Create a context for URL blocking checks blockCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.isURLblocked") inputLower := strings.ToLower(input) for _, url := range disallowedURLs { urlLower := strings.ToLower(url) if strings.HasPrefix(inputLower, urlLower) { contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "MATCH! robots.txt rule: %s blocks URL: %s", url, input) return true } } contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "No robots.txt rules matched URL: %s", input) return false }