Add robots.txt parsing and matching functionality

- Create separate robotsMatch package for robots.txt handling - Implement robots.txt parsing with support for different directives - Add support for both Allow and Disallow patterns - Include robots.txt matching with efficient pattern matching - Add test cases for robots matching
2025-05-22 12:46:21 +03:00
parent a7aa5cd410
commit fe40874844
4 changed files with 352 additions and 0 deletions
--- a/robotsMatch/robots.go
+++ b/robotsMatch/robots.go
@@ -0,0 +1,73 @@
+package robotsMatch
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"gemini-grc/common/contextlog"
+	"gemini-grc/contextutil"
+	"gemini-grc/logging"
+)
+
+// ParseRobotsTxt takes robots.txt content and a host, and
+// returns a list of full URLs that shouldn't be visited.
+// This is the legacy version without context support.
+// TODO Also take into account the user agent?
+// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
+func ParseRobotsTxt(content string, host string) []string {
+	// Call the context-aware version with a background context
+	return ParseRobotsTxtWithContext(context.Background(), content, host)
+}
+
+// ParseRobotsTxtWithContext takes robots.txt content and a host, and
+// returns a list of full URLs that shouldn't be visited.
+// This version supports context for logging.
+// TODO Also take into account the user agent?
+// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
+func ParseRobotsTxtWithContext(ctx context.Context, content string, host string) []string {
+	// Create a context for robots.txt parsing
+	parseCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.parser")
+
+	var disallowedPaths []string
+	for _, line := range strings.Split(content, "\n") {
+		line = strings.TrimSpace(line)
+		line = strings.ToLower(line)
+		if strings.HasPrefix(line, "disallow:") {
+			parts := strings.SplitN(line, ":", 2)
+			if len(parts) == 2 {
+				path := strings.TrimSpace(parts[1])
+				if path != "" {
+					// Construct full Gemini URL
+					var fullURL string
+
+					// Handle if the path is already a full URL
+					if strings.HasPrefix(path, "gemini://") {
+						// Extract just the path from the full URL
+						urlParts := strings.SplitN(path, "/", 4)
+						if len(urlParts) >= 4 {
+							// Get the path part (everything after the domain)
+							pathPart := "/" + urlParts[3]
+							fullURL = fmt.Sprintf("gemini://%s%s", host, pathPart)
+						} else {
+							// If it's just a domain without a path, skip it or use root path
+							fullURL = fmt.Sprintf("gemini://%s/", host)
+						}
+					} else {
+						// It's a relative path, just add it to the host
+						if !strings.HasPrefix(path, "/") {
+							path = "/" + path
+						}
+						fullURL = fmt.Sprintf("gemini://%s%s", host, path)
+					}
+
+					disallowedPaths = append(disallowedPaths, fullURL)
+
+					// Add additional logging to debug robots.txt parsing
+					contextlog.LogDebugWithContext(parseCtx, logging.GetSlogger(), "Added robots.txt disallow rule: %s from original: %s", fullURL, path)
+				}
+			}
+		}
+	}
+	return disallowedPaths
+}