Add robots.txt parsing and matching functionality

- Create separate robotsMatch package for robots.txt handling
- Implement robots.txt parsing with support for different directives
- Add support for both Allow and Disallow patterns
- Include robots.txt matching with efficient pattern matching
- Add test cases for robots matching
This commit is contained in:
2025-05-22 12:46:21 +03:00
parent a7aa5cd410
commit fe40874844
4 changed files with 352 additions and 0 deletions

73
robotsMatch/robots.go Normal file
View File

@@ -0,0 +1,73 @@
package robotsMatch
import (
"context"
"fmt"
"strings"
"gemini-grc/common/contextlog"
"gemini-grc/contextutil"
"gemini-grc/logging"
)
// ParseRobotsTxt takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't be visited.
// This is the legacy version without context support.
// TODO Also take into account the user agent?
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
func ParseRobotsTxt(content string, host string) []string {
// Call the context-aware version with a background context
return ParseRobotsTxtWithContext(context.Background(), content, host)
}
// ParseRobotsTxtWithContext takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't be visited.
// This version supports context for logging.
// TODO Also take into account the user agent?
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
func ParseRobotsTxtWithContext(ctx context.Context, content string, host string) []string {
// Create a context for robots.txt parsing
parseCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.parser")
var disallowedPaths []string
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
line = strings.ToLower(line)
if strings.HasPrefix(line, "disallow:") {
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
path := strings.TrimSpace(parts[1])
if path != "" {
// Construct full Gemini URL
var fullURL string
// Handle if the path is already a full URL
if strings.HasPrefix(path, "gemini://") {
// Extract just the path from the full URL
urlParts := strings.SplitN(path, "/", 4)
if len(urlParts) >= 4 {
// Get the path part (everything after the domain)
pathPart := "/" + urlParts[3]
fullURL = fmt.Sprintf("gemini://%s%s", host, pathPart)
} else {
// If it's just a domain without a path, skip it or use root path
fullURL = fmt.Sprintf("gemini://%s/", host)
}
} else {
// It's a relative path, just add it to the host
if !strings.HasPrefix(path, "/") {
path = "/" + path
}
fullURL = fmt.Sprintf("gemini://%s%s", host, path)
}
disallowedPaths = append(disallowedPaths, fullURL)
// Add additional logging to debug robots.txt parsing
contextlog.LogDebugWithContext(parseCtx, logging.GetSlogger(), "Added robots.txt disallow rule: %s from original: %s", fullURL, path)
}
}
}
}
return disallowedPaths
}