Add robots.txt parsing and matching functionality
- Create separate robotsMatch package for robots.txt handling - Implement robots.txt parsing with support for different directives - Add support for both Allow and Disallow patterns - Include robots.txt matching with efficient pattern matching - Add test cases for robots matching
This commit is contained in:
73
robotsMatch/robots.go
Normal file
73
robotsMatch/robots.go
Normal file
@@ -0,0 +1,73 @@
|
||||
package robotsMatch
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"gemini-grc/common/contextlog"
|
||||
"gemini-grc/contextutil"
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
// ParseRobotsTxt takes robots.txt content and a host, and
|
||||
// returns a list of full URLs that shouldn't be visited.
|
||||
// This is the legacy version without context support.
|
||||
// TODO Also take into account the user agent?
|
||||
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||
func ParseRobotsTxt(content string, host string) []string {
|
||||
// Call the context-aware version with a background context
|
||||
return ParseRobotsTxtWithContext(context.Background(), content, host)
|
||||
}
|
||||
|
||||
// ParseRobotsTxtWithContext takes robots.txt content and a host, and
|
||||
// returns a list of full URLs that shouldn't be visited.
|
||||
// This version supports context for logging.
|
||||
// TODO Also take into account the user agent?
|
||||
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||
func ParseRobotsTxtWithContext(ctx context.Context, content string, host string) []string {
|
||||
// Create a context for robots.txt parsing
|
||||
parseCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.parser")
|
||||
|
||||
var disallowedPaths []string
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
line = strings.ToLower(line)
|
||||
if strings.HasPrefix(line, "disallow:") {
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
path := strings.TrimSpace(parts[1])
|
||||
if path != "" {
|
||||
// Construct full Gemini URL
|
||||
var fullURL string
|
||||
|
||||
// Handle if the path is already a full URL
|
||||
if strings.HasPrefix(path, "gemini://") {
|
||||
// Extract just the path from the full URL
|
||||
urlParts := strings.SplitN(path, "/", 4)
|
||||
if len(urlParts) >= 4 {
|
||||
// Get the path part (everything after the domain)
|
||||
pathPart := "/" + urlParts[3]
|
||||
fullURL = fmt.Sprintf("gemini://%s%s", host, pathPart)
|
||||
} else {
|
||||
// If it's just a domain without a path, skip it or use root path
|
||||
fullURL = fmt.Sprintf("gemini://%s/", host)
|
||||
}
|
||||
} else {
|
||||
// It's a relative path, just add it to the host
|
||||
if !strings.HasPrefix(path, "/") {
|
||||
path = "/" + path
|
||||
}
|
||||
fullURL = fmt.Sprintf("gemini://%s%s", host, path)
|
||||
}
|
||||
|
||||
disallowedPaths = append(disallowedPaths, fullURL)
|
||||
|
||||
// Add additional logging to debug robots.txt parsing
|
||||
contextlog.LogDebugWithContext(parseCtx, logging.GetSlogger(), "Added robots.txt disallow rule: %s from original: %s", fullURL, path)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return disallowedPaths
|
||||
}
|
||||
Reference in New Issue
Block a user