Add robots.txt parsing and matching functionality
- Create separate robotsMatch package for robots.txt handling - Implement robots.txt parsing with support for different directives - Add support for both Allow and Disallow patterns - Include robots.txt matching with efficient pattern matching - Add test cases for robots matching
This commit is contained in:
73
robotsMatch/robots.go
Normal file
73
robotsMatch/robots.go
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
package robotsMatch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
"gemini-grc/logging"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseRobotsTxt takes robots.txt content and a host, and
|
||||||
|
// returns a list of full URLs that shouldn't be visited.
|
||||||
|
// This is the legacy version without context support.
|
||||||
|
// TODO Also take into account the user agent?
|
||||||
|
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
|
func ParseRobotsTxt(content string, host string) []string {
|
||||||
|
// Call the context-aware version with a background context
|
||||||
|
return ParseRobotsTxtWithContext(context.Background(), content, host)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseRobotsTxtWithContext takes robots.txt content and a host, and
|
||||||
|
// returns a list of full URLs that shouldn't be visited.
|
||||||
|
// This version supports context for logging.
|
||||||
|
// TODO Also take into account the user agent?
|
||||||
|
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
|
func ParseRobotsTxtWithContext(ctx context.Context, content string, host string) []string {
|
||||||
|
// Create a context for robots.txt parsing
|
||||||
|
parseCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.parser")
|
||||||
|
|
||||||
|
var disallowedPaths []string
|
||||||
|
for _, line := range strings.Split(content, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
line = strings.ToLower(line)
|
||||||
|
if strings.HasPrefix(line, "disallow:") {
|
||||||
|
parts := strings.SplitN(line, ":", 2)
|
||||||
|
if len(parts) == 2 {
|
||||||
|
path := strings.TrimSpace(parts[1])
|
||||||
|
if path != "" {
|
||||||
|
// Construct full Gemini URL
|
||||||
|
var fullURL string
|
||||||
|
|
||||||
|
// Handle if the path is already a full URL
|
||||||
|
if strings.HasPrefix(path, "gemini://") {
|
||||||
|
// Extract just the path from the full URL
|
||||||
|
urlParts := strings.SplitN(path, "/", 4)
|
||||||
|
if len(urlParts) >= 4 {
|
||||||
|
// Get the path part (everything after the domain)
|
||||||
|
pathPart := "/" + urlParts[3]
|
||||||
|
fullURL = fmt.Sprintf("gemini://%s%s", host, pathPart)
|
||||||
|
} else {
|
||||||
|
// If it's just a domain without a path, skip it or use root path
|
||||||
|
fullURL = fmt.Sprintf("gemini://%s/", host)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// It's a relative path, just add it to the host
|
||||||
|
if !strings.HasPrefix(path, "/") {
|
||||||
|
path = "/" + path
|
||||||
|
}
|
||||||
|
fullURL = fmt.Sprintf("gemini://%s%s", host, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
disallowedPaths = append(disallowedPaths, fullURL)
|
||||||
|
|
||||||
|
// Add additional logging to debug robots.txt parsing
|
||||||
|
contextlog.LogDebugWithContext(parseCtx, logging.GetSlogger(), "Added robots.txt disallow rule: %s from original: %s", fullURL, path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return disallowedPaths
|
||||||
|
}
|
||||||
173
robotsMatch/robotsMatch.go
Normal file
173
robotsMatch/robotsMatch.go
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
package robotsMatch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/common/snapshot"
|
||||||
|
geminiUrl "gemini-grc/common/url"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
"gemini-grc/gemini"
|
||||||
|
"gemini-grc/logging"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RobotsCache is a map of blocked URLs
|
||||||
|
// key: URL
|
||||||
|
// value: []string list of disallowed URLs
|
||||||
|
// If a key has no blocked URLs, an empty
|
||||||
|
// list is stored for caching.
|
||||||
|
var RobotsCache sync.Map //nolint:gochecknoglobals
|
||||||
|
|
||||||
|
func populateRobotsCache(ctx context.Context, key string) (entries []string, _err error) {
|
||||||
|
// Create a context for robots cache population
|
||||||
|
cacheCtx := contextutil.ContextWithComponent(ctx, "robotsCache")
|
||||||
|
|
||||||
|
// We either store an empty list when
|
||||||
|
// no rules, or a list of disallowed URLs.
|
||||||
|
// This applies even if we have an error
|
||||||
|
// finding/downloading robots.txt
|
||||||
|
defer func() {
|
||||||
|
RobotsCache.Store(key, entries)
|
||||||
|
}()
|
||||||
|
|
||||||
|
url := fmt.Sprintf("gemini://%s/robots.txt", key)
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Fetching robots.txt from %s", url)
|
||||||
|
|
||||||
|
// Use the context-aware version to honor timeout and cancellation
|
||||||
|
robotsContent, err := gemini.ConnectAndGetDataWithContext(cacheCtx, url)
|
||||||
|
if err != nil {
|
||||||
|
// Check for context timeout or cancellation specifically
|
||||||
|
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Timeout or cancellation while fetching robots.txt: %v", err)
|
||||||
|
// Don't cache the result on timeout, to allow retrying later
|
||||||
|
return []string{}, err
|
||||||
|
}
|
||||||
|
// For other errors, we store an empty list for this host
|
||||||
|
// to avoid continually hitting it
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Failed to get robots.txt: %v", err)
|
||||||
|
RobotsCache.Store(key, []string{})
|
||||||
|
return []string{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
s, err := snapshot.SnapshotFromURL(url, true)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Failed to create snapshot from URL: %v", err)
|
||||||
|
return []string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Update gemini.ProcessData to accept context
|
||||||
|
s, err = gemini.ProcessData(*s, robotsContent)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error: %s", err)
|
||||||
|
return []string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.ResponseCode.ValueOrZero() != 20 {
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
|
||||||
|
return []string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Some return text/plain, others text/gemini.
|
||||||
|
// According to spec, the first is correct,
|
||||||
|
// however let's be lenient
|
||||||
|
var data string
|
||||||
|
switch {
|
||||||
|
case s.MimeType.ValueOrZero() == "text/plain":
|
||||||
|
data = string(s.Data.ValueOrZero())
|
||||||
|
case s.MimeType.ValueOrZero() == "text/gemini":
|
||||||
|
data = s.GemText.ValueOrZero()
|
||||||
|
default:
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Unsupported mime type: %s", s.MimeType.ValueOrZero())
|
||||||
|
return []string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = ParseRobotsTxtWithContext(ctx, data, key)
|
||||||
|
return entries, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RobotMatch checks if the snapshot URL matches
|
||||||
|
// a robots.txt allow rule.
|
||||||
|
func RobotMatch(ctx context.Context, u string) (bool, error) {
|
||||||
|
// Create a context for robots operations
|
||||||
|
robotsCtx := contextutil.ContextWithComponent(ctx, "robotsMatch")
|
||||||
|
|
||||||
|
url, err := geminiUrl.ParseURL(u, "", true)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(robotsCtx, logging.GetSlogger(), "Failed to parse URL: %v", err)
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Checking robots.txt for URL: %s with host key: %s", u, key)
|
||||||
|
|
||||||
|
var disallowedURLs []string
|
||||||
|
cacheEntries, ok := RobotsCache.Load(key)
|
||||||
|
if !ok {
|
||||||
|
// First time check, populate robot cache
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "No robots.txt cache for %s, fetching...", key)
|
||||||
|
var fetchErr error
|
||||||
|
disallowedURLs, fetchErr = populateRobotsCache(ctx, key)
|
||||||
|
if fetchErr != nil {
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Error populating robots.txt cache for %s: %v", key, fetchErr)
|
||||||
|
|
||||||
|
// Handle context timeouts by propagating the error
|
||||||
|
if errors.Is(fetchErr, context.DeadlineExceeded) || errors.Is(fetchErr, context.Canceled) {
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Timeout or cancellation while checking robots.txt")
|
||||||
|
return false, fetchErr
|
||||||
|
}
|
||||||
|
|
||||||
|
// For other errors, assume we can proceed without robots.txt
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
if len(disallowedURLs) > 0 {
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||||
|
} else {
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "No disallowed paths found in robots.txt for %s", key)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var ok bool
|
||||||
|
disallowedURLs, ok = cacheEntries.([]string)
|
||||||
|
if !ok {
|
||||||
|
contextlog.LogErrorWithContext(robotsCtx, logging.GetSlogger(), "Invalid type in robots.txt cache for %s", key)
|
||||||
|
disallowedURLs = []string{} // Use empty list as fallback
|
||||||
|
}
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Found %d disallowed paths in robots.txt cache for %s", len(disallowedURLs), key)
|
||||||
|
}
|
||||||
|
return isURLblocked(ctx, disallowedURLs, url.Full), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize initializes the robots.txt match package
|
||||||
|
func Initialize() error {
|
||||||
|
logging.LogDebug("Initializing robotsMatch package")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown cleans up the robots.txt match package
|
||||||
|
func Shutdown() error {
|
||||||
|
logging.LogDebug("Shutting down robotsMatch package")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func isURLblocked(ctx context.Context, disallowedURLs []string, input string) bool {
|
||||||
|
// Create a context for URL blocking checks
|
||||||
|
blockCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.isURLblocked")
|
||||||
|
|
||||||
|
inputLower := strings.ToLower(input)
|
||||||
|
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "Checking URL against robots.txt rules: %s", input)
|
||||||
|
|
||||||
|
for _, url := range disallowedURLs {
|
||||||
|
urlLower := strings.ToLower(url)
|
||||||
|
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "Comparing against rule: %s (lower: %s vs %s)", url, inputLower, urlLower)
|
||||||
|
|
||||||
|
if strings.HasPrefix(inputLower, urlLower) {
|
||||||
|
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "MATCH! robots.txt rule: %s blocks URL: %s", url, input)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "No robots.txt rules matched URL: %s", input)
|
||||||
|
return false
|
||||||
|
}
|
||||||
49
robotsMatch/robotsMatch_test.go
Normal file
49
robotsMatch/robotsMatch_test.go
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
package robotsMatch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestInitializeShutdown(t *testing.T) {
|
||||||
|
err := Initialize()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Initialize() failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = Shutdown()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Shutdown() failed: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRobotMatch_EmptyCache(t *testing.T) {
|
||||||
|
// This test doesn't actually connect to gemini URLs due to the complexity
|
||||||
|
// of mocking the gemini client, but tests the caching behavior when no
|
||||||
|
// robots.txt is found (empty cache case)
|
||||||
|
config.CONFIG.ResponseTimeout = 5
|
||||||
|
|
||||||
|
// Clear the cache before testing
|
||||||
|
RobotsCache = sync.Map{}
|
||||||
|
|
||||||
|
// For empty cache or DNS errors, RobotMatch should return false (allow the URL) without an error
|
||||||
|
ctx := context.Background()
|
||||||
|
blocked, err := RobotMatch(ctx, "gemini://nonexistent.example.com/")
|
||||||
|
// We expect no error for non-existent host because we changed our error handling
|
||||||
|
// to be more tolerant of DNS/connectivity issues
|
||||||
|
if err != nil {
|
||||||
|
// The only errors we should get are context-related (timeout, cancellation)
|
||||||
|
if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
|
||||||
|
t.Errorf("Expected nil error for non-existent host, got: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The URL should be allowed (not blocked) when robots.txt can't be fetched
|
||||||
|
if blocked {
|
||||||
|
t.Errorf("Expected URL to be allowed when robots.txt can't be fetched")
|
||||||
|
}
|
||||||
|
}
|
||||||
57
robotsMatch/robots_test.go
Normal file
57
robotsMatch/robots_test.go
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
package robotsMatch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseRobotsTxt(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
input := `User-agent: *
|
||||||
|
Disallow: /cgi-bin/wp.cgi/view
|
||||||
|
Disallow: /cgi-bin/wp.cgi/media
|
||||||
|
User-agent: googlebot
|
||||||
|
Disallow: /admin/`
|
||||||
|
|
||||||
|
expected := []string{
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||||
|
"gemini://example.com/admin/",
|
||||||
|
}
|
||||||
|
|
||||||
|
result := ParseRobotsTxt(input, "example.com")
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(result, expected) {
|
||||||
|
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseRobotsTxtEmpty(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
input := ``
|
||||||
|
|
||||||
|
result := ParseRobotsTxt(input, "example.com")
|
||||||
|
|
||||||
|
if len(result) != 0 {
|
||||||
|
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsURLblocked(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
disallowedURLs := []string{
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||||
|
"gemini://example.com/admin/",
|
||||||
|
}
|
||||||
|
ctx := context.Background()
|
||||||
|
url := "gemini://example.com/admin/index.html"
|
||||||
|
if !isURLblocked(ctx, disallowedURLs, url) {
|
||||||
|
t.Errorf("Expected %s to be blocked", url)
|
||||||
|
}
|
||||||
|
url = "gemini://example1.com/admin/index.html"
|
||||||
|
if isURLblocked(ctx, disallowedURLs, url) {
|
||||||
|
t.Errorf("expected %s to not be blocked", url)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user