Enhance crawler with seed list and SQL utilities

Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
2025-06-16 12:29:33 +03:00
parent 5e6dabf1e7
commit 8588414b14
37 changed files with 742 additions and 682 deletions
--- a/robotsMatch/robotsMatch_test.go
+++ b/robotsMatch/robotsMatch_test.go
@@ -2,7 +2,6 @@ package robotsMatch

 import (
 	"context"
-	"errors"
 	"sync"
 	"testing"

@@ -32,15 +31,7 @@ func TestRobotMatch_EmptyCache(t *testing.T) {

 	// For empty cache or DNS errors, RobotMatch should return false (allow the URL) without an error
 	ctx := context.Background()
-	blocked, err := RobotMatch(ctx, "gemini://nonexistent.example.com/")
-	// We expect no error for non-existent host because we changed our error handling
-	// to be more tolerant of DNS/connectivity issues
-	if err != nil {
-		// The only errors we should get are context-related (timeout, cancellation)
-		if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
-			t.Errorf("Expected nil error for non-existent host, got: %v", err)
-		}
-	}
+	blocked := RobotMatch(ctx, "gemini://nonexistent.example.com/")

 	// The URL should be allowed (not blocked) when robots.txt can't be fetched
 	if blocked {