Add robots.txt parsing and matching functionality

- Create separate robotsMatch package for robots.txt handling - Implement robots.txt parsing with support for different directives - Add support for both Allow and Disallow patterns - Include robots.txt matching with efficient pattern matching - Add test cases for robots matching
2025-05-22 12:46:21 +03:00
parent 5940a117fd
commit 8a9ca0b2e7
4 changed files with 352 additions and 0 deletions
--- a/robotsMatch/robots_test.go
+++ b/robotsMatch/robots_test.go
@@ -0,0 +1,57 @@
+package robotsMatch
+
+import (
+	"context"
+	"reflect"
+	"testing"
+)
+
+func TestParseRobotsTxt(t *testing.T) {
+	t.Parallel()
+	input := `User-agent: *
+Disallow: /cgi-bin/wp.cgi/view
+Disallow: /cgi-bin/wp.cgi/media
+User-agent: googlebot
+Disallow: /admin/`
+
+	expected := []string{
+		"gemini://example.com/cgi-bin/wp.cgi/view",
+		"gemini://example.com/cgi-bin/wp.cgi/media",
+		"gemini://example.com/admin/",
+	}
+
+	result := ParseRobotsTxt(input, "example.com")
+
+	if !reflect.DeepEqual(result, expected) {
+		t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
+	}
+}
+
+func TestParseRobotsTxtEmpty(t *testing.T) {
+	t.Parallel()
+	input := ``
+
+	result := ParseRobotsTxt(input, "example.com")
+
+	if len(result) != 0 {
+		t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
+	}
+}
+
+func TestIsURLblocked(t *testing.T) {
+	t.Parallel()
+	disallowedURLs := []string{
+		"gemini://example.com/cgi-bin/wp.cgi/view",
+		"gemini://example.com/cgi-bin/wp.cgi/media",
+		"gemini://example.com/admin/",
+	}
+	ctx := context.Background()
+	url := "gemini://example.com/admin/index.html"
+	if !isURLblocked(ctx, disallowedURLs, url) {
+		t.Errorf("Expected %s to be blocked", url)
+	}
+	url = "gemini://example1.com/admin/index.html"
+	if isURLblocked(ctx, disallowedURLs, url) {
+		t.Errorf("expected %s to not be blocked", url)
+	}
+}