Add robots.txt checking

Still needs periodic cache refresh
2024-10-23 14:24:10 +03:00
parent c49a69728a
commit 1e54df741d
7 changed files with 112 additions and 42 deletions
--- a/README.md
+++ b/README.md
@@ -10,10 +10,10 @@ A Gemini crawler.
 - [x] Configuration via environment variables
 - [x] Storing snapshots in PostgreSQL
 - [x] Proper response header & body UTF-8 and format validation
+- [x] Follow robots.txt

 ## TODO
- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/
-  - [ ] Test with gemini://alexey.shpakovsky.ru/maze
+- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi
 - [ ] Proper handling of all response codes
  - [ ] Handle 3X redirects properly
 - [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
--- a/blacklists/domains.txt
+++ b/blacklists/domains.txt
@@ -1,5 +0,0 @@
-gemi.dev
-kennedy.gemi.dev
-alexey.shpakovsky.ru
-musicbrainz.uploadedlobster.com
-gemini.bunburya.eu
--- a/db/initdb.sql
+++ b/db/initdb.sql
@@ -42,4 +42,5 @@ CREATE INDEX idx_lang ON snapshots (lang);
 CREATE INDEX idx_response_code ON snapshots (response_code);
 CREATE INDEX idx_error ON snapshots (error);
 CREATE INDEX idx_host ON snapshots (host);
+-- Add the unprocessed snapshots index here! check db
 CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
--- a/gemini/blacklist.go
+++ b/gemini/blacklist.go
@@ -1,22 +0,0 @@
-package gemini
-
-import "gemini-grc/logging"
-
-var Blacklist *[]string
-
-func InBlacklist(s *Snapshot) bool {
-	if Blacklist == nil {
-		data := ReadLines("blacklists/domains.txt")
-		Blacklist = &data
-		logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist))
-	}
-	for _, l := range *Blacklist {
-		if s.Host == l {
-			return true
-		}
-		// if strings.HasPrefix(s.URL.String(), l) {
-		// 	return true
-		// }
-	}
-	return false
-}
--- a/gemini/robotmatch.go
+++ b/gemini/robotmatch.go
@@ -0,0 +1,83 @@
+package gemini
+
+import (
+	"fmt"
+	"gemini-grc/logging"
+	"strings"
+	"sync"
+)
+
+// key: "host:port" (string)
+// value:
+// empty []string if no robots data, or
+// list of URL prefixes ([]string) in robots
+var RobotsCache sync.Map
+
+func populateBlacklist(key string) (entries []string) {
+	// We either store an empty list when
+	// no rules, or a list of disallowed URLs.
+	// This applies even if we have an error
+	// finding/downloading robots.txt
+	defer func() {
+		RobotsCache.Store(key, entries)
+	}()
+	url := fmt.Sprintf("gemini://%s/robots.txt", key)
+	robotsContent, err := ConnectAndGetData(url)
+	if err != nil {
+		logging.LogDebug("robots.txt error %s", err)
+		return []string{}
+	}
+	robotsData, err := processData(robotsContent)
+	if err != nil {
+		logging.LogDebug("robots.txt error %s", err)
+		return []string{}
+	}
+	if robotsData.ResponseCode != 20 {
+		logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
+		return []string{}
+	}
+	// Some return text/plain, others text/gemini.
+	// According to spec, the first is correct,
+	// however let's be lenient
+	var data string
+	if robotsData.MimeType == "text/plain" {
+		data = string(robotsData.Data)
+	} else if robotsData.MimeType == "text/gemini" {
+		data = robotsData.GemText
+	} else {
+		return []string{}
+	}
+	entries = ParseRobotsTxt(string(data), key)
+	return entries
+}
+
+// Check if the snapshot URL matches
+// a robots.txt allow rule.
+func RobotMatch(s *Snapshot) bool {
+	logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
+	key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
+	v, ok := RobotsCache.Load(key)
+	if ok == false {
+		// First time check, populate robot cache
+		logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
+		disallowedURLs := populateBlacklist(key)
+		for _, url := range disallowedURLs {
+			if strings.HasPrefix(s.URL.String(), url) {
+				logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
+				return true
+			}
+		}
+	} else {
+		if len(v.([]string)) == 0 {
+			logging.LogDebug("No robots.txt or no rules, allowed")
+			return false
+		}
+		for _, url := range v.([]string) {
+			if strings.HasPrefix(s.URL.String(), url) {
+				logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
+				return true
+			}
+		}
+	}
+	return false
+}
--- a/gemini/robots_test.go
+++ b/gemini/robots_test.go
@@ -1,8 +1,8 @@
 package gemini

 import (
-	"testing"
 	"reflect"
+	"testing"
 )

 func TestParseRobotsTxt(t *testing.T) {
@@ -15,6 +15,7 @@ Disallow: /admin/`
 	expected := []string{
 		"gemini://example.com/cgi-bin/wp.cgi/view",
 		"gemini://example.com/cgi-bin/wp.cgi/media",
+		"gemini://example.com/admin/",
 	}

 	result := ParseRobotsTxt(input, "example.com")
@@ -23,3 +24,13 @@ Disallow: /admin/`
 		t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
 	}
 }
+
+func TestParseRobotsTxtEmpty(t *testing.T) {
+	input := ``
+
+	result := ParseRobotsTxt(input, "example.com")
+
+	if len(result) != 0 {
+		t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
+	}
+}
--- a/gemini/worker.go
+++ b/gemini/worker.go
@@ -30,12 +30,17 @@ func printPoolIPs() {
 }

 func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
-	// Wrap errors with more info.
-	defer func() {
+	// If URL matches a robots.txt disallow line,
+	// add it as an error so next time it won't be
+	// crawled.
+	if RobotMatch(s) {
+		s.Error = null.StringFrom("robots.txt disallow match")
+		err = SaveSnapshotToDB(tx, s)
 		if err != nil {
-			err = fmt.Errorf("[%d] Worker Error: %w", id, err)
+			return fmt.Errorf("[%d] DB Error: %w", id, err)
 		}
-	}()
+		return nil
+	}

 	IPs, err := getHostIPAddresses(s.Host)
 	if err != nil {
@@ -88,19 +93,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
 	if s.Links != nil {
 		var batchSnapshots []*Snapshot
 		timestamp := null.TimeFrom(time.Now())
-		
+
 		for _, link := range *s.Links {
 			if shouldPersistURL(tx, link) {
 				newSnapshot := &Snapshot{
-					UID: uid.UID(),
-					URL: link,
-					Host: link.Hostname,
+					UID:       uid.UID(),
+					URL:       link,
+					Host:      link.Hostname,
 					Timestamp: timestamp,
 				}
 				batchSnapshots = append(batchSnapshots, newSnapshot)
 			}
 		}
-		
+
 		if len(batchSnapshots) > 0 {
 			logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
 			err = SaveLinksToDB(tx, batchSnapshots)
@@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) {
 	}
 	total := len(snapshots)
 	for i, s := range snapshots {
-		if InBlacklist(&s) {
-			logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
-		}
 		logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
 		err = workOnSnapshot(id, tx, &s)
 		if err != nil {