Add robots.txt checking

Still needs periodic cache refresh
Simplify robots.txt parsing logic
2024-10-23 14:28:49 +03:00 · 2024-10-23 14:28:49 +03:00
7 changed files with 118 additions and 67 deletions
--- a/README.md
+++ b/README.md
@@ -10,10 +10,10 @@ A Gemini crawler.
 - [x] Configuration via environment variables
 - [x] Storing snapshots in PostgreSQL
 - [x] Proper response header & body UTF-8 and format validation
+- [x] Follow robots.txt

 ## TODO
- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/
-  - [ ] Test with gemini://alexey.shpakovsky.ru/maze
+- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi
 - [ ] Proper handling of all response codes
  - [ ] Handle 3X redirects properly
 - [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
--- a/blacklists/domains.txt
+++ b/blacklists/domains.txt
@@ -1,5 +0,0 @@
-gemi.dev
-kennedy.gemi.dev
-alexey.shpakovsky.ru
-musicbrainz.uploadedlobster.com
-gemini.bunburya.eu
--- a/gemini/blacklist.go
+++ b/gemini/blacklist.go
@@ -1,22 +0,0 @@
-package gemini
-
-import "gemini-grc/logging"
-
-var Blacklist *[]string
-
-func InBlacklist(s *Snapshot) bool {
-	if Blacklist == nil {
-		data := ReadLines("blacklists/domains.txt")
-		Blacklist = &data
-		logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist))
-	}
-	for _, l := range *Blacklist {
-		if s.Host == l {
-			return true
-		}
-		// if strings.HasPrefix(s.URL.String(), l) {
-		// 	return true
-		// }
-	}
-	return false
-}
--- a/gemini/robotmatch.go
+++ b/gemini/robotmatch.go
@@ -0,0 +1,80 @@
+package gemini
+
+import (
+	"fmt"
+	"gemini-grc/logging"
+	"strings"
+	"sync"
+)
+
+// key: "host:port" (string)
+// value:
+// empty []string if no robots data, or
+// list of URL prefixes ([]string) in robots
+var RobotsCache sync.Map
+
+func populateBlacklist(key string) (entries []string) {
+	// We either store an empty list when
+	// no rules, or a list of disallowed URLs.
+	// This applies even if we have an error
+	// finding/downloading robots.txt
+	defer func() {
+		RobotsCache.Store(key, entries)
+	}()
+	url := fmt.Sprintf("gemini://%s/robots.txt", key)
+	robotsContent, err := ConnectAndGetData(url)
+	if err != nil {
+		logging.LogDebug("robots.txt error %s", err)
+		return []string{}
+	}
+	robotsData, err := processData(robotsContent)
+	if err != nil {
+		logging.LogDebug("robots.txt error %s", err)
+		return []string{}
+	}
+	if robotsData.ResponseCode != 20 {
+		logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
+		return []string{}
+	}
+	// Some return text/plain, others text/gemini.
+	// According to spec, the first is correct,
+	// however let's be lenient
+	var data string
+	if robotsData.MimeType == "text/plain" {
+		data = string(robotsData.Data)
+	} else if robotsData.MimeType == "text/gemini" {
+		data = robotsData.GemText
+	} else {
+		return []string{}
+	}
+	entries = ParseRobotsTxt(string(data), key)
+	return entries
+}
+
+func RobotMatch(s *Snapshot) bool {
+	logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
+	key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
+	v, ok := RobotsCache.Load(key)
+	if ok == false {
+		logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
+		disallowedURLs := populateBlacklist(key)
+		for _, url := range disallowedURLs {
+			if strings.HasPrefix(s.URL.String(), url) {
+				logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
+				return true
+			}
+		}
+	} else {
+		if len(v.([]string)) == 0 {
+			logging.LogDebug("No robots.txt or no rules, allowed")
+			return false
+		}
+		for _, url := range v.([]string) {
+			if strings.HasPrefix(s.URL.String(), url) {
+				logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
+				return true
+			}
+		}
+	}
+	return false
+}
--- a/gemini/robots.go
+++ b/gemini/robots.go
@@ -1,35 +1,21 @@
 package gemini

 import (
-	"bufio"
 	"fmt"
 	"strings"
 )

-// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited
+// Takes robots.txt content and a host, and
+// returns a list of full URLs that shouldn't
+// be visited.
+// TODO Also take into account the user agent?
+// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
 func ParseRobotsTxt(content string, host string) []string {
-	scanner := bufio.NewScanner(strings.NewReader(content))
 	var disallowedPaths []string
-
-	// Skip everything until we find "User-agent: *" line
-	for scanner.Scan() {
-		line := strings.TrimSpace(scanner.Text())
-		if strings.ToLower(line) == "user-agent: *" {
-			break
-		}
-	}
-
-	// Now collect all Disallow paths
-	for scanner.Scan() {
-		line := strings.TrimSpace(scanner.Text())
-		
-		// Stop if we hit another User-agent section
-		if strings.HasPrefix(strings.ToLower(line), "user-agent:") {
-			break
-		}
-
-		// Parse Disallow lines
-		if strings.HasPrefix(strings.ToLower(line), "disallow:") {
+	for _, line := range strings.Split(content, "\n") {
+		line = strings.TrimSpace(line)
+		line = strings.ToLower(line)
+		if strings.HasPrefix(line, "disallow:") {
 			parts := strings.SplitN(line, ":", 2)
 			if len(parts) == 2 {
 				path := strings.TrimSpace(parts[1])
@@ -41,6 +27,5 @@ func ParseRobotsTxt(content string, host string) []string {
 			}
 		}
 	}
-
 	return disallowedPaths
 }
--- a/gemini/robots_test.go
+++ b/gemini/robots_test.go
@@ -1,8 +1,8 @@
 package gemini

 import (
-	"testing"
 	"reflect"
+	"testing"
 )

 func TestParseRobotsTxt(t *testing.T) {
@@ -15,6 +15,7 @@ Disallow: /admin/`
 	expected := []string{
 		"gemini://example.com/cgi-bin/wp.cgi/view",
 		"gemini://example.com/cgi-bin/wp.cgi/media",
+		"gemini://example.com/admin/",
 	}

 	result := ParseRobotsTxt(input, "example.com")
@@ -23,3 +24,13 @@ Disallow: /admin/`
 		t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
 	}
 }
+
+func TestParseRobotsTxtEmpty(t *testing.T) {
+	input := ``
+
+	result := ParseRobotsTxt(input, "example.com")
+
+	if len(result) != 0 {
+		t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
+	}
+}
--- a/gemini/worker.go
+++ b/gemini/worker.go
@@ -30,12 +30,17 @@ func printPoolIPs() {
 }

 func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
-	// Wrap errors with more info.
-	defer func() {
+	// If URL matches a robots.txt disallow line,
+	// add it as an error so next time it won't be
+	// crawled.
+	if RobotMatch(s) {
+		s.Error = null.StringFrom("robots.txt disallow match")
+		err = SaveSnapshotToDB(tx, s)
 		if err != nil {
-			err = fmt.Errorf("[%d] Worker Error: %w", id, err)
+			return fmt.Errorf("[%d] DB Error: %w", id, err)
 		}
-	}()
+		return nil
+	}

 	IPs, err := getHostIPAddresses(s.Host)
 	if err != nil {
@@ -92,9 +97,9 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
 		for _, link := range *s.Links {
 			if shouldPersistURL(tx, link) {
 				newSnapshot := &Snapshot{
-					UID: uid.UID(),
-					URL: link,
-					Host: link.Hostname,
+					UID:       uid.UID(),
+					URL:       link,
+					Host:      link.Hostname,
 					Timestamp: timestamp,
 				}
 				batchSnapshots = append(batchSnapshots, newSnapshot)
@@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) {
 	}
 	total := len(snapshots)
 	for i, s := range snapshots {
-		if InBlacklist(&s) {
-			logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
-		}
 		logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
 		err = workOnSnapshot(id, tx, &s)
 		if err != nil {
Author	SHA1	Message	Date
antanst	02015faa81	Add robots.txt checking Still needs periodic cache refresh	2024-10-23 14:28:49 +03:00
antanst	c49a69728a	Simplify robots.txt parsing logic	2024-10-23 14:28:49 +03:00