From ef3f0097098293b504b8d7468c5736d301fa94ed Mon Sep 17 00:00:00 2001
From: antanst <antanst@antanst.com>
Date: Wed, 23 Oct 2024 14:24:10 +0300
Subject: [PATCH] Add robots.txt checking

Still needs periodic cache refresh
---
 .gitignore             |   1 +
 README.md              |   4 +-
 blacklists/domains.txt |   5 --
 db/initdb.sql          |   3 +
 db/stats.sql           |   2 +-
 gemini/blacklist.go    |  22 ------
 gemini/gemini.go       |  50 ++++++-------
 gemini/persistence.go  |  28 ++++++++
 gemini/robotmatch.go   |  83 +++++++++++++++++++++
 gemini/robots_test.go  |  13 +++-
 gemini/worker.go       | 159 +++++++++++++++--------------------------
 util/util.go           |  11 +++
 12 files changed, 225 insertions(+), 156 deletions(-)
 delete mode 100644 blacklists/domains.txt
 delete mode 100644 gemini/blacklist.go
 create mode 100644 gemini/robotmatch.go
 create mode 100644 util/util.go

diff --git a/.gitignore b/.gitignore
index 7ef970b..160e6bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 **/*~
 /cmd
 /db/initdb.sql
+/db/*sh
 /run*.sh
 /gemini-grc
 /snaps
diff --git a/README.md b/README.md
index 397613b..9b17907 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,10 @@ A Gemini crawler.
 - [x] Configuration via environment variables
 - [x] Storing snapshots in PostgreSQL
 - [x] Proper response header & body UTF-8 and format validation
+- [x] Follow robots.txt
 
 ## TODO
-- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/
-  - [ ] Test with gemini://alexey.shpakovsky.ru/maze
+- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi
 - [ ] Proper handling of all response codes
   - [ ] Handle 3X redirects properly
 - [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
diff --git a/blacklists/domains.txt b/blacklists/domains.txt
deleted file mode 100644
index 9f956ff..0000000
--- a/blacklists/domains.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-gemi.dev
-kennedy.gemi.dev
-alexey.shpakovsky.ru
-musicbrainz.uploadedlobster.com
-gemini.bunburya.eu
diff --git a/db/initdb.sql b/db/initdb.sql
index 424fd51..9071604 100644
--- a/db/initdb.sql
+++ b/db/initdb.sql
@@ -42,4 +42,7 @@ CREATE INDEX idx_lang ON snapshots (lang);
 CREATE INDEX idx_response_code ON snapshots (response_code);
 CREATE INDEX idx_error ON snapshots (error);
 CREATE INDEX idx_host ON snapshots (host);
+CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
+WHERE response_code IS NULL AND error IS NULL
+INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
 CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
diff --git a/db/stats.sql b/db/stats.sql
index afbba51..a509ee7 100644
--- a/db/stats.sql
+++ b/db/stats.sql
@@ -1,5 +1,5 @@
 SELECT
     COUNT(CASE WHEN response_code IS NOT NULL AND error IS NULL THEN 1 END) AS "Visited",
-    COUNT(CASE WHEN response_code IS NULL THEN 1 END) AS "Pending",
+    COUNT(CASE WHEN response_code IS NULL     AND error IS NULL THEN 1 END) AS "Pending",
     COUNT(CASE WHEN error IS NOT NULL THEN 1 END) AS "Errors"
 FROM snapshots;
diff --git a/gemini/blacklist.go b/gemini/blacklist.go
deleted file mode 100644
index 2f2afee..0000000
--- a/gemini/blacklist.go
+++ /dev/null
@@ -1,22 +0,0 @@
-package gemini
-
-import "gemini-grc/logging"
-
-var Blacklist *[]string
-
-func InBlacklist(s *Snapshot) bool {
-	if Blacklist == nil {
-		data := ReadLines("blacklists/domains.txt")
-		Blacklist = &data
-		logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist))
-	}
-	for _, l := range *Blacklist {
-		if s.Host == l {
-			return true
-		}
-		// if strings.HasPrefix(s.URL.String(), l) {
-		// 	return true
-		// }
-	}
-	return false
-}
diff --git a/gemini/gemini.go b/gemini/gemini.go
index f7341f6..eb24d4e 100644
--- a/gemini/gemini.go
+++ b/gemini/gemini.go
@@ -5,14 +5,14 @@ import (
 	"fmt"
 	"gemini-grc/logging"
 	"net/url"
-	go_url "net/url"
+	gourl "net/url"
 	"regexp"
 	"strconv"
 	"strings"
 )
 
 func isGeminiURL(url string) bool {
-	_, err := go_url.Parse(url)
+	_, err := gourl.Parse(url)
 	if err != nil {
 		logging.LogWarn("[%s] Invalid URL: %v", url, err)
 		return false
@@ -36,17 +36,17 @@ func checkGeminiStatusCode(code int) error {
 	case code == 20:
 		return nil
 	case code >= 10 && code < 20:
-		return fmt.Errorf("Gemini response %d needs data input", code)
+		return fmt.Errorf("gemini response %d needs data input", code)
 	case code >= 30 && code < 40:
-		return fmt.Errorf("Gemini response %d redirect", code)
+		return fmt.Errorf("gemini response %d redirect", code)
 	case code >= 40 && code < 50:
-		return fmt.Errorf("Gemini response %d server error", code)
+		return fmt.Errorf("gemini response %d server error", code)
 	case code >= 50 && code < 60:
-		return fmt.Errorf("Gemini response %d server permanent error", code)
+		return fmt.Errorf("gemini response %d server permanent error", code)
 	case code >= 60 && code < 70:
-		return fmt.Errorf("Gemini response %d certificate error", code)
+		return fmt.Errorf("gemini response %d certificate error", code)
 	default:
-		return fmt.Errorf("Unexpected/unhandled Gemini response %d", code)
+		return fmt.Errorf("unexpected/unhandled Gemini response %d", code)
 	}
 }
 
@@ -57,14 +57,14 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
 
 	// Normalize URLs in links, and store them in snapshot
 	for _, line := range linkLines {
-		normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String())
-		if error != nil {
-			logging.LogWarn("Cannot normalize URL in line '%s': %v", line, error)
+		normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String())
+		if err != nil {
+			logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
 			continue
 		}
-		geminiUrl, error := ParseUrl(normalizedLink, descr)
-		if error != nil {
-			logging.LogWarn("Cannot parse URL in link '%s': %v", line, error)
+		geminiUrl, err := ParseUrl(normalizedLink, descr)
+		if err != nil {
+			logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
 			continue
 		}
 		if snapshot.Links == nil {
@@ -79,18 +79,18 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
 func ParseUrl(input string, descr string) (*GeminiUrl, error) {
 	u, err := url.Parse(input)
 	if err != nil {
-		return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
+		return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
 	}
 	protocol := u.Scheme
 	hostname := u.Hostname()
-	str_port := u.Port()
+	strPort := u.Port()
 	path := u.Path
-	if str_port == "" {
-		str_port = "1965"
+	if strPort == "" {
+		strPort = "1965"
 	}
-	port, err := strconv.Atoi(str_port)
+	port, err := strconv.Atoi(strPort)
 	if err != nil {
-		return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
+		return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
 	}
 	return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
 }
@@ -106,14 +106,14 @@ func ExtractLinkLines(gemtext string) []string {
 	return matches
 }
 
-// Take a single link line and the current URL,
+// NormalizeLink takes a single link line and the current URL,
 // return the URL converted to an absolute URL
 // and its description.
 func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
 	// Parse the current URL
 	baseURL, err := url.Parse(currentURL)
 	if err != nil {
-		return "", "", fmt.Errorf("Invalid current URL: %v", err)
+		return "", "", fmt.Errorf("invalid current URL: %v", err)
 	}
 
 	// Regular expression to extract the URL part from a link line
@@ -123,13 +123,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
 	matches := re.FindStringSubmatch(linkLine)
 	if len(matches) == 0 {
 		// If the line doesn't match the expected format, return it unchanged
-		return "", "", fmt.Errorf("Not a link line: %v", linkLine)
+		return "", "", fmt.Errorf("not a link line: %v", linkLine)
 	}
 
 	originalURLStr := matches[1]
 	_, err = url.QueryUnescape(originalURLStr)
 	if err != nil {
-		return "", "", fmt.Errorf("Error decoding URL: %w", err)
+		return "", "", fmt.Errorf("error decoding URL: %w", err)
 	}
 
 	restOfLine := ""
@@ -141,7 +141,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
 	parsedURL, err := url.Parse(originalURLStr)
 	if err != nil {
 		// If URL parsing fails, return an error
-		return "", "", fmt.Errorf("Invalid URL '%s': %v", originalURLStr, err)
+		return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err)
 	}
 
 	// Resolve relative URLs against the base URL
diff --git a/gemini/persistence.go b/gemini/persistence.go
index d10abd5..73809ad 100644
--- a/gemini/persistence.go
+++ b/gemini/persistence.go
@@ -57,6 +57,34 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
 	return nil
 }
 
+func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
+	// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
+	const batchSize = 5000
+
+	query := `
+        INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
+        VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
+        ON CONFLICT (uid) DO NOTHING
+    `
+
+	for i := 0; i < len(snapshots); i += batchSize {
+		end := i + batchSize
+		if end > len(snapshots) {
+			end = len(snapshots)
+		}
+
+		batch := snapshots[i:end]
+
+		_, err := tx.NamedExec(query, batch)
+		if err != nil {
+			logging.LogError("Error batch inserting snapshots: %w", err)
+			return fmt.Errorf("DB error: %w", err)
+		}
+	}
+
+	return nil
+}
+
 func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
 	query := `
         INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
diff --git a/gemini/robotmatch.go b/gemini/robotmatch.go
new file mode 100644
index 0000000..ddce449
--- /dev/null
+++ b/gemini/robotmatch.go
@@ -0,0 +1,83 @@
+package gemini
+
+import (
+	"fmt"
+	"gemini-grc/logging"
+	"strings"
+	"sync"
+)
+
+// key: "host:port" (string)
+// value:
+// empty []string if no robots data, or
+// list of URL prefixes ([]string) in robots
+var RobotsCache sync.Map
+
+func populateBlacklist(key string) (entries []string) {
+	// We either store an empty list when
+	// no rules, or a list of disallowed URLs.
+	// This applies even if we have an error
+	// finding/downloading robots.txt
+	defer func() {
+		RobotsCache.Store(key, entries)
+	}()
+	url := fmt.Sprintf("gemini://%s/robots.txt", key)
+	robotsContent, err := ConnectAndGetData(url)
+	if err != nil {
+		logging.LogDebug("robots.txt error %s", err)
+		return []string{}
+	}
+	robotsData, err := processData(robotsContent)
+	if err != nil {
+		logging.LogDebug("robots.txt error %s", err)
+		return []string{}
+	}
+	if robotsData.ResponseCode != 20 {
+		logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
+		return []string{}
+	}
+	// Some return text/plain, others text/gemini.
+	// According to spec, the first is correct,
+	// however let's be lenient
+	var data string
+	if robotsData.MimeType == "text/plain" {
+		data = string(robotsData.Data)
+	} else if robotsData.MimeType == "text/gemini" {
+		data = robotsData.GemText
+	} else {
+		return []string{}
+	}
+	entries = ParseRobotsTxt(string(data), key)
+	return entries
+}
+
+// Check if the snapshot URL matches
+// a robots.txt allow rule.
+func RobotMatch(s *Snapshot) bool {
+	logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
+	key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
+	v, ok := RobotsCache.Load(key)
+	if ok == false {
+		// First time check, populate robot cache
+		logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
+		disallowedURLs := populateBlacklist(key)
+		for _, url := range disallowedURLs {
+			if strings.HasPrefix(s.URL.String(), url) {
+				logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
+				return true
+			}
+		}
+	} else {
+		if len(v.([]string)) == 0 {
+			logging.LogDebug("No robots.txt or no rules, allowed")
+			return false
+		}
+		for _, url := range v.([]string) {
+			if strings.HasPrefix(s.URL.String(), url) {
+				logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
+				return true
+			}
+		}
+	}
+	return false
+}
diff --git a/gemini/robots_test.go b/gemini/robots_test.go
index 2572f67..4a00d12 100644
--- a/gemini/robots_test.go
+++ b/gemini/robots_test.go
@@ -1,8 +1,8 @@
 package gemini
 
 import (
-	"testing"
 	"reflect"
+	"testing"
 )
 
 func TestParseRobotsTxt(t *testing.T) {
@@ -15,6 +15,7 @@ Disallow: /admin/`
 	expected := []string{
 		"gemini://example.com/cgi-bin/wp.cgi/view",
 		"gemini://example.com/cgi-bin/wp.cgi/media",
+		"gemini://example.com/admin/",
 	}
 
 	result := ParseRobotsTxt(input, "example.com")
@@ -23,3 +24,13 @@ Disallow: /admin/`
 		t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
 	}
 }
+
+func TestParseRobotsTxtEmpty(t *testing.T) {
+	input := ``
+
+	result := ParseRobotsTxt(input, "example.com")
+
+	if len(result) != 0 {
+		t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
+	}
+}
diff --git a/gemini/worker.go b/gemini/worker.go
index 20125c9..8bf6acf 100644
--- a/gemini/worker.go
+++ b/gemini/worker.go
@@ -1,12 +1,11 @@
 package gemini
 
 import (
-	"database/sql"
 	"fmt"
 	"gemini-grc/config"
 	"gemini-grc/logging"
 	"gemini-grc/uid"
-	"runtime/debug"
+	"gemini-grc/util"
 	"strings"
 	"time"
 
@@ -25,17 +24,63 @@ func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
 	}
 }
 
-func printPoolIPs() {
-	fmt.Printf("%v", IpPool.IPs)
+func runWorker(id int, db *sqlx.DB) {
+	// Start the DB transaction
+	tx, err := db.Beginx()
+	if err != nil {
+		logging.LogError("Failed to begin transaction: %w", err)
+	}
+
+	defer func() {
+		err = tx.Commit()
+		if err != nil {
+			logging.LogError("[%d] Failed to commit transaction: %w", id, err)
+			err := tx.Rollback()
+			if err != nil {
+				panic(fmt.Sprintf("[%d] Failed to roll back transaction: %v", id, err))
+			}
+		}
+	}()
+
+	snapshots, err := GetRandomSnapshotsDistinctHosts(tx)
+
+	if err != nil {
+		logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
+		time.Sleep(10 * time.Second)
+		return
+	} else if len(snapshots) == 0 {
+		logging.LogInfo("[%d] No remaining snapshots to visit.", id)
+		time.Sleep(1 * time.Minute)
+		return
+	}
+	total := len(snapshots)
+	for i, s := range snapshots {
+		logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
+		err = workOnSnapshot(id, tx, &s)
+		if err != nil {
+			logging.LogError("[%d] [%s] Error %w", id, s.URL, err)
+			util.PrintStackAndPanic(err)
+		}
+		if s.Error.Valid {
+			logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String))
+		}
+		logging.LogDebug("[%d] Done %d/%d.", id, i, total)
+	}
+	logging.LogInfo("[%d] Worker done.", id)
 }
 
 func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
-	// Wrap errors with more info.
-	defer func() {
+	// If URL matches a robots.txt disallow line,
+	// add it as an error so next time it won't be
+	// crawled.
+	if RobotMatch(s) {
+		s.Error = null.StringFrom("robots.txt disallow match")
+		err = SaveSnapshotToDB(tx, s)
 		if err != nil {
-			err = fmt.Errorf("[%d] Worker Error: %w", id, err)
+			return fmt.Errorf("[%d] DB Error: %w", id, err)
 		}
-	}()
+		return nil
+	}
 
 	IPs, err := getHostIPAddresses(s.Host)
 	if err != nil {
@@ -88,22 +133,22 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
 	if s.Links != nil {
 		var batchSnapshots []*Snapshot
 		timestamp := null.TimeFrom(time.Now())
-		
+
 		for _, link := range *s.Links {
 			if shouldPersistURL(tx, link) {
 				newSnapshot := &Snapshot{
-					UID: uid.UID(),
-					URL: link,
-					Host: link.Hostname,
+					UID:       uid.UID(),
+					URL:       link,
+					Host:      link.Hostname,
 					Timestamp: timestamp,
 				}
 				batchSnapshots = append(batchSnapshots, newSnapshot)
 			}
 		}
-		
+
 		if len(batchSnapshots) > 0 {
 			logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
-			err = SaveLinksToDB(tx, batchSnapshots)
+			err = SaveLinksToDBinBatches(tx, batchSnapshots)
 			if err != nil {
 				return fmt.Errorf("[%d] DB Error: %w", id, err)
 			}
@@ -127,45 +172,6 @@ func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
 	return !exists
 }
 
-// Select a random snapshot.
-func GetRandomSnapshot(tx *sqlx.Tx) (*Snapshot, error) {
-	query := `SELECT * FROM snapshots
-              WHERE response_code IS NULL
-              AND error IS NULL
-	      ORDER BY RANDOM()
-              LIMIT 1
-              FOR UPDATE SKIP LOCKED`
-	// AND (timestamp < NOW() - INTERVAL '1 day' OR timestamp IS NULL)
-	var snapshot Snapshot
-	err := tx.Get(&snapshot, query)
-	if err != nil {
-		if err == sql.ErrNoRows {
-			// Handle the case where no rows were found
-			return nil, nil
-		}
-		// Handle other potential errors
-		return nil, err
-	}
-	return &snapshot, nil
-}
-
-func GetRandomSnapshots(tx *sqlx.Tx) ([]Snapshot, error) {
-	query := `
-        SELECT * FROM snapshots
-        WHERE response_code IS NULL
-          AND error IS NULL
-        ORDER BY RANDOM()
-        LIMIT $1
-        FOR UPDATE SKIP LOCKED
-    `
-	var snapshots []Snapshot
-	err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
-	if err != nil {
-		return nil, err
-	}
-	return snapshots, nil
-}
-
 func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
 	// Old, unoptimized query
 	//
@@ -199,50 +205,3 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
 	}
 	return snapshots, nil
 }
-
-func runWorker(id int, db *sqlx.DB) {
-	// Start the transaction
-	tx, err := db.Beginx()
-	if err != nil {
-		logging.LogError("Failed to begin transaction: %w", err)
-	}
-
-	defer func() {
-		err = tx.Commit()
-		if err != nil {
-			logging.LogError("[%d] Failed to commit transaction: %w", id, err)
-			tx.Rollback()
-		}
-	}()
-
-	snapshots, err := GetRandomSnapshotsDistinctHosts(tx)
-
-	if err != nil {
-		logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
-		time.Sleep(10 * time.Second)
-		return
-	} else if len(snapshots) == 0 {
-		logging.LogInfo("[%d] No remaining snapshots to visit.", id)
-		time.Sleep(1 * time.Minute)
-		return
-	}
-	total := len(snapshots)
-	for i, s := range snapshots {
-		if InBlacklist(&s) {
-			logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
-		}
-		logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
-		err = workOnSnapshot(id, tx, &s)
-		if err != nil {
-			logging.LogError("[%d] [%s] Error %w", id, s.URL, err)
-			// TODO Remove panic and gracefully handle/log error
-			fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack())
-			panic("ERROR encountered")
-		}
-		if s.Error.Valid {
-			logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String))
-		}
-		logging.LogDebug("[%d] Done %d/%d.", id, i, total)
-	}
-	logging.LogInfo("[%d] Worker done.", id)
-}
diff --git a/util/util.go b/util/util.go
new file mode 100644
index 0000000..ddf2dfe
--- /dev/null
+++ b/util/util.go
@@ -0,0 +1,11 @@
+package util
+
+import (
+	"fmt"
+	"runtime/debug"
+)
+
+func PrintStackAndPanic(err error) {
+	fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack())
+	panic("PANIC")
+}