From ef3f0097098293b504b8d7468c5736d301fa94ed Mon Sep 17 00:00:00 2001 From: antanst Date: Wed, 23 Oct 2024 14:24:10 +0300 Subject: [PATCH] Add robots.txt checking Still needs periodic cache refresh --- .gitignore | 1 + README.md | 4 +- blacklists/domains.txt | 5 -- db/initdb.sql | 3 + db/stats.sql | 2 +- gemini/blacklist.go | 22 ------ gemini/gemini.go | 50 ++++++------- gemini/persistence.go | 28 ++++++++ gemini/robotmatch.go | 83 +++++++++++++++++++++ gemini/robots_test.go | 13 +++- gemini/worker.go | 159 +++++++++++++++-------------------------- util/util.go | 11 +++ 12 files changed, 225 insertions(+), 156 deletions(-) delete mode 100644 blacklists/domains.txt delete mode 100644 gemini/blacklist.go create mode 100644 gemini/robotmatch.go create mode 100644 util/util.go diff --git a/.gitignore b/.gitignore index 7ef970b..160e6bc 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ **/*~ /cmd /db/initdb.sql +/db/*sh /run*.sh /gemini-grc /snaps diff --git a/README.md b/README.md index 397613b..9b17907 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,10 @@ A Gemini crawler. - [x] Configuration via environment variables - [x] Storing snapshots in PostgreSQL - [x] Proper response header & body UTF-8 and format validation +- [x] Follow robots.txt ## TODO -- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/ - - [ ] Test with gemini://alexey.shpakovsky.ru/maze +- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi - [ ] Proper handling of all response codes - [ ] Handle 3X redirects properly - [ ] Handle URLs that need presentation of a TLS cert, like astrobotany diff --git a/blacklists/domains.txt b/blacklists/domains.txt deleted file mode 100644 index 9f956ff..0000000 --- a/blacklists/domains.txt +++ /dev/null @@ -1,5 +0,0 @@ -gemi.dev -kennedy.gemi.dev -alexey.shpakovsky.ru -musicbrainz.uploadedlobster.com -gemini.bunburya.eu diff --git a/db/initdb.sql b/db/initdb.sql index 424fd51..9071604 100644 --- a/db/initdb.sql +++ b/db/initdb.sql @@ -42,4 +42,7 @@ CREATE INDEX idx_lang ON snapshots (lang); CREATE INDEX idx_response_code ON snapshots (response_code); CREATE INDEX idx_error ON snapshots (error); CREATE INDEX idx_host ON snapshots (host); +CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host) +WHERE response_code IS NULL AND error IS NULL +INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang); CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; diff --git a/db/stats.sql b/db/stats.sql index afbba51..a509ee7 100644 --- a/db/stats.sql +++ b/db/stats.sql @@ -1,5 +1,5 @@ SELECT COUNT(CASE WHEN response_code IS NOT NULL AND error IS NULL THEN 1 END) AS "Visited", - COUNT(CASE WHEN response_code IS NULL THEN 1 END) AS "Pending", + COUNT(CASE WHEN response_code IS NULL AND error IS NULL THEN 1 END) AS "Pending", COUNT(CASE WHEN error IS NOT NULL THEN 1 END) AS "Errors" FROM snapshots; diff --git a/gemini/blacklist.go b/gemini/blacklist.go deleted file mode 100644 index 2f2afee..0000000 --- a/gemini/blacklist.go +++ /dev/null @@ -1,22 +0,0 @@ -package gemini - -import "gemini-grc/logging" - -var Blacklist *[]string - -func InBlacklist(s *Snapshot) bool { - if Blacklist == nil { - data := ReadLines("blacklists/domains.txt") - Blacklist = &data - logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist)) - } - for _, l := range *Blacklist { - if s.Host == l { - return true - } - // if strings.HasPrefix(s.URL.String(), l) { - // return true - // } - } - return false -} diff --git a/gemini/gemini.go b/gemini/gemini.go index f7341f6..eb24d4e 100644 --- a/gemini/gemini.go +++ b/gemini/gemini.go @@ -5,14 +5,14 @@ import ( "fmt" "gemini-grc/logging" "net/url" - go_url "net/url" + gourl "net/url" "regexp" "strconv" "strings" ) func isGeminiURL(url string) bool { - _, err := go_url.Parse(url) + _, err := gourl.Parse(url) if err != nil { logging.LogWarn("[%s] Invalid URL: %v", url, err) return false @@ -36,17 +36,17 @@ func checkGeminiStatusCode(code int) error { case code == 20: return nil case code >= 10 && code < 20: - return fmt.Errorf("Gemini response %d needs data input", code) + return fmt.Errorf("gemini response %d needs data input", code) case code >= 30 && code < 40: - return fmt.Errorf("Gemini response %d redirect", code) + return fmt.Errorf("gemini response %d redirect", code) case code >= 40 && code < 50: - return fmt.Errorf("Gemini response %d server error", code) + return fmt.Errorf("gemini response %d server error", code) case code >= 50 && code < 60: - return fmt.Errorf("Gemini response %d server permanent error", code) + return fmt.Errorf("gemini response %d server permanent error", code) case code >= 60 && code < 70: - return fmt.Errorf("Gemini response %d certificate error", code) + return fmt.Errorf("gemini response %d certificate error", code) default: - return fmt.Errorf("Unexpected/unhandled Gemini response %d", code) + return fmt.Errorf("unexpected/unhandled Gemini response %d", code) } } @@ -57,14 +57,14 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot { // Normalize URLs in links, and store them in snapshot for _, line := range linkLines { - normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String()) - if error != nil { - logging.LogWarn("Cannot normalize URL in line '%s': %v", line, error) + normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String()) + if err != nil { + logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err) continue } - geminiUrl, error := ParseUrl(normalizedLink, descr) - if error != nil { - logging.LogWarn("Cannot parse URL in link '%s': %v", line, error) + geminiUrl, err := ParseUrl(normalizedLink, descr) + if err != nil { + logging.LogDebug("Cannot parse URL in link '%s': %v", line, err) continue } if snapshot.Links == nil { @@ -79,18 +79,18 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot { func ParseUrl(input string, descr string) (*GeminiUrl, error) { u, err := url.Parse(input) if err != nil { - return nil, fmt.Errorf("Error parsing URL %s: %w", input, err) + return nil, fmt.Errorf("error parsing URL %s: %w", input, err) } protocol := u.Scheme hostname := u.Hostname() - str_port := u.Port() + strPort := u.Port() path := u.Path - if str_port == "" { - str_port = "1965" + if strPort == "" { + strPort = "1965" } - port, err := strconv.Atoi(str_port) + port, err := strconv.Atoi(strPort) if err != nil { - return nil, fmt.Errorf("Error parsing URL %s: %w", input, err) + return nil, fmt.Errorf("error parsing URL %s: %w", input, err) } return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil } @@ -106,14 +106,14 @@ func ExtractLinkLines(gemtext string) []string { return matches } -// Take a single link line and the current URL, +// NormalizeLink takes a single link line and the current URL, // return the URL converted to an absolute URL // and its description. func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) { // Parse the current URL baseURL, err := url.Parse(currentURL) if err != nil { - return "", "", fmt.Errorf("Invalid current URL: %v", err) + return "", "", fmt.Errorf("invalid current URL: %v", err) } // Regular expression to extract the URL part from a link line @@ -123,13 +123,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin matches := re.FindStringSubmatch(linkLine) if len(matches) == 0 { // If the line doesn't match the expected format, return it unchanged - return "", "", fmt.Errorf("Not a link line: %v", linkLine) + return "", "", fmt.Errorf("not a link line: %v", linkLine) } originalURLStr := matches[1] _, err = url.QueryUnescape(originalURLStr) if err != nil { - return "", "", fmt.Errorf("Error decoding URL: %w", err) + return "", "", fmt.Errorf("error decoding URL: %w", err) } restOfLine := "" @@ -141,7 +141,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin parsedURL, err := url.Parse(originalURLStr) if err != nil { // If URL parsing fails, return an error - return "", "", fmt.Errorf("Invalid URL '%s': %v", originalURLStr, err) + return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err) } // Resolve relative URLs against the base URL diff --git a/gemini/persistence.go b/gemini/persistence.go index d10abd5..73809ad 100644 --- a/gemini/persistence.go +++ b/gemini/persistence.go @@ -57,6 +57,34 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { return nil } +func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { + // Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe + const batchSize = 5000 + + query := ` + INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) + VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) + ON CONFLICT (uid) DO NOTHING + ` + + for i := 0; i < len(snapshots); i += batchSize { + end := i + batchSize + if end > len(snapshots) { + end = len(snapshots) + } + + batch := snapshots[i:end] + + _, err := tx.NamedExec(query, batch) + if err != nil { + logging.LogError("Error batch inserting snapshots: %w", err) + return fmt.Errorf("DB error: %w", err) + } + } + + return nil +} + func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error { query := ` INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) diff --git a/gemini/robotmatch.go b/gemini/robotmatch.go new file mode 100644 index 0000000..ddce449 --- /dev/null +++ b/gemini/robotmatch.go @@ -0,0 +1,83 @@ +package gemini + +import ( + "fmt" + "gemini-grc/logging" + "strings" + "sync" +) + +// key: "host:port" (string) +// value: +// empty []string if no robots data, or +// list of URL prefixes ([]string) in robots +var RobotsCache sync.Map + +func populateBlacklist(key string) (entries []string) { + // We either store an empty list when + // no rules, or a list of disallowed URLs. + // This applies even if we have an error + // finding/downloading robots.txt + defer func() { + RobotsCache.Store(key, entries) + }() + url := fmt.Sprintf("gemini://%s/robots.txt", key) + robotsContent, err := ConnectAndGetData(url) + if err != nil { + logging.LogDebug("robots.txt error %s", err) + return []string{} + } + robotsData, err := processData(robotsContent) + if err != nil { + logging.LogDebug("robots.txt error %s", err) + return []string{} + } + if robotsData.ResponseCode != 20 { + logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode) + return []string{} + } + // Some return text/plain, others text/gemini. + // According to spec, the first is correct, + // however let's be lenient + var data string + if robotsData.MimeType == "text/plain" { + data = string(robotsData.Data) + } else if robotsData.MimeType == "text/gemini" { + data = robotsData.GemText + } else { + return []string{} + } + entries = ParseRobotsTxt(string(data), key) + return entries +} + +// Check if the snapshot URL matches +// a robots.txt allow rule. +func RobotMatch(s *Snapshot) bool { + logging.LogDebug("Checking robots.txt cache for %s", s.URL.String()) + key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port) + v, ok := RobotsCache.Load(key) + if ok == false { + // First time check, populate robot cache + logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String()) + disallowedURLs := populateBlacklist(key) + for _, url := range disallowedURLs { + if strings.HasPrefix(s.URL.String(), url) { + logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) + return true + } + } + } else { + if len(v.([]string)) == 0 { + logging.LogDebug("No robots.txt or no rules, allowed") + return false + } + for _, url := range v.([]string) { + if strings.HasPrefix(s.URL.String(), url) { + logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) + return true + } + } + } + return false +} diff --git a/gemini/robots_test.go b/gemini/robots_test.go index 2572f67..4a00d12 100644 --- a/gemini/robots_test.go +++ b/gemini/robots_test.go @@ -1,8 +1,8 @@ package gemini import ( - "testing" "reflect" + "testing" ) func TestParseRobotsTxt(t *testing.T) { @@ -15,6 +15,7 @@ Disallow: /admin/` expected := []string{ "gemini://example.com/cgi-bin/wp.cgi/view", "gemini://example.com/cgi-bin/wp.cgi/media", + "gemini://example.com/admin/", } result := ParseRobotsTxt(input, "example.com") @@ -23,3 +24,13 @@ Disallow: /admin/` t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected) } } + +func TestParseRobotsTxtEmpty(t *testing.T) { + input := `` + + result := ParseRobotsTxt(input, "example.com") + + if len(result) != 0 { + t.Errorf("ParseRobotsTxt() = %v, want empty []string", result) + } +} diff --git a/gemini/worker.go b/gemini/worker.go index 20125c9..8bf6acf 100644 --- a/gemini/worker.go +++ b/gemini/worker.go @@ -1,12 +1,11 @@ package gemini import ( - "database/sql" "fmt" "gemini-grc/config" "gemini-grc/logging" "gemini-grc/uid" - "runtime/debug" + "gemini-grc/util" "strings" "time" @@ -25,17 +24,63 @@ func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { } } -func printPoolIPs() { - fmt.Printf("%v", IpPool.IPs) +func runWorker(id int, db *sqlx.DB) { + // Start the DB transaction + tx, err := db.Beginx() + if err != nil { + logging.LogError("Failed to begin transaction: %w", err) + } + + defer func() { + err = tx.Commit() + if err != nil { + logging.LogError("[%d] Failed to commit transaction: %w", id, err) + err := tx.Rollback() + if err != nil { + panic(fmt.Sprintf("[%d] Failed to roll back transaction: %v", id, err)) + } + } + }() + + snapshots, err := GetRandomSnapshotsDistinctHosts(tx) + + if err != nil { + logging.LogError("[%d] Error retrieving snapshot: %w", id, err) + time.Sleep(10 * time.Second) + return + } else if len(snapshots) == 0 { + logging.LogInfo("[%d] No remaining snapshots to visit.", id) + time.Sleep(1 * time.Minute) + return + } + total := len(snapshots) + for i, s := range snapshots { + logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL) + err = workOnSnapshot(id, tx, &s) + if err != nil { + logging.LogError("[%d] [%s] Error %w", id, s.URL, err) + util.PrintStackAndPanic(err) + } + if s.Error.Valid { + logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String)) + } + logging.LogDebug("[%d] Done %d/%d.", id, i, total) + } + logging.LogInfo("[%d] Worker done.", id) } func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { - // Wrap errors with more info. - defer func() { + // If URL matches a robots.txt disallow line, + // add it as an error so next time it won't be + // crawled. + if RobotMatch(s) { + s.Error = null.StringFrom("robots.txt disallow match") + err = SaveSnapshotToDB(tx, s) if err != nil { - err = fmt.Errorf("[%d] Worker Error: %w", id, err) + return fmt.Errorf("[%d] DB Error: %w", id, err) } - }() + return nil + } IPs, err := getHostIPAddresses(s.Host) if err != nil { @@ -88,22 +133,22 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { if s.Links != nil { var batchSnapshots []*Snapshot timestamp := null.TimeFrom(time.Now()) - + for _, link := range *s.Links { if shouldPersistURL(tx, link) { newSnapshot := &Snapshot{ - UID: uid.UID(), - URL: link, - Host: link.Hostname, + UID: uid.UID(), + URL: link, + Host: link.Hostname, Timestamp: timestamp, } batchSnapshots = append(batchSnapshots, newSnapshot) } } - + if len(batchSnapshots) > 0 { logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots)) - err = SaveLinksToDB(tx, batchSnapshots) + err = SaveLinksToDBinBatches(tx, batchSnapshots) if err != nil { return fmt.Errorf("[%d] DB Error: %w", id, err) } @@ -127,45 +172,6 @@ func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool { return !exists } -// Select a random snapshot. -func GetRandomSnapshot(tx *sqlx.Tx) (*Snapshot, error) { - query := `SELECT * FROM snapshots - WHERE response_code IS NULL - AND error IS NULL - ORDER BY RANDOM() - LIMIT 1 - FOR UPDATE SKIP LOCKED` - // AND (timestamp < NOW() - INTERVAL '1 day' OR timestamp IS NULL) - var snapshot Snapshot - err := tx.Get(&snapshot, query) - if err != nil { - if err == sql.ErrNoRows { - // Handle the case where no rows were found - return nil, nil - } - // Handle other potential errors - return nil, err - } - return &snapshot, nil -} - -func GetRandomSnapshots(tx *sqlx.Tx) ([]Snapshot, error) { - query := ` - SELECT * FROM snapshots - WHERE response_code IS NULL - AND error IS NULL - ORDER BY RANDOM() - LIMIT $1 - FOR UPDATE SKIP LOCKED - ` - var snapshots []Snapshot - err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize) - if err != nil { - return nil, err - } - return snapshots, nil -} - func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) { // Old, unoptimized query // @@ -199,50 +205,3 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) { } return snapshots, nil } - -func runWorker(id int, db *sqlx.DB) { - // Start the transaction - tx, err := db.Beginx() - if err != nil { - logging.LogError("Failed to begin transaction: %w", err) - } - - defer func() { - err = tx.Commit() - if err != nil { - logging.LogError("[%d] Failed to commit transaction: %w", id, err) - tx.Rollback() - } - }() - - snapshots, err := GetRandomSnapshotsDistinctHosts(tx) - - if err != nil { - logging.LogError("[%d] Error retrieving snapshot: %w", id, err) - time.Sleep(10 * time.Second) - return - } else if len(snapshots) == 0 { - logging.LogInfo("[%d] No remaining snapshots to visit.", id) - time.Sleep(1 * time.Minute) - return - } - total := len(snapshots) - for i, s := range snapshots { - if InBlacklist(&s) { - logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL) - } - logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL) - err = workOnSnapshot(id, tx, &s) - if err != nil { - logging.LogError("[%d] [%s] Error %w", id, s.URL, err) - // TODO Remove panic and gracefully handle/log error - fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack()) - panic("ERROR encountered") - } - if s.Error.Valid { - logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String)) - } - logging.LogDebug("[%d] Done %d/%d.", id, i, total) - } - logging.LogInfo("[%d] Worker done.", id) -} diff --git a/util/util.go b/util/util.go new file mode 100644 index 0000000..ddf2dfe --- /dev/null +++ b/util/util.go @@ -0,0 +1,11 @@ +package util + +import ( + "fmt" + "runtime/debug" +) + +func PrintStackAndPanic(err error) { + fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack()) + panic("PANIC") +}