diff --git a/README.md b/README.md index 397613b..9b17907 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,10 @@ A Gemini crawler. - [x] Configuration via environment variables - [x] Storing snapshots in PostgreSQL - [x] Proper response header & body UTF-8 and format validation +- [x] Follow robots.txt ## TODO -- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/ - - [ ] Test with gemini://alexey.shpakovsky.ru/maze +- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi - [ ] Proper handling of all response codes - [ ] Handle 3X redirects properly - [ ] Handle URLs that need presentation of a TLS cert, like astrobotany diff --git a/blacklists/domains.txt b/blacklists/domains.txt deleted file mode 100644 index 9f956ff..0000000 --- a/blacklists/domains.txt +++ /dev/null @@ -1,5 +0,0 @@ -gemi.dev -kennedy.gemi.dev -alexey.shpakovsky.ru -musicbrainz.uploadedlobster.com -gemini.bunburya.eu diff --git a/db/initdb.sql b/db/initdb.sql index 424fd51..9071604 100644 --- a/db/initdb.sql +++ b/db/initdb.sql @@ -42,4 +42,7 @@ CREATE INDEX idx_lang ON snapshots (lang); CREATE INDEX idx_response_code ON snapshots (response_code); CREATE INDEX idx_error ON snapshots (error); CREATE INDEX idx_host ON snapshots (host); +CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host) +WHERE response_code IS NULL AND error IS NULL +INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang); CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; diff --git a/gemini/blacklist.go b/gemini/blacklist.go deleted file mode 100644 index 2f2afee..0000000 --- a/gemini/blacklist.go +++ /dev/null @@ -1,22 +0,0 @@ -package gemini - -import "gemini-grc/logging" - -var Blacklist *[]string - -func InBlacklist(s *Snapshot) bool { - if Blacklist == nil { - data := ReadLines("blacklists/domains.txt") - Blacklist = &data - logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist)) - } - for _, l := range *Blacklist { - if s.Host == l { - return true - } - // if strings.HasPrefix(s.URL.String(), l) { - // return true - // } - } - return false -} diff --git a/gemini/robotmatch.go b/gemini/robotmatch.go new file mode 100644 index 0000000..ddce449 --- /dev/null +++ b/gemini/robotmatch.go @@ -0,0 +1,83 @@ +package gemini + +import ( + "fmt" + "gemini-grc/logging" + "strings" + "sync" +) + +// key: "host:port" (string) +// value: +// empty []string if no robots data, or +// list of URL prefixes ([]string) in robots +var RobotsCache sync.Map + +func populateBlacklist(key string) (entries []string) { + // We either store an empty list when + // no rules, or a list of disallowed URLs. + // This applies even if we have an error + // finding/downloading robots.txt + defer func() { + RobotsCache.Store(key, entries) + }() + url := fmt.Sprintf("gemini://%s/robots.txt", key) + robotsContent, err := ConnectAndGetData(url) + if err != nil { + logging.LogDebug("robots.txt error %s", err) + return []string{} + } + robotsData, err := processData(robotsContent) + if err != nil { + logging.LogDebug("robots.txt error %s", err) + return []string{} + } + if robotsData.ResponseCode != 20 { + logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode) + return []string{} + } + // Some return text/plain, others text/gemini. + // According to spec, the first is correct, + // however let's be lenient + var data string + if robotsData.MimeType == "text/plain" { + data = string(robotsData.Data) + } else if robotsData.MimeType == "text/gemini" { + data = robotsData.GemText + } else { + return []string{} + } + entries = ParseRobotsTxt(string(data), key) + return entries +} + +// Check if the snapshot URL matches +// a robots.txt allow rule. +func RobotMatch(s *Snapshot) bool { + logging.LogDebug("Checking robots.txt cache for %s", s.URL.String()) + key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port) + v, ok := RobotsCache.Load(key) + if ok == false { + // First time check, populate robot cache + logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String()) + disallowedURLs := populateBlacklist(key) + for _, url := range disallowedURLs { + if strings.HasPrefix(s.URL.String(), url) { + logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) + return true + } + } + } else { + if len(v.([]string)) == 0 { + logging.LogDebug("No robots.txt or no rules, allowed") + return false + } + for _, url := range v.([]string) { + if strings.HasPrefix(s.URL.String(), url) { + logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) + return true + } + } + } + return false +} diff --git a/gemini/robots_test.go b/gemini/robots_test.go index 2572f67..4a00d12 100644 --- a/gemini/robots_test.go +++ b/gemini/robots_test.go @@ -1,8 +1,8 @@ package gemini import ( - "testing" "reflect" + "testing" ) func TestParseRobotsTxt(t *testing.T) { @@ -15,6 +15,7 @@ Disallow: /admin/` expected := []string{ "gemini://example.com/cgi-bin/wp.cgi/view", "gemini://example.com/cgi-bin/wp.cgi/media", + "gemini://example.com/admin/", } result := ParseRobotsTxt(input, "example.com") @@ -23,3 +24,13 @@ Disallow: /admin/` t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected) } } + +func TestParseRobotsTxtEmpty(t *testing.T) { + input := `` + + result := ParseRobotsTxt(input, "example.com") + + if len(result) != 0 { + t.Errorf("ParseRobotsTxt() = %v, want empty []string", result) + } +} diff --git a/gemini/worker.go b/gemini/worker.go index 20125c9..592e5ad 100644 --- a/gemini/worker.go +++ b/gemini/worker.go @@ -30,12 +30,17 @@ func printPoolIPs() { } func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { - // Wrap errors with more info. - defer func() { + // If URL matches a robots.txt disallow line, + // add it as an error so next time it won't be + // crawled. + if RobotMatch(s) { + s.Error = null.StringFrom("robots.txt disallow match") + err = SaveSnapshotToDB(tx, s) if err != nil { - err = fmt.Errorf("[%d] Worker Error: %w", id, err) + return fmt.Errorf("[%d] DB Error: %w", id, err) } - }() + return nil + } IPs, err := getHostIPAddresses(s.Host) if err != nil { @@ -88,19 +93,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { if s.Links != nil { var batchSnapshots []*Snapshot timestamp := null.TimeFrom(time.Now()) - + for _, link := range *s.Links { if shouldPersistURL(tx, link) { newSnapshot := &Snapshot{ - UID: uid.UID(), - URL: link, - Host: link.Hostname, + UID: uid.UID(), + URL: link, + Host: link.Hostname, Timestamp: timestamp, } batchSnapshots = append(batchSnapshots, newSnapshot) } } - + if len(batchSnapshots) > 0 { logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots)) err = SaveLinksToDB(tx, batchSnapshots) @@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) { } total := len(snapshots) for i, s := range snapshots { - if InBlacklist(&s) { - logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL) - } logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL) err = workOnSnapshot(id, tx, &s) if err != nil {