Add robots.txt checking
Still needs periodic cache refresh
This commit is contained in:
@@ -10,10 +10,10 @@ A Gemini crawler.
|
||||
- [x] Configuration via environment variables
|
||||
- [x] Storing snapshots in PostgreSQL
|
||||
- [x] Proper response header & body UTF-8 and format validation
|
||||
- [x] Follow robots.txt
|
||||
|
||||
## TODO
|
||||
- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/
|
||||
- [ ] Test with gemini://alexey.shpakovsky.ru/maze
|
||||
- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||
- [ ] Proper handling of all response codes
|
||||
- [ ] Handle 3X redirects properly
|
||||
- [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
gemi.dev
|
||||
kennedy.gemi.dev
|
||||
alexey.shpakovsky.ru
|
||||
musicbrainz.uploadedlobster.com
|
||||
gemini.bunburya.eu
|
||||
@@ -42,4 +42,5 @@ CREATE INDEX idx_lang ON snapshots (lang);
|
||||
CREATE INDEX idx_response_code ON snapshots (response_code);
|
||||
CREATE INDEX idx_error ON snapshots (error);
|
||||
CREATE INDEX idx_host ON snapshots (host);
|
||||
-- Add the unprocessed snapshots index here! check db
|
||||
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
package gemini
|
||||
|
||||
import "gemini-grc/logging"
|
||||
|
||||
var Blacklist *[]string
|
||||
|
||||
func InBlacklist(s *Snapshot) bool {
|
||||
if Blacklist == nil {
|
||||
data := ReadLines("blacklists/domains.txt")
|
||||
Blacklist = &data
|
||||
logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist))
|
||||
}
|
||||
for _, l := range *Blacklist {
|
||||
if s.Host == l {
|
||||
return true
|
||||
}
|
||||
// if strings.HasPrefix(s.URL.String(), l) {
|
||||
// return true
|
||||
// }
|
||||
}
|
||||
return false
|
||||
}
|
||||
83
gemini/robotmatch.go
Normal file
83
gemini/robotmatch.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// key: "host:port" (string)
|
||||
// value:
|
||||
// empty []string if no robots data, or
|
||||
// list of URL prefixes ([]string) in robots
|
||||
var RobotsCache sync.Map
|
||||
|
||||
func populateBlacklist(key string) (entries []string) {
|
||||
// We either store an empty list when
|
||||
// no rules, or a list of disallowed URLs.
|
||||
// This applies even if we have an error
|
||||
// finding/downloading robots.txt
|
||||
defer func() {
|
||||
RobotsCache.Store(key, entries)
|
||||
}()
|
||||
url := fmt.Sprintf("gemini://%s/robots.txt", key)
|
||||
robotsContent, err := ConnectAndGetData(url)
|
||||
if err != nil {
|
||||
logging.LogDebug("robots.txt error %s", err)
|
||||
return []string{}
|
||||
}
|
||||
robotsData, err := processData(robotsContent)
|
||||
if err != nil {
|
||||
logging.LogDebug("robots.txt error %s", err)
|
||||
return []string{}
|
||||
}
|
||||
if robotsData.ResponseCode != 20 {
|
||||
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
|
||||
return []string{}
|
||||
}
|
||||
// Some return text/plain, others text/gemini.
|
||||
// According to spec, the first is correct,
|
||||
// however let's be lenient
|
||||
var data string
|
||||
if robotsData.MimeType == "text/plain" {
|
||||
data = string(robotsData.Data)
|
||||
} else if robotsData.MimeType == "text/gemini" {
|
||||
data = robotsData.GemText
|
||||
} else {
|
||||
return []string{}
|
||||
}
|
||||
entries = ParseRobotsTxt(string(data), key)
|
||||
return entries
|
||||
}
|
||||
|
||||
// Check if the snapshot URL matches
|
||||
// a robots.txt allow rule.
|
||||
func RobotMatch(s *Snapshot) bool {
|
||||
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
|
||||
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
|
||||
v, ok := RobotsCache.Load(key)
|
||||
if ok == false {
|
||||
// First time check, populate robot cache
|
||||
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
|
||||
disallowedURLs := populateBlacklist(key)
|
||||
for _, url := range disallowedURLs {
|
||||
if strings.HasPrefix(s.URL.String(), url) {
|
||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||
return true
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if len(v.([]string)) == 0 {
|
||||
logging.LogDebug("No robots.txt or no rules, allowed")
|
||||
return false
|
||||
}
|
||||
for _, url := range v.([]string) {
|
||||
if strings.HasPrefix(s.URL.String(), url) {
|
||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseRobotsTxt(t *testing.T) {
|
||||
@@ -15,6 +15,7 @@ Disallow: /admin/`
|
||||
expected := []string{
|
||||
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||
"gemini://example.com/admin/",
|
||||
}
|
||||
|
||||
result := ParseRobotsTxt(input, "example.com")
|
||||
@@ -23,3 +24,13 @@ Disallow: /admin/`
|
||||
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRobotsTxtEmpty(t *testing.T) {
|
||||
input := ``
|
||||
|
||||
result := ParseRobotsTxt(input, "example.com")
|
||||
|
||||
if len(result) != 0 {
|
||||
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,12 +30,17 @@ func printPoolIPs() {
|
||||
}
|
||||
|
||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
// Wrap errors with more info.
|
||||
defer func() {
|
||||
// If URL matches a robots.txt disallow line,
|
||||
// add it as an error so next time it won't be
|
||||
// crawled.
|
||||
if RobotMatch(s) {
|
||||
s.Error = null.StringFrom("robots.txt disallow match")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("[%d] Worker Error: %w", id, err)
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
IPs, err := getHostIPAddresses(s.Host)
|
||||
if err != nil {
|
||||
@@ -88,19 +93,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
if s.Links != nil {
|
||||
var batchSnapshots []*Snapshot
|
||||
timestamp := null.TimeFrom(time.Now())
|
||||
|
||||
|
||||
for _, link := range *s.Links {
|
||||
if shouldPersistURL(tx, link) {
|
||||
newSnapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
URL: link,
|
||||
Host: link.Hostname,
|
||||
UID: uid.UID(),
|
||||
URL: link,
|
||||
Host: link.Hostname,
|
||||
Timestamp: timestamp,
|
||||
}
|
||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if len(batchSnapshots) > 0 {
|
||||
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
|
||||
err = SaveLinksToDB(tx, batchSnapshots)
|
||||
@@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) {
|
||||
}
|
||||
total := len(snapshots)
|
||||
for i, s := range snapshots {
|
||||
if InBlacklist(&s) {
|
||||
logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
|
||||
}
|
||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
||||
err = workOnSnapshot(id, tx, &s)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user