Compare commits

...

2 Commits

Author SHA1 Message Date
02015faa81 Add robots.txt checking
Still needs periodic cache refresh
2024-10-23 14:28:49 +03:00
c49a69728a Simplify robots.txt parsing logic 2024-10-23 14:28:49 +03:00
7 changed files with 118 additions and 67 deletions

View File

@@ -10,10 +10,10 @@ A Gemini crawler.
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
- [x] Follow robots.txt
## TODO
- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/
- [ ] Test with gemini://alexey.shpakovsky.ru/maze
- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi
- [ ] Proper handling of all response codes
- [ ] Handle 3X redirects properly
- [ ] Handle URLs that need presentation of a TLS cert, like astrobotany

View File

@@ -1,5 +0,0 @@
gemi.dev
kennedy.gemi.dev
alexey.shpakovsky.ru
musicbrainz.uploadedlobster.com
gemini.bunburya.eu

View File

@@ -1,22 +0,0 @@
package gemini
import "gemini-grc/logging"
var Blacklist *[]string
func InBlacklist(s *Snapshot) bool {
if Blacklist == nil {
data := ReadLines("blacklists/domains.txt")
Blacklist = &data
logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist))
}
for _, l := range *Blacklist {
if s.Host == l {
return true
}
// if strings.HasPrefix(s.URL.String(), l) {
// return true
// }
}
return false
}

80
gemini/robotmatch.go Normal file
View File

@@ -0,0 +1,80 @@
package gemini
import (
"fmt"
"gemini-grc/logging"
"strings"
"sync"
)
// key: "host:port" (string)
// value:
// empty []string if no robots data, or
// list of URL prefixes ([]string) in robots
var RobotsCache sync.Map
func populateBlacklist(key string) (entries []string) {
// We either store an empty list when
// no rules, or a list of disallowed URLs.
// This applies even if we have an error
// finding/downloading robots.txt
defer func() {
RobotsCache.Store(key, entries)
}()
url := fmt.Sprintf("gemini://%s/robots.txt", key)
robotsContent, err := ConnectAndGetData(url)
if err != nil {
logging.LogDebug("robots.txt error %s", err)
return []string{}
}
robotsData, err := processData(robotsContent)
if err != nil {
logging.LogDebug("robots.txt error %s", err)
return []string{}
}
if robotsData.ResponseCode != 20 {
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
return []string{}
}
// Some return text/plain, others text/gemini.
// According to spec, the first is correct,
// however let's be lenient
var data string
if robotsData.MimeType == "text/plain" {
data = string(robotsData.Data)
} else if robotsData.MimeType == "text/gemini" {
data = robotsData.GemText
} else {
return []string{}
}
entries = ParseRobotsTxt(string(data), key)
return entries
}
func RobotMatch(s *Snapshot) bool {
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
v, ok := RobotsCache.Load(key)
if ok == false {
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
disallowedURLs := populateBlacklist(key)
for _, url := range disallowedURLs {
if strings.HasPrefix(s.URL.String(), url) {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
return true
}
}
} else {
if len(v.([]string)) == 0 {
logging.LogDebug("No robots.txt or no rules, allowed")
return false
}
for _, url := range v.([]string) {
if strings.HasPrefix(s.URL.String(), url) {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
return true
}
}
}
return false
}

View File

@@ -1,35 +1,21 @@
package gemini
import (
"bufio"
"fmt"
"strings"
)
// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited
// Takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't
// be visited.
// TODO Also take into account the user agent?
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
func ParseRobotsTxt(content string, host string) []string {
scanner := bufio.NewScanner(strings.NewReader(content))
var disallowedPaths []string
// Skip everything until we find "User-agent: *" line
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.ToLower(line) == "user-agent: *" {
break
}
}
// Now collect all Disallow paths
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Stop if we hit another User-agent section
if strings.HasPrefix(strings.ToLower(line), "user-agent:") {
break
}
// Parse Disallow lines
if strings.HasPrefix(strings.ToLower(line), "disallow:") {
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
line = strings.ToLower(line)
if strings.HasPrefix(line, "disallow:") {
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
path := strings.TrimSpace(parts[1])
@@ -41,6 +27,5 @@ func ParseRobotsTxt(content string, host string) []string {
}
}
}
return disallowedPaths
}

View File

@@ -1,8 +1,8 @@
package gemini
import (
"testing"
"reflect"
"testing"
)
func TestParseRobotsTxt(t *testing.T) {
@@ -15,6 +15,7 @@ Disallow: /admin/`
expected := []string{
"gemini://example.com/cgi-bin/wp.cgi/view",
"gemini://example.com/cgi-bin/wp.cgi/media",
"gemini://example.com/admin/",
}
result := ParseRobotsTxt(input, "example.com")
@@ -23,3 +24,13 @@ Disallow: /admin/`
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
}
}
func TestParseRobotsTxtEmpty(t *testing.T) {
input := ``
result := ParseRobotsTxt(input, "example.com")
if len(result) != 0 {
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
}
}

View File

@@ -30,12 +30,17 @@ func printPoolIPs() {
}
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
// Wrap errors with more info.
defer func() {
// If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be
// crawled.
if RobotMatch(s) {
s.Error = null.StringFrom("robots.txt disallow match")
err = SaveSnapshotToDB(tx, s)
if err != nil {
err = fmt.Errorf("[%d] Worker Error: %w", id, err)
return fmt.Errorf("[%d] DB Error: %w", id, err)
}
}()
return nil
}
IPs, err := getHostIPAddresses(s.Host)
if err != nil {
@@ -92,9 +97,9 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
for _, link := range *s.Links {
if shouldPersistURL(tx, link) {
newSnapshot := &Snapshot{
UID: uid.UID(),
URL: link,
Host: link.Hostname,
UID: uid.UID(),
URL: link,
Host: link.Hostname,
Timestamp: timestamp,
}
batchSnapshots = append(batchSnapshots, newSnapshot)
@@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) {
}
total := len(snapshots)
for i, s := range snapshots {
if InBlacklist(&s) {
logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
}
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
err = workOnSnapshot(id, tx, &s)
if err != nil {