Compare commits
2 Commits
daaa61c884
...
02015faa81
| Author | SHA1 | Date | |
|---|---|---|---|
| 02015faa81 | |||
| c49a69728a |
@@ -10,10 +10,10 @@ A Gemini crawler.
|
|||||||
- [x] Configuration via environment variables
|
- [x] Configuration via environment variables
|
||||||
- [x] Storing snapshots in PostgreSQL
|
- [x] Storing snapshots in PostgreSQL
|
||||||
- [x] Proper response header & body UTF-8 and format validation
|
- [x] Proper response header & body UTF-8 and format validation
|
||||||
|
- [x] Follow robots.txt
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
- [ ] Follow robots.txt gemini://geminiprotocol.net/docs/companion/
|
- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
- [ ] Test with gemini://alexey.shpakovsky.ru/maze
|
|
||||||
- [ ] Proper handling of all response codes
|
- [ ] Proper handling of all response codes
|
||||||
- [ ] Handle 3X redirects properly
|
- [ ] Handle 3X redirects properly
|
||||||
- [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
|
- [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
gemi.dev
|
|
||||||
kennedy.gemi.dev
|
|
||||||
alexey.shpakovsky.ru
|
|
||||||
musicbrainz.uploadedlobster.com
|
|
||||||
gemini.bunburya.eu
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import "gemini-grc/logging"
|
|
||||||
|
|
||||||
var Blacklist *[]string
|
|
||||||
|
|
||||||
func InBlacklist(s *Snapshot) bool {
|
|
||||||
if Blacklist == nil {
|
|
||||||
data := ReadLines("blacklists/domains.txt")
|
|
||||||
Blacklist = &data
|
|
||||||
logging.LogInfo("Loaded %d blacklisted domains", len(*Blacklist))
|
|
||||||
}
|
|
||||||
for _, l := range *Blacklist {
|
|
||||||
if s.Host == l {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
// if strings.HasPrefix(s.URL.String(), l) {
|
|
||||||
// return true
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
80
gemini/robotmatch.go
Normal file
80
gemini/robotmatch.go
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"gemini-grc/logging"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// key: "host:port" (string)
|
||||||
|
// value:
|
||||||
|
// empty []string if no robots data, or
|
||||||
|
// list of URL prefixes ([]string) in robots
|
||||||
|
var RobotsCache sync.Map
|
||||||
|
|
||||||
|
func populateBlacklist(key string) (entries []string) {
|
||||||
|
// We either store an empty list when
|
||||||
|
// no rules, or a list of disallowed URLs.
|
||||||
|
// This applies even if we have an error
|
||||||
|
// finding/downloading robots.txt
|
||||||
|
defer func() {
|
||||||
|
RobotsCache.Store(key, entries)
|
||||||
|
}()
|
||||||
|
url := fmt.Sprintf("gemini://%s/robots.txt", key)
|
||||||
|
robotsContent, err := ConnectAndGetData(url)
|
||||||
|
if err != nil {
|
||||||
|
logging.LogDebug("robots.txt error %s", err)
|
||||||
|
return []string{}
|
||||||
|
}
|
||||||
|
robotsData, err := processData(robotsContent)
|
||||||
|
if err != nil {
|
||||||
|
logging.LogDebug("robots.txt error %s", err)
|
||||||
|
return []string{}
|
||||||
|
}
|
||||||
|
if robotsData.ResponseCode != 20 {
|
||||||
|
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
|
||||||
|
return []string{}
|
||||||
|
}
|
||||||
|
// Some return text/plain, others text/gemini.
|
||||||
|
// According to spec, the first is correct,
|
||||||
|
// however let's be lenient
|
||||||
|
var data string
|
||||||
|
if robotsData.MimeType == "text/plain" {
|
||||||
|
data = string(robotsData.Data)
|
||||||
|
} else if robotsData.MimeType == "text/gemini" {
|
||||||
|
data = robotsData.GemText
|
||||||
|
} else {
|
||||||
|
return []string{}
|
||||||
|
}
|
||||||
|
entries = ParseRobotsTxt(string(data), key)
|
||||||
|
return entries
|
||||||
|
}
|
||||||
|
|
||||||
|
func RobotMatch(s *Snapshot) bool {
|
||||||
|
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
|
||||||
|
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
|
||||||
|
v, ok := RobotsCache.Load(key)
|
||||||
|
if ok == false {
|
||||||
|
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
|
||||||
|
disallowedURLs := populateBlacklist(key)
|
||||||
|
for _, url := range disallowedURLs {
|
||||||
|
if strings.HasPrefix(s.URL.String(), url) {
|
||||||
|
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if len(v.([]string)) == 0 {
|
||||||
|
logging.LogDebug("No robots.txt or no rules, allowed")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, url := range v.([]string) {
|
||||||
|
if strings.HasPrefix(s.URL.String(), url) {
|
||||||
|
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -1,35 +1,21 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited
|
// Takes robots.txt content and a host, and
|
||||||
|
// returns a list of full URLs that shouldn't
|
||||||
|
// be visited.
|
||||||
|
// TODO Also take into account the user agent?
|
||||||
|
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
func ParseRobotsTxt(content string, host string) []string {
|
func ParseRobotsTxt(content string, host string) []string {
|
||||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
|
||||||
var disallowedPaths []string
|
var disallowedPaths []string
|
||||||
|
for _, line := range strings.Split(content, "\n") {
|
||||||
// Skip everything until we find "User-agent: *" line
|
line = strings.TrimSpace(line)
|
||||||
for scanner.Scan() {
|
line = strings.ToLower(line)
|
||||||
line := strings.TrimSpace(scanner.Text())
|
if strings.HasPrefix(line, "disallow:") {
|
||||||
if strings.ToLower(line) == "user-agent: *" {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now collect all Disallow paths
|
|
||||||
for scanner.Scan() {
|
|
||||||
line := strings.TrimSpace(scanner.Text())
|
|
||||||
|
|
||||||
// Stop if we hit another User-agent section
|
|
||||||
if strings.HasPrefix(strings.ToLower(line), "user-agent:") {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse Disallow lines
|
|
||||||
if strings.HasPrefix(strings.ToLower(line), "disallow:") {
|
|
||||||
parts := strings.SplitN(line, ":", 2)
|
parts := strings.SplitN(line, ":", 2)
|
||||||
if len(parts) == 2 {
|
if len(parts) == 2 {
|
||||||
path := strings.TrimSpace(parts[1])
|
path := strings.TrimSpace(parts[1])
|
||||||
@@ -41,6 +27,5 @@ func ParseRobotsTxt(content string, host string) []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return disallowedPaths
|
return disallowedPaths
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"testing"
|
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestParseRobotsTxt(t *testing.T) {
|
func TestParseRobotsTxt(t *testing.T) {
|
||||||
@@ -15,6 +15,7 @@ Disallow: /admin/`
|
|||||||
expected := []string{
|
expected := []string{
|
||||||
"gemini://example.com/cgi-bin/wp.cgi/view",
|
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||||
"gemini://example.com/cgi-bin/wp.cgi/media",
|
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||||
|
"gemini://example.com/admin/",
|
||||||
}
|
}
|
||||||
|
|
||||||
result := ParseRobotsTxt(input, "example.com")
|
result := ParseRobotsTxt(input, "example.com")
|
||||||
@@ -23,3 +24,13 @@ Disallow: /admin/`
|
|||||||
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
|
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseRobotsTxtEmpty(t *testing.T) {
|
||||||
|
input := ``
|
||||||
|
|
||||||
|
result := ParseRobotsTxt(input, "example.com")
|
||||||
|
|
||||||
|
if len(result) != 0 {
|
||||||
|
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -30,12 +30,17 @@ func printPoolIPs() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||||
// Wrap errors with more info.
|
// If URL matches a robots.txt disallow line,
|
||||||
defer func() {
|
// add it as an error so next time it won't be
|
||||||
|
// crawled.
|
||||||
|
if RobotMatch(s) {
|
||||||
|
s.Error = null.StringFrom("robots.txt disallow match")
|
||||||
|
err = SaveSnapshotToDB(tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err = fmt.Errorf("[%d] Worker Error: %w", id, err)
|
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||||
}
|
}
|
||||||
}()
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
IPs, err := getHostIPAddresses(s.Host)
|
IPs, err := getHostIPAddresses(s.Host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -92,9 +97,9 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
for _, link := range *s.Links {
|
for _, link := range *s.Links {
|
||||||
if shouldPersistURL(tx, link) {
|
if shouldPersistURL(tx, link) {
|
||||||
newSnapshot := &Snapshot{
|
newSnapshot := &Snapshot{
|
||||||
UID: uid.UID(),
|
UID: uid.UID(),
|
||||||
URL: link,
|
URL: link,
|
||||||
Host: link.Hostname,
|
Host: link.Hostname,
|
||||||
Timestamp: timestamp,
|
Timestamp: timestamp,
|
||||||
}
|
}
|
||||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||||
@@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) {
|
|||||||
}
|
}
|
||||||
total := len(snapshots)
|
total := len(snapshots)
|
||||||
for i, s := range snapshots {
|
for i, s := range snapshots {
|
||||||
if InBlacklist(&s) {
|
|
||||||
logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
|
|
||||||
}
|
|
||||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
||||||
err = workOnSnapshot(id, tx, &s)
|
err = workOnSnapshot(id, tx, &s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user