This commit is contained in:
2024-11-18 16:28:45 +02:00
parent f0452ff9f7
commit 825c7e3391
34 changed files with 624 additions and 426 deletions

View File

@@ -1,30 +1,32 @@
package gemini
import (
"errors"
"fmt"
"regexp"
"strings"
"time"
"gemini-grc/config"
"gemini-grc/logging"
"gemini-grc/uid"
"gemini-grc/util"
"strings"
"time"
"github.com/guregu/null/v5"
"github.com/jmoiron/sqlx"
)
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers)
for i := 0; i < numOfWorkers; i++ {
for i := range numOfWorkers {
go func(i int) {
for {
runWorker(i, db)
RunWorker(i, db, nil)
}
}(i)
}
}
func runWorker(id int, db *sqlx.DB) {
func RunWorker(id int, db *sqlx.DB, url *string) {
// Start the DB transaction
tx, err := db.Beginx()
if err != nil {
@@ -42,38 +44,85 @@ func runWorker(id int, db *sqlx.DB) {
}
}()
snapshots, err := GetRandomSnapshotsDistinctHosts(tx)
var snapshots []Snapshot
if url == nil {
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
} else {
snapshots, err = GetSnapshotFromURL(tx, *url)
if len(snapshots) == 0 {
snapshotURL, err := ParseURL(*url, "")
if err != nil {
panic("Invalid URL: " + *url)
}
snapshots = []Snapshot{{
UID: uid.UID(),
URL: *snapshotURL,
Host: snapshotURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}}
}
}
if err != nil {
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
time.Sleep(10 * time.Second)
return
} else if len(snapshots) == 0 {
logging.LogInfo("[%d] No remaining snapshots to visit.", id)
logging.LogInfo("[%d] No snapshots to visit.", id)
time.Sleep(1 * time.Minute)
return
}
total := len(snapshots)
for i, s := range snapshots {
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
err = workOnSnapshot(id, tx, &s)
if err != nil {
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL, err)
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err)
util.PrintStackAndPanic(err)
}
if s.Error.Valid {
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL, s.Error.String)
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String)
}
logging.LogDebug("[%d] Done %d/%d.", id, i, total)
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
}
logging.LogInfo("[%d] Worker done.", id)
}
func handleRedirection(tx *sqlx.Tx, s *Snapshot) error {
re := regexp.MustCompile(`gemini://\S+`)
matches := re.FindStringSubmatch(s.Error.ValueOrZero())
if len(matches) == 1 {
newURL := matches[0]
logging.LogDebug("Page redirects to %s", newURL)
_url, err := ParseURL(newURL, "")
// Insert fresh snapshot with new URL
if err == nil {
snapshot := &Snapshot{
UID: uid.UID(),
URL: *_url,
Host: _url.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
err := SaveSnapshotToDBIfNotExists(tx, snapshot)
if err != nil {
return err
}
}
}
return nil
}
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
if IsBlacklisted(s.URL) {
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
return nil
}
// If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be
// crawled.
if RobotMatch(s) {
if RobotMatch(s.URL) {
s.Error = null.StringFrom("robots.txt disallow match")
err = SaveSnapshotToDB(tx, s)
if err != nil {
@@ -92,14 +141,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
return nil
}
defer func() {
time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs)
}()
// If the host's ip is in the connections pool,
// stop and add the url in the queue later.
IpPool.Lock.RLock()
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL)
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
for _, ip := range IPs {
_, ok := IpPool.IPs[ip]
if ok {
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL)
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
IpPool.Lock.RUnlock()
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
return nil
@@ -111,15 +165,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
url := s.URL.String()
logging.LogDebug("[%d] Dialing %s", id, url)
Visit(s)
err = Visit(s)
if err != nil {
if !IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
if config.CONFIG.PanicOnUnexpectedError {
util.PrintStackAndPanic(err)
}
} else {
s.Error = null.StringFrom(err.Error())
}
if errors.As(err, new(*ErrGeminiStatusCode)) {
err = handleRedirection(tx, s)
if err != nil {
return err
}
}
}
logging.LogDebug("[%d] Finished dialing.", id)
go func() {
time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs)
}()
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
logging.LogDebug("[%d] [%s] Processing", id, url)
s = ProcessGemini(s)
}
@@ -158,7 +223,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
}
// Should we save the given URL for crawling?
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
func shouldPersistURL(tx *sqlx.Tx, u URL) bool {
if !strings.HasPrefix(u.String(), "gemini://") {
return false
}
@@ -205,3 +270,18 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
}
return snapshots, nil
}
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err
}
return snapshots, nil
}