Better error handling, many fixes all around
This commit is contained in:
250
gemini/worker.go
250
gemini/worker.go
@@ -3,14 +3,13 @@ package gemini
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/logging"
|
||||
"gemini-grc/uid"
|
||||
"gemini-grc/util"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@@ -27,12 +26,13 @@ func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||
}
|
||||
|
||||
func RunWorker(id int, db *sqlx.DB, url *string) {
|
||||
// Start the DB transaction
|
||||
// Each worker runs within a DB transaction.
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
logging.LogError("Failed to begin transaction: %w", err)
|
||||
}
|
||||
|
||||
// Commit/rollback at the end
|
||||
defer func() {
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
@@ -46,73 +46,57 @@ func RunWorker(id int, db *sqlx.DB, url *string) {
|
||||
|
||||
var snapshots []Snapshot
|
||||
|
||||
// If not given a specific URL,
|
||||
// get some random ones to visit from DB.
|
||||
if url == nil {
|
||||
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
|
||||
} else {
|
||||
snapshots, err = GetSnapshotFromURL(tx, *url)
|
||||
if len(snapshots) == 0 {
|
||||
snapshotURL, err := ParseURL(*url, "")
|
||||
if err != nil {
|
||||
panic("Invalid URL: " + *url)
|
||||
}
|
||||
snapshots = []Snapshot{{
|
||||
UID: uid.UID(),
|
||||
URL: *snapshotURL,
|
||||
Host: snapshotURL.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}}
|
||||
if err != nil {
|
||||
logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err)
|
||||
panic("This should never happen")
|
||||
} else if len(snapshots) == 0 {
|
||||
logging.LogInfo("[%d] No snapshots to visit.", id)
|
||||
time.Sleep(1 * time.Minute)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
snapshotURL, err := ParseURL(*url, "")
|
||||
if err != nil {
|
||||
logging.LogError("Invalid URL given: " + *url)
|
||||
return
|
||||
|
||||
}
|
||||
snapshots = []Snapshot{{
|
||||
//UID: uid.UID(),
|
||||
URL: *snapshotURL,
|
||||
Host: snapshotURL.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
|
||||
time.Sleep(10 * time.Second)
|
||||
return
|
||||
} else if len(snapshots) == 0 {
|
||||
logging.LogInfo("[%d] No snapshots to visit.", id)
|
||||
time.Sleep(1 * time.Minute)
|
||||
return
|
||||
}
|
||||
// Start visiting URLs.
|
||||
total := len(snapshots)
|
||||
for i, s := range snapshots {
|
||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
|
||||
// We differentiate between errors:
|
||||
// Unexpected errors are the ones returned from the following function.
|
||||
// If an error is unexpected (which should never happen) we panic.
|
||||
// Expected errors are stored as strings within the snapshot,
|
||||
// so that they can also be stored in DB.
|
||||
err = workOnSnapshot(id, tx, &s)
|
||||
if err != nil {
|
||||
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err)
|
||||
logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err)
|
||||
util.PrintStackAndPanic(err)
|
||||
}
|
||||
if s.Error.Valid {
|
||||
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String)
|
||||
logging.LogWarn("[%d] Error: %v", id, s.Error.String)
|
||||
}
|
||||
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
|
||||
}
|
||||
logging.LogInfo("[%d] Worker done.", id)
|
||||
}
|
||||
|
||||
func handleRedirection(tx *sqlx.Tx, s *Snapshot) error {
|
||||
re := regexp.MustCompile(`gemini://\S+`)
|
||||
matches := re.FindStringSubmatch(s.Error.ValueOrZero())
|
||||
if len(matches) == 1 {
|
||||
newURL := matches[0]
|
||||
logging.LogDebug("Page redirects to %s", newURL)
|
||||
_url, err := ParseURL(newURL, "")
|
||||
// Insert fresh snapshot with new URL
|
||||
if err == nil {
|
||||
snapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
URL: *_url,
|
||||
Host: _url.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
err := SaveSnapshotToDBIfNotExists(tx, snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// workOnSnapshot visits a URL and stores the result.
|
||||
// errors should be returned only if they are unexpected.
|
||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
if IsBlacklisted(s.URL) {
|
||||
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
|
||||
@@ -123,31 +107,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
// add it as an error so next time it won't be
|
||||
// crawled.
|
||||
if RobotMatch(s.URL) {
|
||||
s.Error = null.StringFrom("robots.txt disallow match")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
|
||||
err = UpsertSnapshot(id, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return fmt.Errorf("[%d] %w", id, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Resolve IP address via DNS
|
||||
IPs, err := getHostIPAddresses(s.Host)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom("DNS Resolve error")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
err = UpsertSnapshot(id, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return fmt.Errorf("[%d] %w", id, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
defer func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
RemoveIPsFromPool(IPs)
|
||||
}()
|
||||
|
||||
// If the host's ip is in the connections pool,
|
||||
// stop and add the url in the queue later.
|
||||
// If the host's ip is in the connections pool we stop
|
||||
IpPool.Lock.RLock()
|
||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
|
||||
for _, ip := range IPs {
|
||||
@@ -155,7 +134,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
if ok {
|
||||
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
|
||||
IpPool.Lock.RUnlock()
|
||||
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
|
||||
time.Sleep(1 * time.Second) // Avoid flood-retrying
|
||||
return nil
|
||||
}
|
||||
}
|
||||
@@ -163,84 +142,115 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
|
||||
AddIPsToPool(IPs)
|
||||
|
||||
// After finishing, remove the host IPs from
|
||||
// the connections pool, with a small delay
|
||||
// to avoid potentially hitting the same IP quickly.
|
||||
defer func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
RemoveIPsFromPool(IPs)
|
||||
}()
|
||||
|
||||
url := s.URL.String()
|
||||
logging.LogDebug("[%d] Dialing %s", id, url)
|
||||
|
||||
err = Visit(s)
|
||||
if err != nil {
|
||||
if !IsKnownError(err) {
|
||||
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
|
||||
if config.CONFIG.PanicOnUnexpectedError {
|
||||
util.PrintStackAndPanic(err)
|
||||
}
|
||||
} else {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return err
|
||||
}
|
||||
if errors.As(err, new(*ErrGeminiStatusCode)) {
|
||||
err = handleRedirection(tx, s)
|
||||
// Check if error is redirection, and handle it
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
if errors.As(err, new(*GeminiError)) &&
|
||||
err.(*GeminiError).Msg == "redirect" {
|
||||
err = handleRedirection(id, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
logging.LogDebug("[%d] Finished dialing.", id)
|
||||
logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero())
|
||||
|
||||
// If this is a gemini page, parse possible links inside
|
||||
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||
logging.LogDebug("[%d] [%s] Processing", id, url)
|
||||
s = ProcessGemini(s)
|
||||
links := GetPageLinks(s.URL, s.GemText.String)
|
||||
logging.LogDebug("[%d] Found %d links", id, len(links))
|
||||
if len(links) > 0 {
|
||||
s.Links = null.ValueFrom(links)
|
||||
}
|
||||
} else {
|
||||
logging.LogDebug("[%d] Not looking for page links", id)
|
||||
}
|
||||
logging.LogDebug("[%d] Saving", id)
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
|
||||
err = UpsertSnapshot(id, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return err
|
||||
}
|
||||
|
||||
// Store links in batch
|
||||
if s.Links != nil {
|
||||
var batchSnapshots []*Snapshot
|
||||
timestamp := null.TimeFrom(time.Now())
|
||||
err = storeLinks(tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, link := range *s.Links {
|
||||
if shouldPersistURL(tx, link) {
|
||||
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
|
||||
if s.Links.Valid {
|
||||
var batchSnapshots []*Snapshot
|
||||
for _, link := range s.Links.ValueOrZero() {
|
||||
if shouldPersistURL(link) {
|
||||
newSnapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
//UID: uid.UID(),
|
||||
URL: link,
|
||||
Host: link.Hostname,
|
||||
Timestamp: timestamp,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
if len(batchSnapshots) > 0 {
|
||||
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
|
||||
err = SaveLinksToDBinBatches(tx, batchSnapshots)
|
||||
err := SaveLinksToDBinBatches(tx, batchSnapshots)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Should we save the given URL for crawling?
|
||||
func shouldPersistURL(tx *sqlx.Tx, u URL) bool {
|
||||
if !strings.HasPrefix(u.String(), "gemini://") {
|
||||
return false
|
||||
}
|
||||
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)`
|
||||
var exists bool
|
||||
err := tx.Get(&exists, query, u.String())
|
||||
// shouldPersistURL returns true if we
|
||||
// should save the URL in the DB.
|
||||
// Only gemini:// urls are saved.
|
||||
func shouldPersistURL(u URL) bool {
|
||||
return strings.HasPrefix(u.String(), "gemini://")
|
||||
}
|
||||
|
||||
func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error {
|
||||
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
|
||||
if err != nil {
|
||||
fmt.Println("Error executing query:", err)
|
||||
return false
|
||||
return err
|
||||
}
|
||||
return !exists
|
||||
logging.LogDebug("[%d] Page redirects to %s", id, newURL)
|
||||
// Insert fresh snapshot with new URL
|
||||
snapshot := &Snapshot{
|
||||
//UID: uid.UID(),
|
||||
URL: *newURL,
|
||||
Host: newURL.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String())
|
||||
err = SaveSnapshotIfNew(tx, snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||
// Old, unoptimized query
|
||||
//
|
||||
// query := `
|
||||
|
||||
// query := `
|
||||
// SELECT DISTINCT ON (host) *
|
||||
// FROM snapshots
|
||||
// WHERE response_code IS NULL
|
||||
@@ -249,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||
// LIMIT $1
|
||||
// `
|
||||
query := `
|
||||
WITH RankedSnapshots AS (
|
||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
||||
links, lang, response_code, error,
|
||||
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||
FROM snapshots
|
||||
WHERE response_code IS NULL
|
||||
AND error IS NULL
|
||||
)
|
||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
||||
links, lang, response_code, error
|
||||
FROM RankedSnapshots
|
||||
WHERE rn = 1
|
||||
LIMIT $1
|
||||
`
|
||||
SELECT *
|
||||
FROM snapshots
|
||||
WHERE response_code IS NULL
|
||||
AND error IS NULL
|
||||
ORDER BY RANDOM()
|
||||
LIMIT $1
|
||||
`
|
||||
//query := `
|
||||
// WITH RankedSnapshots AS (
|
||||
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||
// links, lang, response_code, error,
|
||||
// ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||
// FROM snapshots
|
||||
// WHERE response_code IS NULL
|
||||
// AND error IS NULL
|
||||
// )
|
||||
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||
// links, lang, response_code, error
|
||||
// FROM RankedSnapshots
|
||||
// WHERE rn = 1
|
||||
// LIMIT $1
|
||||
//`
|
||||
var snapshots []Snapshot
|
||||
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user