Improve crawler performance and logging
- Optimize job scheduler to use NumOfWorkers for URL limits - Clean up verbose logging in worker processing - Update log messages for better clarity
This commit is contained in:
@@ -107,6 +107,9 @@ You can access the snapshot history using the included `snapshot_history.sh` scr
|
|||||||
Good starting points:
|
Good starting points:
|
||||||
|
|
||||||
gemini://warmedal.se/~antenna/
|
gemini://warmedal.se/~antenna/
|
||||||
|
|
||||||
gemini://tlgs.one/
|
gemini://tlgs.one/
|
||||||
|
|
||||||
gopher://i-logout.cz:70/1/bongusta/
|
gopher://i-logout.cz:70/1/bongusta/
|
||||||
|
|
||||||
gopher://gopher.quux.org:70/
|
gopher://gopher.quux.org:70/
|
||||||
@@ -215,7 +215,7 @@ func runJobScheduler() {
|
|||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Commit this tx here so the loop sees the changes.
|
// Commit this tx here so the loop below sees the changes.
|
||||||
err := tx.Commit()
|
err := tx.Commit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
@@ -251,7 +251,7 @@ func runJobScheduler() {
|
|||||||
// When out of pending URLs, add some random ones.
|
// When out of pending URLs, add some random ones.
|
||||||
if len(distinctHosts) == 0 {
|
if len(distinctHosts) == 0 {
|
||||||
// Queue random old URLs from history.
|
// Queue random old URLs from history.
|
||||||
count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers*10, config.CONFIG.SkipIfUpdatedDays)
|
count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers, config.CONFIG.SkipIfUpdatedDays)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
return
|
return
|
||||||
@@ -269,7 +269,7 @@ func runJobScheduler() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get some URLs from each host, up to a limit
|
// Get some URLs from each host, up to a limit
|
||||||
urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, 10, tx)
|
urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, config.CONFIG.NumOfWorkers, tx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
return
|
return
|
||||||
@@ -287,7 +287,7 @@ func runJobScheduler() {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Queueing %d distinct hosts -> %d urls to crawl", len(distinctHosts), len(urls))
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%d urls to crawl", len(urls))
|
||||||
|
|
||||||
// Add jobs to WaitGroup before queuing
|
// Add jobs to WaitGroup before queuing
|
||||||
common.WorkerWG.Add(len(urls))
|
common.WorkerWG.Add(len(urls))
|
||||||
|
|||||||
@@ -140,7 +140,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
|||||||
|
|
||||||
// Only check blacklist if URL is not whitelisted
|
// Only check blacklist if URL is not whitelisted
|
||||||
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
|
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "URL matches blacklist, skipped")
|
|
||||||
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
|
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
|
||||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
}
|
}
|
||||||
@@ -152,7 +151,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
|||||||
// add it as an error and remove url
|
// add it as an error and remove url
|
||||||
robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
|
robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
|
||||||
if robotMatch {
|
if robotMatch {
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "URL matches robots.txt, skipped")
|
|
||||||
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
|
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
|
||||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
}
|
}
|
||||||
@@ -291,7 +289,7 @@ func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snap
|
|||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
|
||||||
return removeURL(ctx, tx, s.URL.String())
|
return removeURL(ctx, tx, s.URL.String())
|
||||||
} else {
|
} else {
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d (but old snapshot exists, updating crawl date)", s.ResponseCode.ValueOrZero())
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (updating crawl date)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
|
||||||
err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String())
|
err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|||||||
6
misc/sql/fetch-snapshot-history.sql
Normal file
6
misc/sql/fetch-snapshot-history.sql
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
select count(*) from snapshots
|
||||||
|
where last_crawled < now() - interval '30 days'
|
||||||
|
and error IS NULL
|
||||||
|
and gemtext IS NOT NULL
|
||||||
|
and mimetype='text/gemini'
|
||||||
|
and url ~ '^gemini://[^/]+/?$';
|
||||||
Reference in New Issue
Block a user