diff --git a/README.md b/README.md index 87109cf..18696f7 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,9 @@ You can access the snapshot history using the included `snapshot_history.sh` scr Good starting points: gemini://warmedal.se/~antenna/ + gemini://tlgs.one/ + gopher://i-logout.cz:70/1/bongusta/ + gopher://gopher.quux.org:70/ \ No newline at end of file diff --git a/cmd/crawler/crawler.go b/cmd/crawler/crawler.go index f0dbd97..2f7870b 100644 --- a/cmd/crawler/crawler.go +++ b/cmd/crawler/crawler.go @@ -215,7 +215,7 @@ func runJobScheduler() { common.FatalErrorsChan <- err return } - // Commit this tx here so the loop sees the changes. + // Commit this tx here so the loop below sees the changes. err := tx.Commit() if err != nil { common.FatalErrorsChan <- err @@ -251,7 +251,7 @@ func runJobScheduler() { // When out of pending URLs, add some random ones. if len(distinctHosts) == 0 { // Queue random old URLs from history. - count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers*10, config.CONFIG.SkipIfUpdatedDays) + count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers, config.CONFIG.SkipIfUpdatedDays) if err != nil { common.FatalErrorsChan <- err return @@ -269,7 +269,7 @@ func runJobScheduler() { } // Get some URLs from each host, up to a limit - urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, 10, tx) + urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, config.CONFIG.NumOfWorkers, tx) if err != nil { common.FatalErrorsChan <- err return @@ -287,7 +287,7 @@ func runJobScheduler() { continue } - contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Queueing %d distinct hosts -> %d urls to crawl", len(distinctHosts), len(urls)) + contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%d urls to crawl", len(urls)) // Add jobs to WaitGroup before queuing common.WorkerWG.Add(len(urls)) diff --git a/common/worker.go b/common/worker.go index 3bbc498..2b5eaee 100644 --- a/common/worker.go +++ b/common/worker.go @@ -140,7 +140,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) { // Only check blacklist if URL is not whitelisted if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) { - contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "URL matches blacklist, skipped") s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error()) return saveSnapshotAndRemoveURL(ctx, tx, s) } @@ -152,7 +151,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) { // add it as an error and remove url robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String()) if robotMatch { - contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "URL matches robots.txt, skipped") s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error()) return saveSnapshotAndRemoveURL(ctx, tx, s) } @@ -291,7 +289,7 @@ func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snap contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero()) return removeURL(ctx, tx, s.URL.String()) } else { - contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d (but old snapshot exists, updating crawl date)", s.ResponseCode.ValueOrZero()) + contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (updating crawl date)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero()) err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String()) if err != nil { return err diff --git a/misc/sql/fetch-snapshot-history.sql b/misc/sql/fetch-snapshot-history.sql new file mode 100644 index 0000000..94efc83 --- /dev/null +++ b/misc/sql/fetch-snapshot-history.sql @@ -0,0 +1,6 @@ +select count(*) from snapshots + where last_crawled < now() - interval '30 days' + and error IS NULL + and gemtext IS NOT NULL + and mimetype='text/gemini' + and url ~ '^gemini://[^/]+/?$';