Improve crawler performance and logging

- Optimize job scheduler to use NumOfWorkers for URL limits - Clean up verbose logging in worker processing - Update log messages for better clarity
2025-06-29 22:27:20 +03:00
parent 21b8769bc5
commit 1ba432c127
4 changed files with 14 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -107,6 +107,9 @@ You can access the snapshot history using the included `snapshot_history.sh` scr
 Good starting points:

 gemini://warmedal.se/~antenna/
+
 gemini://tlgs.one/
+
 gopher://i-logout.cz:70/1/bongusta/
+
 gopher://gopher.quux.org:70/
--- a/cmd/crawler/crawler.go
+++ b/cmd/crawler/crawler.go
@@ -215,7 +215,7 @@ func runJobScheduler() {
 			common.FatalErrorsChan <- err
 			return
 		}
-		// Commit this tx here so the loop sees the changes.
+		// Commit this tx here so the loop below sees the changes.
 		err := tx.Commit()
 		if err != nil {
 			common.FatalErrorsChan <- err
@@ -251,7 +251,7 @@ func runJobScheduler() {
 		// When out of pending URLs, add some random ones.
 		if len(distinctHosts) == 0 {
 			// Queue random old URLs from history.
-			count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers*10, config.CONFIG.SkipIfUpdatedDays)
+			count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers, config.CONFIG.SkipIfUpdatedDays)
 			if err != nil {
 				common.FatalErrorsChan <- err
 				return
@@ -269,7 +269,7 @@ func runJobScheduler() {
 		}

 		// Get some URLs from each host, up to a limit
-		urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, 10, tx)
+		urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, config.CONFIG.NumOfWorkers, tx)
 		if err != nil {
 			common.FatalErrorsChan <- err
 			return
@@ -287,7 +287,7 @@ func runJobScheduler() {
 			continue
 		}

-		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Queueing %d distinct hosts -> %d urls to crawl", len(distinctHosts), len(urls))
+		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%d urls to crawl", len(urls))

 		// Add jobs to WaitGroup before queuing
 		common.WorkerWG.Add(len(urls))
--- a/common/worker.go
+++ b/common/worker.go
@@ -140,7 +140,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {

 	// Only check blacklist if URL is not whitelisted
 	if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
-		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "URL matches blacklist, skipped")
 		s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
 		return saveSnapshotAndRemoveURL(ctx, tx, s)
 	}
@@ -152,7 +151,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
 		// add it as an error and remove url
 		robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
 		if robotMatch {
-			contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "URL matches robots.txt, skipped")
 			s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
 			return saveSnapshotAndRemoveURL(ctx, tx, s)
 		}
@@ -291,7 +289,7 @@ func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snap
 		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
 		return removeURL(ctx, tx, s.URL.String())
 	} else {
-		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d (but old snapshot exists, updating crawl date)", s.ResponseCode.ValueOrZero())
+		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (updating crawl date)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
 		err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String())
 		if err != nil {
 			return err
--- a/misc/sql/fetch-snapshot-history.sql
+++ b/misc/sql/fetch-snapshot-history.sql
@@ -0,0 +1,6 @@
+select count(*) from snapshots
+  where last_crawled < now() - interval '30 days'
+    and error IS NULL
+    and gemtext IS NOT NULL
+    and mimetype='text/gemini'
+    and url ~ '^gemini://[^/]+/?$';