Improve crawler performance and worker coordination

- Add WaitGroup synchronization for workers to prevent overlapping scheduler runs - Increase history fetch multiplier and sleep intervals for better resource usage - Simplify error handling and logging in worker processing - Update SQL query to exclude error snapshots from history selection - Fix worker ID variable reference in spawning loop - Streamline snapshot update logic and error reporting
2025-06-19 09:59:50 +03:00
parent 59893efc3d
commit af42383513
4 changed files with 66 additions and 62 deletions
--- a/db/db_queries.go
+++ b/db/db_queries.go
@@ -115,12 +115,7 @@ LIMIT $1
 	SQL_UPDATE_LAST_CRAWLED = `
        UPDATE snapshots 
        SET last_crawled = CURRENT_TIMESTAMP 
-        WHERE id = (
-            SELECT id FROM snapshots 
-            WHERE url = $1 
-            ORDER BY timestamp DESC 
-            LIMIT 1
-        )
+        WHERE url = $1
    `
 	// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
 	// This query finds root domain URLs that haven't been crawled recently and selects
@@ -137,7 +132,7 @@ LIMIT $1
 				host,
 				COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
 			FROM snapshots
-			WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini'
+			WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini' AND error IS NULL
 			GROUP BY url, host
 		),
 		root_urls_with_content AS (