Fix snapshot overwrite logic to preserve successful responses

- Prevent overwriting snapshots that have valid response codes - Ensure URL is removed from queue when snapshot update is skipped - Add last_crawled timestamp tracking for better crawl scheduling - Remove SkipIdenticalContent flag, simplify content deduplication logic - Update database schema with last_crawled column and indexes 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-18 11:23:56 +03:00
parent e9d7fa85ff
commit ada6cda4ac
8 changed files with 284 additions and 242 deletions
--- a/cmd/crawler/crawler.go
+++ b/cmd/crawler/crawler.go
@@ -307,139 +307,17 @@ func enqueueSeedURLs(ctx context.Context, tx *sqlx.Tx) error {
 	return nil
 }

-//func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age int) (int, error) {
-//	// Select <num> snapshots from snapshots table for recrawling
-//	// They should be at least <age> days old, random order,
-//	// one snapshot per host if possible.
-//	historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
-//	contextlog.LogDebugWithContext(historyCtx, logging.GetSlogger(), "Looking for %d snapshots at least %d days old to recrawl", num, age)
-//
-//	// Calculate the cutoff date
-//	cutoffDate := time.Now().AddDate(0, 0, -age)
-//
-//	// SQL to select one random URL per host from snapshots older than the cutoff date
-//	// TODO implement when gopher enabled
-//	query := `
-//		WITH hosts AS (
-//			SELECT DISTINCT host
-//			FROM snapshots
-//			WHERE url ~ '^gemini://[^/]+/?$'
-//		        AND gemtext IS NOT NULL
-//			AND timestamp < $1
-//			AND error IS NULL
-//		),
-//		ranked_snapshots AS (
-//			SELECT
-//				s.url,
-//				s.host,
-//				s.timestamp,
-//				ROW_NUMBER() OVER (PARTITION BY s.host ORDER BY RANDOM()) as rank
-//			FROM snapshots s
-//			JOIN hosts h ON s.host = h.host
-//			WHERE s.timestamp < $1
-//		)
-//		SELECT url, host
-//		FROM ranked_snapshots
-//		WHERE rank = 1
-//		ORDER BY RANDOM()
-//		LIMIT $2
-//	`
-//
-//	type SnapshotURL struct {
-//		URL  string `db:"url"`
-//		Host string `db:"host"`
-//	}
-//
-//	// Execute the query
-//	var snapshotURLs []SnapshotURL
-//	err := tx.Select(&snapshotURLs, query, cutoffDate, num)
-//	if err != nil {
-//		return 0, err
-//	}
-//
-//	if len(snapshotURLs) == 0 {
-//		historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
-//		contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "No old snapshots found to recrawl")
-//		return 0, nil
-//	}
-//
-//	// For each selected snapshot, add the URL to the urls table
-//	insertCount := 0
-//	for _, snapshot := range snapshotURLs {
-//		err := gemdb.Database.InsertURL(ctx, tx, snapshot.URL)
-//		if err != nil {
-//			logging.LogError("Error inserting URL %s from old snapshot to queue: %v", snapshot.URL, err)
-//			return 0, err
-//		}
-//		insertCount++
-//	}
-//
-//	// Note: The transaction is committed by the caller (runJobScheduler),
-//	// not here. This function is called as part of a larger transaction.
-//	if insertCount > 0 {
-//		historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
-//		contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "Added %d old URLs to recrawl queue", insertCount)
-//	}
-//
-//	return insertCount, nil
-//}
-
 func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age int) (int, error) {
 	// Select <num> snapshots from snapshots table for recrawling
 	// Find URLs where the LATEST crawl attempt (via last_crawled) is at least <age> days old
-	// Now works correctly with SkipIdenticalContent=true - fixes infinite recrawl loop
+	// Uses last_crawled timestamp to track actual crawl attempts regardless of content changes
 	historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
 	contextlog.LogDebugWithContext(historyCtx, logging.GetSlogger(), "Looking for %d URLs whose latest crawl attempt is at least %d days old to recrawl", num, age)

 	// Calculate the cutoff date
 	cutoffDate := time.Now().AddDate(0, 0, -age)

-	// SQL to select URLs where the latest crawl attempt is older than the cutoff date
-	// Now uses last_crawled timestamp to track actual crawl attempts
-	// Works correctly with SkipIdenticalContent setting - avoids infinite recrawl loop
-	// One URL per host for diversity, focusing on root domain URLs
-	query := `
-		WITH latest_attempts AS (
-			SELECT 
-				url,
-				host,
-				COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
-			FROM snapshots
-			WHERE url ~ '^gemini://[^/]+/?$'
-			GROUP BY url, host
-		),
-		root_urls_with_content AS (
-			SELECT DISTINCT
-				la.url,
-				la.host,
-				la.latest_attempt
-			FROM latest_attempts la
-			JOIN snapshots s ON s.url = la.url 
-			WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL)
-				AND s.response_code BETWEEN 20 AND 29
-		),
-		eligible_urls AS (
-			SELECT 
-				url,
-				host,
-				latest_attempt
-			FROM root_urls_with_content
-			WHERE latest_attempt < $1
-		),
-		ranked_urls AS (
-			SELECT
-				url,
-				host,
-				latest_attempt,
-				ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank
-			FROM eligible_urls
-		)
-		SELECT url, host
-		FROM ranked_urls
-		WHERE rank = 1
-		ORDER BY RANDOM()
-		LIMIT $2
-	`
+	// Use the query from db_queries.go to find URLs that need re-crawling

 	type SnapshotURL struct {
 		URL  string `db:"url"`
@@ -448,7 +326,7 @@ func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age in

 	// Execute the query
 	var snapshotURLs []SnapshotURL
-	err := tx.Select(&snapshotURLs, query, cutoffDate, num)
+	err := tx.Select(&snapshotURLs, gemdb.SQL_FETCH_SNAPSHOTS_FROM_HISTORY, cutoffDate, num)
 	if err != nil {
 		return 0, err
 	}