Fix snapshot overwrite logic to preserve successful responses

- Prevent overwriting snapshots that have valid response codes
- Ensure URL is removed from queue when snapshot update is skipped
- Add last_crawled timestamp tracking for better crawl scheduling
- Remove SkipIdenticalContent flag, simplify content deduplication logic
- Update database schema with last_crawled column and indexes

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
antanst
2025-06-18 11:23:56 +03:00
parent e9d7fa85ff
commit ada6cda4ac
8 changed files with 284 additions and 242 deletions

View File

@@ -307,139 +307,17 @@ func enqueueSeedURLs(ctx context.Context, tx *sqlx.Tx) error {
return nil
}
//func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age int) (int, error) {
// // Select <num> snapshots from snapshots table for recrawling
// // They should be at least <age> days old, random order,
// // one snapshot per host if possible.
// historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
// contextlog.LogDebugWithContext(historyCtx, logging.GetSlogger(), "Looking for %d snapshots at least %d days old to recrawl", num, age)
//
// // Calculate the cutoff date
// cutoffDate := time.Now().AddDate(0, 0, -age)
//
// // SQL to select one random URL per host from snapshots older than the cutoff date
// // TODO implement when gopher enabled
// query := `
// WITH hosts AS (
// SELECT DISTINCT host
// FROM snapshots
// WHERE url ~ '^gemini://[^/]+/?$'
// AND gemtext IS NOT NULL
// AND timestamp < $1
// AND error IS NULL
// ),
// ranked_snapshots AS (
// SELECT
// s.url,
// s.host,
// s.timestamp,
// ROW_NUMBER() OVER (PARTITION BY s.host ORDER BY RANDOM()) as rank
// FROM snapshots s
// JOIN hosts h ON s.host = h.host
// WHERE s.timestamp < $1
// )
// SELECT url, host
// FROM ranked_snapshots
// WHERE rank = 1
// ORDER BY RANDOM()
// LIMIT $2
// `
//
// type SnapshotURL struct {
// URL string `db:"url"`
// Host string `db:"host"`
// }
//
// // Execute the query
// var snapshotURLs []SnapshotURL
// err := tx.Select(&snapshotURLs, query, cutoffDate, num)
// if err != nil {
// return 0, err
// }
//
// if len(snapshotURLs) == 0 {
// historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
// contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "No old snapshots found to recrawl")
// return 0, nil
// }
//
// // For each selected snapshot, add the URL to the urls table
// insertCount := 0
// for _, snapshot := range snapshotURLs {
// err := gemdb.Database.InsertURL(ctx, tx, snapshot.URL)
// if err != nil {
// logging.LogError("Error inserting URL %s from old snapshot to queue: %v", snapshot.URL, err)
// return 0, err
// }
// insertCount++
// }
//
// // Note: The transaction is committed by the caller (runJobScheduler),
// // not here. This function is called as part of a larger transaction.
// if insertCount > 0 {
// historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
// contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "Added %d old URLs to recrawl queue", insertCount)
// }
//
// return insertCount, nil
//}
func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age int) (int, error) {
// Select <num> snapshots from snapshots table for recrawling
// Find URLs where the LATEST crawl attempt (via last_crawled) is at least <age> days old
// Now works correctly with SkipIdenticalContent=true - fixes infinite recrawl loop
// Uses last_crawled timestamp to track actual crawl attempts regardless of content changes
historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
contextlog.LogDebugWithContext(historyCtx, logging.GetSlogger(), "Looking for %d URLs whose latest crawl attempt is at least %d days old to recrawl", num, age)
// Calculate the cutoff date
cutoffDate := time.Now().AddDate(0, 0, -age)
// SQL to select URLs where the latest crawl attempt is older than the cutoff date
// Now uses last_crawled timestamp to track actual crawl attempts
// Works correctly with SkipIdenticalContent setting - avoids infinite recrawl loop
// One URL per host for diversity, focusing on root domain URLs
query := `
WITH latest_attempts AS (
SELECT
url,
host,
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
FROM snapshots
WHERE url ~ '^gemini://[^/]+/?$'
GROUP BY url, host
),
root_urls_with_content AS (
SELECT DISTINCT
la.url,
la.host,
la.latest_attempt
FROM latest_attempts la
JOIN snapshots s ON s.url = la.url
WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL)
AND s.response_code BETWEEN 20 AND 29
),
eligible_urls AS (
SELECT
url,
host,
latest_attempt
FROM root_urls_with_content
WHERE latest_attempt < $1
),
ranked_urls AS (
SELECT
url,
host,
latest_attempt,
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank
FROM eligible_urls
)
SELECT url, host
FROM ranked_urls
WHERE rank = 1
ORDER BY RANDOM()
LIMIT $2
`
// Use the query from db_queries.go to find URLs that need re-crawling
type SnapshotURL struct {
URL string `db:"url"`
@@ -448,7 +326,7 @@ func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age in
// Execute the query
var snapshotURLs []SnapshotURL
err := tx.Select(&snapshotURLs, query, cutoffDate, num)
err := tx.Select(&snapshotURLs, gemdb.SQL_FETCH_SNAPSHOTS_FROM_HISTORY, cutoffDate, num)
if err != nil {
return 0, err
}