Fix infinite recrawl loop with skip-identical-content

Add last_crawled timestamp tracking to fix fetchSnapshotsFromHistory()
infinite loop when SkipIdenticalContent=true. Now tracks actual crawl
attempts separately from content changes via database DEFAULT timestamps.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
antanst
2025-06-17 10:41:17 +03:00
parent f9024d15aa
commit 4e225ee866
5 changed files with 555 additions and 4 deletions

View File

@@ -202,7 +202,12 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
return err
}
if skipIdentical {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, recording crawl attempt")
// Record the crawl attempt to track that we processed this URL
err = gemdb.Database.RecordCrawlAttempt(ctx, tx, s)
if err != nil {
return err
}
return removeURL(ctx, tx, s.URL.String())
}