Fix infinite recrawl loop with skip-identical-content
Add last_crawled timestamp tracking to fix fetchSnapshotsFromHistory() infinite loop when SkipIdenticalContent=true. Now tracks actual crawl attempts separately from content changes via database DEFAULT timestamps. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -20,8 +20,9 @@ type Snapshot struct {
|
||||
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
||||
Links null.Value[linkList.LinkList] `db:"links" json:"links,omitempty"`
|
||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response Status code.
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response Status code.
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
LastCrawled null.Time `db:"last_crawled" json:"last_crawled,omitempty"` // When URL was last processed (regardless of content changes)
|
||||
}
|
||||
|
||||
func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
|
||||
|
||||
@@ -202,7 +202,12 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
||||
return err
|
||||
}
|
||||
if skipIdentical {
|
||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
|
||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, recording crawl attempt")
|
||||
// Record the crawl attempt to track that we processed this URL
|
||||
err = gemdb.Database.RecordCrawlAttempt(ctx, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return removeURL(ctx, tx, s.URL.String())
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user