Fix infinite recrawl loop with skip-identical-content

Add last_crawled timestamp tracking to fix fetchSnapshotsFromHistory() infinite loop when SkipIdenticalContent=true. Now tracks actual crawl attempts separately from content changes via database DEFAULT timestamps.
2025-06-17 10:41:17 +03:00
parent 8b498a2603
commit 98d3ed6707
5 changed files with 555 additions and 4 deletions
--- a/common/snapshot/snapshot.go
+++ b/common/snapshot/snapshot.go
@@ -20,8 +20,9 @@ type Snapshot struct {
 	Header       null.String                   `db:"header" json:"header,omitempty"`   // Response header.
 	Links        null.Value[linkList.LinkList] `db:"links" json:"links,omitempty"`
 	Lang         null.String                   `db:"lang" json:"lang,omitempty"`
-	ResponseCode null.Int                      `db:"response_code" json:"code,omitempty"` // Gemini response Status code.
-	Error        null.String                   `db:"error" json:"error,omitempty"`        // On network errors only
+	ResponseCode null.Int                      `db:"response_code" json:"code,omitempty"`        // Gemini response Status code.
+	Error        null.String                   `db:"error" json:"error,omitempty"`               // On network errors only
+	LastCrawled  null.Time                     `db:"last_crawled" json:"last_crawled,omitempty"` // When URL was last processed (regardless of content changes)
 }

 func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
--- a/common/worker.go
+++ b/common/worker.go
@@ -202,7 +202,12 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
 		return err
 	}
 	if skipIdentical {
-		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
+		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, recording crawl attempt")
+		// Record the crawl attempt to track that we processed this URL
+		err = gemdb.Database.RecordCrawlAttempt(ctx, tx, s)
+		if err != nil {
+			return err
+		}
 		return removeURL(ctx, tx, s.URL.String())
 	}