Fix infinite recrawl loop with skip-identical-content

Add last_crawled timestamp tracking to fix fetchSnapshotsFromHistory()
infinite loop when SkipIdenticalContent=true. Now tracks actual crawl
attempts separately from content changes via database DEFAULT timestamps.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
antanst
2025-06-17 10:41:17 +03:00
parent 9938dc542b
commit e9d7fa85ff
5 changed files with 555 additions and 4 deletions

View File

@@ -41,6 +41,7 @@ type DbService interface {
// Snapshot methods
SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url string) (*snapshot.Snapshot, error)
GetSnapshotAtTimestamp(ctx context.Context, tx *sqlx.Tx, url string, timestamp time.Time) (*snapshot.Snapshot, error)
GetAllSnapshotsForURL(ctx context.Context, tx *sqlx.Tx, url string) ([]*snapshot.Snapshot, error)
@@ -387,6 +388,7 @@ func (d *DbServiceImpl) SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapsh
// Always ensure we have a current timestamp
s.Timestamp = null.TimeFrom(time.Now())
// last_crawled will be set automatically by database DEFAULT
// For PostgreSQL, use the global sqlx.NamedQueryContext function
// The SQL_INSERT_SNAPSHOT already has a RETURNING id clause
@@ -421,6 +423,31 @@ func (d *DbServiceImpl) OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *s
return d.SaveSnapshot(ctx, tx, s)
}
// RecordCrawlAttempt records a crawl attempt without saving full content (when content is identical)
func (d *DbServiceImpl) RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
dbCtx := contextutil.ContextWithComponent(ctx, "database")
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Recording crawl attempt for URL %s", s.URL.String())
// Check if the context is cancelled before proceeding
if err := ctx.Err(); err != nil {
return err
}
// Record the crawl attempt with minimal data
// timestamp and last_crawled will be set automatically by database DEFAULT
_, err := tx.ExecContext(ctx, SQL_RECORD_CRAWL_ATTEMPT,
s.URL.String(),
s.Host,
s.MimeType.String,
s.ResponseCode.ValueOrZero(),
s.Error.String)
if err != nil {
return xerrors.NewError(fmt.Errorf("cannot record crawl attempt for URL %s: %w", s.URL.String(), err), 0, "", true)
}
return nil
}
// GetLatestSnapshot gets the latest snapshot with context
func (d *DbServiceImpl) GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url string) (*snapshot.Snapshot, error) {
dbCtx := contextutil.ContextWithComponent(ctx, "database")

View File

@@ -94,7 +94,8 @@ links = :links,
lang = :lang,
response_code = :response_code,
error = :error,
header = :header
header = :header,
last_crawled = CURRENT_TIMESTAMP
WHERE id = :id
RETURNING id
`
@@ -139,4 +140,9 @@ RETURNING id
AND timestamp BETWEEN $2 AND $3
ORDER BY timestamp DESC
`
// New query to record crawl attempt when content is identical (no new snapshot needed)
SQL_RECORD_CRAWL_ATTEMPT = `
INSERT INTO snapshots (url, host, mimetype, response_code, error)
VALUES ($1, $2, $3, $4, $5)
`
)