Fix snapshot overwrite logic to preserve successful responses
- Prevent overwriting snapshots that have valid response codes - Ensure URL is removed from queue when snapshot update is skipped - Add last_crawled timestamp tracking for better crawl scheduling - Remove SkipIdenticalContent flag, simplify content deduplication logic - Update database schema with last_crawled column and indexes 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
47
db/db.go
47
db/db.go
@@ -41,7 +41,7 @@ type DbService interface {
|
||||
// Snapshot methods
|
||||
SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
|
||||
OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
|
||||
RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
|
||||
UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error
|
||||
GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url string) (*snapshot.Snapshot, error)
|
||||
GetSnapshotAtTimestamp(ctx context.Context, tx *sqlx.Tx, url string, timestamp time.Time) (*snapshot.Snapshot, error)
|
||||
GetAllSnapshotsForURL(ctx context.Context, tx *sqlx.Tx, url string) ([]*snapshot.Snapshot, error)
|
||||
@@ -374,21 +374,10 @@ func (d *DbServiceImpl) SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapsh
|
||||
return err
|
||||
}
|
||||
|
||||
// Check if we should skip storing identical content
|
||||
if config.CONFIG.SkipIdenticalContent {
|
||||
// Use the context-aware version to check for identical content
|
||||
identical, err := d.IsContentIdentical(ctx, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if identical {
|
||||
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Skipping URL with identical content to existing snapshot: %s", s.URL.String())
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Always ensure we have a current timestamp
|
||||
s.Timestamp = null.TimeFrom(time.Now())
|
||||
// last_crawled will be set automatically by database DEFAULT
|
||||
// Always ensure we have current timestamps
|
||||
currentTime := time.Now()
|
||||
s.Timestamp = null.TimeFrom(currentTime)
|
||||
s.LastCrawled = null.TimeFrom(currentTime)
|
||||
|
||||
// For PostgreSQL, use the global sqlx.NamedQueryContext function
|
||||
// The SQL_INSERT_SNAPSHOT already has a RETURNING id clause
|
||||
@@ -423,26 +412,20 @@ func (d *DbServiceImpl) OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *s
|
||||
return d.SaveSnapshot(ctx, tx, s)
|
||||
}
|
||||
|
||||
// RecordCrawlAttempt records a crawl attempt without saving full content (when content is identical)
|
||||
func (d *DbServiceImpl) RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
// UpdateLastCrawled updates the last_crawled timestamp for the most recent snapshot of a URL
|
||||
func (d *DbServiceImpl) UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error {
|
||||
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Recording crawl attempt for URL %s", s.URL.String())
|
||||
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating last_crawled timestamp for URL %s", url)
|
||||
|
||||
// Check if the context is cancelled before proceeding
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Record the crawl attempt with minimal data
|
||||
// timestamp and last_crawled will be set automatically by database DEFAULT
|
||||
_, err := tx.ExecContext(ctx, SQL_RECORD_CRAWL_ATTEMPT,
|
||||
s.URL.String(),
|
||||
s.Host,
|
||||
s.MimeType.String,
|
||||
s.ResponseCode.ValueOrZero(),
|
||||
s.Error.String)
|
||||
// Update the last_crawled timestamp for the most recent snapshot
|
||||
_, err := tx.ExecContext(ctx, SQL_UPDATE_LAST_CRAWLED, url)
|
||||
if err != nil {
|
||||
return xerrors.NewError(fmt.Errorf("cannot record crawl attempt for URL %s: %w", s.URL.String(), err), 0, "", true)
|
||||
return xerrors.NewError(fmt.Errorf("cannot update last_crawled for URL %s: %w", url, err), 0, "", true)
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -541,14 +524,6 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
|
||||
return false, err
|
||||
}
|
||||
|
||||
// Update: Skipped this because empty pages can be valid
|
||||
// ex. pages with redirect headers
|
||||
// Only check for identical content if we have gemtext or data
|
||||
//if (!s.GemText.Valid || s.GemText.String == "") &&
|
||||
// (!s.Data.Valid || len(s.Data.V) == 0) {
|
||||
// return false, nil
|
||||
//}
|
||||
|
||||
// Try to get the latest snapshot for this URL
|
||||
latestSnapshot := &snapshot.Snapshot{}
|
||||
err := tx.GetContext(ctx, latestSnapshot, SQL_GET_LATEST_SNAPSHOT, s.URL.String())
|
||||
|
||||
@@ -67,38 +67,10 @@ LIMIT $1
|
||||
`
|
||||
// New query - always insert a new snapshot without conflict handling
|
||||
SQL_INSERT_SNAPSHOT = `
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header)
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header, last_crawled)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header, :last_crawled)
|
||||
RETURNING id
|
||||
`
|
||||
// Keep for backward compatibility, but should be phased out
|
||||
SQL_INSERT_SNAPSHOT_IF_NEW = `
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header)
|
||||
ON CONFLICT DO NOTHING
|
||||
`
|
||||
// Update to match the SQL_INSERT_SNAPSHOT - we no longer want to upsert, just insert new versions
|
||||
SQL_UPSERT_SNAPSHOT = `
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header)
|
||||
RETURNING id
|
||||
`
|
||||
SQL_UPDATE_SNAPSHOT = `UPDATE snapshots
|
||||
SET url = :url,
|
||||
host = :host,
|
||||
timestamp = :timestamp,
|
||||
mimetype = :mimetype,
|
||||
data = :data,
|
||||
gemtext = :gemtext,
|
||||
links = :links,
|
||||
lang = :lang,
|
||||
response_code = :response_code,
|
||||
error = :error,
|
||||
header = :header,
|
||||
last_crawled = CURRENT_TIMESTAMP
|
||||
WHERE id = :id
|
||||
RETURNING id
|
||||
`
|
||||
SQL_INSERT_URL = `
|
||||
INSERT INTO urls (url, host, timestamp)
|
||||
VALUES (:url, :host, :timestamp)
|
||||
@@ -115,7 +87,6 @@ RETURNING id
|
||||
SQL_DELETE_URL = `
|
||||
DELETE FROM urls WHERE url=$1
|
||||
`
|
||||
// New queries for retrieving snapshots
|
||||
SQL_GET_LATEST_SNAPSHOT = `
|
||||
SELECT * FROM snapshots
|
||||
WHERE url = $1
|
||||
@@ -140,9 +111,65 @@ RETURNING id
|
||||
AND timestamp BETWEEN $2 AND $3
|
||||
ORDER BY timestamp DESC
|
||||
`
|
||||
// New query to record crawl attempt when content is identical (no new snapshot needed)
|
||||
SQL_RECORD_CRAWL_ATTEMPT = `
|
||||
INSERT INTO snapshots (url, host, mimetype, response_code, error)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
// Update last_crawled timestamp for the most recent snapshot of a URL
|
||||
SQL_UPDATE_LAST_CRAWLED = `
|
||||
UPDATE snapshots
|
||||
SET last_crawled = CURRENT_TIMESTAMP
|
||||
WHERE id = (
|
||||
SELECT id FROM snapshots
|
||||
WHERE url = $1
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1
|
||||
)
|
||||
`
|
||||
// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
|
||||
// This query finds root domain URLs that haven't been crawled recently and selects
|
||||
// one URL per host for diversity. Uses CTEs to:
|
||||
// 1. Find latest crawl attempt per URL (via MAX(last_crawled))
|
||||
// 2. Filter to URLs with actual content and successful responses (20-29)
|
||||
// 3. Select URLs where latest crawl is older than cutoff date
|
||||
// 4. Rank randomly within each host and pick one URL per host
|
||||
// Parameters: $1 = cutoff_date, $2 = limit
|
||||
SQL_FETCH_SNAPSHOTS_FROM_HISTORY = `
|
||||
WITH latest_attempts AS (
|
||||
SELECT
|
||||
url,
|
||||
host,
|
||||
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
|
||||
FROM snapshots
|
||||
WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini'
|
||||
GROUP BY url, host
|
||||
),
|
||||
root_urls_with_content AS (
|
||||
SELECT DISTINCT
|
||||
la.url,
|
||||
la.host,
|
||||
la.latest_attempt
|
||||
FROM latest_attempts la
|
||||
JOIN snapshots s ON s.url = la.url
|
||||
WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL)
|
||||
AND s.response_code BETWEEN 20 AND 29
|
||||
),
|
||||
eligible_urls AS (
|
||||
SELECT
|
||||
url,
|
||||
host,
|
||||
latest_attempt
|
||||
FROM root_urls_with_content
|
||||
WHERE latest_attempt < $1
|
||||
),
|
||||
ranked_urls AS (
|
||||
SELECT
|
||||
url,
|
||||
host,
|
||||
latest_attempt,
|
||||
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank
|
||||
FROM eligible_urls
|
||||
)
|
||||
SELECT url, host
|
||||
FROM ranked_urls
|
||||
WHERE rank = 1
|
||||
ORDER BY RANDOM()
|
||||
LIMIT $2
|
||||
`
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user