Fix snapshot overwrite logic to preserve successful responses

- Prevent overwriting snapshots that have valid response codes
- Ensure URL is removed from queue when snapshot update is skipped
- Add last_crawled timestamp tracking for better crawl scheduling
- Remove SkipIdenticalContent flag, simplify content deduplication logic
- Update database schema with last_crawled column and indexes

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
antanst
2025-06-18 11:23:56 +03:00
parent e9d7fa85ff
commit ada6cda4ac
8 changed files with 284 additions and 242 deletions

View File

@@ -26,7 +26,8 @@ CREATE TABLE snapshots (
lang TEXT,
response_code INTEGER,
error TEXT,
header TEXT
header TEXT,
last_crawled TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE UNIQUE INDEX idx_url_timestamp ON snapshots (url, timestamp);
@@ -40,4 +41,6 @@ CREATE INDEX idx_host ON snapshots (host);
CREATE INDEX idx_response_code_error ON snapshots (response_code, error);
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
CREATE INDEX idx_snapshots_unprocessed ON snapshots (host) WHERE response_code IS NULL AND error IS NULL;
CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC);
CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC);
CREATE INDEX idx_last_crawled ON snapshots (last_crawled);
CREATE INDEX idx_url_last_crawled ON snapshots (url, last_crawled DESC);