From ada6cda4ac1f62f3711233f9fd516a916898a432 Mon Sep 17 00:00:00 2001 From: antanst <> Date: Wed, 18 Jun 2025 11:23:56 +0300 Subject: [PATCH] Fix snapshot overwrite logic to preserve successful responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Prevent overwriting snapshots that have valid response codes - Ensure URL is removed from queue when snapshot update is skipped - Add last_crawled timestamp tracking for better crawl scheduling - Remove SkipIdenticalContent flag, simplify content deduplication logic - Update database schema with last_crawled column and indexes 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- TODO.md | 38 +++++++ cmd/crawler/crawler.go | 128 +---------------------- common/worker.go | 67 +++++++----- config/config.go | 27 +++-- db/db.go | 47 ++------- db/db_queries.go | 97 ++++++++++------- misc/sql/cleanup_duplicate_snapshots.sql | 115 ++++++++++++++++++++ misc/sql/initdb.sql | 7 +- 8 files changed, 284 insertions(+), 242 deletions(-) create mode 100644 TODO.md create mode 100644 misc/sql/cleanup_duplicate_snapshots.sql diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..ddaf28e --- /dev/null +++ b/TODO.md @@ -0,0 +1,38 @@ +# TODO + +## Outstanding Issues + +### 1. Ctrl+C Signal Handling Issue + +**Problem**: The crawler sometimes doesn't exit properly when Ctrl+C is pressed. + +**Root Cause**: The main thread gets stuck in blocking operations before it can check for signals: +- Database operations in the polling loop (`cmd/crawler/crawler.go:239-250`) +- Job queueing when channel is full (`jobs <- url` can block if workers are slow) +- Long-running database transactions + +**Location**: `cmd/crawler/crawler.go` - main polling loop starting at line 233 + +**Solution**: Add signal/context checking to blocking operations: +- Use cancellable context instead of `context.Background()` for database operations +- Make job queueing non-blocking or context-aware +- Add timeout/cancellation to database operations + +### 2. fetchSnapshotsFromHistory() Doesn't Work with --skip-identical-content=true + +**Problem**: When `--skip-identical-content=true` (default), URLs with unchanged content get continuously re-queued. + +**Root Cause**: The function tracks when content last changed, not when URLs were last crawled: +- Identical content → no new snapshot created +- Query finds old snapshot timestamp → re-queues URL +- Creates infinite loop of re-crawling unchanged content + +**Location**: `cmd/crawler/crawler.go:388-470` - `fetchSnapshotsFromHistory()` function + +**Solution Options**: +1. Add `last_crawled` timestamp to URLs table +2. Create separate `crawl_attempts` table +3. Always create snapshot entries (even for duplicates) but mark them as such +4. Modify logic to work with existing schema constraints + +**Current Status**: Function assumes `SkipIdenticalContent=false` per original comment at line 391. \ No newline at end of file diff --git a/cmd/crawler/crawler.go b/cmd/crawler/crawler.go index d011894..6df5726 100644 --- a/cmd/crawler/crawler.go +++ b/cmd/crawler/crawler.go @@ -307,139 +307,17 @@ func enqueueSeedURLs(ctx context.Context, tx *sqlx.Tx) error { return nil } -//func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age int) (int, error) { -// // Select snapshots from snapshots table for recrawling -// // They should be at least days old, random order, -// // one snapshot per host if possible. -// historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory") -// contextlog.LogDebugWithContext(historyCtx, logging.GetSlogger(), "Looking for %d snapshots at least %d days old to recrawl", num, age) -// -// // Calculate the cutoff date -// cutoffDate := time.Now().AddDate(0, 0, -age) -// -// // SQL to select one random URL per host from snapshots older than the cutoff date -// // TODO implement when gopher enabled -// query := ` -// WITH hosts AS ( -// SELECT DISTINCT host -// FROM snapshots -// WHERE url ~ '^gemini://[^/]+/?$' -// AND gemtext IS NOT NULL -// AND timestamp < $1 -// AND error IS NULL -// ), -// ranked_snapshots AS ( -// SELECT -// s.url, -// s.host, -// s.timestamp, -// ROW_NUMBER() OVER (PARTITION BY s.host ORDER BY RANDOM()) as rank -// FROM snapshots s -// JOIN hosts h ON s.host = h.host -// WHERE s.timestamp < $1 -// ) -// SELECT url, host -// FROM ranked_snapshots -// WHERE rank = 1 -// ORDER BY RANDOM() -// LIMIT $2 -// ` -// -// type SnapshotURL struct { -// URL string `db:"url"` -// Host string `db:"host"` -// } -// -// // Execute the query -// var snapshotURLs []SnapshotURL -// err := tx.Select(&snapshotURLs, query, cutoffDate, num) -// if err != nil { -// return 0, err -// } -// -// if len(snapshotURLs) == 0 { -// historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory") -// contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "No old snapshots found to recrawl") -// return 0, nil -// } -// -// // For each selected snapshot, add the URL to the urls table -// insertCount := 0 -// for _, snapshot := range snapshotURLs { -// err := gemdb.Database.InsertURL(ctx, tx, snapshot.URL) -// if err != nil { -// logging.LogError("Error inserting URL %s from old snapshot to queue: %v", snapshot.URL, err) -// return 0, err -// } -// insertCount++ -// } -// -// // Note: The transaction is committed by the caller (runJobScheduler), -// // not here. This function is called as part of a larger transaction. -// if insertCount > 0 { -// historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory") -// contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "Added %d old URLs to recrawl queue", insertCount) -// } -// -// return insertCount, nil -//} - func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age int) (int, error) { // Select snapshots from snapshots table for recrawling // Find URLs where the LATEST crawl attempt (via last_crawled) is at least days old - // Now works correctly with SkipIdenticalContent=true - fixes infinite recrawl loop + // Uses last_crawled timestamp to track actual crawl attempts regardless of content changes historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory") contextlog.LogDebugWithContext(historyCtx, logging.GetSlogger(), "Looking for %d URLs whose latest crawl attempt is at least %d days old to recrawl", num, age) // Calculate the cutoff date cutoffDate := time.Now().AddDate(0, 0, -age) - // SQL to select URLs where the latest crawl attempt is older than the cutoff date - // Now uses last_crawled timestamp to track actual crawl attempts - // Works correctly with SkipIdenticalContent setting - avoids infinite recrawl loop - // One URL per host for diversity, focusing on root domain URLs - query := ` - WITH latest_attempts AS ( - SELECT - url, - host, - COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt - FROM snapshots - WHERE url ~ '^gemini://[^/]+/?$' - GROUP BY url, host - ), - root_urls_with_content AS ( - SELECT DISTINCT - la.url, - la.host, - la.latest_attempt - FROM latest_attempts la - JOIN snapshots s ON s.url = la.url - WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL) - AND s.response_code BETWEEN 20 AND 29 - ), - eligible_urls AS ( - SELECT - url, - host, - latest_attempt - FROM root_urls_with_content - WHERE latest_attempt < $1 - ), - ranked_urls AS ( - SELECT - url, - host, - latest_attempt, - ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank - FROM eligible_urls - ) - SELECT url, host - FROM ranked_urls - WHERE rank = 1 - ORDER BY RANDOM() - LIMIT $2 - ` + // Use the query from db_queries.go to find URLs that need re-crawling type SnapshotURL struct { URL string `db:"url"` @@ -448,7 +326,7 @@ func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age in // Execute the query var snapshotURLs []SnapshotURL - err := tx.Select(&snapshotURLs, query, cutoffDate, num) + err := tx.Select(&snapshotURLs, gemdb.SQL_FETCH_SNAPSHOTS_FROM_HISTORY, cutoffDate, num) if err != nil { return 0, err } diff --git a/common/worker.go b/common/worker.go index 4cf95fd..11b24db 100644 --- a/common/worker.go +++ b/common/worker.go @@ -196,15 +196,15 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) { } // Check if we should skip a potentially - // identical snapshot - skipIdentical, err := shouldSkipIdenticalSnapshot(ctx, tx, s) + // identical snapshot with one from history + isIdentical, err := isContentIdentical(ctx, tx, s) if err != nil { return err } - if skipIdentical { - contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, recording crawl attempt") - // Record the crawl attempt to track that we processed this URL - err = gemdb.Database.RecordCrawlAttempt(ctx, tx, s) + if isIdentical { + contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, updating crawl timestamp") + // Update the last_crawled timestamp to track that we processed this URL + err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String()) if err != nil { return err } @@ -222,37 +222,46 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) { // Save the snapshot and remove the URL from the queue if s.Error.ValueOrZero() != "" { - contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero()) + // Only save error if we didn't have any valid + // snapshot data from a previous crawl! + shouldUpdateSnapshot, err := shouldUpdateSnapshotData(ctx, tx, s) + if err != nil { + return err + } + if shouldUpdateSnapshot { + contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero()) + return saveSnapshotAndRemoveURL(ctx, tx, s) + } else { + contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (but old content exists, not updating)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero()) + return removeURL(ctx, tx, s.URL.String()) + } } else { contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero()) + return saveSnapshotAndRemoveURL(ctx, tx, s) } - return saveSnapshotAndRemoveURL(ctx, tx, s) } -func shouldSkipIdenticalSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) { - // Check if content is identical to previous snapshot and we should skip further processing - if config.CONFIG.SkipIdenticalContent { - identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s) - if err != nil { - return false, err - } - if identical { - return true, nil - } +func shouldUpdateSnapshotData(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) { + prevSnapshot, err := gemdb.Database.GetLatestSnapshot(ctx, tx, s.URL.String()) + if err != nil { + return false, err } - // We write every Gemini capsule, but still - // skip identical pages that aren't capsules. - if s.MimeType.String != "text/gemini" { - identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s) - if err != nil { - return false, err - } - if identical { - return true, nil - } + if prevSnapshot == nil { + return true, nil + } + if prevSnapshot.ResponseCode.Valid { + return false, nil + } + return true, nil +} +func isContentIdentical(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) { + // Always check if content is identical to previous snapshot + identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s) + if err != nil { + return false, err } - return false, nil + return identical, nil } // storeLinks checks and stores the snapshot links in the database. diff --git a/config/config.go b/config/config.go index ed277c7..8b0791e 100644 --- a/config/config.go +++ b/config/config.go @@ -9,19 +9,18 @@ import ( // Config holds the application configuration loaded from environment variables. type Config struct { - PgURL string - LogLevel slog.Level // Logging level (debug, info, warn, error) - MaxResponseSize int // Maximum size of response in bytes - MaxDbConnections int // Maximum number of database connections. - NumOfWorkers int // Number of concurrent workers - ResponseTimeout int // Timeout for responses in seconds - BlacklistPath string // File that has blacklisted strings of "host:port" - WhitelistPath string // File with URLs that should always be crawled regardless of blacklist - DryRun bool // If false, don't write to disk - GopherEnable bool // Enable Gopher crawling - SeedUrlPath string // Add URLs from file to queue - SkipIdenticalContent bool // When true, skip storing snapshots with identical content - SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable, default 0) + PgURL string + LogLevel slog.Level // Logging level (debug, info, warn, error) + MaxResponseSize int // Maximum size of response in bytes + MaxDbConnections int // Maximum number of database connections. + NumOfWorkers int // Number of concurrent workers + ResponseTimeout int // Timeout for responses in seconds + BlacklistPath string // File that has blacklisted strings of "host:port" + WhitelistPath string // File with URLs that should always be crawled regardless of blacklist + DryRun bool // If false, don't write to disk + GopherEnable bool // Enable Gopher crawling + SeedUrlPath string // Add URLs from file to queue + SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable) } var CONFIG Config //nolint:gochecknoglobals @@ -39,7 +38,6 @@ func Initialize() *Config { maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes") responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds") blacklistPath := flag.String("blacklist-path", "", "File that has blacklist regexes") - skipIdenticalContent := flag.Bool("skip-identical-content", true, "Skip storing snapshots with identical content") skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable)") whitelistPath := flag.String("whitelist-path", "", "File with URLs that should always be crawled regardless of blacklist") seedUrlPath := flag.String("seed-url-path", "", "File with seed URLs that should be added to the queue immediatelly") @@ -56,7 +54,6 @@ func Initialize() *Config { config.WhitelistPath = *whitelistPath config.SeedUrlPath = *seedUrlPath config.MaxDbConnections = *maxDbConnections - config.SkipIdenticalContent = *skipIdenticalContent config.SkipIfUpdatedDays = *skipIfUpdatedDays level, err := ParseSlogLevel(*loglevel) diff --git a/db/db.go b/db/db.go index 6cdf6c2..c60353b 100644 --- a/db/db.go +++ b/db/db.go @@ -41,7 +41,7 @@ type DbService interface { // Snapshot methods SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error - RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error + UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url string) (*snapshot.Snapshot, error) GetSnapshotAtTimestamp(ctx context.Context, tx *sqlx.Tx, url string, timestamp time.Time) (*snapshot.Snapshot, error) GetAllSnapshotsForURL(ctx context.Context, tx *sqlx.Tx, url string) ([]*snapshot.Snapshot, error) @@ -374,21 +374,10 @@ func (d *DbServiceImpl) SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapsh return err } - // Check if we should skip storing identical content - if config.CONFIG.SkipIdenticalContent { - // Use the context-aware version to check for identical content - identical, err := d.IsContentIdentical(ctx, tx, s) - if err != nil { - return err - } else if identical { - contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Skipping URL with identical content to existing snapshot: %s", s.URL.String()) - return nil - } - } - - // Always ensure we have a current timestamp - s.Timestamp = null.TimeFrom(time.Now()) - // last_crawled will be set automatically by database DEFAULT + // Always ensure we have current timestamps + currentTime := time.Now() + s.Timestamp = null.TimeFrom(currentTime) + s.LastCrawled = null.TimeFrom(currentTime) // For PostgreSQL, use the global sqlx.NamedQueryContext function // The SQL_INSERT_SNAPSHOT already has a RETURNING id clause @@ -423,26 +412,20 @@ func (d *DbServiceImpl) OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *s return d.SaveSnapshot(ctx, tx, s) } -// RecordCrawlAttempt records a crawl attempt without saving full content (when content is identical) -func (d *DbServiceImpl) RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error { +// UpdateLastCrawled updates the last_crawled timestamp for the most recent snapshot of a URL +func (d *DbServiceImpl) UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error { dbCtx := contextutil.ContextWithComponent(ctx, "database") - contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Recording crawl attempt for URL %s", s.URL.String()) + contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating last_crawled timestamp for URL %s", url) // Check if the context is cancelled before proceeding if err := ctx.Err(); err != nil { return err } - // Record the crawl attempt with minimal data - // timestamp and last_crawled will be set automatically by database DEFAULT - _, err := tx.ExecContext(ctx, SQL_RECORD_CRAWL_ATTEMPT, - s.URL.String(), - s.Host, - s.MimeType.String, - s.ResponseCode.ValueOrZero(), - s.Error.String) + // Update the last_crawled timestamp for the most recent snapshot + _, err := tx.ExecContext(ctx, SQL_UPDATE_LAST_CRAWLED, url) if err != nil { - return xerrors.NewError(fmt.Errorf("cannot record crawl attempt for URL %s: %w", s.URL.String(), err), 0, "", true) + return xerrors.NewError(fmt.Errorf("cannot update last_crawled for URL %s: %w", url, err), 0, "", true) } return nil @@ -541,14 +524,6 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s * return false, err } - // Update: Skipped this because empty pages can be valid - // ex. pages with redirect headers - // Only check for identical content if we have gemtext or data - //if (!s.GemText.Valid || s.GemText.String == "") && - // (!s.Data.Valid || len(s.Data.V) == 0) { - // return false, nil - //} - // Try to get the latest snapshot for this URL latestSnapshot := &snapshot.Snapshot{} err := tx.GetContext(ctx, latestSnapshot, SQL_GET_LATEST_SNAPSHOT, s.URL.String()) diff --git a/db/db_queries.go b/db/db_queries.go index f3d2100..2e99b67 100644 --- a/db/db_queries.go +++ b/db/db_queries.go @@ -67,38 +67,10 @@ LIMIT $1 ` // New query - always insert a new snapshot without conflict handling SQL_INSERT_SNAPSHOT = ` - INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header) - VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header) + INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header, last_crawled) + VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header, :last_crawled) RETURNING id ` - // Keep for backward compatibility, but should be phased out - SQL_INSERT_SNAPSHOT_IF_NEW = ` - INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header) - VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header) - ON CONFLICT DO NOTHING - ` - // Update to match the SQL_INSERT_SNAPSHOT - we no longer want to upsert, just insert new versions - SQL_UPSERT_SNAPSHOT = ` - INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header) - VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header) - RETURNING id - ` - SQL_UPDATE_SNAPSHOT = `UPDATE snapshots -SET url = :url, -host = :host, -timestamp = :timestamp, -mimetype = :mimetype, -data = :data, -gemtext = :gemtext, -links = :links, -lang = :lang, -response_code = :response_code, -error = :error, -header = :header, -last_crawled = CURRENT_TIMESTAMP -WHERE id = :id -RETURNING id -` SQL_INSERT_URL = ` INSERT INTO urls (url, host, timestamp) VALUES (:url, :host, :timestamp) @@ -115,7 +87,6 @@ RETURNING id SQL_DELETE_URL = ` DELETE FROM urls WHERE url=$1 ` - // New queries for retrieving snapshots SQL_GET_LATEST_SNAPSHOT = ` SELECT * FROM snapshots WHERE url = $1 @@ -140,9 +111,65 @@ RETURNING id AND timestamp BETWEEN $2 AND $3 ORDER BY timestamp DESC ` - // New query to record crawl attempt when content is identical (no new snapshot needed) - SQL_RECORD_CRAWL_ATTEMPT = ` - INSERT INTO snapshots (url, host, mimetype, response_code, error) - VALUES ($1, $2, $3, $4, $5) + // Update last_crawled timestamp for the most recent snapshot of a URL + SQL_UPDATE_LAST_CRAWLED = ` + UPDATE snapshots + SET last_crawled = CURRENT_TIMESTAMP + WHERE id = ( + SELECT id FROM snapshots + WHERE url = $1 + ORDER BY timestamp DESC + LIMIT 1 + ) + ` + // SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp + // This query finds root domain URLs that haven't been crawled recently and selects + // one URL per host for diversity. Uses CTEs to: + // 1. Find latest crawl attempt per URL (via MAX(last_crawled)) + // 2. Filter to URLs with actual content and successful responses (20-29) + // 3. Select URLs where latest crawl is older than cutoff date + // 4. Rank randomly within each host and pick one URL per host + // Parameters: $1 = cutoff_date, $2 = limit + SQL_FETCH_SNAPSHOTS_FROM_HISTORY = ` + WITH latest_attempts AS ( + SELECT + url, + host, + COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt + FROM snapshots + WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini' + GROUP BY url, host + ), + root_urls_with_content AS ( + SELECT DISTINCT + la.url, + la.host, + la.latest_attempt + FROM latest_attempts la + JOIN snapshots s ON s.url = la.url + WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL) + AND s.response_code BETWEEN 20 AND 29 + ), + eligible_urls AS ( + SELECT + url, + host, + latest_attempt + FROM root_urls_with_content + WHERE latest_attempt < $1 + ), + ranked_urls AS ( + SELECT + url, + host, + latest_attempt, + ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank + FROM eligible_urls + ) + SELECT url, host + FROM ranked_urls + WHERE rank = 1 + ORDER BY RANDOM() + LIMIT $2 ` ) diff --git a/misc/sql/cleanup_duplicate_snapshots.sql b/misc/sql/cleanup_duplicate_snapshots.sql new file mode 100644 index 0000000..d3382b3 --- /dev/null +++ b/misc/sql/cleanup_duplicate_snapshots.sql @@ -0,0 +1,115 @@ +-- Cleanup script for snapshots table after adding last_crawled column +-- This script consolidates multiple snapshots per URL by: +-- 1. Keeping the latest snapshot with content (non-null gemtext OR data) +-- 2. Setting its last_crawled to the most recent timestamp from any snapshot for that URL +-- 3. Deleting all other snapshots for URLs with multiple snapshots +-- +-- IMPORTANT: This script will permanently delete data. Make sure to backup your database first! + +BEGIN; + +-- Update last_crawled for URLs with multiple snapshots +-- Keep the latest snapshot with content and update its last_crawled to the most recent timestamp +WITH url_snapshots AS ( + -- Get all snapshots grouped by URL with row numbers + SELECT + id, + url, + timestamp, + last_crawled, + gemtext, + data, + ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp + FROM snapshots +), +latest_content_snapshots AS ( + -- Find the latest snapshot with content for each URL + SELECT + url, + id as keep_id, + timestamp as keep_timestamp + FROM url_snapshots + WHERE (gemtext IS NOT NULL OR data IS NOT NULL) + AND rn_by_timestamp = ( + SELECT MIN(rn_by_timestamp) + FROM url_snapshots us2 + WHERE us2.url = url_snapshots.url + AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL) + ) +), +most_recent_timestamps AS ( + -- Get the most recent timestamp (last_crawled or timestamp) for each URL + SELECT + url, + GREATEST( + MAX(timestamp), + COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) + ) as most_recent_time + FROM snapshots + GROUP BY url +) +-- Update the last_crawled of snapshots we're keeping +UPDATE snapshots +SET last_crawled = mrt.most_recent_time +FROM latest_content_snapshots lcs +JOIN most_recent_timestamps mrt ON lcs.url = mrt.url +WHERE snapshots.id = lcs.keep_id; + +-- Delete all other snapshots for URLs that have multiple snapshots +WITH url_snapshots AS ( + SELECT + id, + url, + timestamp, + gemtext, + data, + ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp + FROM snapshots +), +latest_content_snapshots AS ( + -- Find the latest snapshot with content for each URL + SELECT + url, + id as keep_id + FROM url_snapshots + WHERE (gemtext IS NOT NULL OR data IS NOT NULL) + AND rn_by_timestamp = ( + SELECT MIN(rn_by_timestamp) + FROM url_snapshots us2 + WHERE us2.url = url_snapshots.url + AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL) + ) +), +snapshots_to_delete AS ( + -- Find snapshots to delete (all except the ones we're keeping) + SELECT s.id + FROM snapshots s + LEFT JOIN latest_content_snapshots lcs ON s.id = lcs.keep_id + WHERE lcs.keep_id IS NULL + AND s.url IN ( + -- Only for URLs that have multiple snapshots + SELECT url + FROM snapshots + GROUP BY url + HAVING COUNT(*) > 1 + ) +) +DELETE FROM snapshots +WHERE id IN (SELECT id FROM snapshots_to_delete); + +-- Show summary of changes +SELECT + 'Cleanup completed. Remaining snapshots: ' || COUNT(*) as summary +FROM snapshots; + +-- Show URLs that still have multiple snapshots (should be 0 after cleanup) +SELECT + 'URLs with multiple snapshots after cleanup: ' || COUNT(*) as validation +FROM ( + SELECT url + FROM snapshots + GROUP BY url + HAVING COUNT(*) > 1 +) multi_snapshots; + +COMMIT; \ No newline at end of file diff --git a/misc/sql/initdb.sql b/misc/sql/initdb.sql index 2a57fb8..84cf6d0 100644 --- a/misc/sql/initdb.sql +++ b/misc/sql/initdb.sql @@ -26,7 +26,8 @@ CREATE TABLE snapshots ( lang TEXT, response_code INTEGER, error TEXT, - header TEXT + header TEXT, + last_crawled TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP ); CREATE UNIQUE INDEX idx_url_timestamp ON snapshots (url, timestamp); @@ -40,4 +41,6 @@ CREATE INDEX idx_host ON snapshots (host); CREATE INDEX idx_response_code_error ON snapshots (response_code, error); CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; CREATE INDEX idx_snapshots_unprocessed ON snapshots (host) WHERE response_code IS NULL AND error IS NULL; -CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC); \ No newline at end of file +CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC); +CREATE INDEX idx_last_crawled ON snapshots (last_crawled); +CREATE INDEX idx_url_last_crawled ON snapshots (url, last_crawled DESC); \ No newline at end of file