From 349968d01929c935bf795f300e69bfeb69ee3adb Mon Sep 17 00:00:00 2001 From: antanst <> Date: Wed, 18 Jun 2025 11:56:26 +0300 Subject: [PATCH] Improve error handling and add duplicate snapshot cleanup --- common/worker.go | 13 ++- db/db.go | 2 +- misc/sql/cleanup_duplicate_snapshots.sql | 134 ++++------------------- 3 files changed, 28 insertions(+), 121 deletions(-) diff --git a/common/worker.go b/common/worker.go index 11b24db..4f6a331 100644 --- a/common/worker.go +++ b/common/worker.go @@ -76,7 +76,13 @@ func RunWorkerWithTx(workerID int, job string) { return } - panic(err) // We shouldn't reach this point! + contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err) + rollbackErr := gemdb.SafeRollback(ctx, tx) + if rollbackErr != nil { + FatalErrorsChan <- rollbackErr + return + } + return } err = tx.Commit() @@ -94,10 +100,7 @@ func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error { for _, u := range urls { err := WorkOnUrl(ctx, tx, u) if err != nil { - if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) || xerrors.IsFatal(err) { - return err - } - contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err) + return err } } return nil diff --git a/db/db.go b/db/db.go index c60353b..9607184 100644 --- a/db/db.go +++ b/db/db.go @@ -446,7 +446,7 @@ func (d *DbServiceImpl) GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url err := tx.GetContext(ctx, s, SQL_GET_LATEST_SNAPSHOT, url) if err != nil { if errors.Is(err, sql.ErrNoRows) { - return nil, xerrors.NewError(fmt.Errorf("no snapshot found for URL %s", url), 0, "", false) + return nil, nil } return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", false) } diff --git a/misc/sql/cleanup_duplicate_snapshots.sql b/misc/sql/cleanup_duplicate_snapshots.sql index d3382b3..7aec493 100644 --- a/misc/sql/cleanup_duplicate_snapshots.sql +++ b/misc/sql/cleanup_duplicate_snapshots.sql @@ -1,115 +1,19 @@ --- Cleanup script for snapshots table after adding last_crawled column --- This script consolidates multiple snapshots per URL by: --- 1. Keeping the latest snapshot with content (non-null gemtext OR data) --- 2. Setting its last_crawled to the most recent timestamp from any snapshot for that URL --- 3. Deleting all other snapshots for URLs with multiple snapshots --- --- IMPORTANT: This script will permanently delete data. Make sure to backup your database first! - -BEGIN; - --- Update last_crawled for URLs with multiple snapshots --- Keep the latest snapshot with content and update its last_crawled to the most recent timestamp -WITH url_snapshots AS ( - -- Get all snapshots grouped by URL with row numbers - SELECT - id, - url, - timestamp, - last_crawled, - gemtext, - data, - ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp - FROM snapshots -), -latest_content_snapshots AS ( - -- Find the latest snapshot with content for each URL - SELECT - url, - id as keep_id, - timestamp as keep_timestamp - FROM url_snapshots - WHERE (gemtext IS NOT NULL OR data IS NOT NULL) - AND rn_by_timestamp = ( - SELECT MIN(rn_by_timestamp) - FROM url_snapshots us2 - WHERE us2.url = url_snapshots.url - AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL) - ) -), -most_recent_timestamps AS ( - -- Get the most recent timestamp (last_crawled or timestamp) for each URL - SELECT - url, - GREATEST( - MAX(timestamp), - COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) - ) as most_recent_time - FROM snapshots - GROUP BY url -) --- Update the last_crawled of snapshots we're keeping -UPDATE snapshots -SET last_crawled = mrt.most_recent_time -FROM latest_content_snapshots lcs -JOIN most_recent_timestamps mrt ON lcs.url = mrt.url -WHERE snapshots.id = lcs.keep_id; - --- Delete all other snapshots for URLs that have multiple snapshots -WITH url_snapshots AS ( - SELECT - id, - url, - timestamp, - gemtext, - data, - ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp - FROM snapshots -), -latest_content_snapshots AS ( - -- Find the latest snapshot with content for each URL - SELECT - url, - id as keep_id - FROM url_snapshots - WHERE (gemtext IS NOT NULL OR data IS NOT NULL) - AND rn_by_timestamp = ( - SELECT MIN(rn_by_timestamp) - FROM url_snapshots us2 - WHERE us2.url = url_snapshots.url - AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL) - ) -), -snapshots_to_delete AS ( - -- Find snapshots to delete (all except the ones we're keeping) - SELECT s.id - FROM snapshots s - LEFT JOIN latest_content_snapshots lcs ON s.id = lcs.keep_id - WHERE lcs.keep_id IS NULL - AND s.url IN ( - -- Only for URLs that have multiple snapshots - SELECT url - FROM snapshots - GROUP BY url - HAVING COUNT(*) > 1 - ) -) -DELETE FROM snapshots -WHERE id IN (SELECT id FROM snapshots_to_delete); - --- Show summary of changes -SELECT - 'Cleanup completed. Remaining snapshots: ' || COUNT(*) as summary -FROM snapshots; - --- Show URLs that still have multiple snapshots (should be 0 after cleanup) -SELECT - 'URLs with multiple snapshots after cleanup: ' || COUNT(*) as validation -FROM ( - SELECT url - FROM snapshots - GROUP BY url - HAVING COUNT(*) > 1 -) multi_snapshots; - -COMMIT; \ No newline at end of file +WITH snapshot_rankings AS ( + SELECT + id, + url, + ROW_NUMBER() OVER ( + PARTITION BY url + ORDER BY + CASE WHEN (gemtext IS NOT NULL AND gemtext != '') OR data IS NOT NULL + THEN 0 ELSE 1 END, + timestamp DESC + ) as rn + FROM snapshots + ) + DELETE FROM snapshots + WHERE id IN ( + SELECT id + FROM snapshot_rankings + WHERE rn > 1 + );