Improve error handling and add duplicate snapshot cleanup

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
antanst
2025-06-18 11:56:26 +03:00
parent ada6cda4ac
commit 967f371777
3 changed files with 28 additions and 121 deletions

View File

@@ -76,7 +76,13 @@ func RunWorkerWithTx(workerID int, job string) {
return return
} }
panic(err) // We shouldn't reach this point! contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
rollbackErr := gemdb.SafeRollback(ctx, tx)
if rollbackErr != nil {
FatalErrorsChan <- rollbackErr
return
}
return
} }
err = tx.Commit() err = tx.Commit()
@@ -94,11 +100,8 @@ func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
for _, u := range urls { for _, u := range urls {
err := WorkOnUrl(ctx, tx, u) err := WorkOnUrl(ctx, tx, u)
if err != nil { if err != nil {
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) || xerrors.IsFatal(err) {
return err return err
} }
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
}
} }
return nil return nil
} }

View File

@@ -446,7 +446,7 @@ func (d *DbServiceImpl) GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url
err := tx.GetContext(ctx, s, SQL_GET_LATEST_SNAPSHOT, url) err := tx.GetContext(ctx, s, SQL_GET_LATEST_SNAPSHOT, url)
if err != nil { if err != nil {
if errors.Is(err, sql.ErrNoRows) { if errors.Is(err, sql.ErrNoRows) {
return nil, xerrors.NewError(fmt.Errorf("no snapshot found for URL %s", url), 0, "", false) return nil, nil
} }
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", false) return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", false)
} }

View File

@@ -1,115 +1,19 @@
-- Cleanup script for snapshots table after adding last_crawled column WITH snapshot_rankings AS (
-- This script consolidates multiple snapshots per URL by:
-- 1. Keeping the latest snapshot with content (non-null gemtext OR data)
-- 2. Setting its last_crawled to the most recent timestamp from any snapshot for that URL
-- 3. Deleting all other snapshots for URLs with multiple snapshots
--
-- IMPORTANT: This script will permanently delete data. Make sure to backup your database first!
BEGIN;
-- Update last_crawled for URLs with multiple snapshots
-- Keep the latest snapshot with content and update its last_crawled to the most recent timestamp
WITH url_snapshots AS (
-- Get all snapshots grouped by URL with row numbers
SELECT SELECT
id, id,
url, url,
timestamp, ROW_NUMBER() OVER (
last_crawled, PARTITION BY url
gemtext, ORDER BY
data, CASE WHEN (gemtext IS NOT NULL AND gemtext != '') OR data IS NOT NULL
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp THEN 0 ELSE 1 END,
timestamp DESC
) as rn
FROM snapshots FROM snapshots
),
latest_content_snapshots AS (
-- Find the latest snapshot with content for each URL
SELECT
url,
id as keep_id,
timestamp as keep_timestamp
FROM url_snapshots
WHERE (gemtext IS NOT NULL OR data IS NOT NULL)
AND rn_by_timestamp = (
SELECT MIN(rn_by_timestamp)
FROM url_snapshots us2
WHERE us2.url = url_snapshots.url
AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL)
) )
), DELETE FROM snapshots
most_recent_timestamps AS ( WHERE id IN (
-- Get the most recent timestamp (last_crawled or timestamp) for each URL SELECT id
SELECT FROM snapshot_rankings
url, WHERE rn > 1
GREATEST( );
MAX(timestamp),
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp)
) as most_recent_time
FROM snapshots
GROUP BY url
)
-- Update the last_crawled of snapshots we're keeping
UPDATE snapshots
SET last_crawled = mrt.most_recent_time
FROM latest_content_snapshots lcs
JOIN most_recent_timestamps mrt ON lcs.url = mrt.url
WHERE snapshots.id = lcs.keep_id;
-- Delete all other snapshots for URLs that have multiple snapshots
WITH url_snapshots AS (
SELECT
id,
url,
timestamp,
gemtext,
data,
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp
FROM snapshots
),
latest_content_snapshots AS (
-- Find the latest snapshot with content for each URL
SELECT
url,
id as keep_id
FROM url_snapshots
WHERE (gemtext IS NOT NULL OR data IS NOT NULL)
AND rn_by_timestamp = (
SELECT MIN(rn_by_timestamp)
FROM url_snapshots us2
WHERE us2.url = url_snapshots.url
AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL)
)
),
snapshots_to_delete AS (
-- Find snapshots to delete (all except the ones we're keeping)
SELECT s.id
FROM snapshots s
LEFT JOIN latest_content_snapshots lcs ON s.id = lcs.keep_id
WHERE lcs.keep_id IS NULL
AND s.url IN (
-- Only for URLs that have multiple snapshots
SELECT url
FROM snapshots
GROUP BY url
HAVING COUNT(*) > 1
)
)
DELETE FROM snapshots
WHERE id IN (SELECT id FROM snapshots_to_delete);
-- Show summary of changes
SELECT
'Cleanup completed. Remaining snapshots: ' || COUNT(*) as summary
FROM snapshots;
-- Show URLs that still have multiple snapshots (should be 0 after cleanup)
SELECT
'URLs with multiple snapshots after cleanup: ' || COUNT(*) as validation
FROM (
SELECT url
FROM snapshots
GROUP BY url
HAVING COUNT(*) > 1
) multi_snapshots;
COMMIT;