Improve error handling and add duplicate snapshot cleanup
This commit is contained in:
@@ -76,7 +76,13 @@ func RunWorkerWithTx(workerID int, job string) {
|
|||||||
return
|
return
|
||||||
|
|
||||||
}
|
}
|
||||||
panic(err) // We shouldn't reach this point!
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
|
||||||
|
rollbackErr := gemdb.SafeRollback(ctx, tx)
|
||||||
|
if rollbackErr != nil {
|
||||||
|
FatalErrorsChan <- rollbackErr
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
err = tx.Commit()
|
err = tx.Commit()
|
||||||
@@ -94,10 +100,7 @@ func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
|
|||||||
for _, u := range urls {
|
for _, u := range urls {
|
||||||
err := WorkOnUrl(ctx, tx, u)
|
err := WorkOnUrl(ctx, tx, u)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) || xerrors.IsFatal(err) {
|
return err
|
||||||
return err
|
|
||||||
}
|
|
||||||
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
2
db/db.go
2
db/db.go
@@ -446,7 +446,7 @@ func (d *DbServiceImpl) GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url
|
|||||||
err := tx.GetContext(ctx, s, SQL_GET_LATEST_SNAPSHOT, url)
|
err := tx.GetContext(ctx, s, SQL_GET_LATEST_SNAPSHOT, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, sql.ErrNoRows) {
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
return nil, xerrors.NewError(fmt.Errorf("no snapshot found for URL %s", url), 0, "", false)
|
return nil, nil
|
||||||
}
|
}
|
||||||
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", false)
|
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", false)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,115 +1,19 @@
|
|||||||
-- Cleanup script for snapshots table after adding last_crawled column
|
WITH snapshot_rankings AS (
|
||||||
-- This script consolidates multiple snapshots per URL by:
|
SELECT
|
||||||
-- 1. Keeping the latest snapshot with content (non-null gemtext OR data)
|
id,
|
||||||
-- 2. Setting its last_crawled to the most recent timestamp from any snapshot for that URL
|
url,
|
||||||
-- 3. Deleting all other snapshots for URLs with multiple snapshots
|
ROW_NUMBER() OVER (
|
||||||
--
|
PARTITION BY url
|
||||||
-- IMPORTANT: This script will permanently delete data. Make sure to backup your database first!
|
ORDER BY
|
||||||
|
CASE WHEN (gemtext IS NOT NULL AND gemtext != '') OR data IS NOT NULL
|
||||||
BEGIN;
|
THEN 0 ELSE 1 END,
|
||||||
|
timestamp DESC
|
||||||
-- Update last_crawled for URLs with multiple snapshots
|
) as rn
|
||||||
-- Keep the latest snapshot with content and update its last_crawled to the most recent timestamp
|
FROM snapshots
|
||||||
WITH url_snapshots AS (
|
)
|
||||||
-- Get all snapshots grouped by URL with row numbers
|
DELETE FROM snapshots
|
||||||
SELECT
|
WHERE id IN (
|
||||||
id,
|
SELECT id
|
||||||
url,
|
FROM snapshot_rankings
|
||||||
timestamp,
|
WHERE rn > 1
|
||||||
last_crawled,
|
);
|
||||||
gemtext,
|
|
||||||
data,
|
|
||||||
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp
|
|
||||||
FROM snapshots
|
|
||||||
),
|
|
||||||
latest_content_snapshots AS (
|
|
||||||
-- Find the latest snapshot with content for each URL
|
|
||||||
SELECT
|
|
||||||
url,
|
|
||||||
id as keep_id,
|
|
||||||
timestamp as keep_timestamp
|
|
||||||
FROM url_snapshots
|
|
||||||
WHERE (gemtext IS NOT NULL OR data IS NOT NULL)
|
|
||||||
AND rn_by_timestamp = (
|
|
||||||
SELECT MIN(rn_by_timestamp)
|
|
||||||
FROM url_snapshots us2
|
|
||||||
WHERE us2.url = url_snapshots.url
|
|
||||||
AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
most_recent_timestamps AS (
|
|
||||||
-- Get the most recent timestamp (last_crawled or timestamp) for each URL
|
|
||||||
SELECT
|
|
||||||
url,
|
|
||||||
GREATEST(
|
|
||||||
MAX(timestamp),
|
|
||||||
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp)
|
|
||||||
) as most_recent_time
|
|
||||||
FROM snapshots
|
|
||||||
GROUP BY url
|
|
||||||
)
|
|
||||||
-- Update the last_crawled of snapshots we're keeping
|
|
||||||
UPDATE snapshots
|
|
||||||
SET last_crawled = mrt.most_recent_time
|
|
||||||
FROM latest_content_snapshots lcs
|
|
||||||
JOIN most_recent_timestamps mrt ON lcs.url = mrt.url
|
|
||||||
WHERE snapshots.id = lcs.keep_id;
|
|
||||||
|
|
||||||
-- Delete all other snapshots for URLs that have multiple snapshots
|
|
||||||
WITH url_snapshots AS (
|
|
||||||
SELECT
|
|
||||||
id,
|
|
||||||
url,
|
|
||||||
timestamp,
|
|
||||||
gemtext,
|
|
||||||
data,
|
|
||||||
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp
|
|
||||||
FROM snapshots
|
|
||||||
),
|
|
||||||
latest_content_snapshots AS (
|
|
||||||
-- Find the latest snapshot with content for each URL
|
|
||||||
SELECT
|
|
||||||
url,
|
|
||||||
id as keep_id
|
|
||||||
FROM url_snapshots
|
|
||||||
WHERE (gemtext IS NOT NULL OR data IS NOT NULL)
|
|
||||||
AND rn_by_timestamp = (
|
|
||||||
SELECT MIN(rn_by_timestamp)
|
|
||||||
FROM url_snapshots us2
|
|
||||||
WHERE us2.url = url_snapshots.url
|
|
||||||
AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
snapshots_to_delete AS (
|
|
||||||
-- Find snapshots to delete (all except the ones we're keeping)
|
|
||||||
SELECT s.id
|
|
||||||
FROM snapshots s
|
|
||||||
LEFT JOIN latest_content_snapshots lcs ON s.id = lcs.keep_id
|
|
||||||
WHERE lcs.keep_id IS NULL
|
|
||||||
AND s.url IN (
|
|
||||||
-- Only for URLs that have multiple snapshots
|
|
||||||
SELECT url
|
|
||||||
FROM snapshots
|
|
||||||
GROUP BY url
|
|
||||||
HAVING COUNT(*) > 1
|
|
||||||
)
|
|
||||||
)
|
|
||||||
DELETE FROM snapshots
|
|
||||||
WHERE id IN (SELECT id FROM snapshots_to_delete);
|
|
||||||
|
|
||||||
-- Show summary of changes
|
|
||||||
SELECT
|
|
||||||
'Cleanup completed. Remaining snapshots: ' || COUNT(*) as summary
|
|
||||||
FROM snapshots;
|
|
||||||
|
|
||||||
-- Show URLs that still have multiple snapshots (should be 0 after cleanup)
|
|
||||||
SELECT
|
|
||||||
'URLs with multiple snapshots after cleanup: ' || COUNT(*) as validation
|
|
||||||
FROM (
|
|
||||||
SELECT url
|
|
||||||
FROM snapshots
|
|
||||||
GROUP BY url
|
|
||||||
HAVING COUNT(*) > 1
|
|
||||||
) multi_snapshots;
|
|
||||||
|
|
||||||
COMMIT;
|
|
||||||
|
|||||||
Reference in New Issue
Block a user