- Prevent overwriting snapshots that have valid response codes - Ensure URL is removed from queue when snapshot update is skipped - Add last_crawled timestamp tracking for better crawl scheduling - Remove SkipIdenticalContent flag, simplify content deduplication logic - Update database schema with last_crawled column and indexes
115 lines
3.5 KiB
PL/PgSQL
115 lines
3.5 KiB
PL/PgSQL
-- Cleanup script for snapshots table after adding last_crawled column
|
|
-- This script consolidates multiple snapshots per URL by:
|
|
-- 1. Keeping the latest snapshot with content (non-null gemtext OR data)
|
|
-- 2. Setting its last_crawled to the most recent timestamp from any snapshot for that URL
|
|
-- 3. Deleting all other snapshots for URLs with multiple snapshots
|
|
--
|
|
-- IMPORTANT: This script will permanently delete data. Make sure to backup your database first!
|
|
|
|
BEGIN;
|
|
|
|
-- Update last_crawled for URLs with multiple snapshots
|
|
-- Keep the latest snapshot with content and update its last_crawled to the most recent timestamp
|
|
WITH url_snapshots AS (
|
|
-- Get all snapshots grouped by URL with row numbers
|
|
SELECT
|
|
id,
|
|
url,
|
|
timestamp,
|
|
last_crawled,
|
|
gemtext,
|
|
data,
|
|
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp
|
|
FROM snapshots
|
|
),
|
|
latest_content_snapshots AS (
|
|
-- Find the latest snapshot with content for each URL
|
|
SELECT
|
|
url,
|
|
id as keep_id,
|
|
timestamp as keep_timestamp
|
|
FROM url_snapshots
|
|
WHERE (gemtext IS NOT NULL OR data IS NOT NULL)
|
|
AND rn_by_timestamp = (
|
|
SELECT MIN(rn_by_timestamp)
|
|
FROM url_snapshots us2
|
|
WHERE us2.url = url_snapshots.url
|
|
AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL)
|
|
)
|
|
),
|
|
most_recent_timestamps AS (
|
|
-- Get the most recent timestamp (last_crawled or timestamp) for each URL
|
|
SELECT
|
|
url,
|
|
GREATEST(
|
|
MAX(timestamp),
|
|
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp)
|
|
) as most_recent_time
|
|
FROM snapshots
|
|
GROUP BY url
|
|
)
|
|
-- Update the last_crawled of snapshots we're keeping
|
|
UPDATE snapshots
|
|
SET last_crawled = mrt.most_recent_time
|
|
FROM latest_content_snapshots lcs
|
|
JOIN most_recent_timestamps mrt ON lcs.url = mrt.url
|
|
WHERE snapshots.id = lcs.keep_id;
|
|
|
|
-- Delete all other snapshots for URLs that have multiple snapshots
|
|
WITH url_snapshots AS (
|
|
SELECT
|
|
id,
|
|
url,
|
|
timestamp,
|
|
gemtext,
|
|
data,
|
|
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn_by_timestamp
|
|
FROM snapshots
|
|
),
|
|
latest_content_snapshots AS (
|
|
-- Find the latest snapshot with content for each URL
|
|
SELECT
|
|
url,
|
|
id as keep_id
|
|
FROM url_snapshots
|
|
WHERE (gemtext IS NOT NULL OR data IS NOT NULL)
|
|
AND rn_by_timestamp = (
|
|
SELECT MIN(rn_by_timestamp)
|
|
FROM url_snapshots us2
|
|
WHERE us2.url = url_snapshots.url
|
|
AND (us2.gemtext IS NOT NULL OR us2.data IS NOT NULL)
|
|
)
|
|
),
|
|
snapshots_to_delete AS (
|
|
-- Find snapshots to delete (all except the ones we're keeping)
|
|
SELECT s.id
|
|
FROM snapshots s
|
|
LEFT JOIN latest_content_snapshots lcs ON s.id = lcs.keep_id
|
|
WHERE lcs.keep_id IS NULL
|
|
AND s.url IN (
|
|
-- Only for URLs that have multiple snapshots
|
|
SELECT url
|
|
FROM snapshots
|
|
GROUP BY url
|
|
HAVING COUNT(*) > 1
|
|
)
|
|
)
|
|
DELETE FROM snapshots
|
|
WHERE id IN (SELECT id FROM snapshots_to_delete);
|
|
|
|
-- Show summary of changes
|
|
SELECT
|
|
'Cleanup completed. Remaining snapshots: ' || COUNT(*) as summary
|
|
FROM snapshots;
|
|
|
|
-- Show URLs that still have multiple snapshots (should be 0 after cleanup)
|
|
SELECT
|
|
'URLs with multiple snapshots after cleanup: ' || COUNT(*) as validation
|
|
FROM (
|
|
SELECT url
|
|
FROM snapshots
|
|
GROUP BY url
|
|
HAVING COUNT(*) > 1
|
|
) multi_snapshots;
|
|
|
|
COMMIT; |