Files
gemini-grc/misc/sql/crawl_top_level.sql
antanst 37d5e7cd78 Enhance crawler with seed list and SQL utilities
Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
2025-06-16 12:29:33 +03:00

31 lines
600 B
PL/PgSQL

BEGIN;
WITH matching_urls AS (
SELECT url, host
FROM snapshots
WHERE url ~ '^gemini://[^/]+/$'
AND timestamp < (NOW() - INTERVAL '1 week')
ORDER BY random()
LIMIT 500
)
INSERT INTO urls (url, host)
SELECT url, host
FROM matching_urls
ON CONFLICT DO NOTHING;
-- WITH matching_urls AS (
-- SELECT url, host
-- FROM snapshots
-- WHERE url ~ '^gemini://[^/]+/$'
-- AND timestamp < (NOW() - INTERVAL '1 week')
-- ORDER BY random()
-- LIMIT 500
-- )
-- DELETE FROM snapshots
-- WHERE url IN (
-- SELECT url
-- FROM matching_urls
-- );
COMMIT;