Enhance crawler with seed list and SQL utilities
Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
30
misc/sql/crawl_top_level.sql
Normal file
30
misc/sql/crawl_top_level.sql
Normal file
@@ -0,0 +1,30 @@
|
||||
BEGIN;
|
||||
|
||||
WITH matching_urls AS (
|
||||
SELECT url, host
|
||||
FROM snapshots
|
||||
WHERE url ~ '^gemini://[^/]+/$'
|
||||
AND timestamp < (NOW() - INTERVAL '1 week')
|
||||
ORDER BY random()
|
||||
LIMIT 500
|
||||
)
|
||||
INSERT INTO urls (url, host)
|
||||
SELECT url, host
|
||||
FROM matching_urls
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
-- WITH matching_urls AS (
|
||||
-- SELECT url, host
|
||||
-- FROM snapshots
|
||||
-- WHERE url ~ '^gemini://[^/]+/$'
|
||||
-- AND timestamp < (NOW() - INTERVAL '1 week')
|
||||
-- ORDER BY random()
|
||||
-- LIMIT 500
|
||||
-- )
|
||||
-- DELETE FROM snapshots
|
||||
-- WHERE url IN (
|
||||
-- SELECT url
|
||||
-- FROM matching_urls
|
||||
-- );
|
||||
|
||||
COMMIT;
|
||||
Reference in New Issue
Block a user