Enhance crawler with seed list and SQL utilities

Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
antanst
2025-06-16 12:29:33 +03:00
parent 51f94c90b2
commit 330b596497
37 changed files with 742 additions and 682 deletions

View File

@@ -0,0 +1,20 @@
-- File: host_snapshot_stats.sql
-- Groups snapshots by hosts and shows URLs with multiple snapshots
-- Usage: \i misc/sql/host_snapshot_stats.sql
SELECT
host,
COUNT(DISTINCT url) as unique_urls,
SUM(CASE WHEN url_count > 1 THEN 1 ELSE 0 END) as urls_with_multiple_snapshots,
SUM(snapshot_count) as total_snapshots
FROM (
SELECT
host,
url,
COUNT(*) as snapshot_count,
COUNT(*) OVER (PARTITION BY url) as url_count
FROM snapshots
GROUP BY host, url
) subquery
GROUP BY host
ORDER BY total_snapshots DESC;