Enhance crawler with seed list and SQL utilities

Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
2025-06-16 12:29:33 +03:00
parent dfb050588c
commit 37d5e7cd78
37 changed files with 742 additions and 682 deletions
--- a/misc/sql/README.md
+++ b/misc/sql/README.md
@@ -0,0 +1,28 @@
+# SQL Queries for Snapshot Analysis
+
+This directory contains SQL queries to analyze snapshot data in the gemini-grc database.
+
+## Usage
+
+You can run these queries directly from psql using the `\i` directive:
+
+```
+\i misc/sql/snapshots_per_url.sql
+```
+
+## Available Queries
+
+- **snapshots_per_url.sql** - Basic count of snapshots per URL
+- **snapshots_date_range.sql** - Shows snapshot count with date range information for each URL
+- **host_snapshot_stats.sql** - Groups snapshots by hosts and shows URLs with multiple snapshots
+- **content_changes.sql** - Finds URLs with the most content changes between consecutive snapshots
+- **snapshot_distribution.sql** - Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
+- **recent_snapshot_activity.sql** - Shows URLs with most snapshots in the last 7 days
+- **storage_efficiency.sql** - Shows potential storage savings from deduplication
+- **snapshots_by_timeframe.sql** - Shows snapshot count by timeframe (day, week, month)
+
+## Notes
+
+- These queries are designed to work with PostgreSQL and the gemini-grc database schema
+- Some queries may be resource-intensive on large databases
+- The results can help optimize storage and understand the effectiveness of the versioned snapshot feature
--- a/misc/sql/content_changes.sql
+++ b/misc/sql/content_changes.sql
@@ -0,0 +1,26 @@
+-- File: content_changes.sql
+-- Finds URLs with the most content changes between consecutive snapshots
+-- Usage: \i misc/sql/content_changes.sql
+
+WITH snapshot_changes AS (
+    SELECT 
+        s1.url,
+        s1.timestamp as prev_timestamp,
+        s2.timestamp as next_timestamp,
+        s1.gemtext IS DISTINCT FROM s2.gemtext as gemtext_changed,
+        s1.data IS DISTINCT FROM s2.data as data_changed
+    FROM snapshots s1
+    JOIN snapshots s2 ON s1.url = s2.url AND s1.timestamp < s2.timestamp
+    WHERE NOT EXISTS (
+        SELECT 1 FROM snapshots s3
+        WHERE s3.url = s1.url AND s1.timestamp < s3.timestamp AND s3.timestamp < s2.timestamp
+    )
+)
+SELECT 
+    url,
+    COUNT(*) + 1 as snapshot_count,
+    SUM(CASE WHEN gemtext_changed OR data_changed THEN 1 ELSE 0 END) as content_changes
+FROM snapshot_changes
+GROUP BY url
+HAVING COUNT(*) + 1 > 1
+ORDER BY content_changes DESC, snapshot_count DESC;
--- a/misc/sql/crawl_top_level.sql
+++ b/misc/sql/crawl_top_level.sql
@@ -0,0 +1,30 @@
+BEGIN;
+
+WITH matching_urls AS (
+    SELECT url, host
+    FROM snapshots
+    WHERE url ~ '^gemini://[^/]+/$'
+      AND timestamp < (NOW() - INTERVAL '1 week')
+    ORDER BY random()
+    LIMIT 500
+)
+INSERT INTO urls (url, host)
+SELECT url, host
+FROM matching_urls
+ON CONFLICT DO NOTHING;
+
+-- WITH matching_urls AS (
+--     SELECT url, host
+--     FROM snapshots
+--     WHERE url ~ '^gemini://[^/]+/$'
+--       AND timestamp < (NOW() - INTERVAL '1 week')
+--     ORDER BY random()
+--     LIMIT 500
+-- )
+-- DELETE FROM snapshots
+-- WHERE url IN (
+--     SELECT url
+--     FROM matching_urls
+-- );
+
+COMMIT;
--- a/misc/sql/host_snapshot_stats.sql
+++ b/misc/sql/host_snapshot_stats.sql
@@ -0,0 +1,20 @@
+-- File: host_snapshot_stats.sql
+-- Groups snapshots by hosts and shows URLs with multiple snapshots
+-- Usage: \i misc/sql/host_snapshot_stats.sql
+
+SELECT 
+    host,
+    COUNT(DISTINCT url) as unique_urls,
+    SUM(CASE WHEN url_count > 1 THEN 1 ELSE 0 END) as urls_with_multiple_snapshots,
+    SUM(snapshot_count) as total_snapshots
+FROM (
+    SELECT 
+        host, 
+        url, 
+        COUNT(*) as snapshot_count,
+        COUNT(*) OVER (PARTITION BY url) as url_count
+    FROM snapshots
+    GROUP BY host, url
+) subquery
+GROUP BY host
+ORDER BY total_snapshots DESC;
--- a/misc/sql/mark_urls_processed_false.sql
+++ b/misc/sql/mark_urls_processed_false.sql
@@ -0,0 +1 @@
+update urls set being_processed=false where being_processed is true;
--- a/misc/sql/recent_snapshot_activity.sql
+++ b/misc/sql/recent_snapshot_activity.sql
@@ -0,0 +1,13 @@
+-- File: recent_snapshot_activity.sql
+-- Shows URLs with most snapshots in the last 7 days
+-- Usage: \i misc/sql/recent_snapshot_activity.sql
+
+SELECT 
+    url, 
+    COUNT(*) as snapshot_count
+FROM snapshots
+WHERE timestamp > NOW() - INTERVAL '7 days'
+GROUP BY url
+HAVING COUNT(*) > 1
+ORDER BY snapshot_count DESC
+LIMIT 20;
--- a/misc/sql/snapshot_distribution.sql
+++ b/misc/sql/snapshot_distribution.sql
@@ -0,0 +1,16 @@
+-- File: snapshot_distribution.sql
+-- Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
+-- Usage: \i misc/sql/snapshot_distribution.sql
+
+WITH counts AS (
+    SELECT url, COUNT(*) as snapshot_count
+    FROM snapshots
+    GROUP BY url
+)
+SELECT 
+    snapshot_count,
+    COUNT(*) as url_count,
+    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
+FROM counts
+GROUP BY snapshot_count
+ORDER BY snapshot_count;
--- a/misc/sql/snapshots_by_timeframe.sql
+++ b/misc/sql/snapshots_by_timeframe.sql
@@ -0,0 +1,37 @@
+-- File: snapshots_by_timeframe.sql
+-- Shows snapshot count by timeframe (day, week, month)
+-- Usage: \i misc/sql/snapshots_by_timeframe.sql
+
+WITH daily_snapshots AS (
+    SELECT 
+        date_trunc('day', timestamp) as day,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT url) as unique_urls
+    FROM snapshots
+    GROUP BY day
+    ORDER BY day
+),
+weekly_snapshots AS (
+    SELECT 
+        date_trunc('week', timestamp) as week,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT url) as unique_urls
+    FROM snapshots
+    GROUP BY week
+    ORDER BY week
+),
+monthly_snapshots AS (
+    SELECT 
+        date_trunc('month', timestamp) as month,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT url) as unique_urls
+    FROM snapshots
+    GROUP BY month
+    ORDER BY month
+)
+SELECT 'Daily' as timeframe, * FROM daily_snapshots
+UNION ALL
+SELECT 'Weekly' as timeframe, * FROM weekly_snapshots
+UNION ALL
+SELECT 'Monthly' as timeframe, * FROM monthly_snapshots
+ORDER BY timeframe, day;
--- a/misc/sql/snapshots_date_range.sql
+++ b/misc/sql/snapshots_date_range.sql
@@ -0,0 +1,14 @@
+-- File: snapshots_date_range.sql
+-- Shows snapshot count with date range information for each URL
+-- Usage: \i misc/sql/snapshots_date_range.sql
+
+SELECT 
+    url, 
+    COUNT(*) as snapshot_count,
+    MIN(timestamp) as first_snapshot,
+    MAX(timestamp) as last_snapshot,
+    MAX(timestamp) - MIN(timestamp) as time_span
+FROM snapshots
+GROUP BY url
+HAVING COUNT(*) > 1
+ORDER BY snapshot_count DESC;
--- a/misc/sql/snapshots_per_url.sql
+++ b/misc/sql/snapshots_per_url.sql
@@ -0,0 +1,8 @@
+-- File: snapshots_per_url.sql
+-- Basic count of snapshots per URL
+-- Usage: \i misc/sql/snapshots_per_url.sql
+
+SELECT url, COUNT(*) as snapshot_count
+FROM snapshots
+GROUP BY url
+ORDER BY snapshot_count DESC;
--- a/misc/sql/storage_efficiency.sql
+++ b/misc/sql/storage_efficiency.sql
@@ -0,0 +1,20 @@
+-- File: storage_efficiency.sql
+-- Shows potential storage savings from deduplication
+-- Usage: \i misc/sql/storage_efficiency.sql
+
+WITH duplicate_stats AS (
+    SELECT 
+        url,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT gemtext) as unique_gemtexts,
+        COUNT(DISTINCT data) as unique_datas
+    FROM snapshots
+    GROUP BY url
+    HAVING COUNT(*) > 1
+)
+SELECT 
+    SUM(snapshot_count) as total_snapshots,
+    SUM(unique_gemtexts + unique_datas) as unique_contents,
+    SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas) as duplicate_content_count,
+    ROUND((SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas)) * 100.0 / SUM(snapshot_count), 2) as duplicate_percentage
+FROM duplicate_stats;
				`@@ -0,0 +1 @@`
				`update urls set being_processed=false where being_processed is true;`