Enhance crawler with seed list and SQL utilities
Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
28
misc/sql/README.md
Normal file
28
misc/sql/README.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# SQL Queries for Snapshot Analysis
|
||||
|
||||
This directory contains SQL queries to analyze snapshot data in the gemini-grc database.
|
||||
|
||||
## Usage
|
||||
|
||||
You can run these queries directly from psql using the `\i` directive:
|
||||
|
||||
```
|
||||
\i misc/sql/snapshots_per_url.sql
|
||||
```
|
||||
|
||||
## Available Queries
|
||||
|
||||
- **snapshots_per_url.sql** - Basic count of snapshots per URL
|
||||
- **snapshots_date_range.sql** - Shows snapshot count with date range information for each URL
|
||||
- **host_snapshot_stats.sql** - Groups snapshots by hosts and shows URLs with multiple snapshots
|
||||
- **content_changes.sql** - Finds URLs with the most content changes between consecutive snapshots
|
||||
- **snapshot_distribution.sql** - Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
|
||||
- **recent_snapshot_activity.sql** - Shows URLs with most snapshots in the last 7 days
|
||||
- **storage_efficiency.sql** - Shows potential storage savings from deduplication
|
||||
- **snapshots_by_timeframe.sql** - Shows snapshot count by timeframe (day, week, month)
|
||||
|
||||
## Notes
|
||||
|
||||
- These queries are designed to work with PostgreSQL and the gemini-grc database schema
|
||||
- Some queries may be resource-intensive on large databases
|
||||
- The results can help optimize storage and understand the effectiveness of the versioned snapshot feature
|
||||
26
misc/sql/content_changes.sql
Normal file
26
misc/sql/content_changes.sql
Normal file
@@ -0,0 +1,26 @@
|
||||
-- File: content_changes.sql
|
||||
-- Finds URLs with the most content changes between consecutive snapshots
|
||||
-- Usage: \i misc/sql/content_changes.sql
|
||||
|
||||
WITH snapshot_changes AS (
|
||||
SELECT
|
||||
s1.url,
|
||||
s1.timestamp as prev_timestamp,
|
||||
s2.timestamp as next_timestamp,
|
||||
s1.gemtext IS DISTINCT FROM s2.gemtext as gemtext_changed,
|
||||
s1.data IS DISTINCT FROM s2.data as data_changed
|
||||
FROM snapshots s1
|
||||
JOIN snapshots s2 ON s1.url = s2.url AND s1.timestamp < s2.timestamp
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM snapshots s3
|
||||
WHERE s3.url = s1.url AND s1.timestamp < s3.timestamp AND s3.timestamp < s2.timestamp
|
||||
)
|
||||
)
|
||||
SELECT
|
||||
url,
|
||||
COUNT(*) + 1 as snapshot_count,
|
||||
SUM(CASE WHEN gemtext_changed OR data_changed THEN 1 ELSE 0 END) as content_changes
|
||||
FROM snapshot_changes
|
||||
GROUP BY url
|
||||
HAVING COUNT(*) + 1 > 1
|
||||
ORDER BY content_changes DESC, snapshot_count DESC;
|
||||
30
misc/sql/crawl_top_level.sql
Normal file
30
misc/sql/crawl_top_level.sql
Normal file
@@ -0,0 +1,30 @@
|
||||
BEGIN;
|
||||
|
||||
WITH matching_urls AS (
|
||||
SELECT url, host
|
||||
FROM snapshots
|
||||
WHERE url ~ '^gemini://[^/]+/$'
|
||||
AND timestamp < (NOW() - INTERVAL '1 week')
|
||||
ORDER BY random()
|
||||
LIMIT 500
|
||||
)
|
||||
INSERT INTO urls (url, host)
|
||||
SELECT url, host
|
||||
FROM matching_urls
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
-- WITH matching_urls AS (
|
||||
-- SELECT url, host
|
||||
-- FROM snapshots
|
||||
-- WHERE url ~ '^gemini://[^/]+/$'
|
||||
-- AND timestamp < (NOW() - INTERVAL '1 week')
|
||||
-- ORDER BY random()
|
||||
-- LIMIT 500
|
||||
-- )
|
||||
-- DELETE FROM snapshots
|
||||
-- WHERE url IN (
|
||||
-- SELECT url
|
||||
-- FROM matching_urls
|
||||
-- );
|
||||
|
||||
COMMIT;
|
||||
20
misc/sql/host_snapshot_stats.sql
Normal file
20
misc/sql/host_snapshot_stats.sql
Normal file
@@ -0,0 +1,20 @@
|
||||
-- File: host_snapshot_stats.sql
|
||||
-- Groups snapshots by hosts and shows URLs with multiple snapshots
|
||||
-- Usage: \i misc/sql/host_snapshot_stats.sql
|
||||
|
||||
SELECT
|
||||
host,
|
||||
COUNT(DISTINCT url) as unique_urls,
|
||||
SUM(CASE WHEN url_count > 1 THEN 1 ELSE 0 END) as urls_with_multiple_snapshots,
|
||||
SUM(snapshot_count) as total_snapshots
|
||||
FROM (
|
||||
SELECT
|
||||
host,
|
||||
url,
|
||||
COUNT(*) as snapshot_count,
|
||||
COUNT(*) OVER (PARTITION BY url) as url_count
|
||||
FROM snapshots
|
||||
GROUP BY host, url
|
||||
) subquery
|
||||
GROUP BY host
|
||||
ORDER BY total_snapshots DESC;
|
||||
1
misc/sql/mark_urls_processed_false.sql
Normal file
1
misc/sql/mark_urls_processed_false.sql
Normal file
@@ -0,0 +1 @@
|
||||
update urls set being_processed=false where being_processed is true;
|
||||
13
misc/sql/recent_snapshot_activity.sql
Normal file
13
misc/sql/recent_snapshot_activity.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
-- File: recent_snapshot_activity.sql
|
||||
-- Shows URLs with most snapshots in the last 7 days
|
||||
-- Usage: \i misc/sql/recent_snapshot_activity.sql
|
||||
|
||||
SELECT
|
||||
url,
|
||||
COUNT(*) as snapshot_count
|
||||
FROM snapshots
|
||||
WHERE timestamp > NOW() - INTERVAL '7 days'
|
||||
GROUP BY url
|
||||
HAVING COUNT(*) > 1
|
||||
ORDER BY snapshot_count DESC
|
||||
LIMIT 20;
|
||||
16
misc/sql/snapshot_distribution.sql
Normal file
16
misc/sql/snapshot_distribution.sql
Normal file
@@ -0,0 +1,16 @@
|
||||
-- File: snapshot_distribution.sql
|
||||
-- Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
|
||||
-- Usage: \i misc/sql/snapshot_distribution.sql
|
||||
|
||||
WITH counts AS (
|
||||
SELECT url, COUNT(*) as snapshot_count
|
||||
FROM snapshots
|
||||
GROUP BY url
|
||||
)
|
||||
SELECT
|
||||
snapshot_count,
|
||||
COUNT(*) as url_count,
|
||||
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
|
||||
FROM counts
|
||||
GROUP BY snapshot_count
|
||||
ORDER BY snapshot_count;
|
||||
37
misc/sql/snapshots_by_timeframe.sql
Normal file
37
misc/sql/snapshots_by_timeframe.sql
Normal file
@@ -0,0 +1,37 @@
|
||||
-- File: snapshots_by_timeframe.sql
|
||||
-- Shows snapshot count by timeframe (day, week, month)
|
||||
-- Usage: \i misc/sql/snapshots_by_timeframe.sql
|
||||
|
||||
WITH daily_snapshots AS (
|
||||
SELECT
|
||||
date_trunc('day', timestamp) as day,
|
||||
COUNT(*) as snapshot_count,
|
||||
COUNT(DISTINCT url) as unique_urls
|
||||
FROM snapshots
|
||||
GROUP BY day
|
||||
ORDER BY day
|
||||
),
|
||||
weekly_snapshots AS (
|
||||
SELECT
|
||||
date_trunc('week', timestamp) as week,
|
||||
COUNT(*) as snapshot_count,
|
||||
COUNT(DISTINCT url) as unique_urls
|
||||
FROM snapshots
|
||||
GROUP BY week
|
||||
ORDER BY week
|
||||
),
|
||||
monthly_snapshots AS (
|
||||
SELECT
|
||||
date_trunc('month', timestamp) as month,
|
||||
COUNT(*) as snapshot_count,
|
||||
COUNT(DISTINCT url) as unique_urls
|
||||
FROM snapshots
|
||||
GROUP BY month
|
||||
ORDER BY month
|
||||
)
|
||||
SELECT 'Daily' as timeframe, * FROM daily_snapshots
|
||||
UNION ALL
|
||||
SELECT 'Weekly' as timeframe, * FROM weekly_snapshots
|
||||
UNION ALL
|
||||
SELECT 'Monthly' as timeframe, * FROM monthly_snapshots
|
||||
ORDER BY timeframe, day;
|
||||
14
misc/sql/snapshots_date_range.sql
Normal file
14
misc/sql/snapshots_date_range.sql
Normal file
@@ -0,0 +1,14 @@
|
||||
-- File: snapshots_date_range.sql
|
||||
-- Shows snapshot count with date range information for each URL
|
||||
-- Usage: \i misc/sql/snapshots_date_range.sql
|
||||
|
||||
SELECT
|
||||
url,
|
||||
COUNT(*) as snapshot_count,
|
||||
MIN(timestamp) as first_snapshot,
|
||||
MAX(timestamp) as last_snapshot,
|
||||
MAX(timestamp) - MIN(timestamp) as time_span
|
||||
FROM snapshots
|
||||
GROUP BY url
|
||||
HAVING COUNT(*) > 1
|
||||
ORDER BY snapshot_count DESC;
|
||||
8
misc/sql/snapshots_per_url.sql
Normal file
8
misc/sql/snapshots_per_url.sql
Normal file
@@ -0,0 +1,8 @@
|
||||
-- File: snapshots_per_url.sql
|
||||
-- Basic count of snapshots per URL
|
||||
-- Usage: \i misc/sql/snapshots_per_url.sql
|
||||
|
||||
SELECT url, COUNT(*) as snapshot_count
|
||||
FROM snapshots
|
||||
GROUP BY url
|
||||
ORDER BY snapshot_count DESC;
|
||||
20
misc/sql/storage_efficiency.sql
Normal file
20
misc/sql/storage_efficiency.sql
Normal file
@@ -0,0 +1,20 @@
|
||||
-- File: storage_efficiency.sql
|
||||
-- Shows potential storage savings from deduplication
|
||||
-- Usage: \i misc/sql/storage_efficiency.sql
|
||||
|
||||
WITH duplicate_stats AS (
|
||||
SELECT
|
||||
url,
|
||||
COUNT(*) as snapshot_count,
|
||||
COUNT(DISTINCT gemtext) as unique_gemtexts,
|
||||
COUNT(DISTINCT data) as unique_datas
|
||||
FROM snapshots
|
||||
GROUP BY url
|
||||
HAVING COUNT(*) > 1
|
||||
)
|
||||
SELECT
|
||||
SUM(snapshot_count) as total_snapshots,
|
||||
SUM(unique_gemtexts + unique_datas) as unique_contents,
|
||||
SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas) as duplicate_content_count,
|
||||
ROUND((SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas)) * 100.0 / SUM(snapshot_count), 2) as duplicate_percentage
|
||||
FROM duplicate_stats;
|
||||
Reference in New Issue
Block a user