Files
gemini-grc/misc/sql/storage_efficiency.sql
antanst 37d5e7cd78 Enhance crawler with seed list and SQL utilities
Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
2025-06-16 12:29:33 +03:00

20 lines
729 B
SQL

-- File: storage_efficiency.sql
-- Shows potential storage savings from deduplication
-- Usage: \i misc/sql/storage_efficiency.sql
WITH duplicate_stats AS (
SELECT
url,
COUNT(*) as snapshot_count,
COUNT(DISTINCT gemtext) as unique_gemtexts,
COUNT(DISTINCT data) as unique_datas
FROM snapshots
GROUP BY url
HAVING COUNT(*) > 1
)
SELECT
SUM(snapshot_count) as total_snapshots,
SUM(unique_gemtexts + unique_datas) as unique_contents,
SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas) as duplicate_content_count,
ROUND((SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas)) * 100.0 / SUM(snapshot_count), 2) as duplicate_percentage
FROM duplicate_stats;