Enhance crawler with seed list and SQL utilities
Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
20
misc/sql/storage_efficiency.sql
Normal file
20
misc/sql/storage_efficiency.sql
Normal file
@@ -0,0 +1,20 @@
|
||||
-- File: storage_efficiency.sql
|
||||
-- Shows potential storage savings from deduplication
|
||||
-- Usage: \i misc/sql/storage_efficiency.sql
|
||||
|
||||
WITH duplicate_stats AS (
|
||||
SELECT
|
||||
url,
|
||||
COUNT(*) as snapshot_count,
|
||||
COUNT(DISTINCT gemtext) as unique_gemtexts,
|
||||
COUNT(DISTINCT data) as unique_datas
|
||||
FROM snapshots
|
||||
GROUP BY url
|
||||
HAVING COUNT(*) > 1
|
||||
)
|
||||
SELECT
|
||||
SUM(snapshot_count) as total_snapshots,
|
||||
SUM(unique_gemtexts + unique_datas) as unique_contents,
|
||||
SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas) as duplicate_content_count,
|
||||
ROUND((SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas)) * 100.0 / SUM(snapshot_count), 2) as duplicate_percentage
|
||||
FROM duplicate_stats;
|
||||
Reference in New Issue
Block a user