This commit is contained in:
2024-11-18 16:28:45 +02:00
parent f0452ff9f7
commit 825c7e3391
34 changed files with 624 additions and 426 deletions

View File

@@ -0,0 +1,7 @@
delete FROM snapshots
WHERE host IN (
SELECT DISTINCT host
FROM snapshots
WHERE error LIKE 'robots.txt%'
)
AND url LIKE 'gemini://' || host || '/%';

5
db/error_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT error, count(error) as count
FROM snapshots
GROUP BY error
ORDER BY count DESC
LIMIT 20;

View File

@@ -0,0 +1,7 @@
SELECT host, COUNT(*) AS row_count
FROM snapshots
WHERE response_code IS NOT NULL
AND error IS NULL
GROUP BY host
ORDER BY row_count DESC
LIMIT 10;

View File

@@ -20,7 +20,7 @@ DROP TABLE IF EXISTS snapshots;
CREATE TABLE snapshots (
id SERIAL PRIMARY KEY,
uid TEXT NOT NULL UNIQUE,
url TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
host TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
mimetype TEXT,
@@ -42,7 +42,10 @@ CREATE INDEX idx_lang ON snapshots (lang);
CREATE INDEX idx_response_code ON snapshots (response_code);
CREATE INDEX idx_error ON snapshots (error);
CREATE INDEX idx_host ON snapshots (host);
CREATE INDEX unique_uid_url ON snapshots (uid, url);
CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
WHERE response_code IS NULL AND error IS NULL
INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;

View File

@@ -2,9 +2,9 @@ package main
import (
"fmt"
"gemini-grc/gemini"
"os"
"gemini-grc/gemini"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx"
)
@@ -71,7 +71,6 @@ func main() {
}
}
}
func connectToDB() *sqlx.DB {

View File

@@ -0,0 +1,18 @@
-- Step 1: Delete duplicate entries, keeping the last one based on timestamp
-- Use a CTE to mark duplicates and delete them efficiently
WITH ranked_snapshots AS (
SELECT
id,
url,
ROW_NUMBER() OVER(PARTITION BY url ORDER BY timestamp DESC) AS row_num
FROM
snapshots
)
DELETE FROM snapshots
USING ranked_snapshots
WHERE snapshots.id = ranked_snapshots.id
AND ranked_snapshots.row_num > 1;
-- Step 2: Add a unique constraint on the url column to prevent future duplicates
ALTER TABLE snapshots
ADD CONSTRAINT unique_url UNIQUE (url);

View File

@@ -1,9 +1,9 @@
package main
import (
"gemini-grc/uid"
"time"
"gemini-grc/uid"
"github.com/jmoiron/sqlx"
)