.
This commit is contained in:
7
db/delete_robots_hosts.sql
Normal file
7
db/delete_robots_hosts.sql
Normal file
@@ -0,0 +1,7 @@
|
||||
delete FROM snapshots
|
||||
WHERE host IN (
|
||||
SELECT DISTINCT host
|
||||
FROM snapshots
|
||||
WHERE error LIKE 'robots.txt%'
|
||||
)
|
||||
AND url LIKE 'gemini://' || host || '/%';
|
||||
5
db/error_stats.sql
Normal file
5
db/error_stats.sql
Normal file
@@ -0,0 +1,5 @@
|
||||
SELECT error, count(error) as count
|
||||
FROM snapshots
|
||||
GROUP BY error
|
||||
ORDER BY count DESC
|
||||
LIMIT 20;
|
||||
7
db/host_stats_visited.sql
Normal file
7
db/host_stats_visited.sql
Normal file
@@ -0,0 +1,7 @@
|
||||
SELECT host, COUNT(*) AS row_count
|
||||
FROM snapshots
|
||||
WHERE response_code IS NOT NULL
|
||||
AND error IS NULL
|
||||
GROUP BY host
|
||||
ORDER BY row_count DESC
|
||||
LIMIT 10;
|
||||
@@ -20,7 +20,7 @@ DROP TABLE IF EXISTS snapshots;
|
||||
CREATE TABLE snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
uid TEXT NOT NULL UNIQUE,
|
||||
url TEXT NOT NULL,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
host TEXT NOT NULL,
|
||||
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
mimetype TEXT,
|
||||
@@ -42,7 +42,10 @@ CREATE INDEX idx_lang ON snapshots (lang);
|
||||
CREATE INDEX idx_response_code ON snapshots (response_code);
|
||||
CREATE INDEX idx_error ON snapshots (error);
|
||||
CREATE INDEX idx_host ON snapshots (host);
|
||||
CREATE INDEX unique_uid_url ON snapshots (uid, url);
|
||||
|
||||
CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
|
||||
WHERE response_code IS NULL AND error IS NULL
|
||||
INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
|
||||
|
||||
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
|
||||
|
||||
@@ -2,9 +2,9 @@ package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/gemini"
|
||||
"os"
|
||||
|
||||
"gemini-grc/gemini"
|
||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@@ -71,7 +71,6 @@ func main() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func connectToDB() *sqlx.DB {
|
||||
|
||||
18
db/migrate2_unique_urls.sql
Normal file
18
db/migrate2_unique_urls.sql
Normal file
@@ -0,0 +1,18 @@
|
||||
-- Step 1: Delete duplicate entries, keeping the last one based on timestamp
|
||||
-- Use a CTE to mark duplicates and delete them efficiently
|
||||
WITH ranked_snapshots AS (
|
||||
SELECT
|
||||
id,
|
||||
url,
|
||||
ROW_NUMBER() OVER(PARTITION BY url ORDER BY timestamp DESC) AS row_num
|
||||
FROM
|
||||
snapshots
|
||||
)
|
||||
DELETE FROM snapshots
|
||||
USING ranked_snapshots
|
||||
WHERE snapshots.id = ranked_snapshots.id
|
||||
AND ranked_snapshots.row_num > 1;
|
||||
|
||||
-- Step 2: Add a unique constraint on the url column to prevent future duplicates
|
||||
ALTER TABLE snapshots
|
||||
ADD CONSTRAINT unique_url UNIQUE (url);
|
||||
@@ -1,9 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"gemini-grc/uid"
|
||||
"time"
|
||||
|
||||
"gemini-grc/uid"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user