Lots of features, first version that reliably crawls Geminispace.

- [x] Concurrent downloading with workers - [x] Concurrent connection limit per host - [x] URL Blacklist - [x] Save image/* and text/* files - [x] Configuration via environment variables - [x] Storing snapshots in PostgreSQL - [x] Proper response header & body UTF-8 and format validation . .
2024-10-21 20:03:28 +03:00
parent 212345764b
commit fee7d3e01c
37 changed files with 1231 additions and 319 deletions
--- a/db/backup-table.sql
+++ b/db/backup-table.sql
@@ -0,0 +1,16 @@
+BEGIN;
+
+-- Increase statement timeout
+SET statement_timeout = '10min';
+
+-- Step 1: Create a new table with the same schema
+CREATE TABLE backup (LIKE snapshots INCLUDING ALL);
+
+-- Step 2: Copy data from the old table to the new one
+INSERT INTO backup SELECT * FROM snapshots;
+
+-- (Optional) Step 3: Truncate the original table if you are moving the data
+-- TRUNCATE TABLE snapshots;
+
+-- Commit the transaction if everything went well
+COMMIT;
--- a/db/delete-dups.sql
+++ b/db/delete-dups.sql
@@ -0,0 +1,26 @@
+-- Explanation:
+
+--     WITH DuplicateSnapshots AS:
+--         This is a Common Table Expression (CTE) that selects all rows from the snapshots table.
+--         ROW_NUMBER() OVER (PARTITION BY url ORDER BY id): This assigns a unique row number to each row with the same url. The PARTITION BY url groups the rows by url, and ORDER BY id ensures that the row with the smallest id is given row_num = 1.
+--     DELETE FROM snapshots WHERE id IN:
+--         The DELETE statement deletes rows from the snapshots table where the id is in the result of the subquery.
+--     WHERE row_num > 1:
+--         In the subquery, we select only rows where row_num > 1, which means only the duplicate rows (since row_num = 1 is the one row we want to keep).
+
+-- Result:
+
+--     This query will delete all duplicate rows from the snapshots table, keeping only the row with the smallest id for each url.
+--     If multiple rows share the same url, only the first one (based on id) will be retained.
+
+WITH DuplicateSnapshots AS (
+    SELECT id,
+           ROW_NUMBER() OVER (PARTITION BY url ORDER BY id) AS row_num
+    FROM snapshots
+)
+DELETE FROM snapshots
+WHERE id IN (
+    SELECT id
+    FROM DuplicateSnapshots
+    WHERE row_num > 1
+);
--- a/db/host_stats.sql
+++ b/db/host_stats.sql
@@ -0,0 +1,5 @@
+SELECT host, COUNT(*) AS row_count
+FROM snapshots
+GROUP BY host
+ORDER BY row_count DESC
+LIMIT 10;
--- a/db/initdb.sql
+++ b/db/initdb.sql
@@ -0,0 +1,45 @@
+-- DB creation and users
+CREATE USER gemini;
+ALTER USER gemini WITH PASSWORD 'gemini';
+CREATE DATABASE gemini;
+GRANT ALL PRIVILEGES ON DATABASE gemini TO gemini;
+ALTER DATABASE gemini OWNER TO gemini;
+GRANT ALL PRIVILEGES ON SCHEMA public TO gemini;
+GRANT ALL PRIVILEGES ON gemini TO gemini;
+GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO gemini;
+
+-- Extensions
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+
+
+\c gemini
+
+-- Tables
+DROP TABLE IF EXISTS snapshots;
+
+CREATE TABLE snapshots (
+    id SERIAL PRIMARY KEY,
+    uid TEXT NOT NULL UNIQUE,
+    url TEXT NOT NULL,
+    host TEXT NOT NULL,
+    timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    mimetype TEXT,
+    data BYTEA,
+    gemtext TEXT,
+    links JSONB,
+    lang TEXT,
+    response_code INTEGER,
+    error TEXT
+);
+
+ALTER TABLE snapshots OWNER TO gemini;
+
+CREATE INDEX idx_uid ON snapshots (uid);
+CREATE INDEX idx_url ON snapshots (url);
+CREATE INDEX idx_timestamp ON snapshots (timestamp);
+CREATE INDEX idx_mimetype ON snapshots (mimetype);
+CREATE INDEX idx_lang ON snapshots (lang);
+CREATE INDEX idx_response_code ON snapshots (response_code);
+CREATE INDEX idx_error ON snapshots (error);
+CREATE INDEX idx_host ON snapshots (host);
+CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
--- a/db/migrate1_host.go
+++ b/db/migrate1_host.go
@@ -0,0 +1,99 @@
+package main
+
+import (
+	"fmt"
+	"gemini-grc/gemini"
+	"os"
+
+	_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
+	"github.com/jmoiron/sqlx"
+)
+
+func checkIfDone() bool { return true }
+
+// Populates the `host` field
+func main() {
+	db := connectToDB()
+
+	if checkIfDone() {
+		fmt.Println("Migration already applied")
+		return
+	}
+
+	count := 0
+	for {
+		// Start the transaction
+		tx, err := db.Beginx()
+		if err != nil {
+			fmt.Println(err)
+			return
+		}
+
+		query := `
+        SELECT * FROM snapshots
+        WHERE host IS NULL
+        LIMIT 5000
+    `
+		var snapshots []gemini.Snapshot
+		err = tx.Select(&snapshots, query)
+		if len(snapshots) == 0 {
+			fmt.Println("Done!")
+			return
+		}
+		if err != nil {
+			fmt.Println(err)
+			err := tx.Rollback()
+			if err != nil {
+				panic(err)
+			}
+		}
+		for _, s := range snapshots {
+			s.Host = s.URL.Hostname
+			fmt.Println(count, s.UID, s.URL.Hostname)
+			err := gemini.SaveSnapshotToDB(tx, &s)
+			if err != nil {
+				fmt.Println(err)
+				err := tx.Rollback()
+				if err != nil {
+					panic(err)
+				}
+			}
+			count += 1
+		}
+
+		err = tx.Commit()
+		if err != nil {
+			fmt.Println(err)
+			err := tx.Rollback()
+			if err != nil {
+				panic(err)
+			}
+		}
+
+	}
+
+}
+
+func connectToDB() *sqlx.DB {
+	connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s",
+		os.Getenv("PG_USER"),
+		os.Getenv("PG_PASSWORD"),
+		os.Getenv("PG_HOST"),
+		os.Getenv("PG_PORT"),
+		os.Getenv("PG_DATABASE"),
+	)
+
+	// Create a connection pool
+	db, err := sqlx.Open("pgx", connStr)
+	if err != nil {
+		panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err))
+	}
+	db.SetMaxOpenConns(20)
+	err = db.Ping()
+	if err != nil {
+		panic(fmt.Sprintf("Unable to ping database: %v\n", err))
+	}
+
+	fmt.Println("Connected to database")
+	return db
+}
--- a/db/migrate1_host.sh
+++ b/db/migrate1_host.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+set -eu
+
+MAX_RESPONSE_SIZE=10485760 \
+LOG_LEVEL=info \
+ROOT_PATH=./snaps \
+RESPONSE_TIMEOUT=10 \
+NUM_OF_WORKERS=5 \
+PG_DATABASE=gemini \
+PG_HOST=127.0.0.1 \
+PG_PORT=5433 \
+PG_USER=gemini \
+PG_PASSWORD=gemini \
+go run ./migrate1_host.go
--- a/db/pg_stats.sql
+++ b/db/pg_stats.sql
@@ -0,0 +1,12 @@
+SELECT 
+    query,
+    total_exec_time AS total_time,  -- total time spent on the query execution
+    calls,                          -- number of times the query has been called
+    mean_exec_time AS mean_time     -- average time per execution
+--    max_exec_time AS max_time        -- maximum time taken for any single execution
+FROM 
+    pg_stat_statements
+ORDER BY 
+    total_exec_time DESC             -- order by total execution time
+LIMIT 5;
+
--- a/db/pg_stats_reset.sql
+++ b/db/pg_stats_reset.sql
@@ -0,0 +1 @@
+SELECT pg_stat_statements_reset();
--- a/db/populateDB.go
+++ b/db/populateDB.go
@@ -0,0 +1,20 @@
+// func PopulateDB(db *sqlx.DB) {
+// 	// Delete all rows in the snapshots table
+// 	db.MustExec("TRUNCATE snapshots;")
+
+// 	// Prepare the query for inserting a snapshot with uid, url, and timestamp
+// 	query := `INSERT INTO snapshots(uid, url, timestamp)
+//               VALUES ($1, $2, $3)`
+
+// 	// Calculate the timestamp for 2 days ago
+// 	timestamp := time.Now().Add(-48 * time.Hour)
+
+// 	db.MustExec(query, uid.UID(), "gemini://geminiprotocol.net/", timestamp)
+// 	db.MustExec(query, uid.UID(), "gemini://warmedal.se/~antenna", timestamp)
+// 	db.MustExec(query, uid.UID(), "gemini://skyjake.fi/~Cosmos/", timestamp)
+// 	db.MustExec(query, uid.UID(), "gemini://gemini.circumlunar.space/capcom/", timestamp)
+// 	db.MustExec(query, uid.UID(), "gemini://auragem.letz.dev/", timestamp)
+// 	db.MustExec(query, uid.UID(), "gemini://gemplex.space/", timestamp)
+// 	db.MustExec(query, uid.UID(), "gemini://kennedy.gemi.dev/", timestamp)
+// 	db.MustExec(query, uid.UID(), "gemini://tlgs.one/", timestamp)
+// }
--- a/db/restore-table.sql
+++ b/db/restore-table.sql
@@ -0,0 +1,9 @@
+BEGIN;
+
+SET statement_timeout = '10min';
+
+TRUNCATE TABLE snapshots;
+
+INSERT INTO snapshots SELECT * FROM backup;
+
+COMMIT;
--- a/db/show-dups.sql
+++ b/db/show-dups.sql
@@ -0,0 +1,9 @@
+WITH DuplicateSnapshots AS (
+    SELECT id,
+           url,
+           ROW_NUMBER() OVER (PARTITION BY url ORDER BY id) AS row_num
+    FROM snapshots
+)
+SELECT *
+FROM DuplicateSnapshots
+WHERE row_num > 1;
--- a/db/stats.sql
+++ b/db/stats.sql
@@ -0,0 +1,5 @@
+SELECT
+    COUNT(CASE WHEN response_code IS NOT NULL AND error IS NULL THEN 1 END) AS "Visited",
+    COUNT(CASE WHEN response_code IS NULL THEN 1 END) AS "Pending",
+    COUNT(CASE WHEN error IS NOT NULL THEN 1 END) AS "Errors"
+FROM snapshots;