Lots of features, first version that reliably crawls Geminispace.

- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
- [x] Save image/* and text/* files
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
.

.
This commit is contained in:
2024-10-21 20:03:28 +03:00
parent 212345764b
commit fee7d3e01c
37 changed files with 1231 additions and 319 deletions

16
db/backup-table.sql Normal file
View File

@@ -0,0 +1,16 @@
BEGIN;
-- Increase statement timeout
SET statement_timeout = '10min';
-- Step 1: Create a new table with the same schema
CREATE TABLE backup (LIKE snapshots INCLUDING ALL);
-- Step 2: Copy data from the old table to the new one
INSERT INTO backup SELECT * FROM snapshots;
-- (Optional) Step 3: Truncate the original table if you are moving the data
-- TRUNCATE TABLE snapshots;
-- Commit the transaction if everything went well
COMMIT;

26
db/delete-dups.sql Normal file
View File

@@ -0,0 +1,26 @@
-- Explanation:
-- WITH DuplicateSnapshots AS:
-- This is a Common Table Expression (CTE) that selects all rows from the snapshots table.
-- ROW_NUMBER() OVER (PARTITION BY url ORDER BY id): This assigns a unique row number to each row with the same url. The PARTITION BY url groups the rows by url, and ORDER BY id ensures that the row with the smallest id is given row_num = 1.
-- DELETE FROM snapshots WHERE id IN:
-- The DELETE statement deletes rows from the snapshots table where the id is in the result of the subquery.
-- WHERE row_num > 1:
-- In the subquery, we select only rows where row_num > 1, which means only the duplicate rows (since row_num = 1 is the one row we want to keep).
-- Result:
-- This query will delete all duplicate rows from the snapshots table, keeping only the row with the smallest id for each url.
-- If multiple rows share the same url, only the first one (based on id) will be retained.
WITH DuplicateSnapshots AS (
SELECT id,
ROW_NUMBER() OVER (PARTITION BY url ORDER BY id) AS row_num
FROM snapshots
)
DELETE FROM snapshots
WHERE id IN (
SELECT id
FROM DuplicateSnapshots
WHERE row_num > 1
);

5
db/host_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT host, COUNT(*) AS row_count
FROM snapshots
GROUP BY host
ORDER BY row_count DESC
LIMIT 10;

45
db/initdb.sql Normal file
View File

@@ -0,0 +1,45 @@
-- DB creation and users
CREATE USER gemini;
ALTER USER gemini WITH PASSWORD 'gemini';
CREATE DATABASE gemini;
GRANT ALL PRIVILEGES ON DATABASE gemini TO gemini;
ALTER DATABASE gemini OWNER TO gemini;
GRANT ALL PRIVILEGES ON SCHEMA public TO gemini;
GRANT ALL PRIVILEGES ON gemini TO gemini;
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO gemini;
-- Extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
\c gemini
-- Tables
DROP TABLE IF EXISTS snapshots;
CREATE TABLE snapshots (
id SERIAL PRIMARY KEY,
uid TEXT NOT NULL UNIQUE,
url TEXT NOT NULL,
host TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
mimetype TEXT,
data BYTEA,
gemtext TEXT,
links JSONB,
lang TEXT,
response_code INTEGER,
error TEXT
);
ALTER TABLE snapshots OWNER TO gemini;
CREATE INDEX idx_uid ON snapshots (uid);
CREATE INDEX idx_url ON snapshots (url);
CREATE INDEX idx_timestamp ON snapshots (timestamp);
CREATE INDEX idx_mimetype ON snapshots (mimetype);
CREATE INDEX idx_lang ON snapshots (lang);
CREATE INDEX idx_response_code ON snapshots (response_code);
CREATE INDEX idx_error ON snapshots (error);
CREATE INDEX idx_host ON snapshots (host);
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;

99
db/migrate1_host.go Normal file
View File

@@ -0,0 +1,99 @@
package main
import (
"fmt"
"gemini-grc/gemini"
"os"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx"
)
func checkIfDone() bool { return true }
// Populates the `host` field
func main() {
db := connectToDB()
if checkIfDone() {
fmt.Println("Migration already applied")
return
}
count := 0
for {
// Start the transaction
tx, err := db.Beginx()
if err != nil {
fmt.Println(err)
return
}
query := `
SELECT * FROM snapshots
WHERE host IS NULL
LIMIT 5000
`
var snapshots []gemini.Snapshot
err = tx.Select(&snapshots, query)
if len(snapshots) == 0 {
fmt.Println("Done!")
return
}
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
for _, s := range snapshots {
s.Host = s.URL.Hostname
fmt.Println(count, s.UID, s.URL.Hostname)
err := gemini.SaveSnapshotToDB(tx, &s)
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
count += 1
}
err = tx.Commit()
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
}
}
func connectToDB() *sqlx.DB {
connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s",
os.Getenv("PG_USER"),
os.Getenv("PG_PASSWORD"),
os.Getenv("PG_HOST"),
os.Getenv("PG_PORT"),
os.Getenv("PG_DATABASE"),
)
// Create a connection pool
db, err := sqlx.Open("pgx", connStr)
if err != nil {
panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err))
}
db.SetMaxOpenConns(20)
err = db.Ping()
if err != nil {
panic(fmt.Sprintf("Unable to ping database: %v\n", err))
}
fmt.Println("Connected to database")
return db
}

14
db/migrate1_host.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/sh
set -eu
MAX_RESPONSE_SIZE=10485760 \
LOG_LEVEL=info \
ROOT_PATH=./snaps \
RESPONSE_TIMEOUT=10 \
NUM_OF_WORKERS=5 \
PG_DATABASE=gemini \
PG_HOST=127.0.0.1 \
PG_PORT=5433 \
PG_USER=gemini \
PG_PASSWORD=gemini \
go run ./migrate1_host.go

12
db/pg_stats.sql Normal file
View File

@@ -0,0 +1,12 @@
SELECT
query,
total_exec_time AS total_time, -- total time spent on the query execution
calls, -- number of times the query has been called
mean_exec_time AS mean_time -- average time per execution
-- max_exec_time AS max_time -- maximum time taken for any single execution
FROM
pg_stat_statements
ORDER BY
total_exec_time DESC -- order by total execution time
LIMIT 5;

1
db/pg_stats_reset.sql Normal file
View File

@@ -0,0 +1 @@
SELECT pg_stat_statements_reset();

20
db/populateDB.go Normal file
View File

@@ -0,0 +1,20 @@
// func PopulateDB(db *sqlx.DB) {
// // Delete all rows in the snapshots table
// db.MustExec("TRUNCATE snapshots;")
// // Prepare the query for inserting a snapshot with uid, url, and timestamp
// query := `INSERT INTO snapshots(uid, url, timestamp)
// VALUES ($1, $2, $3)`
// // Calculate the timestamp for 2 days ago
// timestamp := time.Now().Add(-48 * time.Hour)
// db.MustExec(query, uid.UID(), "gemini://geminiprotocol.net/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://warmedal.se/~antenna", timestamp)
// db.MustExec(query, uid.UID(), "gemini://skyjake.fi/~Cosmos/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://gemini.circumlunar.space/capcom/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://auragem.letz.dev/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://gemplex.space/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://kennedy.gemi.dev/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://tlgs.one/", timestamp)
// }

9
db/restore-table.sql Normal file
View File

@@ -0,0 +1,9 @@
BEGIN;
SET statement_timeout = '10min';
TRUNCATE TABLE snapshots;
INSERT INTO snapshots SELECT * FROM backup;
COMMIT;

9
db/show-dups.sql Normal file
View File

@@ -0,0 +1,9 @@
WITH DuplicateSnapshots AS (
SELECT id,
url,
ROW_NUMBER() OVER (PARTITION BY url ORDER BY id) AS row_num
FROM snapshots
)
SELECT *
FROM DuplicateSnapshots
WHERE row_num > 1;

5
db/stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT
COUNT(CASE WHEN response_code IS NOT NULL AND error IS NULL THEN 1 END) AS "Visited",
COUNT(CASE WHEN response_code IS NULL THEN 1 END) AS "Pending",
COUNT(CASE WHEN error IS NOT NULL THEN 1 END) AS "Errors"
FROM snapshots;