DB scripts and migrations

This commit is contained in:
2024-12-09 19:54:00 +02:00
parent 7a36614232
commit 6cf507bdc9
5 changed files with 66 additions and 54 deletions

View File

@@ -17,6 +17,7 @@ const (
EnvResponseTimeout = "RESPONSE_TIMEOUT"
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
EnvBlacklistPath = "BLACKLIST_PATH"
EnvDryRun = "DRY_RUN"
)
// Config holds the application configuration loaded from environment variables.
@@ -28,6 +29,7 @@ type Config struct {
WorkerBatchSize int // Batch size for worker processing
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
BlacklistPath string // File that has blacklisted strings of "host:port"
DryRun bool // If false, don't write to disk
}
var CONFIG Config //nolint:gochecknoglobals
@@ -126,6 +128,14 @@ func GetConfig() *Config {
config.BlacklistPath = v
return nil
},
EnvDryRun: func(v string) error {
val, err := parseBool(EnvDryRun, v)
if err != nil {
return err
}
config.DryRun = val
return nil
},
}
// Process each environment variable

22
db/fix-url-ports.sql Normal file
View File

@@ -0,0 +1,22 @@
-- Here's an SQL script that will find and remove snapshots without port numbers
-- when there exists a duplicate with the default port 1965.
-- Before running this DELETE though, you might want to
-- verify the matches first with this SELECT:
WITH duplicates AS (
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
FROM snapshots s1
JOIN snapshots s2
ON s2.url = s1.url || ':1965'
)
SELECT * FROM duplicates;
-- Now delete them for real:
WITH duplicates AS (
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
FROM snapshots s1
JOIN snapshots s2
ON s2.url = s1.url || ':1965'
)
DELETE FROM snapshots
WHERE id IN (SELECT id_without_port FROM duplicates);

View File

@@ -1,14 +0,0 @@
#!/bin/sh
set -eu
MAX_RESPONSE_SIZE=10485760 \
LOG_LEVEL=info \
ROOT_PATH=./snaps \
RESPONSE_TIMEOUT=10 \
NUM_OF_WORKERS=5 \
PG_DATABASE=gemini \
PG_HOST=127.0.0.1 \
PG_PORT=5433 \
PG_USER=gemini \
PG_PASSWORD=gemini \
go run ./migrate1_host.go

View File

@@ -9,37 +9,49 @@ import (
"github.com/jmoiron/sqlx"
)
// MANUALLY SET TO TRUE WHEN MIGRATION HAS BEEN APPLIED
func checkIfDone() bool { return true }
func MustSelect(tx *sqlx.Tx, dest interface{}, query string, args ...interface{}) {
if err := tx.Select(dest, query, args...); err != nil {
panic(err)
}
}
// Populates the `host` field
func main() {
db := connectToDB()
if checkIfDone() {
fmt.Println("Migration already applied")
return
}
count := 0
db := connectToDB()
defer db.Close()
batchSize := 1000
for {
// Start the transaction
tx, err := db.Beginx()
if err != nil {
fmt.Println(err)
return
}
query := `
SELECT * FROM snapshots
WHERE host IS NULL
LIMIT 5000
`
tx := db.MustBegin()
query := `SELECT * FROM snapshots WHERE url NOT LIKE '%1965%' LIMIT $1`
var snapshots []gemini.Snapshot
err = tx.Select(&snapshots, query)
MustSelect(tx, &snapshots, query, batchSize)
if len(snapshots) == 0 {
fmt.Println("Done!")
return
fmt.Println("No snapshots remaining, done")
break
}
for i, s := range snapshots {
_, err := gemini.ParseURL(s.URL.String(), "")
if err != nil {
panic(fmt.Sprintf("Error parsing URL. ID %d URL %s\n", s.ID, s.URL))
}
fmt.Printf("Saving %d %d %s\n", i+1, s.ID, s.URL)
err = gemini.UpsertSnapshot(0, tx, &s)
if err != nil {
panic(fmt.Sprintf("Error saving %s: %s", s.URL, err))
}
tx.MustExec(`DELETE FROM snapshots WHERE id=$1`, s.ID)
}
err := tx.Commit()
if err != nil {
fmt.Println(err)
err := tx.Rollback()
@@ -47,29 +59,6 @@ func main() {
panic(err)
}
}
for _, s := range snapshots {
s.Host = s.URL.Hostname
fmt.Println(count, s.UID, s.URL.Hostname)
err := gemini.SaveSnapshotToDB(tx, &s)
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
count += 1
}
err = tx.Commit()
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
}
}

5
db/url_port_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT
COUNT(*) AS "All",
COUNT(CASE WHEN URL ~ '://[^:]+:[0-9]+' THEN 1 END) AS "With port",
COUNT(CASE WHEN URL !~ '://[^:]+:[0-9]+' THEN 1 END) AS "Without port"
FROM snapshots;