DB scripts and migrations
This commit is contained in:
@@ -17,6 +17,7 @@ const (
|
||||
EnvResponseTimeout = "RESPONSE_TIMEOUT"
|
||||
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
|
||||
EnvBlacklistPath = "BLACKLIST_PATH"
|
||||
EnvDryRun = "DRY_RUN"
|
||||
)
|
||||
|
||||
// Config holds the application configuration loaded from environment variables.
|
||||
@@ -28,6 +29,7 @@ type Config struct {
|
||||
WorkerBatchSize int // Batch size for worker processing
|
||||
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
|
||||
BlacklistPath string // File that has blacklisted strings of "host:port"
|
||||
DryRun bool // If false, don't write to disk
|
||||
}
|
||||
|
||||
var CONFIG Config //nolint:gochecknoglobals
|
||||
@@ -126,6 +128,14 @@ func GetConfig() *Config {
|
||||
config.BlacklistPath = v
|
||||
return nil
|
||||
},
|
||||
EnvDryRun: func(v string) error {
|
||||
val, err := parseBool(EnvDryRun, v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
config.DryRun = val
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
// Process each environment variable
|
||||
|
||||
22
db/fix-url-ports.sql
Normal file
22
db/fix-url-ports.sql
Normal file
@@ -0,0 +1,22 @@
|
||||
-- Here's an SQL script that will find and remove snapshots without port numbers
|
||||
-- when there exists a duplicate with the default port 1965.
|
||||
|
||||
-- Before running this DELETE though, you might want to
|
||||
-- verify the matches first with this SELECT:
|
||||
WITH duplicates AS (
|
||||
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
|
||||
FROM snapshots s1
|
||||
JOIN snapshots s2
|
||||
ON s2.url = s1.url || ':1965'
|
||||
)
|
||||
SELECT * FROM duplicates;
|
||||
|
||||
-- Now delete them for real:
|
||||
WITH duplicates AS (
|
||||
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
|
||||
FROM snapshots s1
|
||||
JOIN snapshots s2
|
||||
ON s2.url = s1.url || ':1965'
|
||||
)
|
||||
DELETE FROM snapshots
|
||||
WHERE id IN (SELECT id_without_port FROM duplicates);
|
||||
@@ -1,14 +0,0 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
MAX_RESPONSE_SIZE=10485760 \
|
||||
LOG_LEVEL=info \
|
||||
ROOT_PATH=./snaps \
|
||||
RESPONSE_TIMEOUT=10 \
|
||||
NUM_OF_WORKERS=5 \
|
||||
PG_DATABASE=gemini \
|
||||
PG_HOST=127.0.0.1 \
|
||||
PG_PORT=5433 \
|
||||
PG_USER=gemini \
|
||||
PG_PASSWORD=gemini \
|
||||
go run ./migrate1_host.go
|
||||
@@ -9,37 +9,49 @@ import (
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
// MANUALLY SET TO TRUE WHEN MIGRATION HAS BEEN APPLIED
|
||||
func checkIfDone() bool { return true }
|
||||
|
||||
func MustSelect(tx *sqlx.Tx, dest interface{}, query string, args ...interface{}) {
|
||||
if err := tx.Select(dest, query, args...); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Populates the `host` field
|
||||
func main() {
|
||||
db := connectToDB()
|
||||
|
||||
if checkIfDone() {
|
||||
fmt.Println("Migration already applied")
|
||||
return
|
||||
}
|
||||
|
||||
count := 0
|
||||
db := connectToDB()
|
||||
defer db.Close()
|
||||
|
||||
batchSize := 1000
|
||||
for {
|
||||
// Start the transaction
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
query := `
|
||||
SELECT * FROM snapshots
|
||||
WHERE host IS NULL
|
||||
LIMIT 5000
|
||||
`
|
||||
tx := db.MustBegin()
|
||||
query := `SELECT * FROM snapshots WHERE url NOT LIKE '%1965%' LIMIT $1`
|
||||
var snapshots []gemini.Snapshot
|
||||
err = tx.Select(&snapshots, query)
|
||||
MustSelect(tx, &snapshots, query, batchSize)
|
||||
if len(snapshots) == 0 {
|
||||
fmt.Println("Done!")
|
||||
return
|
||||
fmt.Println("No snapshots remaining, done")
|
||||
break
|
||||
}
|
||||
for i, s := range snapshots {
|
||||
_, err := gemini.ParseURL(s.URL.String(), "")
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Error parsing URL. ID %d URL %s\n", s.ID, s.URL))
|
||||
}
|
||||
fmt.Printf("Saving %d %d %s\n", i+1, s.ID, s.URL)
|
||||
err = gemini.UpsertSnapshot(0, tx, &s)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Error saving %s: %s", s.URL, err))
|
||||
}
|
||||
tx.MustExec(`DELETE FROM snapshots WHERE id=$1`, s.ID)
|
||||
}
|
||||
err := tx.Commit()
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
err := tx.Rollback()
|
||||
@@ -47,29 +59,6 @@ func main() {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
for _, s := range snapshots {
|
||||
s.Host = s.URL.Hostname
|
||||
fmt.Println(count, s.UID, s.URL.Hostname)
|
||||
err := gemini.SaveSnapshotToDB(tx, &s)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
err := tx.Rollback()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
count += 1
|
||||
}
|
||||
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
err := tx.Rollback()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
5
db/url_port_stats.sql
Normal file
5
db/url_port_stats.sql
Normal file
@@ -0,0 +1,5 @@
|
||||
SELECT
|
||||
COUNT(*) AS "All",
|
||||
COUNT(CASE WHEN URL ~ '://[^:]+:[0-9]+' THEN 1 END) AS "With port",
|
||||
COUNT(CASE WHEN URL !~ '://[^:]+:[0-9]+' THEN 1 END) AS "Without port"
|
||||
FROM snapshots;
|
||||
Reference in New Issue
Block a user