DB scripts and migrations
This commit is contained in:
@@ -17,6 +17,7 @@ const (
|
|||||||
EnvResponseTimeout = "RESPONSE_TIMEOUT"
|
EnvResponseTimeout = "RESPONSE_TIMEOUT"
|
||||||
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
|
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
|
||||||
EnvBlacklistPath = "BLACKLIST_PATH"
|
EnvBlacklistPath = "BLACKLIST_PATH"
|
||||||
|
EnvDryRun = "DRY_RUN"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config holds the application configuration loaded from environment variables.
|
// Config holds the application configuration loaded from environment variables.
|
||||||
@@ -28,6 +29,7 @@ type Config struct {
|
|||||||
WorkerBatchSize int // Batch size for worker processing
|
WorkerBatchSize int // Batch size for worker processing
|
||||||
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
|
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
|
||||||
BlacklistPath string // File that has blacklisted strings of "host:port"
|
BlacklistPath string // File that has blacklisted strings of "host:port"
|
||||||
|
DryRun bool // If false, don't write to disk
|
||||||
}
|
}
|
||||||
|
|
||||||
var CONFIG Config //nolint:gochecknoglobals
|
var CONFIG Config //nolint:gochecknoglobals
|
||||||
@@ -126,6 +128,14 @@ func GetConfig() *Config {
|
|||||||
config.BlacklistPath = v
|
config.BlacklistPath = v
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
|
EnvDryRun: func(v string) error {
|
||||||
|
val, err := parseBool(EnvDryRun, v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
config.DryRun = val
|
||||||
|
return nil
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process each environment variable
|
// Process each environment variable
|
||||||
|
|||||||
22
db/fix-url-ports.sql
Normal file
22
db/fix-url-ports.sql
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
-- Here's an SQL script that will find and remove snapshots without port numbers
|
||||||
|
-- when there exists a duplicate with the default port 1965.
|
||||||
|
|
||||||
|
-- Before running this DELETE though, you might want to
|
||||||
|
-- verify the matches first with this SELECT:
|
||||||
|
WITH duplicates AS (
|
||||||
|
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
|
||||||
|
FROM snapshots s1
|
||||||
|
JOIN snapshots s2
|
||||||
|
ON s2.url = s1.url || ':1965'
|
||||||
|
)
|
||||||
|
SELECT * FROM duplicates;
|
||||||
|
|
||||||
|
-- Now delete them for real:
|
||||||
|
WITH duplicates AS (
|
||||||
|
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
|
||||||
|
FROM snapshots s1
|
||||||
|
JOIN snapshots s2
|
||||||
|
ON s2.url = s1.url || ':1965'
|
||||||
|
)
|
||||||
|
DELETE FROM snapshots
|
||||||
|
WHERE id IN (SELECT id_without_port FROM duplicates);
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
MAX_RESPONSE_SIZE=10485760 \
|
|
||||||
LOG_LEVEL=info \
|
|
||||||
ROOT_PATH=./snaps \
|
|
||||||
RESPONSE_TIMEOUT=10 \
|
|
||||||
NUM_OF_WORKERS=5 \
|
|
||||||
PG_DATABASE=gemini \
|
|
||||||
PG_HOST=127.0.0.1 \
|
|
||||||
PG_PORT=5433 \
|
|
||||||
PG_USER=gemini \
|
|
||||||
PG_PASSWORD=gemini \
|
|
||||||
go run ./migrate1_host.go
|
|
||||||
@@ -9,37 +9,49 @@ import (
|
|||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// MANUALLY SET TO TRUE WHEN MIGRATION HAS BEEN APPLIED
|
||||||
func checkIfDone() bool { return true }
|
func checkIfDone() bool { return true }
|
||||||
|
|
||||||
|
func MustSelect(tx *sqlx.Tx, dest interface{}, query string, args ...interface{}) {
|
||||||
|
if err := tx.Select(dest, query, args...); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Populates the `host` field
|
// Populates the `host` field
|
||||||
func main() {
|
func main() {
|
||||||
db := connectToDB()
|
|
||||||
|
|
||||||
if checkIfDone() {
|
if checkIfDone() {
|
||||||
fmt.Println("Migration already applied")
|
fmt.Println("Migration already applied")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
count := 0
|
db := connectToDB()
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
batchSize := 1000
|
||||||
for {
|
for {
|
||||||
// Start the transaction
|
// Start the transaction
|
||||||
tx, err := db.Beginx()
|
tx := db.MustBegin()
|
||||||
if err != nil {
|
query := `SELECT * FROM snapshots WHERE url NOT LIKE '%1965%' LIMIT $1`
|
||||||
fmt.Println(err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
query := `
|
|
||||||
SELECT * FROM snapshots
|
|
||||||
WHERE host IS NULL
|
|
||||||
LIMIT 5000
|
|
||||||
`
|
|
||||||
var snapshots []gemini.Snapshot
|
var snapshots []gemini.Snapshot
|
||||||
err = tx.Select(&snapshots, query)
|
MustSelect(tx, &snapshots, query, batchSize)
|
||||||
if len(snapshots) == 0 {
|
if len(snapshots) == 0 {
|
||||||
fmt.Println("Done!")
|
fmt.Println("No snapshots remaining, done")
|
||||||
return
|
break
|
||||||
}
|
}
|
||||||
|
for i, s := range snapshots {
|
||||||
|
_, err := gemini.ParseURL(s.URL.String(), "")
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Error parsing URL. ID %d URL %s\n", s.ID, s.URL))
|
||||||
|
}
|
||||||
|
fmt.Printf("Saving %d %d %s\n", i+1, s.ID, s.URL)
|
||||||
|
err = gemini.UpsertSnapshot(0, tx, &s)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Error saving %s: %s", s.URL, err))
|
||||||
|
}
|
||||||
|
tx.MustExec(`DELETE FROM snapshots WHERE id=$1`, s.ID)
|
||||||
|
}
|
||||||
|
err := tx.Commit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
fmt.Println(err)
|
||||||
err := tx.Rollback()
|
err := tx.Rollback()
|
||||||
@@ -47,29 +59,6 @@ func main() {
|
|||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, s := range snapshots {
|
|
||||||
s.Host = s.URL.Hostname
|
|
||||||
fmt.Println(count, s.UID, s.URL.Hostname)
|
|
||||||
err := gemini.SaveSnapshotToDB(tx, &s)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println(err)
|
|
||||||
err := tx.Rollback()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
count += 1
|
|
||||||
}
|
|
||||||
|
|
||||||
err = tx.Commit()
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println(err)
|
|
||||||
err := tx.Rollback()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
5
db/url_port_stats.sql
Normal file
5
db/url_port_stats.sql
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
SELECT
|
||||||
|
COUNT(*) AS "All",
|
||||||
|
COUNT(CASE WHEN URL ~ '://[^:]+:[0-9]+' THEN 1 END) AS "With port",
|
||||||
|
COUNT(CASE WHEN URL !~ '://[^:]+:[0-9]+' THEN 1 END) AS "Without port"
|
||||||
|
FROM snapshots;
|
||||||
Reference in New Issue
Block a user