diff --git a/config/config.go b/config/config.go index 670e200..cc2d430 100644 --- a/config/config.go +++ b/config/config.go @@ -17,6 +17,7 @@ const ( EnvResponseTimeout = "RESPONSE_TIMEOUT" EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR" EnvBlacklistPath = "BLACKLIST_PATH" + EnvDryRun = "DRY_RUN" ) // Config holds the application configuration loaded from environment variables. @@ -28,6 +29,7 @@ type Config struct { WorkerBatchSize int // Batch size for worker processing PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL BlacklistPath string // File that has blacklisted strings of "host:port" + DryRun bool // If false, don't write to disk } var CONFIG Config //nolint:gochecknoglobals @@ -126,6 +128,14 @@ func GetConfig() *Config { config.BlacklistPath = v return nil }, + EnvDryRun: func(v string) error { + val, err := parseBool(EnvDryRun, v) + if err != nil { + return err + } + config.DryRun = val + return nil + }, } // Process each environment variable diff --git a/db/fix-url-ports.sql b/db/fix-url-ports.sql new file mode 100644 index 0000000..5128bc2 --- /dev/null +++ b/db/fix-url-ports.sql @@ -0,0 +1,22 @@ +-- Here's an SQL script that will find and remove snapshots without port numbers +-- when there exists a duplicate with the default port 1965. + +-- Before running this DELETE though, you might want to +-- verify the matches first with this SELECT: +WITH duplicates AS ( + SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port + FROM snapshots s1 + JOIN snapshots s2 + ON s2.url = s1.url || ':1965' +) +SELECT * FROM duplicates; + +-- Now delete them for real: +WITH duplicates AS ( + SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port + FROM snapshots s1 + JOIN snapshots s2 + ON s2.url = s1.url || ':1965' +) +DELETE FROM snapshots +WHERE id IN (SELECT id_without_port FROM duplicates); diff --git a/db/migrate1_host.sh b/db/migrate1_host.sh deleted file mode 100755 index 8463b89..0000000 --- a/db/migrate1_host.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -set -eu - -MAX_RESPONSE_SIZE=10485760 \ -LOG_LEVEL=info \ -ROOT_PATH=./snaps \ -RESPONSE_TIMEOUT=10 \ -NUM_OF_WORKERS=5 \ -PG_DATABASE=gemini \ -PG_HOST=127.0.0.1 \ -PG_PORT=5433 \ -PG_USER=gemini \ -PG_PASSWORD=gemini \ -go run ./migrate1_host.go diff --git a/db/migrate1_host.go b/db/migration_3/migrate3_url_port.go similarity index 58% rename from db/migrate1_host.go rename to db/migration_3/migrate3_url_port.go index ab790ea..b889e36 100644 --- a/db/migrate1_host.go +++ b/db/migration_3/migrate3_url_port.go @@ -9,37 +9,49 @@ import ( "github.com/jmoiron/sqlx" ) +// MANUALLY SET TO TRUE WHEN MIGRATION HAS BEEN APPLIED func checkIfDone() bool { return true } +func MustSelect(tx *sqlx.Tx, dest interface{}, query string, args ...interface{}) { + if err := tx.Select(dest, query, args...); err != nil { + panic(err) + } +} + // Populates the `host` field func main() { - db := connectToDB() - if checkIfDone() { fmt.Println("Migration already applied") return } - count := 0 + db := connectToDB() + defer db.Close() + + batchSize := 1000 for { // Start the transaction - tx, err := db.Beginx() - if err != nil { - fmt.Println(err) - return - } - - query := ` - SELECT * FROM snapshots - WHERE host IS NULL - LIMIT 5000 - ` + tx := db.MustBegin() + query := `SELECT * FROM snapshots WHERE url NOT LIKE '%1965%' LIMIT $1` var snapshots []gemini.Snapshot - err = tx.Select(&snapshots, query) + MustSelect(tx, &snapshots, query, batchSize) if len(snapshots) == 0 { - fmt.Println("Done!") - return + fmt.Println("No snapshots remaining, done") + break } + for i, s := range snapshots { + _, err := gemini.ParseURL(s.URL.String(), "") + if err != nil { + panic(fmt.Sprintf("Error parsing URL. ID %d URL %s\n", s.ID, s.URL)) + } + fmt.Printf("Saving %d %d %s\n", i+1, s.ID, s.URL) + err = gemini.UpsertSnapshot(0, tx, &s) + if err != nil { + panic(fmt.Sprintf("Error saving %s: %s", s.URL, err)) + } + tx.MustExec(`DELETE FROM snapshots WHERE id=$1`, s.ID) + } + err := tx.Commit() if err != nil { fmt.Println(err) err := tx.Rollback() @@ -47,29 +59,6 @@ func main() { panic(err) } } - for _, s := range snapshots { - s.Host = s.URL.Hostname - fmt.Println(count, s.UID, s.URL.Hostname) - err := gemini.SaveSnapshotToDB(tx, &s) - if err != nil { - fmt.Println(err) - err := tx.Rollback() - if err != nil { - panic(err) - } - } - count += 1 - } - - err = tx.Commit() - if err != nil { - fmt.Println(err) - err := tx.Rollback() - if err != nil { - panic(err) - } - } - } } diff --git a/db/url_port_stats.sql b/db/url_port_stats.sql new file mode 100644 index 0000000..ab3a3ca --- /dev/null +++ b/db/url_port_stats.sql @@ -0,0 +1,5 @@ +SELECT + COUNT(*) AS "All", + COUNT(CASE WHEN URL ~ '://[^:]+:[0-9]+' THEN 1 END) AS "With port", + COUNT(CASE WHEN URL !~ '://[^:]+:[0-9]+' THEN 1 END) AS "Without port" +FROM snapshots; \ No newline at end of file