Lots of features, first version that reliably crawls Geminispace.

- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
- [x] Save image/* and text/* files
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
.

.
This commit is contained in:
2024-10-21 20:03:28 +03:00
parent 212345764b
commit fee7d3e01c
37 changed files with 1231 additions and 319 deletions

20
db/populateDB.go Normal file
View File

@@ -0,0 +1,20 @@
// func PopulateDB(db *sqlx.DB) {
// // Delete all rows in the snapshots table
// db.MustExec("TRUNCATE snapshots;")
// // Prepare the query for inserting a snapshot with uid, url, and timestamp
// query := `INSERT INTO snapshots(uid, url, timestamp)
// VALUES ($1, $2, $3)`
// // Calculate the timestamp for 2 days ago
// timestamp := time.Now().Add(-48 * time.Hour)
// db.MustExec(query, uid.UID(), "gemini://geminiprotocol.net/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://warmedal.se/~antenna", timestamp)
// db.MustExec(query, uid.UID(), "gemini://skyjake.fi/~Cosmos/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://gemini.circumlunar.space/capcom/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://auragem.letz.dev/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://gemplex.space/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://kennedy.gemi.dev/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://tlgs.one/", timestamp)
// }