Update DB init instructions & README

2025-01-04 15:39:21 +02:00
parent 4e6fad873b
commit ccb8f6838e
2 changed files with 47 additions and 15 deletions
--- a/README.md
+++ b/README.md
@@ -1,28 +1,25 @@
 # gemini-grc

-A Gemini crawler.
+A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) network. Easily extendable as a "wayback machine" of Gemini.

-URLs to visit as well as data from visited URLs are stored as "snapshots" in the database.
-This makes it easily extendable as a "wayback machine" of Gemini.
-
-## Done
- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
+## Features done
+- [x] URL normalization
+- [x] Handle redirects (3X status codes)
+- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
 - [x] Save image/* and text/* files
+- [x] Concurrent downloading with workers
+- [x] Connection limit per host
+- [x] URL Blacklist
 - [x] Configuration via environment variables
 - [x] Storing snapshots in PostgreSQL
 - [x] Proper response header & body UTF-8 and format validation
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
- [x] Handle redirects (3X status codes)
- [x] Better URL normalization

 ## TODO
- [ ] Add snapshot hash and support snapshot history
- [ ] Add web interface
- [ ] Provide a TLS cert for sites that require it, like Astrobotany
+- [ ] Add snapshot history
+- [ ] Add a web interface
+- [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany

-## TODO with lower priority
+## TODO (lower priority)
 - [ ] Gopher
 - [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
 - [ ] Spartan
--- a/db/sql/initdb.sql
+++ b/db/sql/initdb.sql
@@ -0,0 +1,35 @@
+DROP TABLE IF EXISTS snapshots;
+
+CREATE TABLE snapshots (
+    id SERIAL PRIMARY KEY,
+    url TEXT NOT NULL UNIQUE,
+    host TEXT NOT NULL,
+    timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    mimetype TEXT,
+    data BYTEA,
+    gemtext TEXT,
+    links JSONB,
+    lang TEXT,
+    response_code INTEGER,
+    error TEXT
+);
+
+CREATE INDEX idx_url ON snapshots (url);
+CREATE INDEX idx_timestamp ON snapshots (timestamp);
+CREATE INDEX idx_mimetype ON snapshots (mimetype);
+CREATE INDEX idx_lang ON snapshots (lang);
+CREATE INDEX idx_response_code ON snapshots (response_code);
+CREATE INDEX idx_error ON snapshots (error);
+CREATE INDEX idx_host ON snapshots (host);
+CREATE INDEX unique_uid_url ON snapshots (uid, url);
+CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
+
+CREATE TABLE urls (
+                      id SERIAL PRIMARY KEY,
+                      url TEXT NOT NULL UNIQUE,
+                      host TEXT NOT NULL,
+                      timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+CREATE INDEX idx_urls_url ON urls (url);
+CREATE INDEX idx_urls_timestamp ON urls (timestamp);
+