From ccb8f6838e0f4d70030dcf013aeb0e75880ea86c Mon Sep 17 00:00:00 2001 From: antanst Date: Sat, 4 Jan 2025 15:39:21 +0200 Subject: [PATCH] Update DB init instructions & README --- README.md | 27 ++++++++++++--------------- db/sql/initdb.sql | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 db/sql/initdb.sql diff --git a/README.md b/README.md index eab5653..800cab5 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,25 @@ # gemini-grc -A Gemini crawler. +A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) network. Easily extendable as a "wayback machine" of Gemini. -URLs to visit as well as data from visited URLs are stored as "snapshots" in the database. -This makes it easily extendable as a "wayback machine" of Gemini. - -## Done -- [x] Concurrent downloading with workers -- [x] Concurrent connection limit per host -- [x] URL Blacklist +## Features done +- [x] URL normalization +- [x] Handle redirects (3X status codes) +- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi - [x] Save image/* and text/* files +- [x] Concurrent downloading with workers +- [x] Connection limit per host +- [x] URL Blacklist - [x] Configuration via environment variables - [x] Storing snapshots in PostgreSQL - [x] Proper response header & body UTF-8 and format validation -- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi -- [x] Handle redirects (3X status codes) -- [x] Better URL normalization ## TODO -- [ ] Add snapshot hash and support snapshot history -- [ ] Add web interface -- [ ] Provide a TLS cert for sites that require it, like Astrobotany +- [ ] Add snapshot history +- [ ] Add a web interface +- [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany -## TODO with lower priority +## TODO (lower priority) - [ ] Gopher - [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi - [ ] Spartan diff --git a/db/sql/initdb.sql b/db/sql/initdb.sql new file mode 100644 index 0000000..c4160a5 --- /dev/null +++ b/db/sql/initdb.sql @@ -0,0 +1,35 @@ +DROP TABLE IF EXISTS snapshots; + +CREATE TABLE snapshots ( + id SERIAL PRIMARY KEY, + url TEXT NOT NULL UNIQUE, + host TEXT NOT NULL, + timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + mimetype TEXT, + data BYTEA, + gemtext TEXT, + links JSONB, + lang TEXT, + response_code INTEGER, + error TEXT +); + +CREATE INDEX idx_url ON snapshots (url); +CREATE INDEX idx_timestamp ON snapshots (timestamp); +CREATE INDEX idx_mimetype ON snapshots (mimetype); +CREATE INDEX idx_lang ON snapshots (lang); +CREATE INDEX idx_response_code ON snapshots (response_code); +CREATE INDEX idx_error ON snapshots (error); +CREATE INDEX idx_host ON snapshots (host); +CREATE INDEX unique_uid_url ON snapshots (uid, url); +CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; + +CREATE TABLE urls ( + id SERIAL PRIMARY KEY, + url TEXT NOT NULL UNIQUE, + host TEXT NOT NULL, + timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); +CREATE INDEX idx_urls_url ON urls (url); +CREATE INDEX idx_urls_timestamp ON urls (timestamp); +