Update DB init instructions & README

This commit is contained in:
2025-01-04 15:39:21 +02:00
parent 4e6fad873b
commit ccb8f6838e
2 changed files with 47 additions and 15 deletions

View File

@@ -1,28 +1,25 @@
# gemini-grc
A Gemini crawler.
A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) network. Easily extendable as a "wayback machine" of Gemini.
URLs to visit as well as data from visited URLs are stored as "snapshots" in the database.
This makes it easily extendable as a "wayback machine" of Gemini.
## Done
- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
## Features done
- [x] URL normalization
- [x] Handle redirects (3X status codes)
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
- [x] Save image/* and text/* files
- [x] Concurrent downloading with workers
- [x] Connection limit per host
- [x] URL Blacklist
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
- [x] Handle redirects (3X status codes)
- [x] Better URL normalization
## TODO
- [ ] Add snapshot hash and support snapshot history
- [ ] Add web interface
- [ ] Provide a TLS cert for sites that require it, like Astrobotany
- [ ] Add snapshot history
- [ ] Add a web interface
- [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany
## TODO with lower priority
## TODO (lower priority)
- [ ] Gopher
- [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
- [ ] Spartan

35
db/sql/initdb.sql Normal file
View File

@@ -0,0 +1,35 @@
DROP TABLE IF EXISTS snapshots;
CREATE TABLE snapshots (
id SERIAL PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
host TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
mimetype TEXT,
data BYTEA,
gemtext TEXT,
links JSONB,
lang TEXT,
response_code INTEGER,
error TEXT
);
CREATE INDEX idx_url ON snapshots (url);
CREATE INDEX idx_timestamp ON snapshots (timestamp);
CREATE INDEX idx_mimetype ON snapshots (mimetype);
CREATE INDEX idx_lang ON snapshots (lang);
CREATE INDEX idx_response_code ON snapshots (response_code);
CREATE INDEX idx_error ON snapshots (error);
CREATE INDEX idx_host ON snapshots (host);
CREATE INDEX unique_uid_url ON snapshots (uid, url);
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
CREATE TABLE urls (
id SERIAL PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
host TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_urls_url ON urls (url);
CREATE INDEX idx_urls_timestamp ON urls (timestamp);