Update DB init instructions & README
This commit is contained in:
27
README.md
27
README.md
@@ -1,28 +1,25 @@
|
||||
# gemini-grc
|
||||
|
||||
A Gemini crawler.
|
||||
A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) network. Easily extendable as a "wayback machine" of Gemini.
|
||||
|
||||
URLs to visit as well as data from visited URLs are stored as "snapshots" in the database.
|
||||
This makes it easily extendable as a "wayback machine" of Gemini.
|
||||
|
||||
## Done
|
||||
- [x] Concurrent downloading with workers
|
||||
- [x] Concurrent connection limit per host
|
||||
- [x] URL Blacklist
|
||||
## Features done
|
||||
- [x] URL normalization
|
||||
- [x] Handle redirects (3X status codes)
|
||||
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||
- [x] Save image/* and text/* files
|
||||
- [x] Concurrent downloading with workers
|
||||
- [x] Connection limit per host
|
||||
- [x] URL Blacklist
|
||||
- [x] Configuration via environment variables
|
||||
- [x] Storing snapshots in PostgreSQL
|
||||
- [x] Proper response header & body UTF-8 and format validation
|
||||
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||
- [x] Handle redirects (3X status codes)
|
||||
- [x] Better URL normalization
|
||||
|
||||
## TODO
|
||||
- [ ] Add snapshot hash and support snapshot history
|
||||
- [ ] Add web interface
|
||||
- [ ] Provide a TLS cert for sites that require it, like Astrobotany
|
||||
- [ ] Add snapshot history
|
||||
- [ ] Add a web interface
|
||||
- [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany
|
||||
|
||||
## TODO with lower priority
|
||||
## TODO (lower priority)
|
||||
- [ ] Gopher
|
||||
- [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
|
||||
- [ ] Spartan
|
||||
|
||||
35
db/sql/initdb.sql
Normal file
35
db/sql/initdb.sql
Normal file
@@ -0,0 +1,35 @@
|
||||
DROP TABLE IF EXISTS snapshots;
|
||||
|
||||
CREATE TABLE snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
host TEXT NOT NULL,
|
||||
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
mimetype TEXT,
|
||||
data BYTEA,
|
||||
gemtext TEXT,
|
||||
links JSONB,
|
||||
lang TEXT,
|
||||
response_code INTEGER,
|
||||
error TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX idx_url ON snapshots (url);
|
||||
CREATE INDEX idx_timestamp ON snapshots (timestamp);
|
||||
CREATE INDEX idx_mimetype ON snapshots (mimetype);
|
||||
CREATE INDEX idx_lang ON snapshots (lang);
|
||||
CREATE INDEX idx_response_code ON snapshots (response_code);
|
||||
CREATE INDEX idx_error ON snapshots (error);
|
||||
CREATE INDEX idx_host ON snapshots (host);
|
||||
CREATE INDEX unique_uid_url ON snapshots (uid, url);
|
||||
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
|
||||
|
||||
CREATE TABLE urls (
|
||||
id SERIAL PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
host TEXT NOT NULL,
|
||||
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
CREATE INDEX idx_urls_url ON urls (url);
|
||||
CREATE INDEX idx_urls_timestamp ON urls (timestamp);
|
||||
|
||||
Reference in New Issue
Block a user