diff --git a/.gitignore b/.gitignore index e6ef110..91610c8 100644 --- a/.gitignore +++ b/.gitignore @@ -14,5 +14,7 @@ run*.sh /main /db/migration*/** -/db/populate/** +/cmd/populate/** /db/sql/** + +**/.claude/settings.local.json diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..6043c47 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,128 @@ +# gemini-grc Architectural Notes + +## 20250513 - Versioned Snapshots + +The crawler now supports saving multiple versions of the same URL over time, similar to the Internet Archive's Wayback Machine. This document outlines the architecture and changes made to support this feature. + +### Database Schema Changes + +The following changes to the database schema are required: + +```sql +-- Remove UNIQUE constraint from url in snapshots table +ALTER TABLE snapshots DROP CONSTRAINT unique_url; + +-- Create a composite primary key using url and timestamp +CREATE UNIQUE INDEX idx_url_timestamp ON snapshots (url, timestamp); + +-- Add a new index to efficiently find the latest snapshot +CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC); +``` + +### Code Changes + +1. **Updated SQL Queries**: + - Changed queries to insert new snapshots without conflict handling + - Added queries to retrieve snapshots by timestamp + - Added queries to retrieve all snapshots for a URL + - Added queries to retrieve snapshots in a date range + +2. **Context-Aware Database Methods**: + - `SaveSnapshot`: Saves a new snapshot with the current timestamp using a context + - `GetLatestSnapshot`: Retrieves the most recent snapshot for a URL using a context + - `GetSnapshotAtTimestamp`: Retrieves the nearest snapshot at or before a given timestamp using a context + - `GetAllSnapshotsForURL`: Retrieves all snapshots for a URL using a context + - `GetSnapshotsByDateRange`: Retrieves snapshots within a date range using a context + +3. **Backward Compatibility**: + - The `OverwriteSnapshot` method has been maintained for backward compatibility + - It now delegates to `SaveSnapshot`, effectively creating a new version instead of overwriting + +### Utility Scripts + +A new utility script `snapshot_history.sh` has been created to demonstrate the versioned snapshot functionality: + +- Retrieve the latest snapshot for a URL +- Retrieve a snapshot at a specific point in time +- Retrieve all snapshots for a URL +- Retrieve snapshots within a date range + +### Usage Examples + +```bash +# Get the latest snapshot +./snapshot_history.sh -u gemini://example.com/ + +# Get a snapshot from a specific point in time +./snapshot_history.sh -u gemini://example.com/ -t 2023-05-01T12:00:00Z + +# Get all snapshots for a URL +./snapshot_history.sh -u gemini://example.com/ -a + +# Get snapshots in a date range +./snapshot_history.sh -u gemini://example.com/ -r 2023-01-01T00:00:00Z 2023-12-31T23:59:59Z +``` + +### API Usage Examples + +```go +// Save a new snapshot +ctx := context.Background() +snapshot, _ := snapshot.SnapshotFromURL("gemini://example.com", true) +tx, _ := Database.NewTx(ctx) +err := Database.SaveSnapshot(ctx, tx, snapshot) +tx.Commit() + +// Get the latest snapshot +ctx := context.Background() +tx, _ := Database.NewTx(ctx) +latestSnapshot, err := Database.GetLatestSnapshot(ctx, tx, "gemini://example.com") +tx.Commit() + +// Get a snapshot at a specific time +ctx := context.Background() +timestamp := time.Date(2023, 5, 1, 12, 0, 0, 0, time.UTC) +tx, _ := Database.NewTx(ctx) +historicalSnapshot, err := Database.GetSnapshotAtTimestamp(ctx, tx, "gemini://example.com", timestamp) +tx.Commit() + +// Get all snapshots for a URL +ctx := context.Background() +tx, _ := Database.NewTx(ctx) +allSnapshots, err := Database.GetAllSnapshotsForURL(ctx, tx, "gemini://example.com") +tx.Commit() + +// Using a timeout context to limit database operations +ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) +defer cancel() +tx, _ := Database.NewTx(ctx) +latestSnapshot, err := Database.GetLatestSnapshot(ctx, tx, "gemini://example.com") +tx.Commit() +``` + +### Optimizations + +1. **Content Deduplication**: The crawler can avoid storing duplicate content in the database. When content deduplication is enabled (by setting `--skip-identical-content=true`), the system implements two key behaviors: + + * **Content-based deduplication**: It compares the content with the latest existing snapshot for the same URL before saving. If the content is identical, the new snapshot is skipped, saving storage space. However, if the content has changed, a new snapshot is still created, preserving the version history. + + When deduplication is disabled (default, `--skip-identical-content=false`), the system stores every snapshot regardless of content similarity and may re-queue URLs that already have snapshots, leading to more frequent re-crawling of all content. + + This approach ensures that the version history for URLs with changing content is always preserved, regardless of the flag setting. The flag only controls whether to store snapshots when content hasn't changed. + +2. **Time-based Crawl Frequency Control**: The crawler can be configured to skip re-crawling URLs that have been recently updated, using the `--skip-if-updated-days=N` parameter: + + * When set to a positive integer N, URLs that have a snapshot newer than N days ago will not be added to the crawl queue, even if they're found as links in other pages. + + * This feature helps control crawl frequency, ensuring that resources aren't wasted on frequently checking content that rarely changes. + + * Setting `--skip-if-updated-days=0` (the default) disables this feature, meaning all discovered URLs will be queued for crawling regardless of when they were last updated. + + * For example, `--skip-if-updated-days=7` will skip re-crawling any URL that has been crawled within the last week. + +### Future Improvements + +1. Add a web interface to browse snapshot history +2. Implement comparison features to highlight changes between snapshots +3. Add metadata to track crawl batches +4. Implement retention policies to manage storage \ No newline at end of file diff --git a/Makefile b/Makefile index b816083..345c2c6 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ -SHELL := /bin/env oksh +SHELL := /bin/sh export PATH := $(PATH) -all: fmt lintfix tidy test clean build +all: fmt lintfix vet tidy test clean build clean: - rm -f ./dist && mkdir ./dist + mkdir -p ./dist && rm -rf ./dist/* debug: @echo "PATH: $(PATH)" @@ -30,12 +30,17 @@ fmt: lint: fmt golangci-lint run +vet: fmt + go vet ./.../ + # Run linter and fix lintfix: fmt golangci-lint run --fix build: - CGO_ENABLED=0 go build -o ./dist/gemini-grc ./main.go + CGO_ENABLED=0 go build -o ./dist/get ./cmd/get/get.go + CGO_ENABLED=0 go build -o ./dist/crawl ./cmd/crawl/crawl.go + CGO_ENABLED=0 go build -o ./dist/crawler ./cmd/crawler/crawler.go show-updates: go list -m -u all diff --git a/NOTES.md b/NOTES.md new file mode 100644 index 0000000..1e09522 --- /dev/null +++ b/NOTES.md @@ -0,0 +1,13 @@ +# Notes + +Avoiding endless loops while crawling + +- Make sure we follow robots.txt +- Announce our own agent so people can block us in their robots.txt +- Put a limit on number of pages per host, and notify on limit reach. +- Put a limit on the number of redirects (not needed?) + +Heuristics: + +- Do _not_ parse links from pages that have '/git/' or '/cgi/' or '/cgi-bin/' in their URLs. +- Have a list of "whitelisted" hosts/urls that we visit in regular intervals. diff --git a/README.md b/README.md index 34d07ae..87109cf 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Easily extendable as a "wayback machine" of Gemini. - [x] Concurrent downloading with configurable number of workers - [x] Connection limit per host - [x] URL Blacklist +- [x] URL Whitelist (overrides blacklist and robots.txt) - [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi - [x] Configuration via environment variables - [x] Storing capsule snapshots in PostgreSQL @@ -16,6 +17,9 @@ Easily extendable as a "wayback machine" of Gemini. - [x] Handle redirects (3X status codes) - [x] Crawl Gopher holes +## Security Note +This crawler uses `InsecureSkipVerify: true` in TLS configuration to accept all certificates. This is a common approach for crawlers but makes the application vulnerable to MITM attacks. This trade-off is made to enable crawling self-signed certificates widely used in the Gemini ecosystem. + ## How to run Spin up a PostgreSQL, check `db/sql/initdb.sql` to create the tables and start the crawler. @@ -30,11 +34,12 @@ Bool can be `true`,`false` or `0`,`1`. MaxResponseSize int // Maximum size of response in bytes NumOfWorkers int // Number of concurrent workers ResponseTimeout int // Timeout for responses in seconds - WorkerBatchSize int // Batch size for worker processing PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL BlacklistPath string // File that has blacklisted strings of "host:port" + WhitelistPath string // File with URLs that should always be crawled regardless of blacklist or robots.txt DryRun bool // If false, don't write to disk - PrintWorkerStatus bool // If false, print logs and not worker status table + SkipIdenticalContent bool // When true, skip storing snapshots with identical content + SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable) ``` Example: @@ -42,8 +47,8 @@ Example: ```shell LOG_LEVEL=info \ NUM_OF_WORKERS=10 \ -WORKER_BATCH_SIZE=10 \ BLACKLIST_PATH="./blacklist.txt" \ # one url per line, can be empty +WHITELIST_PATH="./whitelist.txt" \ # URLs that override blacklist and robots.txt MAX_RESPONSE_SIZE=10485760 \ RESPONSE_TIMEOUT=10 \ PANIC_ON_UNEXPECTED_ERROR=true \ @@ -54,6 +59,8 @@ PG_PORT=5434 \ PG_USER=test \ PG_PASSWORD=test \ DRY_RUN=false \ +SKIP_IDENTICAL_CONTENT=false \ +SKIP_IF_UPDATED_DAYS=7 \ ./gemini-grc ``` @@ -65,8 +72,30 @@ go install mvdan.cc/gofumpt@v0.7.0 go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.63.4 ``` +## Snapshot History + +The crawler now supports versioned snapshots, storing multiple snapshots of the same URL over time. This allows you to view how content changes over time, similar to the Internet Archive's Wayback Machine. + +### Accessing Snapshot History + +You can access the snapshot history using the included `snapshot_history.sh` script: + +```bash +# Get the latest snapshot +./snapshot_history.sh -u gemini://example.com/ + +# Get a snapshot from a specific point in time +./snapshot_history.sh -u gemini://example.com/ -t 2023-05-01T12:00:00Z + +# Get all snapshots for a URL +./snapshot_history.sh -u gemini://example.com/ -a + +# Get snapshots in a date range +./snapshot_history.sh -u gemini://example.com/ -r 2023-01-01T00:00:00Z 2023-12-31T23:59:59Z +``` + ## TODO -- [ ] Add snapshot history +- [x] Add snapshot history - [ ] Add a web interface - [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany - [ ] Use pledge/unveil in OpenBSD hosts diff --git a/go.mod b/go.mod index 2550fed..b93a867 100644 --- a/go.mod +++ b/go.mod @@ -1,15 +1,14 @@ module gemini-grc -go 1.23.1 +go 1.24.3 require ( - github.com/antanst/go_errors v0.0.1 + git.antanst.com/antanst/uid v0.0.1 + git.antanst.com/antanst/xerrors v0.0.1 github.com/guregu/null/v5 v5.0.0 github.com/jackc/pgx/v5 v5.7.2 github.com/jmoiron/sqlx v1.4.0 github.com/lib/pq v1.10.9 - github.com/matoous/go-nanoid/v2 v2.1.0 - github.com/rs/zerolog v1.33.0 github.com/stretchr/testify v1.9.0 golang.org/x/text v0.21.0 ) @@ -20,12 +19,13 @@ require ( github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/kr/text v0.2.0 // indirect - github.com/mattn/go-colorable v0.1.14 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.13.1 // indirect golang.org/x/crypto v0.32.0 // indirect golang.org/x/sync v0.10.0 // indirect - golang.org/x/sys v0.29.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) + +replace git.antanst.com/antanst/xerrors => ../xerrors + +replace git.antanst.com/antanst/uid => ../uid diff --git a/go.sum b/go.sum index a2ea9c1..a837054 100644 --- a/go.sum +++ b/go.sum @@ -1,15 +1,11 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= -github.com/antanst/go_errors v0.0.1 h1:55BJ8I3u9IeLJxVslbI8Hv8fM0+fWyIE2VQXuwuYg9Y= -github.com/antanst/go_errors v0.0.1/go.mod h1:VDiDlRB7JfRhr6GMqdChBGT1XTBIfzELhg3Yq7sVwhM= -github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= -github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/guregu/null/v5 v5.0.0 h1:PRxjqyOekS11W+w/7Vfz6jgJE/BCwELWtgvOJzddimw= github.com/guregu/null/v5 v5.0.0/go.mod h1:SjupzNy+sCPtwQTKWhUCqjhVCO69hpsl2QsZrWHjlwU= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= @@ -28,25 +24,12 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE= -github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM= -github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= -github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= -github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= -github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= -github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= -github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= -github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= -github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -56,11 +39,6 @@ golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= -golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=