Compare commits
49 Commits
4e6fad873b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f362a1d2da | ||
|
|
7b3ad38f03 | ||
|
|
8e30a6a365 | ||
|
|
26311a6d2b | ||
|
|
57eb2555c5 | ||
|
|
453cf2294a | ||
|
|
ee2076f337 | ||
|
|
acbac15c20 | ||
|
|
ddbe6b461b | ||
|
|
55bb0d96d0 | ||
|
|
349968d019 | ||
|
|
2357135d5a | ||
|
|
98d3ed6707 | ||
|
|
8b498a2603 | ||
|
|
8588414b14 | ||
| 5e6dabf1e7 | |||
| a8173544e7 | |||
| 3d07b56e8c | |||
| c54c093a10 | |||
| 57f5c0e865 | |||
| dc6eb610a2 | |||
| 39e9ead982 | |||
| 5f4da4f806 | |||
| 4ef3f70f1f | |||
| b8ea6fab4a | |||
| 5fe1490f1e | |||
| a41490f834 | |||
| 701a5df44f | |||
| 5b84960c5a | |||
| be38104f05 | |||
| d70d6c35a3 | |||
| 8399225046 | |||
| e8e26ec76a | |||
| f6ac5003b0 | |||
| e626aabecb | |||
| ebf59c50b8 | |||
| 2a041fec7c | |||
| ca008b0796 | |||
| 8350e106d6 | |||
| 9c7502b2a8 | |||
| dda21e833c | |||
| b0e7052c10 | |||
| 43b207c9ab | |||
| 285f2955e7 | |||
| 998b0e74ec | |||
| 766ee26f68 | |||
| 5357ceb04d | |||
| 03e1849191 | |||
| ccb8f6838e |
23
.gitignore
vendored
23
.gitignore
vendored
@@ -1,10 +1,29 @@
|
|||||||
.idea
|
|
||||||
.goroot
|
|
||||||
**/.#*
|
**/.#*
|
||||||
**/*~
|
**/*~
|
||||||
|
**/.DS_Store
|
||||||
|
/.idea
|
||||||
|
/.goroot
|
||||||
|
/dist/**
|
||||||
|
/blacklist.txt
|
||||||
|
/check.sh
|
||||||
|
/debug.sh
|
||||||
|
/run.sh
|
||||||
/.go
|
/.go
|
||||||
/cmd
|
/cmd
|
||||||
/db/initdb.sql
|
/db/initdb.sql
|
||||||
/gemini-grc
|
/gemini-grc
|
||||||
run*.sh
|
run*.sh
|
||||||
/main
|
/main
|
||||||
|
/db/migration*/**
|
||||||
|
/cmd/populate/**
|
||||||
|
/db/sql/**
|
||||||
|
|
||||||
|
**/.claude/settings.local.json
|
||||||
|
|
||||||
|
/crawl.sh
|
||||||
|
/crawler.sh
|
||||||
|
/get.sh
|
||||||
|
/snapshot_history.sh
|
||||||
|
/whitelist.txt
|
||||||
|
|
||||||
|
/CLAUDE.md
|
||||||
|
|||||||
169
ARCHITECTURE.md
Normal file
169
ARCHITECTURE.md
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# gemini-grc Architectural Notes
|
||||||
|
|
||||||
|
## 20250513 - Versioned Snapshots
|
||||||
|
|
||||||
|
The crawler now supports saving multiple versions of the same URL over time, similar to the Internet Archive's Wayback Machine. This document outlines the architecture and changes made to support this feature.
|
||||||
|
|
||||||
|
### Database Schema Changes
|
||||||
|
|
||||||
|
The following changes to the database schema are required:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Remove UNIQUE constraint from url in snapshots table
|
||||||
|
ALTER TABLE snapshots DROP CONSTRAINT unique_url;
|
||||||
|
|
||||||
|
-- Create a composite primary key using url and timestamp
|
||||||
|
CREATE UNIQUE INDEX idx_url_timestamp ON snapshots (url, timestamp);
|
||||||
|
|
||||||
|
-- Add a new index to efficiently find the latest snapshot
|
||||||
|
CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC);
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error handling
|
||||||
|
- `xerrors` library is used for error creation/wrapping.
|
||||||
|
- The "Fatal" field is not used, we _always_ panic on fatal errors.
|
||||||
|
- _All_ internal functions _must_ return `xerror` errors.
|
||||||
|
- _All_ external errors are wrapped within `xerror` errors.
|
||||||
|
|
||||||
|
|
||||||
|
### Code Changes
|
||||||
|
|
||||||
|
1. **Updated SQL Queries**:
|
||||||
|
- Changed queries to insert new snapshots without conflict handling
|
||||||
|
- Added queries to retrieve snapshots by timestamp
|
||||||
|
- Added queries to retrieve all snapshots for a URL
|
||||||
|
- Added queries to retrieve snapshots in a date range
|
||||||
|
|
||||||
|
2. **Context-Aware Database Methods**:
|
||||||
|
- `SaveSnapshot`: Saves a new snapshot with the current timestamp using a context
|
||||||
|
- `GetLatestSnapshot`: Retrieves the most recent snapshot for a URL using a context
|
||||||
|
- `GetSnapshotAtTimestamp`: Retrieves the nearest snapshot at or before a given timestamp using a context
|
||||||
|
- `GetAllSnapshotsForURL`: Retrieves all snapshots for a URL using a context
|
||||||
|
- `GetSnapshotsByDateRange`: Retrieves snapshots within a date range using a context
|
||||||
|
|
||||||
|
3. **Backward Compatibility**:
|
||||||
|
- The `OverwriteSnapshot` method has been maintained for backward compatibility
|
||||||
|
- It now delegates to `SaveSnapshot`, effectively creating a new version instead of overwriting
|
||||||
|
|
||||||
|
### Utility Scripts
|
||||||
|
|
||||||
|
A new utility script `snapshot_history.sh` has been created to demonstrate the versioned snapshot functionality:
|
||||||
|
|
||||||
|
- Retrieve the latest snapshot for a URL
|
||||||
|
- Retrieve a snapshot at a specific point in time
|
||||||
|
- Retrieve all snapshots for a URL
|
||||||
|
- Retrieve snapshots within a date range
|
||||||
|
|
||||||
|
### Usage Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get the latest snapshot
|
||||||
|
./snapshot_history.sh -u gemini://example.com/
|
||||||
|
|
||||||
|
# Get a snapshot from a specific point in time
|
||||||
|
./snapshot_history.sh -u gemini://example.com/ -t 2023-05-01T12:00:00Z
|
||||||
|
|
||||||
|
# Get all snapshots for a URL
|
||||||
|
./snapshot_history.sh -u gemini://example.com/ -a
|
||||||
|
|
||||||
|
# Get snapshots in a date range
|
||||||
|
./snapshot_history.sh -u gemini://example.com/ -r 2023-01-01T00:00:00Z 2023-12-31T23:59:59Z
|
||||||
|
```
|
||||||
|
|
||||||
|
### API Usage Examples
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Save a new snapshot
|
||||||
|
ctx := context.Background()
|
||||||
|
snapshot, _ := snapshot.SnapshotFromURL("gemini://example.com", true)
|
||||||
|
tx, _ := Database.NewTx(ctx)
|
||||||
|
err := Database.SaveSnapshot(ctx, tx, snapshot)
|
||||||
|
tx.Commit()
|
||||||
|
|
||||||
|
// Get the latest snapshot
|
||||||
|
ctx := context.Background()
|
||||||
|
tx, _ := Database.NewTx(ctx)
|
||||||
|
latestSnapshot, err := Database.GetLatestSnapshot(ctx, tx, "gemini://example.com")
|
||||||
|
tx.Commit()
|
||||||
|
|
||||||
|
// Get a snapshot at a specific time
|
||||||
|
ctx := context.Background()
|
||||||
|
timestamp := time.Date(2023, 5, 1, 12, 0, 0, 0, time.UTC)
|
||||||
|
tx, _ := Database.NewTx(ctx)
|
||||||
|
historicalSnapshot, err := Database.GetSnapshotAtTimestamp(ctx, tx, "gemini://example.com", timestamp)
|
||||||
|
tx.Commit()
|
||||||
|
|
||||||
|
// Get all snapshots for a URL
|
||||||
|
ctx := context.Background()
|
||||||
|
tx, _ := Database.NewTx(ctx)
|
||||||
|
allSnapshots, err := Database.GetAllSnapshotsForURL(ctx, tx, "gemini://example.com")
|
||||||
|
tx.Commit()
|
||||||
|
|
||||||
|
// Using a timeout context to limit database operations
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
tx, _ := Database.NewTx(ctx)
|
||||||
|
latestSnapshot, err := Database.GetLatestSnapshot(ctx, tx, "gemini://example.com")
|
||||||
|
tx.Commit()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Content Deduplication Strategy
|
||||||
|
|
||||||
|
The crawler implements a sophisticated content deduplication strategy that balances storage efficiency with comprehensive historical tracking:
|
||||||
|
|
||||||
|
#### `--skip-identical-content` Flag Behavior
|
||||||
|
|
||||||
|
**When `--skip-identical-content=true` (default)**:
|
||||||
|
- All content types are checked for duplicates before storing
|
||||||
|
- Identical content is skipped entirely to save storage space
|
||||||
|
- Only changed content results in new snapshots
|
||||||
|
- Applies to both Gemini and non-Gemini content uniformly
|
||||||
|
|
||||||
|
**When `--skip-identical-content=false`**:
|
||||||
|
- **Gemini content (`text/gemini` MIME type)**: Full historical tracking - every crawl creates a new snapshot regardless of content changes
|
||||||
|
- **Non-Gemini content**: Still deduplicated - identical content is skipped even when flag is false
|
||||||
|
- Enables comprehensive version history for Gemini capsules while avoiding unnecessary storage of duplicate static assets
|
||||||
|
|
||||||
|
#### Implementation Details
|
||||||
|
|
||||||
|
The deduplication logic is implemented in `shouldSkipIdenticalSnapshot()` function in `common/worker.go`:
|
||||||
|
|
||||||
|
1. **Primary Check**: When `--skip-identical-content=true`, all content is checked for duplicates
|
||||||
|
2. **MIME-Type Specific Check**: When the flag is false, only non-`text/gemini` content is checked for duplicates
|
||||||
|
3. **Content Comparison**: Uses `IsContentIdentical()` which compares either GemText fields or binary Data fields
|
||||||
|
4. **Dual Safety Checks**: Content is checked both in the worker layer and database layer for robustness
|
||||||
|
|
||||||
|
This approach ensures that Gemini capsules get complete version history when desired, while preventing storage bloat from duplicate images, binaries, and other static content.
|
||||||
|
|
||||||
|
### Time-based Crawl Frequency Control
|
||||||
|
|
||||||
|
The crawler can be configured to skip re-crawling URLs that have been recently updated, using the `--skip-if-updated-days=N` parameter:
|
||||||
|
|
||||||
|
* When set to a positive integer N, URLs that have a snapshot newer than N days ago will not be added to the crawl queue, even if they're found as links in other pages.
|
||||||
|
* This feature helps control crawl frequency, ensuring that resources aren't wasted on frequently checking content that rarely changes.
|
||||||
|
* Setting `--skip-if-updated-days=0` disables this feature, meaning all discovered URLs will be queued for crawling regardless of when they were last updated.
|
||||||
|
* Default value is 60 days.
|
||||||
|
* For example, `--skip-if-updated-days=7` will skip re-crawling any URL that has been crawled within the last week.
|
||||||
|
|
||||||
|
### Worker Pool Architecture
|
||||||
|
|
||||||
|
The crawler uses a sophisticated worker pool system with backpressure control:
|
||||||
|
|
||||||
|
* **Buffered Channel**: Job queue size equals the number of workers (`NumOfWorkers`)
|
||||||
|
* **Self-Regulating**: Channel backpressure naturally rate-limits the scheduler
|
||||||
|
* **Context-Aware**: Each URL gets its own context with timeout (default 120s)
|
||||||
|
* **Transaction Per Job**: Each worker operates within its own database transaction
|
||||||
|
* **SafeRollback**: Uses `gemdb.SafeRollback()` for graceful transaction cleanup on errors
|
||||||
|
|
||||||
|
### Database Transaction Patterns
|
||||||
|
|
||||||
|
* **Context Separation**: Scheduler uses long-lived context, while database operations use fresh contexts
|
||||||
|
* **Timeout Prevention**: Fresh `dbCtx := context.Background()` prevents scheduler timeouts from affecting DB operations
|
||||||
|
* **Error Handling**: Distinguishes between context cancellation, fatal errors, and recoverable errors
|
||||||
|
|
||||||
|
### Future Improvements
|
||||||
|
|
||||||
|
1. Add a web interface to browse snapshot history
|
||||||
|
2. Implement comparison features to highlight changes between snapshots
|
||||||
|
3. Add metadata to track crawl batches
|
||||||
|
4. Implement retention policies to manage storage
|
||||||
14
COPYING
14
COPYING
@@ -1,14 +0,0 @@
|
|||||||
|
|
||||||
Copyright (c) Antanst
|
|
||||||
|
|
||||||
Permission to use, copy, modify, and distribute this software for any
|
|
||||||
purpose with or without fee is hereby granted, provided that the above
|
|
||||||
copyright notice and this permission notice appear in all copies.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
||||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
||||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
||||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
||||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
||||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
||||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
||||||
15
LICENSE
Normal file
15
LICENSE
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
ISC License
|
||||||
|
|
||||||
|
Copyright (c) Antanst 2014-2015
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and distribute this software for any
|
||||||
|
purpose with or without fee is hereby granted, provided that the above
|
||||||
|
copyright notice and this permission notice appear in all copies.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
27
Makefile
27
Makefile
@@ -1,10 +1,10 @@
|
|||||||
SHELL := /usr/local/bin/oksh
|
SHELL := /bin/sh
|
||||||
export PATH := $(PATH)
|
export PATH := $(PATH)
|
||||||
|
|
||||||
all: fmt lintfix test clean build
|
all: fmt lintfix vet tidy test clean build
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm ./main
|
mkdir -p ./dist && rm -rf ./dist/*
|
||||||
|
|
||||||
debug:
|
debug:
|
||||||
@echo "PATH: $(PATH)"
|
@echo "PATH: $(PATH)"
|
||||||
@@ -16,7 +16,10 @@ debug:
|
|||||||
|
|
||||||
# Test
|
# Test
|
||||||
test:
|
test:
|
||||||
go test ./...
|
go test -race ./...
|
||||||
|
|
||||||
|
tidy:
|
||||||
|
go mod tidy
|
||||||
|
|
||||||
# Format code
|
# Format code
|
||||||
fmt:
|
fmt:
|
||||||
@@ -27,9 +30,23 @@ fmt:
|
|||||||
lint: fmt
|
lint: fmt
|
||||||
golangci-lint run
|
golangci-lint run
|
||||||
|
|
||||||
|
vet: fmt
|
||||||
|
go vet ./.../
|
||||||
|
|
||||||
# Run linter and fix
|
# Run linter and fix
|
||||||
lintfix: fmt
|
lintfix: fmt
|
||||||
golangci-lint run --fix
|
golangci-lint run --fix
|
||||||
|
|
||||||
build:
|
build:
|
||||||
go build ./main.go
|
CGO_ENABLED=0 go build -o ./dist/get ./cmd/get/get.go
|
||||||
|
CGO_ENABLED=0 go build -o ./dist/crawl ./cmd/crawl/crawl.go
|
||||||
|
CGO_ENABLED=0 go build -o ./dist/crawler ./cmd/crawler/crawler.go
|
||||||
|
|
||||||
|
show-updates:
|
||||||
|
go list -m -u all
|
||||||
|
|
||||||
|
update:
|
||||||
|
go get -u all
|
||||||
|
|
||||||
|
update-patch:
|
||||||
|
go get -u=patch all
|
||||||
|
|||||||
140
README.md
140
README.md
@@ -1,30 +1,128 @@
|
|||||||
# gemini-grc
|
# gemini-grc
|
||||||
|
|
||||||
A Gemini crawler.
|
A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) network.
|
||||||
|
Easily extendable as a "wayback machine" of Gemini.
|
||||||
|
|
||||||
URLs to visit as well as data from visited URLs are stored as "snapshots" in the database.
|
## Features
|
||||||
This makes it easily extendable as a "wayback machine" of Gemini.
|
- [x] Concurrent downloading with configurable number of workers
|
||||||
|
|
||||||
## Done
|
|
||||||
- [x] Concurrent downloading with workers
|
|
||||||
- [x] Concurrent connection limit per host
|
|
||||||
- [x] URL Blacklist
|
|
||||||
- [x] Save image/* and text/* files
|
- [x] Save image/* and text/* files
|
||||||
- [x] Configuration via environment variables
|
- [x] Connection limit per host
|
||||||
- [x] Storing snapshots in PostgreSQL
|
- [x] URL Blacklist
|
||||||
- [x] Proper response header & body UTF-8 and format validation
|
- [x] URL Whitelist (overrides blacklist and robots.txt)
|
||||||
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
|
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
|
- [x] Configuration via command-line flags
|
||||||
|
- [x] Storing capsule snapshots in PostgreSQL
|
||||||
|
- [x] Proper response header & body UTF-8 and format validation
|
||||||
|
- [x] Proper URL normalization
|
||||||
- [x] Handle redirects (3X status codes)
|
- [x] Handle redirects (3X status codes)
|
||||||
- [x] Better URL normalization
|
- [x] Crawl Gopher holes
|
||||||
|
|
||||||
|
## Security Note
|
||||||
|
This crawler uses `InsecureSkipVerify: true` in TLS configuration to accept all certificates. This is a common approach for crawlers but makes the application vulnerable to MITM attacks. This trade-off is made to enable crawling self-signed certificates widely used in the Gemini ecosystem.
|
||||||
|
|
||||||
|
## How to run
|
||||||
|
|
||||||
|
```shell
|
||||||
|
make build
|
||||||
|
./dist/crawler --help
|
||||||
|
```
|
||||||
|
|
||||||
|
Check `misc/sql/initdb.sql` to create the PostgreSQL tables.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Available command-line flags:
|
||||||
|
|
||||||
|
```text
|
||||||
|
-blacklist-path string
|
||||||
|
File that has blacklist regexes
|
||||||
|
-dry-run
|
||||||
|
Dry run mode
|
||||||
|
-gopher
|
||||||
|
Enable crawling of Gopher holes
|
||||||
|
-log-level string
|
||||||
|
Logging level (debug, info, warn, error) (default "info")
|
||||||
|
-max-db-connections int
|
||||||
|
Maximum number of database connections (default 100)
|
||||||
|
-max-response-size int
|
||||||
|
Maximum size of response in bytes (default 1048576)
|
||||||
|
-pgurl string
|
||||||
|
Postgres URL
|
||||||
|
-response-timeout int
|
||||||
|
Timeout for network responses in seconds (default 10)
|
||||||
|
-seed-url-path string
|
||||||
|
File with seed URLs that should be added to the queue immediately
|
||||||
|
-skip-if-updated-days int
|
||||||
|
Skip re-crawling URLs updated within this many days (0 to disable) (default 60)
|
||||||
|
-whitelist-path string
|
||||||
|
File with URLs that should always be crawled regardless of blacklist
|
||||||
|
-workers int
|
||||||
|
Number of concurrent workers (default 1)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./dist/crawler \
|
||||||
|
-pgurl="postgres://test:test@127.0.0.1:5434/test?sslmode=disable" \
|
||||||
|
-log-level=info \
|
||||||
|
-workers=10 \
|
||||||
|
-blacklist-path="./blacklist.txt" \
|
||||||
|
-whitelist-path="./whitelist.txt" \
|
||||||
|
-max-response-size=10485760 \
|
||||||
|
-response-timeout=10 \
|
||||||
|
-max-db-connections=100 \
|
||||||
|
-skip-if-updated-days=7 \
|
||||||
|
-gopher \
|
||||||
|
-seed-url-path="./seed_urls.txt"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
Install linters. Check the versions first.
|
||||||
|
```shell
|
||||||
|
go install mvdan.cc/gofumpt@v0.7.0
|
||||||
|
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.63.4
|
||||||
|
```
|
||||||
|
|
||||||
|
## Snapshot History
|
||||||
|
|
||||||
|
The crawler now supports versioned snapshots, storing multiple snapshots of the same URL over time. This allows you to view how content changes over time, similar to the Internet Archive's Wayback Machine.
|
||||||
|
|
||||||
|
### Accessing Snapshot History
|
||||||
|
|
||||||
|
You can access the snapshot history using the included `snapshot_history.sh` script:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get the latest snapshot
|
||||||
|
./snapshot_history.sh -u gemini://example.com/
|
||||||
|
|
||||||
|
# Get a snapshot from a specific point in time
|
||||||
|
./snapshot_history.sh -u gemini://example.com/ -t 2023-05-01T12:00:00Z
|
||||||
|
|
||||||
|
# Get all snapshots for a URL
|
||||||
|
./snapshot_history.sh -u gemini://example.com/ -a
|
||||||
|
|
||||||
|
# Get snapshots in a date range
|
||||||
|
./snapshot_history.sh -u gemini://example.com/ -r 2023-01-01T00:00:00Z 2023-12-31T23:59:59Z
|
||||||
|
```
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
- [ ] Add snapshot hash and support snapshot history
|
- [x] Add snapshot history
|
||||||
- [ ] Add web interface
|
- [ ] Add a web interface
|
||||||
- [ ] Provide a TLS cert for sites that require it, like Astrobotany
|
- [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany
|
||||||
|
- [ ] Use pledge/unveil in OpenBSD hosts
|
||||||
|
|
||||||
## TODO with lower priority
|
## TODO (lower priority)
|
||||||
- [ ] Gopher
|
- [ ] More protocols? http://dbohdan.sdf.org/smolnet/
|
||||||
- [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
|
|
||||||
- [ ] Spartan
|
## Notes
|
||||||
- [ ] Nex
|
Good starting points:
|
||||||
- [ ] SuperTXT https://supertxt.net/00-intro.html
|
|
||||||
|
gemini://warmedal.se/~antenna/
|
||||||
|
|
||||||
|
gemini://tlgs.one/
|
||||||
|
|
||||||
|
gopher://i-logout.cz:70/1/bongusta/
|
||||||
|
|
||||||
|
gopher://gopher.quux.org:70/
|
||||||
@@ -1,116 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
|
|
||||||
"gemini-grc/gemini"
|
|
||||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
|
||||||
"github.com/jmoiron/sqlx"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Populates the `host` field
|
|
||||||
func main() {
|
|
||||||
db := connectToDB()
|
|
||||||
count := 0
|
|
||||||
|
|
||||||
for {
|
|
||||||
tx := db.MustBegin()
|
|
||||||
query := `
|
|
||||||
SELECT * FROM snapshots
|
|
||||||
ORDER BY id
|
|
||||||
LIMIT 10000 OFFSET $1
|
|
||||||
`
|
|
||||||
var snapshots []gemini.Snapshot
|
|
||||||
err := tx.Select(&snapshots, query, count)
|
|
||||||
if err != nil {
|
|
||||||
printErrorAndExit(tx, err)
|
|
||||||
}
|
|
||||||
if len(snapshots) == 0 {
|
|
||||||
fmt.Println("Done!")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
for _, s := range snapshots {
|
|
||||||
count++
|
|
||||||
escaped := gemini.EscapeURL(s.URL.String())
|
|
||||||
normalizedGeminiURL, err := gemini.ParseURL(escaped, "")
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println(s.URL.String())
|
|
||||||
fmt.Println(escaped)
|
|
||||||
printErrorAndExit(tx, err)
|
|
||||||
}
|
|
||||||
normalizedURLString := normalizedGeminiURL.String()
|
|
||||||
// If URL is already normalized, skip snapshot
|
|
||||||
if normalizedURLString == s.URL.String() {
|
|
||||||
// fmt.Printf("[%5d] Skipping %d %s\n", count, s.ID, s.URL.String())
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// If a snapshot already exists with the normalized
|
|
||||||
// URL, delete the current snapshot and leave the other.
|
|
||||||
var ss []gemini.Snapshot
|
|
||||||
err = tx.Select(&ss, "SELECT * FROM snapshots WHERE URL=$1", normalizedURLString)
|
|
||||||
if err != nil {
|
|
||||||
printErrorAndExit(tx, err)
|
|
||||||
}
|
|
||||||
if len(ss) > 0 {
|
|
||||||
tx.MustExec("DELETE FROM snapshots WHERE id=$1", s.ID)
|
|
||||||
fmt.Printf("%d Deleting %d %s\n", count, s.ID, s.URL.String())
|
|
||||||
//err = tx.Commit()
|
|
||||||
//if err != nil {
|
|
||||||
// printErrorAndExit(tx, err)
|
|
||||||
//}
|
|
||||||
//return
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// fmt.Printf("%s =>\n%s\n", s.URL.String(), normalizedURLString)
|
|
||||||
// At this point we just update the snapshot,
|
|
||||||
// and the normalized URL will be saved.
|
|
||||||
fmt.Printf("%d Updating %d %s => %s\n", count, s.ID, s.URL.String(), normalizedURLString)
|
|
||||||
// Saves the snapshot with the normalized URL
|
|
||||||
tx.MustExec("DELETE FROM snapshots WHERE id=$1", s.ID)
|
|
||||||
s.URL = *normalizedGeminiURL
|
|
||||||
err = gemini.UpsertSnapshot(0, tx, &s)
|
|
||||||
if err != nil {
|
|
||||||
printErrorAndExit(tx, err)
|
|
||||||
}
|
|
||||||
//err = tx.Commit()
|
|
||||||
//if err != nil {
|
|
||||||
// printErrorAndExit(tx, err)
|
|
||||||
//}
|
|
||||||
//return
|
|
||||||
}
|
|
||||||
err = tx.Commit()
|
|
||||||
if err != nil {
|
|
||||||
printErrorAndExit(tx, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func printErrorAndExit(tx *sqlx.Tx, err error) {
|
|
||||||
_ = tx.Rollback()
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
func connectToDB() *sqlx.DB {
|
|
||||||
connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s",
|
|
||||||
os.Getenv("PG_USER"),
|
|
||||||
os.Getenv("PG_PASSWORD"),
|
|
||||||
os.Getenv("PG_HOST"),
|
|
||||||
os.Getenv("PG_PORT"),
|
|
||||||
os.Getenv("PG_DATABASE"),
|
|
||||||
)
|
|
||||||
|
|
||||||
// Create a connection pool
|
|
||||||
db, err := sqlx.Open("pgx", connStr)
|
|
||||||
if err != nil {
|
|
||||||
panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err))
|
|
||||||
}
|
|
||||||
db.SetMaxOpenConns(20)
|
|
||||||
err = db.Ping()
|
|
||||||
if err != nil {
|
|
||||||
panic(fmt.Sprintf("Unable to ping database: %v\n", err))
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Println("Connected to database")
|
|
||||||
return db
|
|
||||||
}
|
|
||||||
400
cmd/crawler/crawler.go
Normal file
400
cmd/crawler/crawler.go
Normal file
@@ -0,0 +1,400 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/common"
|
||||||
|
"gemini-grc/common/blackList"
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/common/seedList"
|
||||||
|
"gemini-grc/common/whiteList"
|
||||||
|
"gemini-grc/config"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
gemdb "gemini-grc/db"
|
||||||
|
"gemini-grc/robotsMatch"
|
||||||
|
"gemini-grc/util"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"github.com/jmoiron/sqlx"
|
||||||
|
)
|
||||||
|
|
||||||
|
var jobs chan string
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
var err error
|
||||||
|
|
||||||
|
err = initializeApp()
|
||||||
|
if err != nil {
|
||||||
|
handleUnexpectedError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = runApp()
|
||||||
|
if err != nil {
|
||||||
|
handleUnexpectedError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = shutdownApp()
|
||||||
|
if err != nil {
|
||||||
|
handleUnexpectedError(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleUnexpectedError(err error) {
|
||||||
|
logging.LogError("Unexpected error: %v", err)
|
||||||
|
_ = shutdownApp()
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func initializeApp() error {
|
||||||
|
config.CONFIG = *config.Initialize()
|
||||||
|
logging.InitSlogger(config.CONFIG.LogLevel)
|
||||||
|
|
||||||
|
logging.LogInfo("Starting up. Press Ctrl+C to exit")
|
||||||
|
common.SignalsChan = make(chan os.Signal, 1)
|
||||||
|
signal.Notify(common.SignalsChan, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
common.FatalErrorsChan = make(chan error)
|
||||||
|
jobs = make(chan string, config.CONFIG.NumOfWorkers)
|
||||||
|
|
||||||
|
var err error
|
||||||
|
|
||||||
|
err = blackList.Initialize()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = whiteList.Initialize()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = seedList.Initialize()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = robotsMatch.Initialize()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
err = gemdb.Database.Initialize(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.CONFIG.SeedUrlPath != "" {
|
||||||
|
err := AddURLsFromFile(ctx, config.CONFIG.SeedUrlPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func shutdownApp() error {
|
||||||
|
var err error
|
||||||
|
|
||||||
|
err = blackList.Shutdown()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = whiteList.Shutdown()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = seedList.Shutdown()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = robotsMatch.Shutdown()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
err = gemdb.Database.Shutdown(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func runApp() (err error) {
|
||||||
|
go spawnWorkers(config.CONFIG.NumOfWorkers)
|
||||||
|
go runJobScheduler()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-common.SignalsChan:
|
||||||
|
logging.LogWarn("Received SIGINT or SIGTERM signal, exiting")
|
||||||
|
return nil
|
||||||
|
case err := <-common.FatalErrorsChan:
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func spawnWorkers(total int) {
|
||||||
|
for id := 0; id < total; id++ {
|
||||||
|
go func(a int) {
|
||||||
|
for {
|
||||||
|
job := <-jobs
|
||||||
|
common.RunWorkerWithTx(a, job)
|
||||||
|
}
|
||||||
|
}(id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Current Logic Flow:
|
||||||
|
//
|
||||||
|
// 1. Create transaction
|
||||||
|
// 2. Get distinct hosts
|
||||||
|
// 3. If no hosts → fetch snapshots from history (adds URLs to queue)
|
||||||
|
// 4. Re-query for hosts (should now have some)
|
||||||
|
// 5. Get URLs from hosts
|
||||||
|
// 6. Commit transaction
|
||||||
|
// 7. Queue URLs for workers
|
||||||
|
func runJobScheduler() {
|
||||||
|
var tx *sqlx.Tx
|
||||||
|
var err error
|
||||||
|
|
||||||
|
ctx := contextutil.ContextWithComponent(context.Background(), "crawler")
|
||||||
|
tx, err = gemdb.Database.NewTx(ctx)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
defer func(tx *sqlx.Tx) {
|
||||||
|
if tx != nil {
|
||||||
|
if err := gemdb.SafeRollback(ctx, tx); err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}(tx)
|
||||||
|
|
||||||
|
// First, check if the URLs table is empty.
|
||||||
|
var urlCount int
|
||||||
|
|
||||||
|
if config.CONFIG.GopherEnable {
|
||||||
|
err = tx.Get(&urlCount, "SELECT COUNT(*) FROM urls")
|
||||||
|
} else {
|
||||||
|
err = tx.Get(&urlCount, "SELECT COUNT(*) FROM urls WHERE url LIKE 'gemini://%'")
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
err = tx.Commit()
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no pending URLs, add the ones from the standard crawl set.
|
||||||
|
tx, err = gemdb.Database.NewTx(ctx)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if urlCount == 0 {
|
||||||
|
logging.LogInfo("URLs table is empty, enqueueing standard crawl set")
|
||||||
|
err = enqueueSeedURLs(ctx, tx)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Commit this tx here so the loop below sees the changes.
|
||||||
|
err := tx.Commit()
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Found %d pending URLs to crawl.", urlCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main job loop.
|
||||||
|
// We get URLs from the pending URLs table,
|
||||||
|
// add crawling jobs for those,
|
||||||
|
// and sleep a bit after each run.
|
||||||
|
for {
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Polling DB for jobs")
|
||||||
|
|
||||||
|
// Use fresh context for DB operations to avoid timeouts/cancellation
|
||||||
|
// from the long-lived scheduler context affecting database transactions
|
||||||
|
dbCtx := context.Background()
|
||||||
|
tx, err = gemdb.Database.NewTx(dbCtx)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all distinct hosts from pending URLs
|
||||||
|
distinctHosts, err := gemdb.Database.GetUrlHosts(dbCtx, tx)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// When out of pending URLs, add some random ones.
|
||||||
|
if len(distinctHosts) == 0 {
|
||||||
|
// Queue random old URLs from history.
|
||||||
|
count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers, config.CONFIG.SkipIfUpdatedDays)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if count == 0 {
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
|
||||||
|
time.Sleep(120 * time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
distinctHosts, err = gemdb.Database.GetUrlHosts(dbCtx, tx)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get some URLs from each host, up to a limit
|
||||||
|
urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, config.CONFIG.NumOfWorkers, tx)
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
err = tx.Commit()
|
||||||
|
if err != nil {
|
||||||
|
common.FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(urls) == 0 {
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
|
||||||
|
time.Sleep(120 * time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%d urls to crawl", len(urls))
|
||||||
|
|
||||||
|
// Add jobs to WaitGroup before queuing
|
||||||
|
common.WorkerWG.Add(len(urls))
|
||||||
|
|
||||||
|
for _, url := range urls {
|
||||||
|
jobs <- url
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all workers to complete their jobs
|
||||||
|
common.WorkerWG.Wait()
|
||||||
|
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "All workers done. New scheduler run starts")
|
||||||
|
logging.LogInfo("")
|
||||||
|
logging.LogInfo("")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func enqueueSeedURLs(ctx context.Context, tx *sqlx.Tx) error {
|
||||||
|
// Get seed URLs from seedList module
|
||||||
|
//urls := seedList.GetSeedURLs()
|
||||||
|
//
|
||||||
|
//for _, url := range urls {
|
||||||
|
// err := gemdb.Database.InsertURL(ctx, tx, url)
|
||||||
|
// if err != nil {
|
||||||
|
// return err
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age int) (int, error) {
|
||||||
|
// Select <num> snapshots from snapshots table for recrawling
|
||||||
|
// Find URLs where the LATEST crawl attempt (via last_crawled) is at least <age> days old
|
||||||
|
// Uses last_crawled timestamp to track actual crawl attempts regardless of content changes
|
||||||
|
historyCtx := contextutil.ContextWithComponent(context.Background(), "fetchSnapshotsFromHistory")
|
||||||
|
contextlog.LogDebugWithContext(historyCtx, logging.GetSlogger(), "Looking for %d URLs whose latest crawl attempt is at least %d days old to recrawl", num, age)
|
||||||
|
|
||||||
|
// Calculate the cutoff date
|
||||||
|
cutoffDate := time.Now().AddDate(0, 0, -age)
|
||||||
|
|
||||||
|
// Use the query from db_queries.go to find URLs that need re-crawling
|
||||||
|
|
||||||
|
type SnapshotURL struct {
|
||||||
|
URL string `db:"url"`
|
||||||
|
Host string `db:"host"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute the query
|
||||||
|
var snapshotURLs []SnapshotURL
|
||||||
|
err := tx.Select(&snapshotURLs, gemdb.SQL_FETCH_SNAPSHOTS_FROM_HISTORY, cutoffDate, num)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(snapshotURLs) == 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// For each selected snapshot, add the URL to the urls table
|
||||||
|
insertCount := 0
|
||||||
|
for _, snapshot := range snapshotURLs {
|
||||||
|
err := gemdb.Database.InsertURL(ctx, tx, snapshot.URL)
|
||||||
|
if err != nil {
|
||||||
|
logging.LogError("Error inserting URL %s from old snapshot to queue: %v", snapshot.URL, err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
insertCount++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: The transaction is committed by the caller (runJobScheduler),
|
||||||
|
// not here. This function is called as part of a larger transaction.
|
||||||
|
if insertCount > 0 {
|
||||||
|
contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "Added %d old URLs to recrawl queue", insertCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
return insertCount, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func AddURLsFromFile(ctx context.Context, filepath string) error {
|
||||||
|
data, err := os.ReadFile(filepath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
urls := util.Filter(lines, func(url string) bool {
|
||||||
|
return strings.TrimSpace(url) != ""
|
||||||
|
})
|
||||||
|
|
||||||
|
// Create a context for database operations
|
||||||
|
tx, err := gemdb.Database.NewTx(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert all the URLs
|
||||||
|
for _, url := range urls {
|
||||||
|
fileCtx := contextutil.ContextWithComponent(context.Background(), "AddURLsFromFile")
|
||||||
|
contextlog.LogInfoWithContext(fileCtx, logging.GetSlogger(), "Adding %s to queue", url)
|
||||||
|
err := gemdb.Database.InsertURL(ctx, tx, url)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = tx.Commit()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
73
common/blackList/blacklist.go
Normal file
73
common/blackList/blacklist.go
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
package blackList
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
)
|
||||||
|
|
||||||
|
var blacklist []regexp.Regexp //nolint:gochecknoglobals
|
||||||
|
|
||||||
|
func Initialize() error {
|
||||||
|
var err error
|
||||||
|
|
||||||
|
// Initialize blacklist
|
||||||
|
if config.CONFIG.BlacklistPath != "" {
|
||||||
|
if err = loadBlacklist(config.CONFIG.BlacklistPath); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBlacklist(filePath string) error {
|
||||||
|
if blacklist != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := os.ReadFile(filePath)
|
||||||
|
if err != nil {
|
||||||
|
blacklist = []regexp.Regexp{}
|
||||||
|
return xerrors.NewError(fmt.Errorf("could not load blacklist file: %w", err), 0, "", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
blacklist = []regexp.Regexp{}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
if line == "" || strings.HasPrefix(line, "#") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
regex, err := regexp.Compile(line)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.NewError(fmt.Errorf("could not compile blacklist line %s: %w", line, err), 0, "", true)
|
||||||
|
}
|
||||||
|
blacklist = append(blacklist, *regex)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(blacklist) > 0 {
|
||||||
|
logging.LogInfo("Loaded %d blacklist entries", len(blacklist))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Shutdown() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsBlacklisted checks if the URL matches any blacklist pattern
|
||||||
|
func IsBlacklisted(u string) bool {
|
||||||
|
for _, v := range blacklist {
|
||||||
|
if v.MatchString(u) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
529
common/blackList/blacklist_test.go
Normal file
529
common/blackList/blacklist_test.go
Normal file
@@ -0,0 +1,529 @@
|
|||||||
|
package blackList
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIsBlacklisted(t *testing.T) {
|
||||||
|
// Save original blacklist and whitelist to restore after test
|
||||||
|
originalBlacklist := blacklist
|
||||||
|
defer func() {
|
||||||
|
blacklist = originalBlacklist
|
||||||
|
}()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
setup func()
|
||||||
|
url string
|
||||||
|
expected bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "empty blacklist",
|
||||||
|
setup: func() {
|
||||||
|
blacklist = []regexp.Regexp{}
|
||||||
|
},
|
||||||
|
url: "https://example.com",
|
||||||
|
expected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "exact hostname match",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`example\.com`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "example.com",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "hostname in URL match",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`example\.com`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://example.com/path",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "partial hostname match",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`example\.com`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://safe-example.com",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "full URL match",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`https://example\.com/bad-path`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://example.com/bad-path",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "path match",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile("/malicious-path")
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://example.com/malicious-path",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "subdomain match with word boundary",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`bad\.example\.com`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://bad.example.com/path",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple patterns, one match",
|
||||||
|
setup: func() {
|
||||||
|
regex1, _ := regexp.Compile(`badsite\.com`)
|
||||||
|
regex2, _ := regexp.Compile(`malicious\.org`)
|
||||||
|
regex3, _ := regexp.Compile(`example\.com/sensitive`)
|
||||||
|
blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
|
||||||
|
},
|
||||||
|
url: "https://example.com/sensitive/data",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple patterns, no match",
|
||||||
|
setup: func() {
|
||||||
|
regex1, _ := regexp.Compile(`badsite\.com`)
|
||||||
|
regex2, _ := regexp.Compile(`malicious\.org`)
|
||||||
|
regex3, _ := regexp.Compile(`example\.com/sensitive`)
|
||||||
|
blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
|
||||||
|
},
|
||||||
|
url: "https://example.com/safe/data",
|
||||||
|
expected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "pattern with wildcard",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`.*\.evil\.com`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://subdomain.evil.com/path",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "pattern with special characters",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`example\.com/path\?id=[0-9]+`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://example.com/path?id=12345",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unicode character support",
|
||||||
|
setup: func() {
|
||||||
|
regex, _ := regexp.Compile(`example\.com/[\p{L}]+`)
|
||||||
|
blacklist = []regexp.Regexp{*regex}
|
||||||
|
},
|
||||||
|
url: "https://example.com/café",
|
||||||
|
expected: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
tt.setup()
|
||||||
|
result := IsBlacklisted(tt.url)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBlacklistLoading tests that the blacklist loading logic works with a mock blacklist file
|
||||||
|
func TestBlacklistLoading(t *testing.T) {
|
||||||
|
// Save original blacklist and config
|
||||||
|
originalBlacklist := blacklist
|
||||||
|
originalConfigPath := config.CONFIG.BlacklistPath
|
||||||
|
defer func() {
|
||||||
|
blacklist = originalBlacklist
|
||||||
|
config.CONFIG.BlacklistPath = originalConfigPath
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Create a temporary blacklist file with known patterns
|
||||||
|
tmpFile, err := os.CreateTemp("", "mock-blacklist-*.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create temporary file: %v", err)
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpFile.Name())
|
||||||
|
|
||||||
|
// Write some test patterns to the mock blacklist file
|
||||||
|
mockBlacklistContent := `# Mock blacklist file for testing
|
||||||
|
/git/
|
||||||
|
/.git/
|
||||||
|
/cgit/
|
||||||
|
gemini://git\..*$
|
||||||
|
gemini://.*/git/.*
|
||||||
|
gopher://.*/git/.*
|
||||||
|
.*/(commit|blob|tree)/.*
|
||||||
|
.*/[0-9a-f]{7,40}$
|
||||||
|
`
|
||||||
|
if err := os.WriteFile(tmpFile.Name(), []byte(mockBlacklistContent), 0o644); err != nil {
|
||||||
|
t.Fatalf("Failed to write to temporary file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configure and load the mock blacklist
|
||||||
|
blacklist = nil
|
||||||
|
config.CONFIG.BlacklistPath = tmpFile.Name()
|
||||||
|
err = Initialize()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to load mock blacklist: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count the number of non-comment, non-empty lines to verify loading
|
||||||
|
lineCount := 0
|
||||||
|
for _, line := range strings.Split(mockBlacklistContent, "\n") {
|
||||||
|
if line != "" && !strings.HasPrefix(line, "#") {
|
||||||
|
lineCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(blacklist) != lineCount {
|
||||||
|
t.Errorf("Expected %d patterns to be loaded, got %d", lineCount, len(blacklist))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify some sample URLs against our known patterns
|
||||||
|
testURLs := []struct {
|
||||||
|
url string
|
||||||
|
expected bool
|
||||||
|
desc string
|
||||||
|
}{
|
||||||
|
{"gemini://example.com/git/repo", true, "git repository"},
|
||||||
|
{"gemini://git.example.com", true, "git subdomain"},
|
||||||
|
{"gemini://example.com/cgit/repo", true, "cgit repository"},
|
||||||
|
{"gemini://example.com/repo/commit/abc123", true, "git commit"},
|
||||||
|
{"gemini://example.com/123abc7", true, "commit hash at path end"},
|
||||||
|
{"gopher://example.com/1/git/repo", true, "gopher git repository"},
|
||||||
|
{"gemini://example.com/normal/page.gmi", false, "normal gemini page"},
|
||||||
|
{"gemini://example.com/project/123abc", false, "hash not at path end"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range testURLs {
|
||||||
|
result := IsBlacklisted(tt.url)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("With mock blacklist, IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadBlacklist(t *testing.T) {
|
||||||
|
// Save original blacklist to restore after test
|
||||||
|
originalBlacklist := blacklist
|
||||||
|
originalConfigPath := config.CONFIG.BlacklistPath
|
||||||
|
defer func() {
|
||||||
|
blacklist = originalBlacklist
|
||||||
|
config.CONFIG.BlacklistPath = originalConfigPath
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Create a temporary blacklist file for testing
|
||||||
|
tmpFile, err := os.CreateTemp("", "blacklist-*.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create temporary file: %v", err)
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpFile.Name())
|
||||||
|
|
||||||
|
// Test cases for Initialize
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
blacklistLines []string
|
||||||
|
configPath string
|
||||||
|
wantErr bool
|
||||||
|
expectedLen int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "empty path",
|
||||||
|
blacklistLines: []string{},
|
||||||
|
configPath: "",
|
||||||
|
wantErr: false,
|
||||||
|
expectedLen: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "valid blacklist with comments",
|
||||||
|
blacklistLines: []string{"example\\.com", "# This is a comment", "malicious\\.org"},
|
||||||
|
configPath: tmpFile.Name(),
|
||||||
|
wantErr: false,
|
||||||
|
expectedLen: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid regex",
|
||||||
|
blacklistLines: []string{"example\\.com", "[invalid regex"},
|
||||||
|
configPath: tmpFile.Name(),
|
||||||
|
wantErr: true,
|
||||||
|
expectedLen: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "nonexistent file",
|
||||||
|
blacklistLines: []string{},
|
||||||
|
configPath: "nonexistent-file.txt",
|
||||||
|
wantErr: true,
|
||||||
|
expectedLen: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
// Reset blacklist
|
||||||
|
blacklist = nil
|
||||||
|
|
||||||
|
// Set config path
|
||||||
|
config.CONFIG.BlacklistPath = tt.configPath
|
||||||
|
|
||||||
|
// Write test data to file if needed
|
||||||
|
if tt.configPath == tmpFile.Name() {
|
||||||
|
content := ""
|
||||||
|
for _, line := range tt.blacklistLines {
|
||||||
|
content += line + "\n"
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(tmpFile.Name(), []byte(content), 0o644); err != nil {
|
||||||
|
t.Fatalf("Failed to write to temporary file: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call the function
|
||||||
|
err := Initialize()
|
||||||
|
|
||||||
|
// Check results
|
||||||
|
if (err != nil) != tt.wantErr {
|
||||||
|
t.Errorf("Initialize() error = %v, wantErr %v", err, tt.wantErr)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !tt.wantErr && len(blacklist) != tt.expectedLen {
|
||||||
|
t.Errorf("Initialize() loaded %d entries, want %d", len(blacklist), tt.expectedLen)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGitPatterns tests the blacklist patterns specifically for Git repositories
|
||||||
|
func TestGitPatterns(t *testing.T) {
|
||||||
|
// Save original blacklist to restore after test
|
||||||
|
originalBlacklist := blacklist
|
||||||
|
defer func() {
|
||||||
|
blacklist = originalBlacklist
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Create patterns similar to those in the blacklist.txt file
|
||||||
|
patterns := []string{
|
||||||
|
"/git/",
|
||||||
|
"/.git/",
|
||||||
|
"/cgit/",
|
||||||
|
"/gitweb/",
|
||||||
|
"/gitea/",
|
||||||
|
"/scm/",
|
||||||
|
".*/(commit|blob|tree|tag|diff|blame|log|raw)/.*",
|
||||||
|
".*/(commits|objects|refs|branches|tags)/.*",
|
||||||
|
".*/[0-9a-f]{7,40}$",
|
||||||
|
"gemini://git\\..*$",
|
||||||
|
"gemini://.*/git/.*",
|
||||||
|
"gemini://.*\\.git/.*",
|
||||||
|
"gopher://.*/git/.*",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile and set up the patterns
|
||||||
|
blacklist = []regexp.Regexp{}
|
||||||
|
for _, pattern := range patterns {
|
||||||
|
regex, err := regexp.Compile(pattern)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to compile pattern %q: %v", pattern, err)
|
||||||
|
}
|
||||||
|
blacklist = append(blacklist, *regex)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test URLs against git-related patterns
|
||||||
|
tests := []struct {
|
||||||
|
url string
|
||||||
|
expected bool
|
||||||
|
desc string
|
||||||
|
}{
|
||||||
|
// Git paths
|
||||||
|
{"gemini://example.com/git/", true, "basic git path"},
|
||||||
|
{"gemini://example.com/.git/", true, "hidden git path"},
|
||||||
|
{"gemini://example.com/cgit/", true, "cgit path"},
|
||||||
|
{"gemini://example.com/gitweb/", true, "gitweb path"},
|
||||||
|
{"gemini://example.com/gitea/", true, "gitea path"},
|
||||||
|
{"gemini://example.com/scm/", true, "scm path"},
|
||||||
|
|
||||||
|
// Git operations
|
||||||
|
{"gemini://example.com/repo/commit/abc123", true, "commit path"},
|
||||||
|
{"gemini://example.com/repo/blob/main/README.md", true, "blob path"},
|
||||||
|
{"gemini://example.com/repo/tree/master", true, "tree path"},
|
||||||
|
{"gemini://example.com/repo/tag/v1.0", true, "tag path"},
|
||||||
|
|
||||||
|
// Git internals
|
||||||
|
{"gemini://example.com/repo/commits/", true, "commits path"},
|
||||||
|
{"gemini://example.com/repo/objects/", true, "objects path"},
|
||||||
|
{"gemini://example.com/repo/refs/heads/main", true, "refs path"},
|
||||||
|
|
||||||
|
// Git hashes
|
||||||
|
{"gemini://example.com/commit/a1b2c3d", true, "short hash"},
|
||||||
|
{"gemini://example.com/commit/a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0", true, "long hash"},
|
||||||
|
|
||||||
|
// Git domains
|
||||||
|
{"gemini://git.example.com/", true, "git subdomain"},
|
||||||
|
{"gemini://example.com/git/repo", true, "git directory"},
|
||||||
|
{"gemini://example.com/project.git/", true, "git extension"},
|
||||||
|
|
||||||
|
// Gopher protocol
|
||||||
|
{"gopher://example.com/1/git/repo", true, "gopher git path"},
|
||||||
|
|
||||||
|
// Non-matching URLs
|
||||||
|
{"gemini://example.com/project/", false, "regular project path"},
|
||||||
|
{"gemini://example.com/blog/", false, "blog path"},
|
||||||
|
{"gemini://example.com/git-guide.gmi", false, "hyphenated word with git"},
|
||||||
|
{"gemini://example.com/digital/", false, "word containing 'git'"},
|
||||||
|
{"gemini://example.com/ab12cd3", true, "short hex string matches commit hash pattern"},
|
||||||
|
{"gemini://example.com/ab12cdz", false, "alphanumeric string with non-hex chars won't match commit hash"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.desc, func(t *testing.T) {
|
||||||
|
result := IsBlacklisted(tt.url)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGeminiGopherPatterns tests the blacklist patterns specific to Gemini and Gopher protocols
|
||||||
|
func TestGeminiGopherPatterns(t *testing.T) {
|
||||||
|
// Save original blacklist to restore after test
|
||||||
|
originalBlacklist := blacklist
|
||||||
|
defer func() {
|
||||||
|
blacklist = originalBlacklist
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Create patterns for Gemini and Gopher
|
||||||
|
patterns := []string{
|
||||||
|
"gemini://badhost\\.com",
|
||||||
|
"gemini://.*/cgi-bin/",
|
||||||
|
"gemini://.*/private/",
|
||||||
|
"gemini://.*\\.evil\\..*",
|
||||||
|
"gopher://badhost\\.org",
|
||||||
|
"gopher://.*/I/onlyfans/",
|
||||||
|
"gopher://.*/[0-9]/(cgi|bin)/",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile and set up the patterns
|
||||||
|
blacklist = []regexp.Regexp{}
|
||||||
|
for _, pattern := range patterns {
|
||||||
|
regex, err := regexp.Compile(pattern)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to compile pattern %q: %v", pattern, err)
|
||||||
|
}
|
||||||
|
blacklist = append(blacklist, *regex)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test URLs against Gemini and Gopher patterns
|
||||||
|
tests := []struct {
|
||||||
|
url string
|
||||||
|
expected bool
|
||||||
|
desc string
|
||||||
|
}{
|
||||||
|
// Gemini URLs
|
||||||
|
{"gemini://badhost.com/", true, "blacklisted gemini host"},
|
||||||
|
{"gemini://badhost.com/page.gmi", true, "blacklisted gemini host with path"},
|
||||||
|
{"gemini://example.com/cgi-bin/script.cgi", true, "gemini cgi-bin path"},
|
||||||
|
{"gemini://example.com/private/docs", true, "gemini private path"},
|
||||||
|
{"gemini://subdomain.evil.org", true, "gemini evil domain pattern"},
|
||||||
|
{"gemini://example.com/public/docs", false, "safe gemini path"},
|
||||||
|
{"gemini://goodhost.com/", false, "safe gemini host"},
|
||||||
|
|
||||||
|
// Gopher URLs
|
||||||
|
{"gopher://badhost.org/1/menu", true, "blacklisted gopher host"},
|
||||||
|
{"gopher://example.org/I/onlyfans/image", true, "gopher onlyfans path"},
|
||||||
|
{"gopher://example.org/1/cgi/script", true, "gopher cgi path"},
|
||||||
|
{"gopher://example.org/1/bin/executable", true, "gopher bin path"},
|
||||||
|
{"gopher://example.org/0/text", false, "safe gopher text"},
|
||||||
|
{"gopher://goodhost.org/1/menu", false, "safe gopher host"},
|
||||||
|
|
||||||
|
// Protocol distinction
|
||||||
|
{"https://badhost.com/", false, "blacklisted host but wrong protocol"},
|
||||||
|
{"http://example.com/cgi-bin/script.cgi", false, "bad path but wrong protocol"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.desc, func(t *testing.T) {
|
||||||
|
result := IsBlacklisted(tt.url)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsBlacklistedIntegration(t *testing.T) {
|
||||||
|
// Save original blacklist to restore after test
|
||||||
|
originalBlacklist := blacklist
|
||||||
|
originalBlacklistPath := config.CONFIG.BlacklistPath
|
||||||
|
defer func() {
|
||||||
|
blacklist = originalBlacklist
|
||||||
|
config.CONFIG.BlacklistPath = originalBlacklistPath
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Create a temporary blacklist file for testing
|
||||||
|
tmpFile, err := os.CreateTemp("", "blacklist-*.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create temporary file: %v", err)
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpFile.Name())
|
||||||
|
|
||||||
|
// Write test patterns to the blacklist file
|
||||||
|
blacklistContent := `# Test blacklist file
|
||||||
|
example\.com
|
||||||
|
malicious\.org
|
||||||
|
/phishing
|
||||||
|
.*\.evil\.com
|
||||||
|
\w+@spam\.com
|
||||||
|
`
|
||||||
|
if err := os.WriteFile(tmpFile.Name(), []byte(blacklistContent), 0o644); err != nil {
|
||||||
|
t.Fatalf("Failed to write to temporary file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set up the test
|
||||||
|
blacklist = nil
|
||||||
|
config.CONFIG.BlacklistPath = tmpFile.Name()
|
||||||
|
|
||||||
|
// Load the blacklist
|
||||||
|
if err := Initialize(); err != nil {
|
||||||
|
t.Fatalf("Initialize() failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test URLs against the loaded blacklist
|
||||||
|
tests := []struct {
|
||||||
|
url string
|
||||||
|
expected bool
|
||||||
|
}{
|
||||||
|
{"https://example.com", true},
|
||||||
|
{"https://safe-site.com", false},
|
||||||
|
{"https://malicious.org/path", true},
|
||||||
|
{"https://example.org/phishing", true},
|
||||||
|
{"https://subdomain.evil.com", true},
|
||||||
|
{"https://safe-site.com/safe-path", false},
|
||||||
|
{"mailto:user@spam.com", true},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
result := IsBlacklisted(tt.url)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
112
common/contextlog/contextlog.go
Normal file
112
common/contextlog/contextlog.go
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
package contextlog
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SlogEventWithContext adds context information as structured fields to the log event.
|
||||||
|
func SlogEventWithContext(ctx context.Context, logger *slog.Logger) *slog.Logger {
|
||||||
|
// Start with the provided logger
|
||||||
|
if logger == nil {
|
||||||
|
// If logger isn't initialized, use the default logger
|
||||||
|
return slog.Default()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get context values - will be added directly to log records
|
||||||
|
host := contextutil.GetHostFromContext(ctx)
|
||||||
|
requestID := contextutil.GetRequestIDFromContext(ctx)
|
||||||
|
component := contextutil.GetComponentFromContext(ctx)
|
||||||
|
workerID := contextutil.GetWorkerIDFromContext(ctx)
|
||||||
|
url := contextutil.GetURLFromContext(ctx)
|
||||||
|
|
||||||
|
// Add all context fields to the logger
|
||||||
|
if host != "" {
|
||||||
|
logger = logger.With("host", host)
|
||||||
|
}
|
||||||
|
|
||||||
|
if requestID != "" {
|
||||||
|
logger = logger.With("request_id", requestID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if workerID >= 0 {
|
||||||
|
logger = logger.With("worker_id", workerID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if component != "" {
|
||||||
|
logger = logger.With("component", component)
|
||||||
|
}
|
||||||
|
|
||||||
|
if url != "" {
|
||||||
|
logger = logger.With("url", url)
|
||||||
|
}
|
||||||
|
|
||||||
|
return logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogDebugWithContext logs a debug message with context information.
|
||||||
|
func LogDebugWithContext(ctx context.Context, logger *slog.Logger, format string, args ...interface{}) {
|
||||||
|
if logger == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create logger with context fields
|
||||||
|
contextLogger := SlogEventWithContext(ctx, logger)
|
||||||
|
|
||||||
|
// Format the message
|
||||||
|
message := fmt.Sprintf(format, args...)
|
||||||
|
|
||||||
|
// Log with context data in the record attributes
|
||||||
|
contextLogger.Debug(message)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogInfoWithContext logs an info message with context information.
|
||||||
|
func LogInfoWithContext(ctx context.Context, logger *slog.Logger, format string, args ...interface{}) {
|
||||||
|
if logger == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create logger with context fields
|
||||||
|
contextLogger := SlogEventWithContext(ctx, logger)
|
||||||
|
|
||||||
|
// Format the message
|
||||||
|
message := fmt.Sprintf(format, args...)
|
||||||
|
|
||||||
|
// Log with context data in the record attributes
|
||||||
|
contextLogger.Info(message)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogWarnWithContext logs a warning message with context information.
|
||||||
|
func LogWarnWithContext(ctx context.Context, logger *slog.Logger, format string, args ...interface{}) {
|
||||||
|
if logger == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create logger with context fields
|
||||||
|
contextLogger := SlogEventWithContext(ctx, logger)
|
||||||
|
|
||||||
|
// Format the message
|
||||||
|
message := fmt.Sprintf(format, args...)
|
||||||
|
|
||||||
|
// Log with context data in the record attributes
|
||||||
|
contextLogger.Warn(message)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogErrorWithContext logs an error message with context information
|
||||||
|
func LogErrorWithContext(ctx context.Context, logger *slog.Logger, format string, args ...interface{}) {
|
||||||
|
if logger == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create logger with context fields
|
||||||
|
contextLogger := SlogEventWithContext(ctx, logger)
|
||||||
|
|
||||||
|
// Format the message
|
||||||
|
msg := fmt.Sprintf(format, args...)
|
||||||
|
|
||||||
|
// Log with context data in the record attributes
|
||||||
|
contextLogger.Error(msg, slog.String("error", msg))
|
||||||
|
}
|
||||||
100
common/errors.go
100
common/errors.go
@@ -1,100 +0,0 @@
|
|||||||
package common
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
)
|
|
||||||
|
|
||||||
type GeminiError struct {
|
|
||||||
Msg string
|
|
||||||
Code int
|
|
||||||
Header string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *GeminiError) Error() string {
|
|
||||||
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewErrGeminiStatusCode(code int, header string) error {
|
|
||||||
var msg string
|
|
||||||
switch {
|
|
||||||
case code >= 10 && code < 20:
|
|
||||||
msg = "needs input"
|
|
||||||
case code >= 30 && code < 40:
|
|
||||||
msg = "redirect"
|
|
||||||
case code >= 40 && code < 50:
|
|
||||||
msg = "bad request"
|
|
||||||
case code >= 50 && code < 60:
|
|
||||||
msg = "server error"
|
|
||||||
case code >= 60 && code < 70:
|
|
||||||
msg = "TLS error"
|
|
||||||
default:
|
|
||||||
msg = "unexpected status code"
|
|
||||||
}
|
|
||||||
return &GeminiError{
|
|
||||||
Msg: msg,
|
|
||||||
Code: code,
|
|
||||||
Header: header,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
|
|
||||||
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
|
|
||||||
ErrGeminiResponseHeader = errors.New("gemini response header error")
|
|
||||||
ErrGeminiRedirect = errors.New("gemini redirection error")
|
|
||||||
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
|
|
||||||
|
|
||||||
ErrURLParse = errors.New("URL parse error")
|
|
||||||
ErrURLNotGemini = errors.New("not a Gemini URL")
|
|
||||||
ErrURLDecode = errors.New("URL decode error")
|
|
||||||
ErrUTF8Parse = errors.New("UTF-8 parse error")
|
|
||||||
ErrTextParse = errors.New("text parse error")
|
|
||||||
|
|
||||||
ErrNetwork = errors.New("network error")
|
|
||||||
ErrNetworkDNS = errors.New("network DNS error")
|
|
||||||
ErrNetworkTLS = errors.New("network TLS error")
|
|
||||||
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
|
|
||||||
ErrNetworkCannotWrite = errors.New("network error - cannot write")
|
|
||||||
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
|
|
||||||
|
|
||||||
ErrDatabase = errors.New("database error")
|
|
||||||
)
|
|
||||||
|
|
||||||
// We could have used a map for speed, but
|
|
||||||
// we would lose ability to check wrapped
|
|
||||||
// errors via errors.Is().
|
|
||||||
|
|
||||||
var errGemini *GeminiError
|
|
||||||
|
|
||||||
var knownErrors = []error{ //nolint:gochecknoglobals
|
|
||||||
errGemini,
|
|
||||||
ErrGeminiLinkLineParse,
|
|
||||||
ErrGeminiRobotsParse,
|
|
||||||
ErrGeminiRobotsDisallowed,
|
|
||||||
ErrGeminiResponseHeader,
|
|
||||||
ErrGeminiRedirect,
|
|
||||||
|
|
||||||
ErrURLParse,
|
|
||||||
ErrURLDecode,
|
|
||||||
ErrUTF8Parse,
|
|
||||||
ErrTextParse,
|
|
||||||
|
|
||||||
ErrNetwork,
|
|
||||||
ErrNetworkDNS,
|
|
||||||
ErrNetworkTLS,
|
|
||||||
ErrNetworkSetConnectionDeadline,
|
|
||||||
ErrNetworkCannotWrite,
|
|
||||||
ErrNetworkResponseSizeExceededMax,
|
|
||||||
|
|
||||||
ErrDatabase,
|
|
||||||
}
|
|
||||||
|
|
||||||
func IsKnownError(err error) bool {
|
|
||||||
for _, known := range knownErrors {
|
|
||||||
if errors.Is(err, known) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return errors.As(err, new(*GeminiError))
|
|
||||||
}
|
|
||||||
29
common/errors/hostError.go
Normal file
29
common/errors/hostError.go
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
package commonErrors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
)
|
||||||
|
|
||||||
|
type HostError struct {
|
||||||
|
xerrors.XError
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsHostError(err error) bool {
|
||||||
|
var temp *HostError
|
||||||
|
return errors.As(err, &temp)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewHostError(err error) error {
|
||||||
|
xerr := xerrors.XError{
|
||||||
|
UserMsg: "",
|
||||||
|
Code: 0,
|
||||||
|
Err: err,
|
||||||
|
IsFatal: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
return &HostError{
|
||||||
|
xerr,
|
||||||
|
}
|
||||||
|
}
|
||||||
8
common/errors/sentinelErrors.go
Normal file
8
common/errors/sentinelErrors.go
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
package commonErrors
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
|
||||||
|
var (
|
||||||
|
ErrBlacklistMatch = fmt.Errorf("black list match")
|
||||||
|
ErrRobotsMatch = fmt.Errorf("robots match")
|
||||||
|
)
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
package common
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestErrGemini(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
err := NewErrGeminiStatusCode(50, "50 server error")
|
|
||||||
if !errors.As(err, new(*GeminiError)) {
|
|
||||||
t.Errorf("TestErrGemini fail")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestErrGeminiWrapped(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
err := NewErrGeminiStatusCode(50, "50 server error")
|
|
||||||
errWrapped := fmt.Errorf("%w wrapped", err)
|
|
||||||
if !errors.As(errWrapped, new(*GeminiError)) {
|
|
||||||
t.Errorf("TestErrGeminiWrapped fail")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,223 +0,0 @@
|
|||||||
package common
|
|
||||||
|
|
||||||
import (
|
|
||||||
"reflect"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestParseURL(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
|
|
||||||
parsed, err := ParseURL(input, "")
|
|
||||||
value, _ := parsed.Value()
|
|
||||||
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
|
|
||||||
t.Errorf("fail: %s", parsed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL := URL{
|
|
||||||
Protocol: "gemini",
|
|
||||||
Hostname: "smol.gr",
|
|
||||||
Port: 1965,
|
|
||||||
Path: "/a/b",
|
|
||||||
Descr: "Nothing",
|
|
||||||
Full: "gemini://smol.gr:1965/a/b",
|
|
||||||
}
|
|
||||||
input := "gemini://a.b/c"
|
|
||||||
output, err := DeriveAbsoluteURL(currentURL, input)
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("fail: %v", err)
|
|
||||||
}
|
|
||||||
expected := &URL{
|
|
||||||
Protocol: "gemini",
|
|
||||||
Hostname: "a.b",
|
|
||||||
Port: 1965,
|
|
||||||
Path: "/c",
|
|
||||||
Descr: "",
|
|
||||||
Full: "gemini://a.b:1965/c",
|
|
||||||
}
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL := URL{
|
|
||||||
Protocol: "gemini",
|
|
||||||
Hostname: "smol.gr",
|
|
||||||
Port: 1965,
|
|
||||||
Path: "/a/b",
|
|
||||||
Descr: "Nothing",
|
|
||||||
Full: "gemini://smol.gr:1965/a/b",
|
|
||||||
}
|
|
||||||
input := "/c"
|
|
||||||
output, err := DeriveAbsoluteURL(currentURL, input)
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("fail: %v", err)
|
|
||||||
}
|
|
||||||
expected := &URL{
|
|
||||||
Protocol: "gemini",
|
|
||||||
Hostname: "smol.gr",
|
|
||||||
Port: 1965,
|
|
||||||
Path: "/c",
|
|
||||||
Descr: "",
|
|
||||||
Full: "gemini://smol.gr:1965/c",
|
|
||||||
}
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL := URL{
|
|
||||||
Protocol: "gemini",
|
|
||||||
Hostname: "smol.gr",
|
|
||||||
Port: 1965,
|
|
||||||
Path: "/a/b",
|
|
||||||
Descr: "Nothing",
|
|
||||||
Full: "gemini://smol.gr:1965/a/b",
|
|
||||||
}
|
|
||||||
input := "c/d"
|
|
||||||
output, err := DeriveAbsoluteURL(currentURL, input)
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("fail: %v", err)
|
|
||||||
}
|
|
||||||
expected := &URL{
|
|
||||||
Protocol: "gemini",
|
|
||||||
Hostname: "smol.gr",
|
|
||||||
Port: 1965,
|
|
||||||
Path: "/a/b/c/d",
|
|
||||||
Descr: "",
|
|
||||||
Full: "gemini://smol.gr:1965/a/b/c/d",
|
|
||||||
}
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeURLSlash(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net/retro-computing/magazines/"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := input
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeURLNoSlash(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net/retro-computing/magazines"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := input
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeMultiSlash(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net/retro-computing/////////a///magazines"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeTrailingSlash(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net/"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://uscoffings.net/"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeNoTrailingSlash(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://uscoffings.net"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeTrailingSlashPath(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net/a/"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://uscoffings.net/a/"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeNoTrailingSlashPath(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net/a"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://uscoffings.net/a"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeDot(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net/retro-computing/./././////a///magazines"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizePort(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://uscoffings.net:1965/a"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://uscoffings.net/a"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeURL(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
input := "gemini://chat.gemini.lehmann.cx:11965/"
|
|
||||||
normalized, _ := NormalizeURL(input)
|
|
||||||
output := normalized.String()
|
|
||||||
expected := "gemini://chat.gemini.lehmann.cx:11965/"
|
|
||||||
pass := reflect.DeepEqual(output, expected)
|
|
||||||
if !pass {
|
|
||||||
t.Errorf("fail: %#v != %#v", output, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
34
common/linkList/linkList.go
Normal file
34
common/linkList/linkList.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package linkList
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql/driver"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"gemini-grc/common/url"
|
||||||
|
)
|
||||||
|
|
||||||
|
type LinkList []url.URL
|
||||||
|
|
||||||
|
func (l LinkList) Value() (driver.Value, error) {
|
||||||
|
if len(l) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
data, err := json.Marshal(l)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *LinkList) Scan(value interface{}) error {
|
||||||
|
if value == nil {
|
||||||
|
*l = nil
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
b, ok := value.([]byte)
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
|
||||||
|
}
|
||||||
|
return json.Unmarshal(b, l)
|
||||||
|
}
|
||||||
67
common/seedList/seedlist.go
Normal file
67
common/seedList/seedlist.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package seedList
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
)
|
||||||
|
|
||||||
|
var seedlist []string //nolint:gochecknoglobals
|
||||||
|
|
||||||
|
func Initialize() error {
|
||||||
|
var err error
|
||||||
|
|
||||||
|
// Initialize seedlist from fixed path
|
||||||
|
if err = loadSeedlist("seed_urls.txt"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadSeedlist(filePath string) error {
|
||||||
|
if seedlist != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := os.ReadFile(filePath)
|
||||||
|
if err != nil {
|
||||||
|
seedlist = []string{}
|
||||||
|
return xerrors.NewError(fmt.Errorf("could not load seedlist file: %w", err), 0, "", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
seedlist = []string{}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" || strings.HasPrefix(line, "#") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seedlist = append(seedlist, line)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(seedlist) > 0 {
|
||||||
|
logging.LogInfo("Loaded %d seed URLs", len(seedlist))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Shutdown() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetSeedURLs returns the list of seed URLs
|
||||||
|
func GetSeedURLs() []string {
|
||||||
|
if seedlist == nil {
|
||||||
|
return []string{}
|
||||||
|
}
|
||||||
|
// Return a copy to prevent external modification
|
||||||
|
result := make([]string, len(seedlist))
|
||||||
|
copy(result, seedlist)
|
||||||
|
return result
|
||||||
|
}
|
||||||
67
common/seedList/seedlist_test.go
Normal file
67
common/seedList/seedlist_test.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package seedList
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLoadSeedlist(t *testing.T) {
|
||||||
|
// Create a temporary test file
|
||||||
|
content := `# Test seed URLs
|
||||||
|
gemini://example.com/
|
||||||
|
gemini://test.com/
|
||||||
|
|
||||||
|
# Another comment
|
||||||
|
gemini://demo.org/`
|
||||||
|
|
||||||
|
tmpFile, err := os.CreateTemp("", "seed_urls_test_*.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create temp file: %v", err)
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpFile.Name())
|
||||||
|
|
||||||
|
if _, err := tmpFile.WriteString(content); err != nil {
|
||||||
|
t.Fatalf("Failed to write to temp file: %v", err)
|
||||||
|
}
|
||||||
|
tmpFile.Close()
|
||||||
|
|
||||||
|
// Reset global variable for test
|
||||||
|
seedlist = nil
|
||||||
|
|
||||||
|
// Test loading
|
||||||
|
err = loadSeedlist(tmpFile.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to load seedlist: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify content
|
||||||
|
expected := []string{
|
||||||
|
"gemini://example.com/",
|
||||||
|
"gemini://test.com/",
|
||||||
|
"gemini://demo.org/",
|
||||||
|
}
|
||||||
|
|
||||||
|
urls := GetSeedURLs()
|
||||||
|
if len(urls) != len(expected) {
|
||||||
|
t.Errorf("Expected %d URLs, got %d", len(expected), len(urls))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, url := range urls {
|
||||||
|
if url != expected[i] {
|
||||||
|
t.Errorf("Expected URL %d to be %s, got %s", i, expected[i], url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetSeedURLsEmptyList(t *testing.T) {
|
||||||
|
// Reset global variable
|
||||||
|
originalSeedlist := seedlist
|
||||||
|
defer func() { seedlist = originalSeedlist }()
|
||||||
|
|
||||||
|
seedlist = nil
|
||||||
|
|
||||||
|
urls := GetSeedURLs()
|
||||||
|
if len(urls) != 0 {
|
||||||
|
t.Errorf("Expected empty list, got %d URLs", len(urls))
|
||||||
|
}
|
||||||
|
}
|
||||||
19
common/shared.go
Normal file
19
common/shared.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
package common
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FatalErrorsChan accepts errors from workers.
|
||||||
|
// In case of fatal error, gracefully
|
||||||
|
// exits the application.
|
||||||
|
var (
|
||||||
|
FatalErrorsChan chan error
|
||||||
|
SignalsChan chan os.Signal
|
||||||
|
WorkerWG sync.WaitGroup
|
||||||
|
)
|
||||||
|
|
||||||
|
const VERSION string = "0.0.1"
|
||||||
|
|
||||||
|
const CtxKeyLogger string = "CtxKeyLogger"
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
package common
|
|
||||||
|
|
||||||
import (
|
|
||||||
"database/sql/driver"
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/guregu/null/v5"
|
|
||||||
)
|
|
||||||
|
|
||||||
type LinkList []URL
|
|
||||||
|
|
||||||
func (l *LinkList) Value() (driver.Value, error) {
|
|
||||||
return json.Marshal(l)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *LinkList) Scan(value interface{}) error {
|
|
||||||
if value == nil {
|
|
||||||
*l = nil
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
b, ok := value.([]byte) // Type assertion! Converts to []byte
|
|
||||||
if !ok {
|
|
||||||
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
|
|
||||||
}
|
|
||||||
return json.Unmarshal(b, l)
|
|
||||||
}
|
|
||||||
|
|
||||||
type Snapshot struct {
|
|
||||||
ID int `db:"id" json:"id,omitempty"`
|
|
||||||
URL URL `db:"url" json:"url,omitempty"`
|
|
||||||
Host string `db:"host" json:"host,omitempty"`
|
|
||||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
|
||||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
|
||||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
|
||||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
|
||||||
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
|
||||||
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
|
|
||||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
|
||||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
|
||||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
|
||||||
}
|
|
||||||
|
|
||||||
func SnapshotFromURL(u string) *Snapshot {
|
|
||||||
url, err := ParseURL(u, "")
|
|
||||||
if err != nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
newSnapshot := Snapshot{
|
|
||||||
URL: *url,
|
|
||||||
Host: url.Hostname,
|
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
|
||||||
}
|
|
||||||
return &newSnapshot
|
|
||||||
}
|
|
||||||
39
common/snapshot/snapshot.go
Normal file
39
common/snapshot/snapshot.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package snapshot
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/common/linkList"
|
||||||
|
commonUrl "gemini-grc/common/url"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
"github.com/guregu/null/v5"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Snapshot struct {
|
||||||
|
ID int `db:"id" json:"ID,omitempty"`
|
||||||
|
URL commonUrl.URL `db:"url" json:"url,omitempty"`
|
||||||
|
Host string `db:"host" json:"host,omitempty"`
|
||||||
|
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||||
|
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||||
|
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||||
|
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||||
|
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
||||||
|
Links null.Value[linkList.LinkList] `db:"links" json:"links,omitempty"`
|
||||||
|
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||||
|
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response Status code.
|
||||||
|
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||||
|
LastCrawled null.Time `db:"last_crawled" json:"last_crawled,omitempty"` // When URL was last processed (regardless of content changes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
|
||||||
|
url, err := commonUrl.ParseURL(u, "", normalize)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
newSnapshot := Snapshot{
|
||||||
|
URL: *url,
|
||||||
|
Host: url.Hostname,
|
||||||
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
|
}
|
||||||
|
return &newSnapshot, nil
|
||||||
|
}
|
||||||
9
common/text/text.go
Normal file
9
common/text/text.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
package text
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
// RemoveNullChars removes all null characters from the input string.
|
||||||
|
func RemoveNullChars(input string) string {
|
||||||
|
// Replace all null characters with an empty string
|
||||||
|
return strings.ReplaceAll(input, "\u0000", "")
|
||||||
|
}
|
||||||
@@ -1,12 +1,15 @@
|
|||||||
package common
|
package url
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"database/sql/driver"
|
"database/sql/driver"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/url"
|
"net/url"
|
||||||
"path"
|
"path"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
)
|
)
|
||||||
|
|
||||||
type URL struct {
|
type URL struct {
|
||||||
@@ -26,11 +29,10 @@ func (u *URL) Scan(value interface{}) error {
|
|||||||
}
|
}
|
||||||
b, ok := value.(string)
|
b, ok := value.(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
|
return xerrors.NewError(fmt.Errorf("database scan error: expected string, got %T", value), 0, "", true)
|
||||||
}
|
}
|
||||||
parsedURL, err := ParseURLNoNormalize(b, "")
|
parsedURL, err := ParseURL(b, "", false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err = fmt.Errorf("failed to scan GeminiUrl %s: %v", b, err)
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
*u = *parsedURL
|
*u = *parsedURL
|
||||||
@@ -42,9 +44,15 @@ func (u URL) String() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (u URL) StringNoDefaultPort() string {
|
func (u URL) StringNoDefaultPort() string {
|
||||||
|
if IsGeminiUrl(u.String()) {
|
||||||
if u.Port == 1965 {
|
if u.Port == 1965 {
|
||||||
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if u.Port == 70 {
|
||||||
|
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||||
|
}
|
||||||
|
}
|
||||||
return u.Full
|
return u.Full
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,49 +63,52 @@ func (u URL) Value() (driver.Value, error) {
|
|||||||
return u.Full, nil
|
return u.Full, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ParseURLNoNormalize(input string, descr string) (*URL, error) {
|
func IsGeminiUrl(url string) bool {
|
||||||
u, err := url.Parse(input)
|
return strings.HasPrefix(url, "gemini://")
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("%w: Input %s URL Parse Error: %w", ErrURLParse, input, err)
|
|
||||||
}
|
|
||||||
if u.Scheme != "gemini" {
|
|
||||||
return nil, fmt.Errorf("%w: URL scheme '%s' is not supported", ErrURLNotGemini, u.Scheme)
|
|
||||||
}
|
|
||||||
protocol := u.Scheme
|
|
||||||
hostname := u.Hostname()
|
|
||||||
strPort := u.Port()
|
|
||||||
urlPath := u.Path
|
|
||||||
if strPort == "" {
|
|
||||||
strPort = "1965"
|
|
||||||
}
|
|
||||||
port, err := strconv.Atoi(strPort)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
|
|
||||||
}
|
|
||||||
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
|
|
||||||
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: urlPath, Descr: descr, Full: full}, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func ParseURL(input string, descr string) (*URL, error) {
|
func IsGopherURL(s string) bool {
|
||||||
u, err := NormalizeURL(input)
|
return strings.HasPrefix(s, "gopher://")
|
||||||
if err != nil {
|
}
|
||||||
return nil, fmt.Errorf("%w: Input %s URL Parse Error: %w", ErrURLParse, input, err)
|
|
||||||
|
func ParseURL(input string, descr string, normalize bool) (*URL, error) {
|
||||||
|
var u *url.URL
|
||||||
|
var err error
|
||||||
|
if normalize {
|
||||||
|
u, err = NormalizeURL(input)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
u, err = url.Parse(input)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input), 0, "", false)
|
||||||
}
|
}
|
||||||
if u.Scheme != "gemini" {
|
|
||||||
return nil, fmt.Errorf("%w: URL scheme '%s' is not supported", ErrURLNotGemini, u.Scheme)
|
|
||||||
}
|
}
|
||||||
protocol := u.Scheme
|
protocol := u.Scheme
|
||||||
hostname := u.Hostname()
|
hostname := u.Hostname()
|
||||||
strPort := u.Port()
|
strPort := u.Port()
|
||||||
|
// urlPath := u.EscapedPath()
|
||||||
urlPath := u.Path
|
urlPath := u.Path
|
||||||
if strPort == "" {
|
if strPort == "" {
|
||||||
strPort = "1965"
|
if u.Scheme == "gemini" {
|
||||||
|
strPort = "1965" // default Gemini port
|
||||||
|
} else {
|
||||||
|
strPort = "70" // default Gopher port
|
||||||
|
}
|
||||||
}
|
}
|
||||||
port, err := strconv.Atoi(strPort)
|
port, err := strconv.Atoi(strPort)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
|
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input), 0, "", false)
|
||||||
}
|
}
|
||||||
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
|
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
|
||||||
|
// full field should also contain query params and url fragments
|
||||||
|
if u.RawQuery != "" {
|
||||||
|
full += "?" + u.RawQuery
|
||||||
|
}
|
||||||
|
if u.Fragment != "" {
|
||||||
|
full += "#" + u.Fragment
|
||||||
|
}
|
||||||
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: urlPath, Descr: descr, Full: full}, nil
|
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: urlPath, Descr: descr, Full: full}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,7 +118,7 @@ func ParseURL(input string, descr string) (*URL, error) {
|
|||||||
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
|
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
|
||||||
// If target URL is absolute, return just it
|
// If target URL is absolute, return just it
|
||||||
if strings.Contains(input, "://") {
|
if strings.Contains(input, "://") {
|
||||||
return ParseURL(input, "")
|
return ParseURL(input, "", true)
|
||||||
}
|
}
|
||||||
// input is a relative path. Clean it and construct absolute.
|
// input is a relative path. Clean it and construct absolute.
|
||||||
var newPath string
|
var newPath string
|
||||||
@@ -120,10 +131,10 @@ func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
|
|||||||
newPath = path.Join(currentURL.Path, "/", path.Clean(input))
|
newPath = path.Join(currentURL.Path, "/", path.Clean(input))
|
||||||
}
|
}
|
||||||
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
|
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
|
||||||
return ParseURL(strURL, "")
|
return ParseURL(strURL, "", true)
|
||||||
}
|
}
|
||||||
|
|
||||||
// NormalizeURL takes a URL string and returns a normalized version.
|
// NormalizeURL takes a URL string and returns a normalized version
|
||||||
// Normalized meaning:
|
// Normalized meaning:
|
||||||
// - Path normalization (removing redundant slashes, . and .. segments)
|
// - Path normalization (removing redundant slashes, . and .. segments)
|
||||||
// - Proper escaping of special characters
|
// - Proper escaping of special characters
|
||||||
@@ -134,7 +145,13 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
|||||||
// Parse the URL
|
// Parse the URL
|
||||||
u, err := url.Parse(rawURL)
|
u, err := url.Parse(rawURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
return nil, xerrors.NewError(fmt.Errorf("error normalizing URL: %w: %s", err, rawURL), 0, "", false)
|
||||||
|
}
|
||||||
|
if u.Scheme == "" {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error normalizing URL: No scheme: %s", rawURL), 0, "", false)
|
||||||
|
}
|
||||||
|
if u.Host == "" {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error normalizing URL: No host: %s", rawURL), 0, "", false)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert scheme to lowercase
|
// Convert scheme to lowercase
|
||||||
@@ -145,7 +162,7 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
|||||||
u.Host = strings.ToLower(u.Host)
|
u.Host = strings.ToLower(u.Host)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove default ports
|
// remove default ports
|
||||||
if u.Port() != "" {
|
if u.Port() != "" {
|
||||||
switch {
|
switch {
|
||||||
case u.Scheme == "http" && u.Port() == "80":
|
case u.Scheme == "http" && u.Port() == "80":
|
||||||
@@ -154,6 +171,8 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
|||||||
u.Host = u.Hostname()
|
u.Host = u.Hostname()
|
||||||
case u.Scheme == "gemini" && u.Port() == "1965":
|
case u.Scheme == "gemini" && u.Port() == "1965":
|
||||||
u.Host = u.Hostname()
|
u.Host = u.Hostname()
|
||||||
|
case u.Scheme == "gopher" && u.Port() == "70":
|
||||||
|
u.Host = u.Hostname()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,7 +181,7 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
|||||||
// Check if there was a trailing slash before cleaning
|
// Check if there was a trailing slash before cleaning
|
||||||
hadTrailingSlash := strings.HasSuffix(u.Path, "/")
|
hadTrailingSlash := strings.HasSuffix(u.Path, "/")
|
||||||
|
|
||||||
u.Path = path.Clean(u.Path)
|
u.Path = path.Clean(u.EscapedPath())
|
||||||
// If path was "/", path.Clean() will return "."
|
// If path was "/", path.Clean() will return "."
|
||||||
if u.Path == "." {
|
if u.Path == "." {
|
||||||
u.Path = "/"
|
u.Path = "/"
|
||||||
@@ -172,20 +191,25 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Properly escape the path
|
// Properly escape the path, but only for unescaped parts
|
||||||
// First split on '/' to avoid escaping them
|
|
||||||
parts := strings.Split(u.Path, "/")
|
parts := strings.Split(u.Path, "/")
|
||||||
for i, part := range parts {
|
for i, part := range parts {
|
||||||
|
// Try to unescape to check if it's already escaped
|
||||||
|
unescaped, err := url.PathUnescape(part)
|
||||||
|
if err != nil || unescaped == part {
|
||||||
|
// Part is not escaped, so escape it
|
||||||
parts[i] = url.PathEscape(part)
|
parts[i] = url.PathEscape(part)
|
||||||
}
|
}
|
||||||
|
// If already escaped, leave as is
|
||||||
|
}
|
||||||
u.Path = strings.Join(parts, "/")
|
u.Path = strings.Join(parts, "/")
|
||||||
|
|
||||||
// Remove trailing fragment if empty
|
// remove trailing fragment if empty
|
||||||
if u.Fragment == "" {
|
if u.Fragment == "" {
|
||||||
u.Fragment = ""
|
u.Fragment = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove trailing query if empty
|
// remove trailing query if empty
|
||||||
if u.RawQuery == "" {
|
if u.RawQuery == "" {
|
||||||
u.RawQuery = ""
|
u.RawQuery = ""
|
||||||
}
|
}
|
||||||
@@ -198,7 +222,7 @@ func EscapeURL(input string) string {
|
|||||||
if strings.Contains(input, "%") && !strings.Contains(input, "% ") {
|
if strings.Contains(input, "%") && !strings.Contains(input, "% ") {
|
||||||
return input
|
return input
|
||||||
}
|
}
|
||||||
// Split URL into parts (protocol, host, path)
|
// Split URL into parts (protocol, host, p)
|
||||||
parts := strings.SplitN(input, "://", 2)
|
parts := strings.SplitN(input, "://", 2)
|
||||||
if len(parts) != 2 {
|
if len(parts) != 2 {
|
||||||
return input
|
return input
|
||||||
@@ -212,18 +236,50 @@ func EscapeURL(input string) string {
|
|||||||
return input
|
return input
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split host and path
|
// Split host and p
|
||||||
parts = strings.SplitN(remainder, "/", 2)
|
parts = strings.SplitN(remainder, "/", 2)
|
||||||
host := parts[0]
|
host := parts[0]
|
||||||
if len(parts) == 1 {
|
if len(parts) == 1 {
|
||||||
return protocol + "://" + host
|
return protocol + "://" + host
|
||||||
}
|
}
|
||||||
|
|
||||||
path := parts[1]
|
|
||||||
|
|
||||||
// Escape the path portion
|
// Escape the path portion
|
||||||
escapedPath := url.PathEscape(path)
|
escapedPath := url.PathEscape(parts[1])
|
||||||
|
|
||||||
// Reconstruct the URL
|
// Reconstruct the URL
|
||||||
return protocol + "://" + host + "/" + escapedPath
|
return protocol + "://" + host + "/" + escapedPath
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TrimTrailingPathSlash trims trailing slash and handles empty path
|
||||||
|
func TrimTrailingPathSlash(path string) string {
|
||||||
|
// Handle empty path (e.g., "http://example.com" -> treat as root)
|
||||||
|
if path == "" {
|
||||||
|
return "/"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim trailing slash while preserving root slash
|
||||||
|
path = strings.TrimSuffix(path, "/")
|
||||||
|
if path == "" { // This happens if path was just "/"
|
||||||
|
return "/"
|
||||||
|
}
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractRedirectTargetFromHeader returns the redirection
|
||||||
|
// URL by parsing the header (or error message)
|
||||||
|
func ExtractRedirectTargetFromHeader(currentURL URL, input string) (*URL, error) {
|
||||||
|
// \d+ - matches one or more digits
|
||||||
|
// \s+ - matches one or more whitespace
|
||||||
|
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
||||||
|
pattern := `\d+\s+([^\r]+)`
|
||||||
|
re := regexp.MustCompile(pattern)
|
||||||
|
matches := re.FindStringSubmatch(input)
|
||||||
|
if len(matches) < 2 {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error extracting redirect target from string %s", input), 0, "", false)
|
||||||
|
}
|
||||||
|
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return newURL, nil
|
||||||
|
}
|
||||||
420
common/url/url_test.go
Normal file
420
common/url/url_test.go
Normal file
@@ -0,0 +1,420 @@
|
|||||||
|
package url
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestURLOperations(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
t.Run("ParseURL", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
base string
|
||||||
|
absolute bool
|
||||||
|
want string
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "parse CGI URL",
|
||||||
|
input: "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162",
|
||||||
|
base: "",
|
||||||
|
absolute: true,
|
||||||
|
want: "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
tt := tt
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
parsed, err := ParseURL(tt.input, tt.base, tt.absolute)
|
||||||
|
if (err != nil) != tt.wantErr {
|
||||||
|
t.Errorf("ParseURL() error = %v, wantErr %v", err, tt.wantErr)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !tt.wantErr {
|
||||||
|
value, _ := parsed.Value()
|
||||||
|
if value != tt.want {
|
||||||
|
t.Errorf("ParseURL() = %v, want %v", value, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("DeriveAbsoluteURL", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
baseURL := URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b",
|
||||||
|
Descr: "Nothing",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b",
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
current URL
|
||||||
|
input string
|
||||||
|
expected *URL
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "absolute URL input",
|
||||||
|
current: baseURL,
|
||||||
|
input: "gemini://a.b/c",
|
||||||
|
expected: &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "a.b",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/c",
|
||||||
|
Full: "gemini://a.b:1965/c",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "absolute path input",
|
||||||
|
current: baseURL,
|
||||||
|
input: "/c",
|
||||||
|
expected: &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/c",
|
||||||
|
Full: "gemini://smol.gr:1965/c",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "relative path input",
|
||||||
|
current: baseURL,
|
||||||
|
input: "c/d",
|
||||||
|
expected: &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b/c/d",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b/c/d",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
tt := tt
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
output, err := DeriveAbsoluteURL(tt.current, tt.input)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(output, tt.expected) {
|
||||||
|
t.Errorf("got %#v, want %#v", output, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("CheckAndUpdateNormalizedURL", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "with trailing slash",
|
||||||
|
input: "gemini://uscoffings.net/retro-computing/magazines/",
|
||||||
|
expected: "gemini://uscoffings.net/retro-computing/magazines/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "without trailing slash",
|
||||||
|
input: "gemini://uscoffings.net/retro-computing/magazines",
|
||||||
|
expected: "gemini://uscoffings.net/retro-computing/magazines",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple slashes",
|
||||||
|
input: "gemini://uscoffings.net/retro-computing/////////a///magazines",
|
||||||
|
expected: "gemini://uscoffings.net/retro-computing/a/magazines",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "root with trailing slash",
|
||||||
|
input: "gemini://uscoffings.net/",
|
||||||
|
expected: "gemini://uscoffings.net/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "root without trailing slash",
|
||||||
|
input: "gemini://uscoffings.net",
|
||||||
|
expected: "gemini://uscoffings.net",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "path with trailing slash",
|
||||||
|
input: "gemini://uscoffings.net/a/",
|
||||||
|
expected: "gemini://uscoffings.net/a/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "path without trailing slash",
|
||||||
|
input: "gemini://uscoffings.net/a",
|
||||||
|
expected: "gemini://uscoffings.net/a",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "with dot segments",
|
||||||
|
input: "gemini://uscoffings.net/retro-computing/./././////a///magazines",
|
||||||
|
expected: "gemini://uscoffings.net/retro-computing/a/magazines",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "with default port",
|
||||||
|
input: "gemini://uscoffings.net:1965/a",
|
||||||
|
expected: "gemini://uscoffings.net/a",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
tt := tt
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
normalized, err := NormalizeURL(tt.input)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
output := normalized.String()
|
||||||
|
if output != tt.expected {
|
||||||
|
t.Errorf("got %#v, want %#v", output, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "URL with non-default port",
|
||||||
|
input: "gemini://chat.gemini.lehmann.cx:11965/",
|
||||||
|
expected: "gemini://chat.gemini.lehmann.cx:11965/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with query parameters",
|
||||||
|
input: "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c",
|
||||||
|
expected: "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with fragment",
|
||||||
|
input: "gemini://chat.gemini.lehmann.cx:11965/index#1",
|
||||||
|
expected: "gemini://chat.gemini.lehmann.cx:11965/index#1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with CGI script and query",
|
||||||
|
input: "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494",
|
||||||
|
expected: "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
tt := tt // capture range variable for parallel testing
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
normalized, err := NormalizeURL(tt.input)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
output := normalized.String()
|
||||||
|
if output != tt.expected {
|
||||||
|
t.Errorf("got %#v, want %#v", output, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizePath(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string // URL string to parse
|
||||||
|
expected string // Expected normalized path
|
||||||
|
}{
|
||||||
|
// Basic cases
|
||||||
|
{
|
||||||
|
name: "empty_path",
|
||||||
|
input: "http://example.com",
|
||||||
|
expected: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "root_path",
|
||||||
|
input: "http://example.com/",
|
||||||
|
expected: "/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "single_trailing_slash",
|
||||||
|
input: "http://example.com/test/",
|
||||||
|
expected: "/test/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no_trailing_slash",
|
||||||
|
input: "http://example.com/test",
|
||||||
|
expected: "/test",
|
||||||
|
},
|
||||||
|
|
||||||
|
// Edge cases with slashes
|
||||||
|
{
|
||||||
|
name: "multiple_trailing_slashes",
|
||||||
|
input: "http://example.com/test//",
|
||||||
|
expected: "/test/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple_consecutive_slashes",
|
||||||
|
input: "http://example.com//test//",
|
||||||
|
expected: "/test/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "only_slashes",
|
||||||
|
input: "http://example.com////",
|
||||||
|
expected: "/",
|
||||||
|
},
|
||||||
|
|
||||||
|
// Encoded characters
|
||||||
|
{
|
||||||
|
name: "encoded_spaces",
|
||||||
|
input: "http://example.com/foo%20bar/",
|
||||||
|
expected: "/foo%20bar/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "encoded_special_chars",
|
||||||
|
input: "http://example.com/foo%2Fbar/",
|
||||||
|
expected: "/foo%2Fbar/",
|
||||||
|
},
|
||||||
|
|
||||||
|
// Query parameters and fragments
|
||||||
|
{
|
||||||
|
name: "with_query_parameters",
|
||||||
|
input: "http://example.com/path?query=param",
|
||||||
|
expected: "/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "with_fragment",
|
||||||
|
input: "http://example.com/path#fragment",
|
||||||
|
expected: "/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "with_both_query_and_fragment",
|
||||||
|
input: "http://example.com/path?query=param#fragment",
|
||||||
|
expected: "/path",
|
||||||
|
},
|
||||||
|
|
||||||
|
// Unicode paths
|
||||||
|
{
|
||||||
|
name: "unicode_characters",
|
||||||
|
input: "http://example.com/über/path/",
|
||||||
|
expected: "/%C3%BCber/path/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unicode_encoded",
|
||||||
|
input: "http://example.com/%C3%BCber/path/",
|
||||||
|
expected: "/%C3%BCber/path/",
|
||||||
|
},
|
||||||
|
|
||||||
|
// Weird but valid cases
|
||||||
|
{
|
||||||
|
name: "dot_in_path",
|
||||||
|
input: "http://example.com/./path/",
|
||||||
|
expected: "/path/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "double_dot_in_path",
|
||||||
|
input: "http://example.com/../path/",
|
||||||
|
expected: "/path/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "mixed_case",
|
||||||
|
input: "http://example.com/PaTh/",
|
||||||
|
expected: "/PaTh/",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
u, err := ParseURL(tt.input, "", true)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to parse URL %q: %v", tt.input, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
result := u.Path
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Input: %s\nExpected: %q\nGot: %q",
|
||||||
|
u.Path, tt.expected, result)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||||
|
input := "redirect: 31 gemini://target.gr"
|
||||||
|
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||||
|
expected := "gemini://target.gr:1965"
|
||||||
|
if err != nil || (result.String() != expected) {
|
||||||
|
t.Errorf("fail: Expected %s got %s", expected, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||||
|
input := "redirect: 31 gemini://target.gr/"
|
||||||
|
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||||
|
expected := "gemini://target.gr:1965/"
|
||||||
|
if err != nil || (result.String() != expected) {
|
||||||
|
t.Errorf("fail: Expected %s got %s", expected, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||||
|
input := "redirect: 31 /a/b"
|
||||||
|
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://nox.im:1965", "", true)
|
||||||
|
input := "redirect: 31 ./"
|
||||||
|
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://status.zvava.org:1965", "", true)
|
||||||
|
input := "redirect: 31 index.gmi"
|
||||||
|
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetWrong(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||||
|
input := "redirect: 31"
|
||||||
|
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||||
|
if result != nil || err == nil {
|
||||||
|
t.Errorf("fail: result should be nil, err is %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
74
common/whiteList/whitelist.go
Normal file
74
common/whiteList/whitelist.go
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
package whiteList
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
)
|
||||||
|
|
||||||
|
var whitelist []regexp.Regexp //nolint:gochecknoglobals
|
||||||
|
|
||||||
|
func Initialize() error {
|
||||||
|
var err error
|
||||||
|
|
||||||
|
// Initialize whitelist
|
||||||
|
if config.CONFIG.WhitelistPath != "" {
|
||||||
|
if err = loadWhitelist(config.CONFIG.WhitelistPath); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadWhitelist(filePath string) error {
|
||||||
|
if whitelist != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := os.ReadFile(filePath)
|
||||||
|
if err != nil {
|
||||||
|
whitelist = []regexp.Regexp{}
|
||||||
|
return xerrors.NewError(fmt.Errorf("could not load whitelist file: %w", err), 0, "", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
whitelist = []regexp.Regexp{}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" || strings.HasPrefix(line, "#") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
regex, err := regexp.Compile(line)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.NewError(fmt.Errorf("could not compile whitelist line %s: %w", line, err), 0, "", true)
|
||||||
|
}
|
||||||
|
whitelist = append(whitelist, *regex)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(whitelist) > 0 {
|
||||||
|
logging.LogInfo("Loaded %d whitelist entries", len(whitelist))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Shutdown() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsWhitelisted checks if the URL matches any whitelist pattern
|
||||||
|
func IsWhitelisted(u string) bool {
|
||||||
|
for _, v := range whitelist {
|
||||||
|
if v.MatchString(u) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
87
common/whiteList/whitelist_test.go
Normal file
87
common/whiteList/whitelist_test.go
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
package whiteList
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIsWhitelisted(t *testing.T) {
|
||||||
|
// Set up a test whitelist
|
||||||
|
whitelist = []regexp.Regexp{
|
||||||
|
*regexp.MustCompile(`^gemini://example\.com`),
|
||||||
|
*regexp.MustCompile(`^gemini://test\.org/path`),
|
||||||
|
}
|
||||||
|
|
||||||
|
testCases := []struct {
|
||||||
|
url string
|
||||||
|
expected bool
|
||||||
|
}{
|
||||||
|
{"gemini://example.com", true},
|
||||||
|
{"gemini://example.com/path", true},
|
||||||
|
{"gemini://test.org", false},
|
||||||
|
{"gemini://test.org/path", true},
|
||||||
|
{"gemini://test.org/path/subpath", true},
|
||||||
|
{"gemini://other.site", false},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
result := IsWhitelisted(tc.url)
|
||||||
|
if result != tc.expected {
|
||||||
|
t.Errorf("IsWhitelisted(%s) = %v, want %v", tc.url, result, tc.expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadWhitelist(t *testing.T) {
|
||||||
|
// Create a temporary whitelist file
|
||||||
|
content := `# This is a test whitelist
|
||||||
|
^gemini://example\.com
|
||||||
|
^gemini://test\.org/path
|
||||||
|
`
|
||||||
|
tmpfile, err := os.CreateTemp("", "whitelist")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpfile.Name())
|
||||||
|
|
||||||
|
if _, err := tmpfile.Write([]byte(content)); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := tmpfile.Close(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset whitelist
|
||||||
|
whitelist = nil
|
||||||
|
|
||||||
|
// Set up configuration to use the temporary file
|
||||||
|
oldPath := config.CONFIG.WhitelistPath
|
||||||
|
config.CONFIG.WhitelistPath = tmpfile.Name()
|
||||||
|
defer func() {
|
||||||
|
config.CONFIG.WhitelistPath = oldPath
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Load whitelist from the file
|
||||||
|
err = loadWhitelist(tmpfile.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("loadWhitelist() error = %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if whitelist was loaded correctly
|
||||||
|
if len(whitelist) != 2 {
|
||||||
|
t.Errorf("loadWhitelist() loaded %d entries, want 2", len(whitelist))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test a whitelisted URL
|
||||||
|
if !IsWhitelisted("gemini://example.com") {
|
||||||
|
t.Error("IsWhitelisted(\"gemini://example.com\") = false, want true")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test a URL in a whitelisted path
|
||||||
|
if !IsWhitelisted("gemini://test.org/path/subpage.gmi") {
|
||||||
|
t.Error("IsWhitelisted(\"gemini://test.org/path/subpage.gmi\") = false, want true")
|
||||||
|
}
|
||||||
|
}
|
||||||
394
common/worker.go
Normal file
394
common/worker.go
Normal file
@@ -0,0 +1,394 @@
|
|||||||
|
package common
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/common/blackList"
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
commonErrors "gemini-grc/common/errors"
|
||||||
|
"gemini-grc/common/snapshot"
|
||||||
|
url2 "gemini-grc/common/url"
|
||||||
|
"gemini-grc/common/whiteList"
|
||||||
|
"gemini-grc/config"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
gemdb "gemini-grc/db"
|
||||||
|
"gemini-grc/gemini"
|
||||||
|
"gemini-grc/gopher"
|
||||||
|
"gemini-grc/hostPool"
|
||||||
|
"gemini-grc/robotsMatch"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
"github.com/guregu/null/v5"
|
||||||
|
"github.com/jmoiron/sqlx"
|
||||||
|
)
|
||||||
|
|
||||||
|
func RunWorkerWithTx(workerID int, job string) {
|
||||||
|
parsedURL, err := url2.ParseURL(job, "", true)
|
||||||
|
if err != nil {
|
||||||
|
logging.LogInfo("Failed to parse URL: %s Error: %s", job, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
host := parsedURL.Hostname
|
||||||
|
|
||||||
|
// Create a new worker context
|
||||||
|
baseCtx := context.Background()
|
||||||
|
ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID)
|
||||||
|
ctx = contextutil.ContextWithComponent(ctx, "worker")
|
||||||
|
defer cancel() // Ensure the context is cancelled when we're done
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job)
|
||||||
|
|
||||||
|
// Create a new db transaction
|
||||||
|
tx, err := gemdb.Database.NewTx(ctx)
|
||||||
|
if err != nil {
|
||||||
|
FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
err = runWorker(ctx, tx, []string{job})
|
||||||
|
WorkerWG.Done()
|
||||||
|
if err != nil {
|
||||||
|
// Two cases to handle:
|
||||||
|
// - context cancellation/timeout errors (log and ignore)
|
||||||
|
// - fatal errors (log and send to chan)
|
||||||
|
// non-fatal errors should've been handled within
|
||||||
|
// the runWorker() function and not bubble up here.
|
||||||
|
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker timed out or canceled: %v", err)
|
||||||
|
rollbackErr := gemdb.SafeRollback(ctx, tx)
|
||||||
|
if rollbackErr != nil {
|
||||||
|
FatalErrorsChan <- rollbackErr
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return
|
||||||
|
} else if xerrors.IsFatal(err) {
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
|
||||||
|
rollbackErr := gemdb.SafeRollback(ctx, tx)
|
||||||
|
if rollbackErr != nil {
|
||||||
|
FatalErrorsChan <- rollbackErr
|
||||||
|
return
|
||||||
|
}
|
||||||
|
FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
|
||||||
|
}
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
|
||||||
|
rollbackErr := gemdb.SafeRollback(ctx, tx)
|
||||||
|
if rollbackErr != nil {
|
||||||
|
FatalErrorsChan <- rollbackErr
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
err = tx.Commit()
|
||||||
|
if err != nil && !errors.Is(err, sql.ErrTxDone) {
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to commit transaction: %v", err)
|
||||||
|
if rollbackErr := gemdb.SafeRollback(ctx, tx); rollbackErr != nil {
|
||||||
|
FatalErrorsChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker done.")
|
||||||
|
}
|
||||||
|
|
||||||
|
func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
|
||||||
|
for _, u := range urls {
|
||||||
|
err := WorkOnUrl(ctx, tx, u)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// WorkOnUrl visits a URL and stores the result.
|
||||||
|
// unexpected errors are returned.
|
||||||
|
// expected errors are stored within the snapshot.
|
||||||
|
func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker visiting URL %s", url)
|
||||||
|
|
||||||
|
s, err := snapshot.SnapshotFromURL(url, true)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// We always use the normalized URL
|
||||||
|
if url != s.URL.Full {
|
||||||
|
url = s.URL.Full
|
||||||
|
}
|
||||||
|
|
||||||
|
isGemini := url2.IsGeminiUrl(s.URL.String())
|
||||||
|
isGopher := url2.IsGopherURL(s.URL.String())
|
||||||
|
|
||||||
|
if !isGemini && !isGopher {
|
||||||
|
return xerrors.NewSimpleError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()))
|
||||||
|
}
|
||||||
|
|
||||||
|
if isGopher && !config.CONFIG.GopherEnable {
|
||||||
|
return xerrors.NewSimpleError(fmt.Errorf("gopher disabled, not processing Gopher URL: %s", s.URL.String()))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if URL is whitelisted
|
||||||
|
isUrlWhitelisted := whiteList.IsWhitelisted(s.URL.String())
|
||||||
|
if isUrlWhitelisted {
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches whitelist, forcing crawl %s", url)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only check blacklist if URL is not whitelisted
|
||||||
|
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
|
||||||
|
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
|
||||||
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only check robots.txt if URL is not whitelisted and is a Gemini URL
|
||||||
|
var robotMatch bool
|
||||||
|
if !isUrlWhitelisted && isGemini {
|
||||||
|
// If URL matches a robots.txt disallow line,
|
||||||
|
// add it as an error and remove url
|
||||||
|
robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
|
||||||
|
if robotMatch {
|
||||||
|
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
|
||||||
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = hostPool.AddHostToHostPool(ctx, s.Host)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
defer func(ctx context.Context, host string) {
|
||||||
|
hostPool.RemoveHostFromPool(ctx, host)
|
||||||
|
}(ctx, s.Host)
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Visiting %s", s.URL.String())
|
||||||
|
|
||||||
|
// Use context-aware visits for both protocols
|
||||||
|
if isGopher {
|
||||||
|
s, err = gopher.VisitWithContext(ctx, s.URL.String())
|
||||||
|
} else {
|
||||||
|
s, err = gemini.Visit(ctx, s.URL.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle Gemini redirection.
|
||||||
|
if isGemini &&
|
||||||
|
s.ResponseCode.ValueOrZero() >= 30 &&
|
||||||
|
s.ResponseCode.ValueOrZero() < 40 {
|
||||||
|
err = saveRedirectURL(ctx, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.NewSimpleError(fmt.Errorf("error while handling redirection: %s", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we should skip a potentially
|
||||||
|
// identical snapshot with one from history
|
||||||
|
isIdentical, err := isContentIdentical(ctx, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if isIdentical {
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, updating crawl timestamp")
|
||||||
|
// Update the last_crawled timestamp to track that we processed this URL
|
||||||
|
err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return removeURL(ctx, tx, s.URL.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process and store links since content has changed
|
||||||
|
if len(s.Links.ValueOrZero()) > 0 {
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Found %d links", len(s.Links.ValueOrZero()))
|
||||||
|
err = storeLinks(ctx, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldUpdateSnapshotData(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
||||||
|
// If we don't have an error, save the new snapshot.
|
||||||
|
if !s.Error.Valid {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
prevSnapshot, err := gemdb.Database.GetLatestSnapshot(ctx, tx, s.URL.String())
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
// If we don't have a previous snapshot, save it anyway.
|
||||||
|
if prevSnapshot == nil {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
// If we have a previous snapshot,
|
||||||
|
// and it didn't have an error, save.
|
||||||
|
// This means that we can have a max
|
||||||
|
// of one consecutive snapshot with
|
||||||
|
// an error.
|
||||||
|
if prevSnapshot.Error.ValueOrZero() == "" {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func isContentIdentical(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
||||||
|
// Always check if content is identical to previous snapshot
|
||||||
|
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return identical, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// storeLinks checks and stores the snapshot links in the database.
|
||||||
|
func storeLinks(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||||
|
if s.Links.Valid { //nolint:nestif
|
||||||
|
for _, link := range s.Links.ValueOrZero() {
|
||||||
|
if shouldPersistURL(&link) {
|
||||||
|
visited, err := haveWeVisitedURL(ctx, tx, link.Full)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !visited {
|
||||||
|
err := gemdb.Database.InsertURL(ctx, tx, link.Full)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Link already persisted: %s", link.Full)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeURL(ctx context.Context, tx *sqlx.Tx, url string) error {
|
||||||
|
return gemdb.Database.DeleteURL(ctx, tx, url)
|
||||||
|
}
|
||||||
|
|
||||||
|
func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||||
|
shouldUpdateSnapshot, err := shouldUpdateSnapshotData(ctx, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if shouldUpdateSnapshot {
|
||||||
|
err := gemdb.Database.SaveSnapshot(ctx, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
|
||||||
|
return removeURL(ctx, tx, s.URL.String())
|
||||||
|
} else {
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (updating crawl date)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
|
||||||
|
err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return removeURL(ctx, tx, s.URL.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// shouldPersistURL returns true given URL is a
|
||||||
|
// non-blacklisted Gemini or Gopher URL.
|
||||||
|
func shouldPersistURL(u *url2.URL) bool {
|
||||||
|
if blackList.IsBlacklisted(u.String()) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if config.CONFIG.GopherEnable && url2.IsGopherURL(u.String()) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return url2.IsGeminiUrl(u.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error) {
|
||||||
|
var result []bool
|
||||||
|
|
||||||
|
// Check if the context is cancelled
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return false, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the urls table which holds the crawl queue.
|
||||||
|
err := tx.SelectContext(ctx, &result, `SELECT TRUE FROM urls WHERE url=$1`, u)
|
||||||
|
if err != nil {
|
||||||
|
return false, xerrors.NewError(fmt.Errorf("database error: %w", err), 0, "", true)
|
||||||
|
}
|
||||||
|
if len(result) > 0 {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we're skipping URLs based on recent updates, check if this URL has been
|
||||||
|
// crawled within the specified number of days
|
||||||
|
if config.CONFIG.SkipIfUpdatedDays > 0 {
|
||||||
|
var recentSnapshots []bool
|
||||||
|
cutoffDate := time.Now().AddDate(0, 0, -config.CONFIG.SkipIfUpdatedDays)
|
||||||
|
|
||||||
|
// Check if the context is cancelled
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = tx.SelectContext(ctx, &recentSnapshots, `
|
||||||
|
SELECT TRUE FROM snapshots
|
||||||
|
WHERE snapshots.url=$1
|
||||||
|
AND timestamp > $2
|
||||||
|
LIMIT 1`, u, cutoffDate)
|
||||||
|
if err != nil {
|
||||||
|
return false, xerrors.NewError(fmt.Errorf("database error checking recent snapshots: %w", err), 0, "", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(recentSnapshots) > 0 {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func saveRedirectURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||||
|
newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Header.ValueOrZero())
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to extract redirect target: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Page redirects to %s", newURL)
|
||||||
|
|
||||||
|
haveWeVisited, err := haveWeVisitedURL(ctx, tx, newURL.String())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if shouldPersistURL(newURL) && !haveWeVisited {
|
||||||
|
err = gemdb.Database.InsertURL(ctx, tx, newURL.Full)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Saved redirection URL %s", newURL.String())
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
//func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]snapshot.Snapshot, error) {
|
||||||
|
// query := `
|
||||||
|
// SELECT *
|
||||||
|
// FROM snapshots
|
||||||
|
// WHERE url=$1
|
||||||
|
// LIMIT 1
|
||||||
|
// `
|
||||||
|
// var snapshots []snapshot.Snapshot
|
||||||
|
// err := tx.Select(&snapshots, query, url)
|
||||||
|
// if err != nil {
|
||||||
|
// return nil, err
|
||||||
|
// }
|
||||||
|
// return snapshots, nil
|
||||||
|
//}
|
||||||
194
config/config.go
194
config/config.go
@@ -1,156 +1,90 @@
|
|||||||
package config
|
package config
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Environment variable names.
|
|
||||||
const (
|
|
||||||
EnvLogLevel = "LOG_LEVEL"
|
|
||||||
EnvNumWorkers = "NUM_OF_WORKERS"
|
|
||||||
EnvWorkerBatchSize = "WORKER_BATCH_SIZE"
|
|
||||||
EnvMaxResponseSize = "MAX_RESPONSE_SIZE"
|
|
||||||
EnvResponseTimeout = "RESPONSE_TIMEOUT"
|
|
||||||
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
|
|
||||||
EnvBlacklistPath = "BLACKLIST_PATH"
|
|
||||||
EnvDryRun = "DRY_RUN"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config holds the application configuration loaded from environment variables.
|
// Config holds the application configuration loaded from environment variables.
|
||||||
type Config struct {
|
type Config struct {
|
||||||
LogLevel zerolog.Level // Logging level (debug, info, warn, error)
|
PgURL string
|
||||||
|
LogLevel slog.Level // Logging level (debug, info, warn, error)
|
||||||
MaxResponseSize int // Maximum size of response in bytes
|
MaxResponseSize int // Maximum size of response in bytes
|
||||||
|
MaxDbConnections int // Maximum number of database connections.
|
||||||
NumOfWorkers int // Number of concurrent workers
|
NumOfWorkers int // Number of concurrent workers
|
||||||
ResponseTimeout int // Timeout for responses in seconds
|
ResponseTimeout int // Timeout for responses in seconds
|
||||||
WorkerBatchSize int // Batch size for worker processing
|
|
||||||
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
|
|
||||||
BlacklistPath string // File that has blacklisted strings of "host:port"
|
BlacklistPath string // File that has blacklisted strings of "host:port"
|
||||||
|
WhitelistPath string // File with URLs that should always be crawled regardless of blacklist
|
||||||
DryRun bool // If false, don't write to disk
|
DryRun bool // If false, don't write to disk
|
||||||
|
GopherEnable bool // Enable Gopher crawling
|
||||||
|
SeedUrlPath string // Add URLs from file to queue
|
||||||
|
SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable)
|
||||||
}
|
}
|
||||||
|
|
||||||
var CONFIG Config //nolint:gochecknoglobals
|
var CONFIG Config //nolint:gochecknoglobals
|
||||||
|
|
||||||
// parsePositiveInt parses and validates positive integer values.
|
// Initialize loads and validates configuration from environment variables
|
||||||
func parsePositiveInt(param, value string) (int, error) {
|
func Initialize() *Config {
|
||||||
val, err := strconv.Atoi(value)
|
|
||||||
if err != nil {
|
|
||||||
return 0, ValidationError{
|
|
||||||
Param: param,
|
|
||||||
Value: value,
|
|
||||||
Reason: "must be a valid integer",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if val <= 0 {
|
|
||||||
return 0, ValidationError{
|
|
||||||
Param: param,
|
|
||||||
Value: value,
|
|
||||||
Reason: "must be positive",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return val, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseBool(param, value string) (bool, error) {
|
|
||||||
val, err := strconv.ParseBool(value)
|
|
||||||
if err != nil {
|
|
||||||
return false, ValidationError{
|
|
||||||
Param: param,
|
|
||||||
Value: value,
|
|
||||||
Reason: "cannot be converted to boolean",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return val, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetConfig loads and validates configuration from environment variables
|
|
||||||
func GetConfig() *Config {
|
|
||||||
config := &Config{}
|
config := &Config{}
|
||||||
|
|
||||||
// Map of environment variables to their parsing functions
|
loglevel := flag.String("log-level", "info", "Logging level (debug, info, warn, error)")
|
||||||
parsers := map[string]func(string) error{
|
pgURL := flag.String("pgurl", "", "Postgres URL")
|
||||||
EnvLogLevel: func(v string) error {
|
dryRun := flag.Bool("dry-run", false, "Dry run mode")
|
||||||
level, err := zerolog.ParseLevel(v)
|
gopherEnable := flag.Bool("gopher", false, "Enable crawling of Gopher holes")
|
||||||
|
maxDbConnections := flag.Int("max-db-connections", 100, "Maximum number of database connections")
|
||||||
|
numOfWorkers := flag.Int("workers", 1, "Number of concurrent workers")
|
||||||
|
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes")
|
||||||
|
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds")
|
||||||
|
blacklistPath := flag.String("blacklist-path", "", "File that has blacklist regexes")
|
||||||
|
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable)")
|
||||||
|
whitelistPath := flag.String("whitelist-path", "", "File with URLs that should always be crawled regardless of blacklist")
|
||||||
|
seedUrlPath := flag.String("seed-url-path", "", "File with seed URLs that should be added to the queue immediatelly")
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
config.PgURL = *pgURL
|
||||||
|
config.DryRun = *dryRun
|
||||||
|
config.GopherEnable = *gopherEnable
|
||||||
|
config.NumOfWorkers = *numOfWorkers
|
||||||
|
config.MaxResponseSize = *maxResponseSize
|
||||||
|
config.ResponseTimeout = *responseTimeout
|
||||||
|
config.BlacklistPath = *blacklistPath
|
||||||
|
config.WhitelistPath = *whitelistPath
|
||||||
|
config.SeedUrlPath = *seedUrlPath
|
||||||
|
config.MaxDbConnections = *maxDbConnections
|
||||||
|
config.SkipIfUpdatedDays = *skipIfUpdatedDays
|
||||||
|
|
||||||
|
level, err := ParseSlogLevel(*loglevel)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ValidationError{
|
_, _ = fmt.Fprint(os.Stderr, err.Error())
|
||||||
Param: EnvLogLevel,
|
os.Exit(-1)
|
||||||
Value: v,
|
|
||||||
Reason: "must be one of: debug, info, warn, error",
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
config.LogLevel = level
|
config.LogLevel = level
|
||||||
return nil
|
|
||||||
},
|
|
||||||
EnvNumWorkers: func(v string) error {
|
|
||||||
val, err := parsePositiveInt(EnvNumWorkers, v)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
config.NumOfWorkers = val
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
EnvWorkerBatchSize: func(v string) error {
|
|
||||||
val, err := parsePositiveInt(EnvWorkerBatchSize, v)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
config.WorkerBatchSize = val
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
EnvMaxResponseSize: func(v string) error {
|
|
||||||
val, err := parsePositiveInt(EnvMaxResponseSize, v)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
config.MaxResponseSize = val
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
EnvResponseTimeout: func(v string) error {
|
|
||||||
val, err := parsePositiveInt(EnvResponseTimeout, v)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
config.ResponseTimeout = val
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
EnvPanicOnUnexpectedError: func(v string) error {
|
|
||||||
val, err := parseBool(EnvPanicOnUnexpectedError, v)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
config.PanicOnUnexpectedError = val
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
EnvBlacklistPath: func(v string) error {
|
|
||||||
config.BlacklistPath = v
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
EnvDryRun: func(v string) error {
|
|
||||||
val, err := parseBool(EnvDryRun, v)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
config.DryRun = val
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process each environment variable
|
|
||||||
for envVar, parser := range parsers {
|
|
||||||
value, ok := os.LookupEnv(envVar)
|
|
||||||
if !ok {
|
|
||||||
fmt.Fprintf(os.Stderr, "Missing required environment variable: %s\n", envVar)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := parser(value); err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, "Configuration error: %v\n", err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return config
|
return config
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ParseSlogLevel converts a string level to slog.Level
|
||||||
|
func ParseSlogLevel(levelStr string) (slog.Level, error) {
|
||||||
|
switch levelStr {
|
||||||
|
case "debug":
|
||||||
|
return slog.LevelDebug, nil
|
||||||
|
case "info":
|
||||||
|
return slog.LevelInfo, nil
|
||||||
|
case "warn":
|
||||||
|
return slog.LevelWarn, nil
|
||||||
|
case "error":
|
||||||
|
return slog.LevelError, nil
|
||||||
|
default:
|
||||||
|
return slog.LevelInfo, fmt.Errorf("invalid log level: %s", levelStr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert method for backward compatibility with existing codebase
|
||||||
|
// This can be removed once all references to Convert() are updated
|
||||||
|
func (c *Config) Convert() *Config {
|
||||||
|
// Just return the config itself as it now directly contains slog.Level
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,14 +0,0 @@
|
|||||||
package config
|
|
||||||
|
|
||||||
import "fmt"
|
|
||||||
|
|
||||||
// ValidationError represents a config validation error
|
|
||||||
type ValidationError struct {
|
|
||||||
Param string
|
|
||||||
Value string
|
|
||||||
Reason string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e ValidationError) Error() string {
|
|
||||||
return fmt.Sprintf("invalid value '%s' for %s: %s", e.Value, e.Param, e.Reason)
|
|
||||||
}
|
|
||||||
89
contextutil/context.go
Normal file
89
contextutil/context.go
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
package contextutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"git.antanst.com/antanst/uid"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ContextKey type for context values
|
||||||
|
type ContextKey string
|
||||||
|
|
||||||
|
// Context keys
|
||||||
|
const (
|
||||||
|
CtxKeyURL ContextKey = "url" // Full URL being processed
|
||||||
|
CtxKeyHost ContextKey = "host" // Host of the URL
|
||||||
|
CtxKeyRequestID ContextKey = "request_id" // Unique ID for this processing request
|
||||||
|
CtxKeyWorkerID ContextKey = "worker_id" // Worker ID processing this request
|
||||||
|
CtxKeyStartTime ContextKey = "start_time" // When processing started
|
||||||
|
CtxKeyComponent ContextKey = "component" // Component name for logging
|
||||||
|
)
|
||||||
|
|
||||||
|
// NewRequestContext creates a new, cancellable context
|
||||||
|
// with a timeout and
|
||||||
|
func NewRequestContext(parentCtx context.Context, url string, host string, workerID int) (context.Context, context.CancelFunc) {
|
||||||
|
ctx, cancel := context.WithTimeout(parentCtx, 120*time.Second)
|
||||||
|
requestID := uid.UID()
|
||||||
|
ctx = context.WithValue(ctx, CtxKeyURL, url)
|
||||||
|
ctx = context.WithValue(ctx, CtxKeyHost, host)
|
||||||
|
ctx = context.WithValue(ctx, CtxKeyRequestID, requestID)
|
||||||
|
ctx = context.WithValue(ctx, CtxKeyWorkerID, workerID)
|
||||||
|
ctx = context.WithValue(ctx, CtxKeyStartTime, time.Now())
|
||||||
|
return ctx, cancel
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper functions to get values from context
|
||||||
|
|
||||||
|
// GetURLFromContext retrieves the URL from the context
|
||||||
|
func GetURLFromContext(ctx context.Context) string {
|
||||||
|
if url, ok := ctx.Value(CtxKeyURL).(string); ok {
|
||||||
|
return url
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetHostFromContext retrieves the host from the context
|
||||||
|
func GetHostFromContext(ctx context.Context) string {
|
||||||
|
if host, ok := ctx.Value(CtxKeyHost).(string); ok {
|
||||||
|
return host
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetRequestIDFromContext retrieves the request ID from the context
|
||||||
|
func GetRequestIDFromContext(ctx context.Context) string {
|
||||||
|
if id, ok := ctx.Value(CtxKeyRequestID).(string); ok {
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetWorkerIDFromContext retrieves the worker ID from the context
|
||||||
|
func GetWorkerIDFromContext(ctx context.Context) int {
|
||||||
|
if id, ok := ctx.Value(CtxKeyWorkerID).(int); ok {
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetStartTimeFromContext retrieves the start time from the context
|
||||||
|
func GetStartTimeFromContext(ctx context.Context) time.Time {
|
||||||
|
if startTime, ok := ctx.Value(CtxKeyStartTime).(time.Time); ok {
|
||||||
|
return startTime
|
||||||
|
}
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetComponentFromContext retrieves the component name from the context
|
||||||
|
func GetComponentFromContext(ctx context.Context) string {
|
||||||
|
if component, ok := ctx.Value(CtxKeyComponent).(string); ok {
|
||||||
|
return component
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContextWithComponent adds or updates the component name in the context
|
||||||
|
func ContextWithComponent(ctx context.Context, component string) context.Context {
|
||||||
|
return context.WithValue(ctx, CtxKeyComponent, component)
|
||||||
|
}
|
||||||
646
db/db.go
646
db/db.go
@@ -1,51 +1,66 @@
|
|||||||
package db
|
package db
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/common"
|
"strings"
|
||||||
"os"
|
"sync"
|
||||||
"strconv"
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/common/snapshot"
|
||||||
|
commonUrl "gemini-grc/common/url"
|
||||||
"gemini-grc/config"
|
"gemini-grc/config"
|
||||||
"gemini-grc/logging"
|
"gemini-grc/contextutil"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
"github.com/guregu/null/v5"
|
||||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
"github.com/lib/pq"
|
"github.com/lib/pq"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ConnectToDB() *sqlx.DB {
|
type DbService interface {
|
||||||
connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s", //nolint:nosprintfhostport
|
// Core database methods
|
||||||
os.Getenv("PG_USER"),
|
Initialize(ctx context.Context) error
|
||||||
os.Getenv("PG_PASSWORD"),
|
Shutdown(ctx context.Context) error
|
||||||
os.Getenv("PG_HOST"),
|
NewTx(ctx context.Context) (*sqlx.Tx, error)
|
||||||
os.Getenv("PG_PORT"),
|
|
||||||
os.Getenv("PG_DATABASE"),
|
|
||||||
)
|
|
||||||
|
|
||||||
// Create a connection pool
|
// URL methods
|
||||||
db, err := sqlx.Open("pgx", connStr)
|
InsertURL(ctx context.Context, tx *sqlx.Tx, url string) error
|
||||||
if err != nil {
|
CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error
|
||||||
panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err))
|
DeleteURL(ctx context.Context, tx *sqlx.Tx, url string) error
|
||||||
}
|
MarkURLsAsBeingProcessed(ctx context.Context, tx *sqlx.Tx, urls []string) error
|
||||||
// TODO move PG_MAX_OPEN_CONNECTIONS to config env variables
|
GetUrlHosts(ctx context.Context, tx *sqlx.Tx) ([]string, error)
|
||||||
maxConnections, err := strconv.Atoi(os.Getenv("PG_MAX_OPEN_CONNECTIONS"))
|
GetRandomUrlsFromHosts(ctx context.Context, hosts []string, limit int, tx *sqlx.Tx) ([]string, error)
|
||||||
if err != nil {
|
|
||||||
panic(fmt.Sprintf("Unable to set max DB connections: %s\n", err))
|
// Snapshot methods
|
||||||
}
|
SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
|
||||||
db.SetMaxOpenConns(maxConnections)
|
OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
|
||||||
err = db.Ping()
|
UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error
|
||||||
if err != nil {
|
GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url string) (*snapshot.Snapshot, error)
|
||||||
panic(fmt.Sprintf("Unable to ping database: %v\n", err))
|
GetSnapshotAtTimestamp(ctx context.Context, tx *sqlx.Tx, url string, timestamp time.Time) (*snapshot.Snapshot, error)
|
||||||
|
GetAllSnapshotsForURL(ctx context.Context, tx *sqlx.Tx, url string) ([]*snapshot.Snapshot, error)
|
||||||
|
GetSnapshotsByDateRange(ctx context.Context, tx *sqlx.Tx, url string, startTime, endTime time.Time) ([]*snapshot.Snapshot, error)
|
||||||
|
IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
logging.LogDebug("Connected to database")
|
type DbServiceImpl struct {
|
||||||
return db
|
db *sqlx.DB
|
||||||
|
connected bool
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsDeadlockError checks if the error is a PostgreSQL deadlock error
|
var Database DbServiceImpl
|
||||||
|
|
||||||
|
// IsDeadlockError checks if the error is a PostgreSQL deadlock error.
|
||||||
func IsDeadlockError(err error) bool {
|
func IsDeadlockError(err error) bool {
|
||||||
|
err = errors.Unwrap(err)
|
||||||
var pqErr *pq.Error
|
var pqErr *pq.Error
|
||||||
if errors.As(err, &pqErr) {
|
if errors.As(err, &pqErr) {
|
||||||
return pqErr.Code == "40P01" // PostgreSQL deadlock error code
|
return pqErr.Code == "40P01" // PostgreSQL deadlock error code
|
||||||
@@ -53,134 +68,501 @@ func IsDeadlockError(err error) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetURLsToVisit(tx *sqlx.Tx) ([]string, error) {
|
// Initialize initializes the database with context
|
||||||
var urls []string
|
func (d *DbServiceImpl) Initialize(ctx context.Context) error {
|
||||||
err := tx.Select(&urls, SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS, config.CONFIG.WorkerBatchSize)
|
// Create a database-specific context
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Initializing database connection")
|
||||||
|
|
||||||
|
d.mu.Lock()
|
||||||
|
defer d.mu.Unlock()
|
||||||
|
|
||||||
|
if d.connected {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a connection pool
|
||||||
|
connStr := config.CONFIG.PgURL
|
||||||
|
db, err := sqlx.Open("pgx", connStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrDatabase, err)
|
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Unable to connect to database with URL %s: %v", connStr, err)
|
||||||
|
return xerrors.NewError(fmt.Errorf("unable to connect to database with URL %s: %w", connStr, err), 0, "", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configure connection pool
|
||||||
|
db.SetMaxOpenConns(config.CONFIG.MaxDbConnections)
|
||||||
|
db.SetMaxIdleConns(config.CONFIG.MaxDbConnections / 2)
|
||||||
|
db.SetConnMaxLifetime(time.Minute * 5)
|
||||||
|
db.SetConnMaxIdleTime(time.Minute * 1)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding with ping
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use PingContext for context-aware ping
|
||||||
|
err = db.PingContext(ctx)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Unable to ping database: %v", err)
|
||||||
|
return xerrors.NewError(fmt.Errorf("unable to ping database: %w", err), 0, "", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Database connection initialized successfully")
|
||||||
|
d.db = db
|
||||||
|
d.connected = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DbServiceImpl) Shutdown(ctx context.Context) error {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Shutting down database connections")
|
||||||
|
_, err := d.db.Query("UPDATE urls SET being_processed=false")
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Unable to update urls table: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
d.mu.Lock()
|
||||||
|
defer d.mu.Unlock()
|
||||||
|
|
||||||
|
if !d.connected {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = d.db.Close()
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Error closing database connection: %v", err)
|
||||||
|
} else {
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Database connection closed successfully")
|
||||||
|
d.connected = false
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewTx creates a new transaction with context
|
||||||
|
func (d *DbServiceImpl) NewTx(ctx context.Context) (*sqlx.Tx, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Context error before creating transaction: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
tx, err := d.db.BeginTxx(ctx, nil)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Failed to create transaction: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return tx, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// InsertURL inserts a URL with context
|
||||||
|
func (d *DbServiceImpl) InsertURL(ctx context.Context, tx *sqlx.Tx, url string) error {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Inserting URL %s", url)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
normalizedURL, err := commonUrl.ParseURL(url, "", true)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
a := struct {
|
||||||
|
Url string
|
||||||
|
Host string
|
||||||
|
Timestamp time.Time
|
||||||
|
}{
|
||||||
|
Url: normalizedURL.Full,
|
||||||
|
Host: normalizedURL.Hostname,
|
||||||
|
Timestamp: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
query := SQL_INSERT_URL
|
||||||
|
_, err = tx.NamedExecContext(ctx, query, a)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.NewError(fmt.Errorf("cannot insert URL: database error %w URL %s", err, url), 0, "", true)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NormalizeURL normalizes a URL with context
|
||||||
|
func (d *DbServiceImpl) CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
|
||||||
|
// Check if URLs are already the same
|
||||||
|
if url == normalizedURL {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating normalized URL %s -> %s", url, normalizedURL)
|
||||||
|
|
||||||
|
query := SQL_UPDATE_URL
|
||||||
|
a := struct {
|
||||||
|
Url string `db:"Url"`
|
||||||
|
NormalizedURL string `db:"NormalizedURL"`
|
||||||
|
}{
|
||||||
|
Url: url,
|
||||||
|
NormalizedURL: normalizedURL,
|
||||||
|
}
|
||||||
|
_, err := tx.NamedExecContext(ctx, query, a)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.NewError(fmt.Errorf("cannot update normalized URL: %w URL %s", err, url), 0, "", true)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteURL deletes a URL with context
|
||||||
|
func (d *DbServiceImpl) DeleteURL(ctx context.Context, tx *sqlx.Tx, url string) error {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Deleting URL %s", url)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
query := SQL_DELETE_URL
|
||||||
|
_, err := tx.ExecContext(ctx, query, url)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.NewError(fmt.Errorf("cannot delete URL: database error %w URL %s", err, url), 0, "", true)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// MarkURLsAsBeingProcessed marks URLs as being processed with context
|
||||||
|
func (d *DbServiceImpl) MarkURLsAsBeingProcessed(ctx context.Context, tx *sqlx.Tx, urls []string) error {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
|
||||||
|
// Skip if no URLs provided
|
||||||
|
if len(urls) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Marking %d URLs as being processed", len(urls))
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
if len(urls) > 0 {
|
||||||
|
// Build a query with multiple parameters instead of using pq.Array
|
||||||
|
placeholders := make([]string, len(urls))
|
||||||
|
args := make([]interface{}, len(urls))
|
||||||
|
for i, url := range urls {
|
||||||
|
placeholders[i] = fmt.Sprintf("$%d", i+1)
|
||||||
|
args[i] = url
|
||||||
|
}
|
||||||
|
query := fmt.Sprintf(SQL_MARK_URLS_BEING_PROCESSED, strings.Join(placeholders, ","))
|
||||||
|
_, err := tx.ExecContext(ctx, query, args...)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.NewError(fmt.Errorf("cannot mark URLs as being processed: %w", err), 0, "", true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetUrlHosts gets URL hosts with context
|
||||||
|
func (d *DbServiceImpl) GetUrlHosts(ctx context.Context, tx *sqlx.Tx) ([]string, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Getting URL hosts")
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
var hosts []string
|
||||||
|
var query string
|
||||||
|
if config.CONFIG.GopherEnable {
|
||||||
|
query = "SELECT DISTINCT(host) FROM urls WHERE being_processed IS NOT TRUE"
|
||||||
|
} else {
|
||||||
|
query = "SELECT DISTINCT(host) FROM urls WHERE url like 'gemini://%' AND being_processed IS NOT TRUE"
|
||||||
|
}
|
||||||
|
err := tx.SelectContext(ctx, &hosts, query)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(err, 0, "", true)
|
||||||
|
}
|
||||||
|
return hosts, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetRandomUrlsFromHosts gets random URLs from hosts with context
|
||||||
|
func (d *DbServiceImpl) GetRandomUrlsFromHosts(ctx context.Context, hosts []string, limit int, tx *sqlx.Tx) ([]string, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Getting random URLs from %d hosts with limit %d", len(hosts), limit)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
var urls []string
|
||||||
|
var query string
|
||||||
|
for _, host := range hosts {
|
||||||
|
var results []string
|
||||||
|
if !config.CONFIG.GopherEnable {
|
||||||
|
query = "SELECT url FROM urls WHERE host=$1 AND url like 'gemini://%' AND being_processed IS NOT TRUE ORDER BY RANDOM() LIMIT $2"
|
||||||
|
} else {
|
||||||
|
query = "SELECT url FROM urls WHERE host=$1 AND being_processed IS NOT TRUE ORDER BY RANDOM() LIMIT $2"
|
||||||
|
}
|
||||||
|
err := tx.SelectContext(ctx, &results, query, host, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(err, 0, "", true)
|
||||||
|
}
|
||||||
|
urls = append(urls, results...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check context cancellation before mark operation
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use context-aware method for marking URLs
|
||||||
|
err := d.MarkURLsAsBeingProcessed(ctx, tx, urls)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
return urls, nil
|
return urls, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func InsertURL(tx *sqlx.Tx, url string) error {
|
// SaveSnapshot saves a snapshot with context
|
||||||
query := SQL_INSERT_URL
|
func (d *DbServiceImpl) SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||||
_, err := tx.NamedExec(query, url)
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
if err != nil {
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Saving snapshot for URL %s", s.URL.String())
|
||||||
return fmt.Errorf("%w inserting URL: %w", common.ErrDatabase, err)
|
|
||||||
}
|
// Check if the context is cancelled before proceeding
|
||||||
return nil
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveSnapshotIfNew(tx *sqlx.Tx, s *common.Snapshot) error {
|
// Context-aware implementation
|
||||||
if config.CONFIG.DryRun {
|
if config.CONFIG.DryRun {
|
||||||
marshalled, err := json.MarshalIndent(s, "", " ")
|
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(fmt.Sprintf("JSON serialization error for %v", s))
|
return xerrors.NewError(fmt.Errorf("JSON serialization error for %v", s), 0, "", true)
|
||||||
}
|
|
||||||
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
query := SQL_INSERT_SNAPSHOT_IF_NEW
|
|
||||||
_, err := tx.NamedExec(query, s)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
|
|
||||||
}
|
}
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Would save snapshot %s", marshalled)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func OverwriteSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) {
|
// Check context before expensive operations
|
||||||
// if config.CONFIG.DryRun {
|
if err := ctx.Err(); err != nil {
|
||||||
//marshalled, err := json.MarshalIndent(s, "", " ")
|
return err
|
||||||
//if err != nil {
|
}
|
||||||
// panic(fmt.Sprintf("JSON serialization error for %v", s))
|
|
||||||
//}
|
// Always ensure we have current timestamps
|
||||||
//logging.LogDebug("[%d] Would upsert snapshot %s", workedID, marshalled)
|
currentTime := time.Now()
|
||||||
// return nil
|
s.Timestamp = null.TimeFrom(currentTime)
|
||||||
// }
|
s.LastCrawled = null.TimeFrom(currentTime)
|
||||||
query := SQL_UPSERT_SNAPSHOT
|
|
||||||
rows, err := tx.NamedQuery(query, s)
|
// For PostgreSQL, use the global sqlx.NamedQueryContext function
|
||||||
|
// The SQL_INSERT_SNAPSHOT already has a RETURNING id clause
|
||||||
|
query := SQL_INSERT_SNAPSHOT
|
||||||
|
rows, err := sqlx.NamedQueryContext(ctx, tx, query, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w while upserting snapshot: %w", workedID, common.ErrDatabase, err)
|
return xerrors.NewError(fmt.Errorf("cannot save snapshot: %w", err), 0, "", true)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer rows.Close()
|
||||||
_err := rows.Close()
|
|
||||||
if _err != nil {
|
// Scan the returned ID
|
||||||
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
if rows.Next() {
|
if rows.Next() {
|
||||||
var returnedID int
|
err = rows.Scan(&s.ID)
|
||||||
err = rows.Scan(&returnedID)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err)
|
return xerrors.NewError(fmt.Errorf("cannot save snapshot: error scanning returned ID: %w", err), 0, "", true)
|
||||||
}
|
|
||||||
s.ID = returnedID
|
|
||||||
// logging.LogDebug("[%d] Upserted snapshot with ID %d", workedID, returnedID)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) {
|
|
||||||
// if config.CONFIG.DryRun {
|
|
||||||
//marshalled, err := json.MarshalIndent(s, "", " ")
|
|
||||||
//if err != nil {
|
|
||||||
// panic(fmt.Sprintf("JSON serialization error for %v", s))
|
|
||||||
//}
|
|
||||||
//logging.LogDebug("[%d] Would upsert snapshot %s", workedID, marshalled)
|
|
||||||
// return nil
|
|
||||||
// }
|
|
||||||
query := SQL_UPDATE_SNAPSHOT
|
|
||||||
rows, err := tx.NamedQuery(query, s)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("[%d] %w while updating snapshot: %w", workedID, common.ErrDatabase, err)
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
_err := rows.Close()
|
|
||||||
if _err != nil {
|
|
||||||
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
if rows.Next() {
|
|
||||||
var returnedID int
|
|
||||||
err = rows.Scan(&returnedID)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err)
|
|
||||||
}
|
|
||||||
s.ID = returnedID
|
|
||||||
// logging.LogDebug("[%d] Updated snapshot with ID %d", workedID, returnedID)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*common.Snapshot) error {
|
|
||||||
if config.CONFIG.DryRun {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
const batchSize = 5000
|
|
||||||
query := SQL_INSERT_SNAPSHOT_IF_NEW
|
|
||||||
for i := 0; i < len(snapshots); i += batchSize {
|
|
||||||
end := i + batchSize
|
|
||||||
if end > len(snapshots) {
|
|
||||||
end = len(snapshots)
|
|
||||||
}
|
|
||||||
batch := snapshots[i:end]
|
|
||||||
_, err := tx.NamedExec(query, batch)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%w: While saving links in batches: %w", common.ErrDatabase, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*common.Snapshot) error {
|
// OverwriteSnapshot overwrites a snapshot with context (maintained for backward compatibility)
|
||||||
if config.CONFIG.DryRun {
|
func (d *DbServiceImpl) OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||||
return nil
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Overwriting snapshot for URL %s", s.URL.String())
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
query := SQL_INSERT_SNAPSHOT_IF_NEW
|
|
||||||
_, err := tx.NamedExec(query, snapshots)
|
// Now simply delegate to SaveSnapshot which is already context-aware
|
||||||
|
return d.SaveSnapshot(ctx, tx, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateLastCrawled updates the last_crawled timestamp for the most recent snapshot of a URL
|
||||||
|
func (d *DbServiceImpl) UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating last_crawled timestamp for URL %s", url)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the last_crawled timestamp for the most recent snapshot
|
||||||
|
_, err := tx.ExecContext(ctx, SQL_UPDATE_LAST_CRAWLED, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("GeminiError batch inserting snapshots: %w", err)
|
return xerrors.NewError(fmt.Errorf("cannot update last_crawled for URL %s: %w", url, err), 0, "", true)
|
||||||
return fmt.Errorf("DB error: %w", err)
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetLatestSnapshot gets the latest snapshot with context
|
||||||
|
func (d *DbServiceImpl) GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url string) (*snapshot.Snapshot, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Getting latest snapshot for URL %s", url)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
s := &snapshot.Snapshot{}
|
||||||
|
err := tx.GetContext(ctx, s, SQL_GET_LATEST_SNAPSHOT, url)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", true)
|
||||||
|
}
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetSnapshotAtTimestamp gets a snapshot at a specific timestamp with context
|
||||||
|
func (d *DbServiceImpl) GetSnapshotAtTimestamp(ctx context.Context, tx *sqlx.Tx, url string, timestamp time.Time) (*snapshot.Snapshot, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Getting snapshot for URL %s at timestamp %v", url, timestamp)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
s := &snapshot.Snapshot{}
|
||||||
|
err := tx.GetContext(ctx, s, SQL_GET_SNAPSHOT_AT_TIMESTAMP, url, timestamp)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("no snapshot found for URL %s at or before %v", url, timestamp), 0, "", false)
|
||||||
|
}
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("cannot get snapshot for URL %s at timestamp %v: %w", url, timestamp, err), 0, "", false)
|
||||||
|
}
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAllSnapshotsForURL gets all snapshots for a URL with context
|
||||||
|
func (d *DbServiceImpl) GetAllSnapshotsForURL(ctx context.Context, tx *sqlx.Tx, url string) ([]*snapshot.Snapshot, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Getting all snapshots for URL %s", url)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
snapshots := []*snapshot.Snapshot{}
|
||||||
|
err := tx.SelectContext(ctx, &snapshots, SQL_GET_ALL_SNAPSHOTS_FOR_URL, url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("cannot get all snapshots for URL %s: %w", url, err), 0, "", false)
|
||||||
|
}
|
||||||
|
return snapshots, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetSnapshotsByDateRange gets snapshots by date range with context
|
||||||
|
func (d *DbServiceImpl) GetSnapshotsByDateRange(ctx context.Context, tx *sqlx.Tx, url string, startTime, endTime time.Time) ([]*snapshot.Snapshot, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Getting snapshots for URL %s in date range %v to %v", url, startTime, endTime)
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context-aware implementation
|
||||||
|
snapshots := []*snapshot.Snapshot{}
|
||||||
|
err := tx.SelectContext(ctx, &snapshots, SQL_GET_SNAPSHOTS_BY_DATE_RANGE, url, startTime, endTime)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("cannot get snapshots for URL %s in date range %v to %v: %w",
|
||||||
|
url, startTime, endTime, err), 0, "", false)
|
||||||
|
}
|
||||||
|
return snapshots, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsContentIdentical checks if content is identical with context
|
||||||
|
func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
||||||
|
dbCtx := contextutil.ContextWithComponent(ctx, "database")
|
||||||
|
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Checking if content is identical for URL %s", s.URL.String())
|
||||||
|
|
||||||
|
// Check if the context is cancelled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to get the latest snapshot for this URL
|
||||||
|
latestSnapshot := &snapshot.Snapshot{}
|
||||||
|
err := tx.GetContext(ctx, latestSnapshot, SQL_GET_LATEST_SNAPSHOT, s.URL.String())
|
||||||
|
if err != nil {
|
||||||
|
// If there's no snapshot yet, it can't be identical
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
return false, xerrors.NewError(err, 0, "", true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check context cancellation before potentially expensive comparison
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the content is identical
|
||||||
|
if s.GemText.Valid && latestSnapshot.GemText.Valid {
|
||||||
|
return s.GemText.String == latestSnapshot.GemText.String, nil
|
||||||
|
} else if s.Data.Valid && latestSnapshot.Data.Valid {
|
||||||
|
return bytes.Equal(s.Data.V, latestSnapshot.Data.V), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SafeRollback attempts to roll back a transaction,
|
||||||
|
// handling the case if the tx was already finalized.
|
||||||
|
func SafeRollback(ctx context.Context, tx *sqlx.Tx) error {
|
||||||
|
rollbackErr := tx.Rollback()
|
||||||
|
if rollbackErr != nil {
|
||||||
|
// Check if it's the standard "transaction already finalized" error
|
||||||
|
if errors.Is(rollbackErr, sql.ErrTxDone) {
|
||||||
|
contextlog.LogWarnWithContext(ctx, logging.GetSlogger(), "Rollback failed because transaction is already finalized")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// Only return error for other types of rollback failures
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to rollback transaction: %v", rollbackErr)
|
||||||
|
return xerrors.NewError(fmt.Errorf("failed to rollback transaction: %w", rollbackErr), 0, "", true)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
211
db/db_queries.go
211
db/db_queries.go
@@ -1,88 +1,74 @@
|
|||||||
package db
|
package db
|
||||||
|
|
||||||
const (
|
const (
|
||||||
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS = `
|
|
||||||
SELECT *
|
|
||||||
FROM snapshots
|
|
||||||
WHERE response_code IS NULL
|
|
||||||
AND error IS NULL
|
|
||||||
ORDER BY RANDOM()
|
|
||||||
FOR UPDATE SKIP LOCKED
|
|
||||||
LIMIT $1
|
|
||||||
`
|
|
||||||
SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS = `
|
SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS = `
|
||||||
SELECT url
|
SELECT url
|
||||||
FROM urls u
|
FROM urls u
|
||||||
WHERE u.id IN (
|
WHERE u.id IN (
|
||||||
SELECT MIN(id)
|
SELECT id FROM (
|
||||||
|
SELECT id, ROW_NUMBER() OVER (PARTITION BY host ORDER BY id) as rn
|
||||||
FROM urls
|
FROM urls
|
||||||
GROUP BY host
|
) t
|
||||||
)
|
WHERE rn <= 3
|
||||||
LIMIT $1
|
|
||||||
`
|
|
||||||
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = `
|
|
||||||
SELECT *
|
|
||||||
FROM snapshots s
|
|
||||||
WHERE response_code IS NULL
|
|
||||||
AND error IS NULL
|
|
||||||
AND s.id IN (
|
|
||||||
SELECT MIN(id)
|
|
||||||
FROM snapshots
|
|
||||||
WHERE response_code IS NULL
|
|
||||||
AND error IS NULL
|
|
||||||
GROUP BY host
|
|
||||||
)
|
)
|
||||||
ORDER BY RANDOM()
|
ORDER BY RANDOM()
|
||||||
FOR UPDATE SKIP LOCKED
|
FOR UPDATE SKIP LOCKED
|
||||||
LIMIT $1
|
LIMIT $1
|
||||||
`
|
`
|
||||||
SQL_SELECT_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = `
|
SQL_SELECT_RANDOM_URLS = `
|
||||||
SELECT *
|
SELECT url
|
||||||
FROM snapshots s
|
FROM urls u
|
||||||
WHERE response_code IS NULL
|
WHERE u.being_processed IS NOT TRUE
|
||||||
AND error IS NULL
|
ORDER BY RANDOM()
|
||||||
AND s.id IN (
|
|
||||||
SELECT MIN(id)
|
|
||||||
FROM snapshots
|
|
||||||
WHERE response_code IS NULL
|
|
||||||
AND error IS NULL
|
|
||||||
GROUP BY host
|
|
||||||
)
|
|
||||||
FOR UPDATE SKIP LOCKED
|
FOR UPDATE SKIP LOCKED
|
||||||
LIMIT $1
|
LIMIT $1
|
||||||
`
|
`
|
||||||
SQL_INSERT_SNAPSHOT_IF_NEW = `
|
SQL_MARK_URLS_BEING_PROCESSED = `UPDATE urls SET being_processed = true WHERE url IN (%s)`
|
||||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
SQL_SELECT_RANDOM_URLS_GEMINI_ONLY = `
|
||||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
SELECT url
|
||||||
ON CONFLICT (url) DO NOTHING
|
FROM urls u
|
||||||
|
WHERE u.url like 'gemini://%'
|
||||||
|
AND u.being_processed IS NOT TRUE
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
FOR UPDATE SKIP LOCKED
|
||||||
|
LIMIT $1
|
||||||
`
|
`
|
||||||
SQL_UPSERT_SNAPSHOT = `INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
SQL_SELECT_RANDOM_URLS_GEMINI_ONLY_2 = `
|
||||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
WITH RankedUrls AS (
|
||||||
ON CONFLICT (url) DO UPDATE SET
|
-- Step 1: Assign a random rank to each URL within its host group
|
||||||
url = EXCLUDED.url,
|
SELECT
|
||||||
host = EXCLUDED.host,
|
url,
|
||||||
timestamp = EXCLUDED.timestamp,
|
host,
|
||||||
mimetype = EXCLUDED.mimetype,
|
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||||
data = EXCLUDED.data,
|
FROM
|
||||||
gemtext = EXCLUDED.gemtext,
|
urls
|
||||||
links = EXCLUDED.links,
|
WHERE url like 'gemini://%'
|
||||||
lang = EXCLUDED.lang,
|
AND being_processed IS NOT TRUE
|
||||||
response_code = EXCLUDED.response_code,
|
),
|
||||||
error = EXCLUDED.error
|
OneUrlPerHost AS (
|
||||||
RETURNING id
|
-- Step 2: Filter to keep only the first-ranked (random) URL per host
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
host
|
||||||
|
FROM
|
||||||
|
RankedUrls
|
||||||
|
WHERE
|
||||||
|
rn = 1
|
||||||
|
)
|
||||||
|
-- Step 3: From the set of one URL per host, randomly select X
|
||||||
|
SELECT
|
||||||
|
url
|
||||||
|
FROM
|
||||||
|
OneUrlPerHost
|
||||||
|
ORDER BY
|
||||||
|
RANDOM()
|
||||||
|
FOR UPDATE SKIP LOCKED
|
||||||
|
LIMIT $1
|
||||||
`
|
`
|
||||||
SQL_UPDATE_SNAPSHOT = `UPDATE snapshots
|
// New query - always insert a new snapshot without conflict handling
|
||||||
SET url = :url,
|
SQL_INSERT_SNAPSHOT = `
|
||||||
host = :host,
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header, last_crawled)
|
||||||
timestamp = :timestamp,
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header, :last_crawled)
|
||||||
mimetype = :mimetype,
|
|
||||||
data = :data,
|
|
||||||
gemtext = :gemtext,
|
|
||||||
links = :links,
|
|
||||||
lang = :lang,
|
|
||||||
response_code = :response_code,
|
|
||||||
error = :error
|
|
||||||
WHERE id = :id
|
|
||||||
RETURNING id
|
RETURNING id
|
||||||
`
|
`
|
||||||
SQL_INSERT_URL = `
|
SQL_INSERT_URL = `
|
||||||
@@ -90,4 +76,95 @@ RETURNING id
|
|||||||
VALUES (:url, :host, :timestamp)
|
VALUES (:url, :host, :timestamp)
|
||||||
ON CONFLICT (url) DO NOTHING
|
ON CONFLICT (url) DO NOTHING
|
||||||
`
|
`
|
||||||
|
SQL_UPDATE_URL = `
|
||||||
|
UPDATE urls
|
||||||
|
SET url = :NormalizedURL
|
||||||
|
WHERE url = :Url
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM urls WHERE url = :NormalizedURL
|
||||||
|
)
|
||||||
|
`
|
||||||
|
SQL_DELETE_URL = `
|
||||||
|
DELETE FROM urls WHERE url=$1
|
||||||
|
`
|
||||||
|
SQL_GET_LATEST_SNAPSHOT = `
|
||||||
|
SELECT * FROM snapshots
|
||||||
|
WHERE url = $1
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT 1
|
||||||
|
`
|
||||||
|
SQL_GET_SNAPSHOT_AT_TIMESTAMP = `
|
||||||
|
SELECT * FROM snapshots
|
||||||
|
WHERE url = $1
|
||||||
|
AND timestamp <= $2
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT 1
|
||||||
|
`
|
||||||
|
SQL_GET_ALL_SNAPSHOTS_FOR_URL = `
|
||||||
|
SELECT * FROM snapshots
|
||||||
|
WHERE url = $1
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
`
|
||||||
|
SQL_GET_SNAPSHOTS_BY_DATE_RANGE = `
|
||||||
|
SELECT * FROM snapshots
|
||||||
|
WHERE url = $1
|
||||||
|
AND timestamp BETWEEN $2 AND $3
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
`
|
||||||
|
// Update last_crawled timestamp for the most recent snapshot of a URL
|
||||||
|
SQL_UPDATE_LAST_CRAWLED = `
|
||||||
|
UPDATE snapshots
|
||||||
|
SET last_crawled = CURRENT_TIMESTAMP
|
||||||
|
WHERE url = $1
|
||||||
|
`
|
||||||
|
// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
|
||||||
|
// This query finds root domain URLs that haven't been crawled recently and selects
|
||||||
|
// one URL per host for diversity. Uses CTEs to:
|
||||||
|
// 1. Find latest crawl attempt per URL (via MAX(last_crawled))
|
||||||
|
// 2. Filter to URLs with actual content and successful responses (20-29)
|
||||||
|
// 3. Select URLs where latest crawl is older than cutoff date
|
||||||
|
// 4. Rank randomly within each host and pick one URL per host
|
||||||
|
// Parameters: $1 = cutoff_date, $2 = limit
|
||||||
|
SQL_FETCH_SNAPSHOTS_FROM_HISTORY = `
|
||||||
|
WITH latest_attempts AS (
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
host,
|
||||||
|
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
|
||||||
|
FROM snapshots
|
||||||
|
WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini' AND error IS NULL
|
||||||
|
GROUP BY url, host
|
||||||
|
),
|
||||||
|
root_urls_with_content AS (
|
||||||
|
SELECT DISTINCT
|
||||||
|
la.url,
|
||||||
|
la.host,
|
||||||
|
la.latest_attempt
|
||||||
|
FROM latest_attempts la
|
||||||
|
JOIN snapshots s ON s.url = la.url
|
||||||
|
WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL)
|
||||||
|
AND s.response_code BETWEEN 20 AND 29
|
||||||
|
),
|
||||||
|
eligible_urls AS (
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
host,
|
||||||
|
latest_attempt
|
||||||
|
FROM root_urls_with_content
|
||||||
|
WHERE latest_attempt < $1
|
||||||
|
),
|
||||||
|
ranked_urls AS (
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
host,
|
||||||
|
latest_attempt,
|
||||||
|
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank
|
||||||
|
FROM eligible_urls
|
||||||
|
)
|
||||||
|
SELECT url, host
|
||||||
|
FROM ranked_urls
|
||||||
|
WHERE rank = 1
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT $2
|
||||||
|
`
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,55 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"gemini-grc/common"
|
|
||||||
"os"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"gemini-grc/config"
|
|
||||||
"gemini-grc/logging"
|
|
||||||
)
|
|
||||||
|
|
||||||
var Blacklist *[]string //nolint:gochecknoglobals
|
|
||||||
|
|
||||||
func LoadBlacklist() {
|
|
||||||
if Blacklist == nil {
|
|
||||||
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
|
|
||||||
if err != nil {
|
|
||||||
Blacklist = &[]string{}
|
|
||||||
logging.LogWarn("Could not load Blacklist file: %v", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
lines := strings.Split(string(data), "\n")
|
|
||||||
|
|
||||||
// Ignore lines starting with '#' (comments)
|
|
||||||
filteredLines := func() []string {
|
|
||||||
out := make([]string, 0, len(lines))
|
|
||||||
for _, line := range lines {
|
|
||||||
if !strings.HasPrefix(line, "#") {
|
|
||||||
out = append(out, line)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}()
|
|
||||||
|
|
||||||
if len(lines) > 0 {
|
|
||||||
Blacklist = &filteredLines
|
|
||||||
logging.LogInfo("Blacklist has %d entries", len(*Blacklist))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func IsBlacklisted(u string) bool {
|
|
||||||
url, err := common.ParseURL(u, "")
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
|
|
||||||
for _, v := range *Blacklist {
|
|
||||||
if v == url.Hostname || v == hostWithPort {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"gemini-grc/logging"
|
|
||||||
)
|
|
||||||
|
|
||||||
var IPPool = IpAddressPool{IPs: make(map[string]int)}
|
|
||||||
|
|
||||||
func AddIPsToPool(ips []string) {
|
|
||||||
IPPool.Lock.Lock()
|
|
||||||
for _, ip := range ips {
|
|
||||||
logging.LogDebug("Adding %s to pool", ip)
|
|
||||||
IPPool.IPs[ip] = 1
|
|
||||||
}
|
|
||||||
IPPool.Lock.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func RemoveIPsFromPool(IPs []string) {
|
|
||||||
IPPool.Lock.Lock()
|
|
||||||
for _, ip := range IPs {
|
|
||||||
_, ok := IPPool.IPs[ip]
|
|
||||||
if ok {
|
|
||||||
logging.LogDebug("Removing %s from pool", ip)
|
|
||||||
delete(IPPool.IPs, ip)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
IPPool.Lock.Unlock()
|
|
||||||
}
|
|
||||||
64
gemini/errors.go
Normal file
64
gemini/errors.go
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GeminiError is used to represent
|
||||||
|
// Gemini network protocol errors only.
|
||||||
|
// Should be recorded to the snapshot.
|
||||||
|
// See https://geminiprotocol.net/docs/protocol-specification.gmi
|
||||||
|
type GeminiError struct {
|
||||||
|
Msg string
|
||||||
|
Code int
|
||||||
|
Header string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *GeminiError) Error() string {
|
||||||
|
return fmt.Sprintf("gemini error: code %d %s", e.Code, e.Msg)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *GeminiError) String() string {
|
||||||
|
return e.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewGeminiError creates a new GeminiError based on the status code and header.
|
||||||
|
// Status codes are based on the Gemini protocol specification:
|
||||||
|
// - 1x: Input required
|
||||||
|
// - 2x: Success (not handled as errors)
|
||||||
|
// - 3x: Redirect
|
||||||
|
// - 4x: Temporary failure
|
||||||
|
// - 5x: Permanent failure
|
||||||
|
// - 6x: Client certificate required/rejected
|
||||||
|
func NewGeminiError(code int, header string) error {
|
||||||
|
var msg string
|
||||||
|
switch {
|
||||||
|
case code >= 10 && code < 20:
|
||||||
|
msg = fmt.Sprintf("input required: %s", header)
|
||||||
|
case code >= 30 && code < 40:
|
||||||
|
msg = fmt.Sprintf("redirect: %s", header)
|
||||||
|
case code >= 40 && code < 50:
|
||||||
|
msg = fmt.Sprintf("request failed: %s", header)
|
||||||
|
case code >= 50 && code < 60:
|
||||||
|
msg = fmt.Sprintf("server error: %s", header)
|
||||||
|
case code >= 60 && code < 70:
|
||||||
|
msg = fmt.Sprintf("TLS error: %s", header)
|
||||||
|
default:
|
||||||
|
msg = fmt.Sprintf("unexpected status code %d: %s", code, header)
|
||||||
|
}
|
||||||
|
return &GeminiError{
|
||||||
|
Msg: msg,
|
||||||
|
Code: code,
|
||||||
|
Header: header,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsGeminiError checks if the given error is a GeminiError.
|
||||||
|
func IsGeminiError(err error) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
var asError *GeminiError
|
||||||
|
return errors.As(err, &asError)
|
||||||
|
}
|
||||||
36
gemini/errors_test.go
Normal file
36
gemini/errors_test.go
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestErrGemini(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
err := NewGeminiError(50, "50 server error")
|
||||||
|
if !errors.As(err, new(*GeminiError)) {
|
||||||
|
t.Errorf("TestErrGemini fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrGeminiWrapped(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
err := NewGeminiError(50, "50 server error")
|
||||||
|
errWrapped := fmt.Errorf("%w wrapped", err)
|
||||||
|
if !errors.As(errWrapped, new(*GeminiError)) {
|
||||||
|
t.Errorf("TestErrGeminiWrapped fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsGeminiError(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
err1 := NewGeminiError(50, "50 server error")
|
||||||
|
if !IsGeminiError(err1) {
|
||||||
|
t.Errorf("TestGeminiError fail #1")
|
||||||
|
}
|
||||||
|
wrappedErr1 := fmt.Errorf("wrapped %w", err1)
|
||||||
|
if !IsGeminiError(wrappedErr1) {
|
||||||
|
t.Errorf("TestGeminiError fail #2")
|
||||||
|
}
|
||||||
|
}
|
||||||
114
gemini/files.go
114
gemini/files.go
@@ -1,114 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"gemini-grc/common"
|
|
||||||
"net/url"
|
|
||||||
"os"
|
|
||||||
"path"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"gemini-grc/logging"
|
|
||||||
)
|
|
||||||
|
|
||||||
// sanitizePath encodes invalid filesystem characters using URL encoding.
|
|
||||||
// Example:
|
|
||||||
// /example/path/to/page?query=param&another=value
|
|
||||||
// would become
|
|
||||||
// example/path/to/page%3Fquery%3Dparam%26another%3Dvalue
|
|
||||||
func sanitizePath(p string) string {
|
|
||||||
// Split the path into its components
|
|
||||||
components := strings.Split(p, "/")
|
|
||||||
|
|
||||||
// Encode each component separately
|
|
||||||
for i, component := range components {
|
|
||||||
// Decode any existing percent-encoded characters
|
|
||||||
decodedComponent, err := url.PathUnescape(component)
|
|
||||||
if err != nil {
|
|
||||||
decodedComponent = component // Fallback to original if unescape fails
|
|
||||||
}
|
|
||||||
|
|
||||||
// Encode the component to escape invalid filesystem characters
|
|
||||||
encodedComponent := url.QueryEscape(decodedComponent)
|
|
||||||
|
|
||||||
// Replace '+' (from QueryEscape) with '%20' to handle spaces correctly
|
|
||||||
encodedComponent = strings.ReplaceAll(encodedComponent, "+", "%20")
|
|
||||||
|
|
||||||
components[i] = encodedComponent
|
|
||||||
}
|
|
||||||
|
|
||||||
// Rejoin the components into a sanitized path
|
|
||||||
safe := filepath.Join(components...)
|
|
||||||
|
|
||||||
return safe
|
|
||||||
}
|
|
||||||
|
|
||||||
// getFilePath constructs a safe file path from the root path and URL path.
|
|
||||||
// It URL-encodes invalid filesystem characters to ensure the path is valid.
|
|
||||||
func calcFilePath(rootPath, urlPath string) (string, error) {
|
|
||||||
// Normalize the URL path
|
|
||||||
cleanPath := filepath.Clean(urlPath)
|
|
||||||
|
|
||||||
// Safe check to prevent directory traversal
|
|
||||||
if strings.Contains(cleanPath, "..") {
|
|
||||||
return "", fmt.Errorf("Invalid URL path: contains directory traversal")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sanitize the path by encoding invalid characters
|
|
||||||
safePath := sanitizePath(cleanPath)
|
|
||||||
|
|
||||||
// Join the root path and the sanitized URL path
|
|
||||||
finalPath := filepath.Join(rootPath, safePath)
|
|
||||||
|
|
||||||
return finalPath, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func SaveToFile(rootPath string, s *common.Snapshot, done chan struct{}) {
|
|
||||||
parentPath := path.Join(rootPath, s.URL.Hostname)
|
|
||||||
urlPath := s.URL.Path
|
|
||||||
// If path is empty, add `index.gmi` as the file to save
|
|
||||||
if urlPath == "" || urlPath == "." {
|
|
||||||
urlPath = "index.gmi"
|
|
||||||
}
|
|
||||||
// If path ends with '/' then add index.gmi for the
|
|
||||||
// directory to be created.
|
|
||||||
if strings.HasSuffix(urlPath, "/") {
|
|
||||||
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
|
|
||||||
}
|
|
||||||
|
|
||||||
finalPath, err := calcFilePath(parentPath, urlPath)
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("GeminiError saving %s: %w", s.URL, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Ensure the directory exists
|
|
||||||
dir := filepath.Dir(finalPath)
|
|
||||||
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
|
|
||||||
logging.LogError("Failed to create directory: %w", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
|
||||||
err = os.WriteFile(finalPath, (*s).Data.V, 0o666)
|
|
||||||
} else {
|
|
||||||
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("GeminiError saving %s: %w", s.URL.Full, err)
|
|
||||||
}
|
|
||||||
close(done)
|
|
||||||
}
|
|
||||||
|
|
||||||
func ReadLines(path string) []string {
|
|
||||||
data, err := os.ReadFile(path)
|
|
||||||
if err != nil {
|
|
||||||
panic(fmt.Sprintf("Failed to read file: %s", err))
|
|
||||||
}
|
|
||||||
lines := strings.Split(string(data), "\n")
|
|
||||||
// Remove last line if empty
|
|
||||||
// (happens when file ends with '\n')
|
|
||||||
if lines[len(lines)-1] == "" {
|
|
||||||
lines = lines[:len(lines)-1]
|
|
||||||
}
|
|
||||||
return lines
|
|
||||||
}
|
|
||||||
140
gemini/gemini.go
140
gemini/gemini.go
@@ -1,140 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"gemini-grc/common"
|
|
||||||
"net/url"
|
|
||||||
"regexp"
|
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"gemini-grc/logging"
|
|
||||||
)
|
|
||||||
|
|
||||||
func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList {
|
|
||||||
// Grab link lines
|
|
||||||
linkLines := ExtractLinkLines(gemtext)
|
|
||||||
if len(linkLines) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
var linkURLs common.LinkList
|
|
||||||
// Normalize URLs in links, and store them in snapshot
|
|
||||||
for _, line := range linkLines {
|
|
||||||
linkURL, err := NormalizeLink(line, currentURL.String())
|
|
||||||
if err != nil {
|
|
||||||
logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
linkURLs = append(linkURLs, *linkURL)
|
|
||||||
}
|
|
||||||
return linkURLs
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
|
||||||
func ExtractLinkLines(gemtext string) []string {
|
|
||||||
// Define the regular expression pattern to match link lines
|
|
||||||
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
|
|
||||||
|
|
||||||
// Find all matches using the regular expression
|
|
||||||
matches := re.FindAllString(gemtext, -1)
|
|
||||||
|
|
||||||
return matches
|
|
||||||
}
|
|
||||||
|
|
||||||
// NormalizeLink takes a single link line and the current URL,
|
|
||||||
// return the URL converted to an absolute URL
|
|
||||||
// and its description.
|
|
||||||
func NormalizeLink(linkLine string, currentURL string) (*common.URL, error) {
|
|
||||||
// Parse the current URL
|
|
||||||
baseURL, err := url.Parse(currentURL)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Regular expression to extract the URL part from a link line
|
|
||||||
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
|
|
||||||
|
|
||||||
// Use regex to extract the URL and the rest of the line
|
|
||||||
matches := re.FindStringSubmatch(linkLine)
|
|
||||||
if len(matches) == 0 {
|
|
||||||
// If the line doesn't match the expected format, return it unchanged
|
|
||||||
return nil, fmt.Errorf("%w for link line %s", common.ErrGeminiLinkLineParse, linkLine)
|
|
||||||
}
|
|
||||||
|
|
||||||
originalURLStr := matches[1]
|
|
||||||
_, err = url.QueryUnescape(originalURLStr)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
restOfLine := ""
|
|
||||||
if len(matches) > 2 {
|
|
||||||
restOfLine = matches[2]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the URL from the link line
|
|
||||||
parsedURL, err := url.Parse(originalURLStr)
|
|
||||||
if err != nil {
|
|
||||||
// If URL parsing fails, return an error
|
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Resolve relative URLs against the base URL
|
|
||||||
if !parsedURL.IsAbs() {
|
|
||||||
parsedURL = baseURL.ResolveReference(parsedURL)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove usual first space from URL description:
|
|
||||||
// => URL description
|
|
||||||
// ^^^^^^^^^^^^
|
|
||||||
if len(restOfLine) > 0 && restOfLine[0] == ' ' {
|
|
||||||
restOfLine = restOfLine[1:]
|
|
||||||
}
|
|
||||||
|
|
||||||
finalURL, err := common.ParseURL(parsedURL.String(), restOfLine)
|
|
||||||
if err != nil {
|
|
||||||
// If URL parsing fails, return an error
|
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return finalURL, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
|
|
||||||
// If no valid digits are found, it returns an error.
|
|
||||||
func ParseFirstTwoDigits(input string) (int, error) {
|
|
||||||
// Define the regular expression pattern to match one or two leading digits
|
|
||||||
re := regexp.MustCompile(`^(\d{1,2})`)
|
|
||||||
|
|
||||||
// Find the first match in the string
|
|
||||||
matches := re.FindStringSubmatch(input)
|
|
||||||
if len(matches) == 0 {
|
|
||||||
return 0, fmt.Errorf("%w", common.ErrGeminiResponseHeader)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the captured match as an integer
|
|
||||||
snapshot, err := strconv.Atoi(matches[1])
|
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("%w: %w", common.ErrTextParse, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return snapshot, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// extractRedirectTarget returns the redirection
|
|
||||||
// URL by parsing the header (or error message)
|
|
||||||
func extractRedirectTarget(currentURL common.URL, input string) (*common.URL, error) {
|
|
||||||
// \d+ - matches one or more digits
|
|
||||||
// \s+ - matches one or more whitespace
|
|
||||||
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
|
||||||
pattern := `\d+\s+([^\r]+)`
|
|
||||||
re := regexp.MustCompile(pattern)
|
|
||||||
matches := re.FindStringSubmatch(input)
|
|
||||||
if len(matches) < 2 {
|
|
||||||
return nil, fmt.Errorf("%w: %s", common.ErrGeminiRedirect, input)
|
|
||||||
}
|
|
||||||
newURL, err := common.DeriveAbsoluteURL(currentURL, matches[1])
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("%w: %w: %s", common.ErrGeminiRedirect, err, input)
|
|
||||||
}
|
|
||||||
return newURL, nil
|
|
||||||
}
|
|
||||||
87
gemini/geminiLinks.go
Normal file
87
gemini/geminiLinks.go
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"gemini-grc/common/linkList"
|
||||||
|
url2 "gemini-grc/common/url"
|
||||||
|
"gemini-grc/util"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
)
|
||||||
|
|
||||||
|
func GetPageLinks(currentURL url2.URL, gemtext string) linkList.LinkList {
|
||||||
|
linkLines := util.GetLinesMatchingRegex(gemtext, `(?m)^=>[ \t]+.*`)
|
||||||
|
if len(linkLines) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var linkURLs linkList.LinkList
|
||||||
|
// Normalize URLs in links
|
||||||
|
for _, line := range linkLines {
|
||||||
|
linkUrl, err := ParseGeminiLinkLine(line, currentURL.String())
|
||||||
|
if err != nil {
|
||||||
|
logging.LogDebug("error parsing gemini link line: %s", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
linkURLs = append(linkURLs, *linkUrl)
|
||||||
|
}
|
||||||
|
return linkURLs
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseGeminiLinkLine takes a single link line and the current URL,
|
||||||
|
// return the URL converted to an absolute URL
|
||||||
|
// and its description.
|
||||||
|
func ParseGeminiLinkLine(linkLine string, currentURL string) (*url2.URL, error) {
|
||||||
|
// Check: currentURL is parseable
|
||||||
|
baseURL, err := url.Parse(currentURL)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the actual URL and the description
|
||||||
|
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
|
||||||
|
matches := re.FindStringSubmatch(linkLine)
|
||||||
|
if len(matches) == 0 {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: no regexp match for line %s", linkLine), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
originalURLStr := matches[1]
|
||||||
|
|
||||||
|
// Check: Unescape the URL if escaped
|
||||||
|
_, err = url.QueryUnescape(originalURLStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
description := ""
|
||||||
|
if len(matches) > 2 {
|
||||||
|
description = matches[2]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse the URL from the link line
|
||||||
|
parsedURL, err := url.Parse(originalURLStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If link URL is relative, resolve full URL
|
||||||
|
if !parsedURL.IsAbs() {
|
||||||
|
parsedURL = baseURL.ResolveReference(parsedURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove usual first space from URL description:
|
||||||
|
// => URL description
|
||||||
|
// ^^^^^^^^^^^^
|
||||||
|
if len(description) > 0 && description[0] == ' ' {
|
||||||
|
description = description[1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
finalURL, err := url2.ParseURL(parsedURL.String(), description, true)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
return finalURL, nil
|
||||||
|
}
|
||||||
122
gemini/geminiLinks_test.go
Normal file
122
gemini/geminiLinks_test.go
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gemini-grc/common/url"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TestData struct {
|
||||||
|
currentURL string
|
||||||
|
link string
|
||||||
|
value *url.URL
|
||||||
|
error string
|
||||||
|
}
|
||||||
|
|
||||||
|
var data = []TestData{
|
||||||
|
{
|
||||||
|
currentURL: "https://gemini.com/",
|
||||||
|
link: "https://gemini.com/",
|
||||||
|
value: nil,
|
||||||
|
error: "error parsing link line",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
|
||||||
|
link: "=> archive/ Complete Archive",
|
||||||
|
value: &url.URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "gemi.dev",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/cgi-bin/xkcd/archive/",
|
||||||
|
Descr: "Complete Archive",
|
||||||
|
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd/archive/",
|
||||||
|
},
|
||||||
|
error: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
|
||||||
|
link: "=> /cgi-bin/xkcd.cgi?a=5&b=6 Example",
|
||||||
|
value: &url.URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "gemi.dev",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/cgi-bin/xkcd.cgi",
|
||||||
|
Descr: "Example",
|
||||||
|
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?a=5&b=6",
|
||||||
|
},
|
||||||
|
error: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
|
||||||
|
link: "=> /cgi-bin/xkcd.cgi?1494 XKCD 1494: Insurance",
|
||||||
|
value: &url.URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "gemi.dev",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/cgi-bin/xkcd.cgi",
|
||||||
|
Descr: "XKCD 1494: Insurance",
|
||||||
|
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494",
|
||||||
|
},
|
||||||
|
error: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
|
||||||
|
link: "=> /cgi-bin/xkcd.cgi?1494#f XKCD 1494: Insurance",
|
||||||
|
value: &url.URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "gemi.dev",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/cgi-bin/xkcd.cgi",
|
||||||
|
Descr: "XKCD 1494: Insurance",
|
||||||
|
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494#f",
|
||||||
|
},
|
||||||
|
error: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
|
||||||
|
link: "=> /cgi-bin/xkcd.cgi?c=5#d XKCD 1494: Insurance",
|
||||||
|
value: &url.URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "gemi.dev",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/cgi-bin/xkcd.cgi",
|
||||||
|
Descr: "XKCD 1494: Insurance",
|
||||||
|
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?c=5#d",
|
||||||
|
},
|
||||||
|
error: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
currentURL: "gemini://a.b/c#d",
|
||||||
|
link: "=> /d/e#f",
|
||||||
|
value: &url.URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "a.b",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/d/e",
|
||||||
|
Descr: "",
|
||||||
|
Full: "gemini://a.b:1965/d/e#f",
|
||||||
|
},
|
||||||
|
error: "",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
for i, expected := range data {
|
||||||
|
result, err := ParseGeminiLinkLine(expected.link, expected.currentURL)
|
||||||
|
if err != nil { //nolint:nestif
|
||||||
|
if expected.value != nil {
|
||||||
|
t.Errorf("data[%d]: Expected value %v, got %v", i, nil, expected.value)
|
||||||
|
}
|
||||||
|
if !strings.HasPrefix(err.Error(), expected.error) {
|
||||||
|
t.Errorf("data[%d]: expected error %v, got %v", i, expected.error, err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if !(reflect.DeepEqual(result, expected.value)) {
|
||||||
|
t.Errorf("data[%d]: expected %#v, got %#v", i, expected.value, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"gemini-grc/common"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
|
||||||
input := "redirect: 31 gemini://target.gr"
|
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
|
||||||
expected := "gemini://target.gr:1965"
|
|
||||||
if err != nil || (result.String() != expected) {
|
|
||||||
t.Errorf("fail: Expected %s got %s", expected, result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
|
||||||
input := "redirect: 31 gemini://target.gr/"
|
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
|
||||||
expected := "gemini://target.gr:1965/"
|
|
||||||
if err != nil || (result.String() != expected) {
|
|
||||||
t.Errorf("fail: Expected %s got %s", expected, result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
|
||||||
input := "redirect: 31 /a/b"
|
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
|
||||||
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
|
||||||
t.Errorf("fail: %s", result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL, _ := common.ParseURL("gemini://nox.im:1965", "")
|
|
||||||
input := "redirect: 31 ./"
|
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
|
||||||
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
|
||||||
t.Errorf("fail: %s", result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL, _ := common.ParseURL("gemini://status.zvava.org:1965", "")
|
|
||||||
input := "redirect: 31 index.gmi"
|
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
|
||||||
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
|
|
||||||
t.Errorf("fail: %s", result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractRedirectTargetWrong(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
|
||||||
input := "redirect: 31"
|
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
|
||||||
if result != nil || err == nil {
|
|
||||||
t.Errorf("fail: result should be nil, err is %s", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import "sync"
|
|
||||||
|
|
||||||
// Used to limit requests per
|
|
||||||
// IP address. Maps IP address
|
|
||||||
// to number of active connections.
|
|
||||||
type IpAddressPool struct {
|
|
||||||
IPs map[string]int
|
|
||||||
Lock sync.RWMutex
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *IpAddressPool) Set(key string, value int) {
|
|
||||||
p.Lock.Lock() // Lock for writing
|
|
||||||
defer p.Lock.Unlock() // Ensure mutex is unlocked after the write
|
|
||||||
p.IPs[key] = value
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *IpAddressPool) Get(key string) int {
|
|
||||||
p.Lock.RLock() // Lock for reading
|
|
||||||
defer p.Lock.RUnlock() // Ensure mutex is unlocked after reading
|
|
||||||
if value, ok := p.IPs[key]; !ok {
|
|
||||||
return 0
|
|
||||||
} else {
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *IpAddressPool) Delete(key string) {
|
|
||||||
p.Lock.Lock()
|
|
||||||
defer p.Lock.Unlock()
|
|
||||||
delete(p.IPs, key)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *IpAddressPool) Incr(key string) {
|
|
||||||
p.Lock.Lock()
|
|
||||||
defer p.Lock.Unlock()
|
|
||||||
if _, ok := p.IPs[key]; !ok {
|
|
||||||
p.IPs[key] = 1
|
|
||||||
} else {
|
|
||||||
p.IPs[key] = p.IPs[key] + 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *IpAddressPool) Decr(key string) {
|
|
||||||
p.Lock.Lock()
|
|
||||||
defer p.Lock.Unlock()
|
|
||||||
if val, ok := p.IPs[key]; ok {
|
|
||||||
p.IPs[key] = val - 1
|
|
||||||
if p.IPs[key] == 0 {
|
|
||||||
delete(p.IPs, key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,201 +1,243 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"crypto/tls"
|
"crypto/tls"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/common"
|
|
||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
gourl "net/url"
|
stdurl "net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/common/snapshot"
|
||||||
|
_url "gemini-grc/common/url"
|
||||||
"gemini-grc/config"
|
"gemini-grc/config"
|
||||||
"gemini-grc/logging"
|
"gemini-grc/contextutil"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
"github.com/guregu/null/v5"
|
"github.com/guregu/null/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
type PageData struct {
|
// Visit visits a given URL using the Gemini protocol,
|
||||||
ResponseCode int
|
// and returns a populated snapshot. Any relevant errors
|
||||||
ResponseHeader string
|
// when visiting the URL are stored in the snapshot;
|
||||||
MimeType string
|
// an error is returned only when construction of a
|
||||||
Lang string
|
// snapshot was not possible (context cancellation errors,
|
||||||
GemText string
|
// not a valid URL etc.)
|
||||||
Data []byte
|
func Visit(ctx context.Context, url string) (s *snapshot.Snapshot, err error) {
|
||||||
|
geminiCtx := contextutil.ContextWithComponent(ctx, "network")
|
||||||
|
|
||||||
|
s, err = snapshot.SnapshotFromURL(url, true)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resolve the URL hostname and
|
// Check if the context has been canceled
|
||||||
// check if we already have an open
|
if err := ctx.Err(); err != nil {
|
||||||
// connection to this host.
|
return nil, xerrors.NewSimpleError(err)
|
||||||
// If we can connect, return a list
|
|
||||||
// of the resolved IPs.
|
|
||||||
func getHostIPAddresses(hostname string) ([]string, error) {
|
|
||||||
addrs, err := net.LookupHost(hostname)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("%w:%w", common.ErrNetworkDNS, err)
|
|
||||||
}
|
|
||||||
IPPool.Lock.RLock()
|
|
||||||
defer func() {
|
|
||||||
IPPool.Lock.RUnlock()
|
|
||||||
}()
|
|
||||||
return addrs, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func ConnectAndGetData(url string) ([]byte, error) {
|
data, err := ConnectAndGetData(geminiCtx, s.URL.String())
|
||||||
parsedURL, err := gourl.Parse(url)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
return s, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if the context has been canceled
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
s = UpdateSnapshotWithData(*s, data)
|
||||||
|
|
||||||
|
if !s.Error.Valid &&
|
||||||
|
s.MimeType.Valid &&
|
||||||
|
s.MimeType.String == "text/gemini" &&
|
||||||
|
len(s.GemText.ValueOrZero()) > 0 {
|
||||||
|
links := GetPageLinks(s.URL, s.GemText.String)
|
||||||
|
if len(links) > 0 {
|
||||||
|
s.Links = null.ValueFrom(links)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ConnectAndGetData is a context-aware version of ConnectAndGetData
|
||||||
|
// that returns the data from a GET request to a Gemini URL. It uses the context
|
||||||
|
// for cancellation, timeout, and logging.
|
||||||
|
func ConnectAndGetData(ctx context.Context, url string) ([]byte, error) {
|
||||||
|
parsedURL, err := stdurl.Parse(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(fmt.Errorf("error parsing URL: %w", err))
|
||||||
|
}
|
||||||
|
|
||||||
hostname := parsedURL.Hostname()
|
hostname := parsedURL.Hostname()
|
||||||
port := parsedURL.Port()
|
port := parsedURL.Port()
|
||||||
if port == "" {
|
if port == "" {
|
||||||
port = "1965"
|
port = "1965"
|
||||||
}
|
}
|
||||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||||
// Establish the underlying TCP connection.
|
|
||||||
|
// Check if the context has been canceled before proceeding
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
|
||||||
|
|
||||||
|
// Establish the underlying TCP connection with context-based cancellation
|
||||||
dialer := &net.Dialer{
|
dialer := &net.Dialer{
|
||||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
Timeout: timeoutDuration,
|
||||||
}
|
}
|
||||||
conn, err := dialer.Dial("tcp", host)
|
|
||||||
|
conn, err := dialer.DialContext(ctx, "tcp", host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to establish TCP connection: %v", err)
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
}
|
}
|
||||||
// Make sure we always close the connection.
|
|
||||||
|
// Make sure we always close the connection
|
||||||
defer func() {
|
defer func() {
|
||||||
// No need to handle error:
|
|
||||||
// Connection will time out eventually if still open somehow.
|
|
||||||
_ = conn.Close()
|
_ = conn.Close()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Set read and write timeouts on the TCP connection.
|
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||||
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
|
return nil, xerrors.NewSimpleError(err)
|
||||||
}
|
}
|
||||||
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context has been canceled before proceeding with TLS handshake
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform the TLS handshake
|
// Perform the TLS handshake
|
||||||
tlsConfig := &tls.Config{
|
tlsConfig := &tls.Config{
|
||||||
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
|
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
|
||||||
ServerName: parsedURL.Hostname(), // SNI says we should not include port in hostname
|
ServerName: parsedURL.Hostname(), // SNI says we should not include port in hostname
|
||||||
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
|
|
||||||
}
|
|
||||||
tlsConn := tls.Client(conn, tlsConfig)
|
|
||||||
if err := tlsConn.Handshake(); err != nil {
|
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrNetworkTLS, err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// We read `buf`-sized chunks and add data to `data`.
|
tlsConn := tls.Client(conn, tlsConfig)
|
||||||
|
err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context is done before attempting handshake
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform TLS handshake with regular method
|
||||||
|
// (HandshakeContext is only available in Go 1.17+)
|
||||||
|
err = tlsConn.Handshake()
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check again if the context is done after handshake
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We read `buf`-sized chunks and add data to `data`
|
||||||
buf := make([]byte, 4096)
|
buf := make([]byte, 4096)
|
||||||
var data []byte
|
var data []byte
|
||||||
|
|
||||||
// Send Gemini request to trigger server response.
|
// Check if the context has been canceled before sending request
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send Gemini request to trigger server response
|
||||||
// Fix for stupid server bug:
|
// Fix for stupid server bug:
|
||||||
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||||
// when the port is 1965 and is still specified explicitly in the URL.
|
// when the port is 1965 and is still specified explicitly in the URL.
|
||||||
_url, _ := common.ParseURL(url, "")
|
url2, _ := _url.ParseURL(url, "", true)
|
||||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
|
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrNetworkCannotWrite, err)
|
return nil, xerrors.NewSimpleError(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read response bytes in len(buf) byte chunks
|
// Read response bytes in len(buf) byte chunks
|
||||||
for {
|
for {
|
||||||
|
// Check if the context has been canceled before each read
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
|
}
|
||||||
|
|
||||||
n, err := tlsConn.Read(buf)
|
n, err := tlsConn.Read(buf)
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
data = append(data, buf[:n]...)
|
data = append(data, buf[:n]...)
|
||||||
}
|
}
|
||||||
if len(data) > config.CONFIG.MaxResponseSize {
|
if len(data) > config.CONFIG.MaxResponseSize {
|
||||||
return nil, fmt.Errorf("%w: %v", common.ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
|
||||||
|
return nil, xerrors.NewSimpleError(fmt.Errorf("response too large"))
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
|
||||||
|
return nil, xerrors.NewSimpleError(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes of data", len(data))
|
||||||
return data, nil
|
return data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Visit given URL, using the Gemini protocol.
|
// UpdateSnapshotWithData processes the raw data from a Gemini response and populates the Snapshot.
|
||||||
// Mutates given Snapshot with the data.
|
// This function is exported for use by the robotsMatch package.
|
||||||
// In case of error, we store the error string
|
func UpdateSnapshotWithData(s snapshot.Snapshot, data []byte) *snapshot.Snapshot {
|
||||||
// inside snapshot and return the error.
|
|
||||||
func Visit(s *common.Snapshot) (err error) {
|
|
||||||
// Don't forget to also store error
|
|
||||||
// response code (if we have one)
|
|
||||||
// and header
|
|
||||||
defer func() {
|
|
||||||
if err != nil {
|
|
||||||
s.Error = null.StringFrom(err.Error())
|
|
||||||
if errors.As(err, new(*common.GeminiError)) {
|
|
||||||
s.Header = null.StringFrom(err.(*common.GeminiError).Header)
|
|
||||||
s.ResponseCode = null.IntFrom(int64(err.(*common.GeminiError).Code))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
s.Timestamp = null.TimeFrom(time.Now())
|
|
||||||
data, err := ConnectAndGetData(s.URL.String())
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
pageData, err := processData(data)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
s.Header = null.StringFrom(pageData.ResponseHeader)
|
|
||||||
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
|
||||||
s.MimeType = null.StringFrom(pageData.MimeType)
|
|
||||||
s.Lang = null.StringFrom(pageData.Lang)
|
|
||||||
if pageData.GemText != "" {
|
|
||||||
s.GemText = null.StringFrom(pageData.GemText)
|
|
||||||
}
|
|
||||||
if pageData.Data != nil {
|
|
||||||
s.Data = null.ValueFrom(pageData.Data)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// processData returne results from
|
|
||||||
// parsing Gemini header data:
|
|
||||||
// Code, mime type and lang (optional)
|
|
||||||
// Returns error if header was invalid
|
|
||||||
func processData(data []byte) (*PageData, error) {
|
|
||||||
header, body, err := getHeadersAndData(data)
|
header, body, err := getHeadersAndData(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
return &s
|
||||||
}
|
}
|
||||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||||
logging.LogDebug("Header: %s", strings.TrimSpace(header))
|
|
||||||
if code != 20 {
|
if code != 0 {
|
||||||
return nil, common.NewErrGeminiStatusCode(code, header)
|
s.ResponseCode = null.IntFrom(int64(code))
|
||||||
|
}
|
||||||
|
if header != "" {
|
||||||
|
s.Header = null.StringFrom(header)
|
||||||
|
}
|
||||||
|
if mimeType != "" {
|
||||||
|
s.MimeType = null.StringFrom(mimeType)
|
||||||
|
}
|
||||||
|
if lang != "" {
|
||||||
|
s.Lang = null.StringFrom(lang)
|
||||||
}
|
}
|
||||||
|
|
||||||
pageData := PageData{
|
|
||||||
ResponseCode: code,
|
|
||||||
ResponseHeader: header,
|
|
||||||
MimeType: mimeType,
|
|
||||||
Lang: lang,
|
|
||||||
}
|
|
||||||
// If we've got a Gemini document, populate
|
// If we've got a Gemini document, populate
|
||||||
// `GemText` field, otherwise raw data goes to `Data`.
|
// `GemText` field, otherwise raw data goes to `Data`.
|
||||||
if mimeType == "text/gemini" {
|
if mimeType == "text/gemini" {
|
||||||
validBody, err := BytesToValidUTF8(body)
|
validBody, err := BytesToValidUTF8(body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", common.ErrUTF8Parse, err)
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
return &s
|
||||||
}
|
}
|
||||||
pageData.GemText = validBody
|
s.GemText = null.StringFrom(validBody)
|
||||||
} else {
|
} else {
|
||||||
pageData.Data = body
|
s.Data = null.ValueFrom(body)
|
||||||
}
|
}
|
||||||
return &pageData, nil
|
return &s
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checks for a Gemini header, which is
|
// Checks for a Gemini header, which is
|
||||||
@@ -205,28 +247,35 @@ func processData(data []byte) (*PageData, error) {
|
|||||||
func getHeadersAndData(data []byte) (string, []byte, error) {
|
func getHeadersAndData(data []byte) (string, []byte, error) {
|
||||||
firstLineEnds := slices.Index(data, '\n')
|
firstLineEnds := slices.Index(data, '\n')
|
||||||
if firstLineEnds == -1 {
|
if firstLineEnds == -1 {
|
||||||
return "", nil, common.ErrGeminiResponseHeader
|
return "", nil, xerrors.NewSimpleError(fmt.Errorf("error parsing header"))
|
||||||
}
|
}
|
||||||
firstLine := string(data[:firstLineEnds])
|
firstLine := string(data[:firstLineEnds])
|
||||||
rest := data[firstLineEnds+1:]
|
rest := data[firstLineEnds+1:]
|
||||||
return firstLine, rest, nil
|
return strings.TrimSpace(firstLine), rest, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parses code, mime type and language
|
// getMimeTypeAndLang Parses code, mime type and language
|
||||||
// from a Gemini header.
|
// given a Gemini header.
|
||||||
// Examples:
|
|
||||||
// `20 text/gemini lang=en` (code, mimetype, lang)
|
|
||||||
// `20 text/gemini` (code, mimetype)
|
|
||||||
// `31 gemini://redirected.to/other/site` (code)
|
|
||||||
func getMimeTypeAndLang(headers string) (int, string, string) {
|
func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||||
// Regex that parses code, mimetype & optional charset/lang parameters
|
// First try to match the full format: "<code> <mimetype> [charset=<value>] [lang=<value>]"
|
||||||
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
|
// The regex looks for:
|
||||||
|
// - A number (\d+)
|
||||||
|
// - Followed by whitespace and a mimetype ([a-zA-Z0-9/\-+]+)
|
||||||
|
// - Optionally followed by charset and/or lang parameters in any order
|
||||||
|
// - Only capturing the lang value, ignoring charset
|
||||||
|
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:(?:[\s;]+(?:charset=[^;\s]+|lang=([a-zA-Z0-9-]+)))*)\s*$`)
|
||||||
matches := re.FindStringSubmatch(headers)
|
matches := re.FindStringSubmatch(headers)
|
||||||
if matches == nil || len(matches) <= 1 {
|
if len(matches) <= 1 {
|
||||||
// Try to get code at least
|
// If full format doesn't match, try to match redirect format: "<code> <URL>"
|
||||||
re := regexp.MustCompile(`^(\d+)\s+`)
|
// This handles cases like "31 gemini://example.com"
|
||||||
|
re := regexp.MustCompile(`^(\d+)\s+(.+)$`)
|
||||||
matches := re.FindStringSubmatch(headers)
|
matches := re.FindStringSubmatch(headers)
|
||||||
if matches == nil || len(matches) <= 1 {
|
if len(matches) <= 1 {
|
||||||
|
// If redirect format doesn't match, try to match just a status code
|
||||||
|
// This handles cases like "99"
|
||||||
|
re := regexp.MustCompile(`^(\d+)\s*$`)
|
||||||
|
matches := re.FindStringSubmatch(headers)
|
||||||
|
if len(matches) <= 1 {
|
||||||
return 0, "", ""
|
return 0, "", ""
|
||||||
}
|
}
|
||||||
code, err := strconv.Atoi(matches[1])
|
code, err := strconv.Atoi(matches[1])
|
||||||
@@ -239,7 +288,13 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, "", ""
|
return 0, "", ""
|
||||||
}
|
}
|
||||||
mimeType := matches[2]
|
return code, "", ""
|
||||||
param := matches[3] // This will capture either charset or lang value
|
}
|
||||||
return code, mimeType, param
|
code, err := strconv.Atoi(matches[1])
|
||||||
|
if err != nil {
|
||||||
|
return 0, "", ""
|
||||||
|
}
|
||||||
|
mimeType := matches[2]
|
||||||
|
lang := matches[3] // Will be empty string if no lang parameter was found
|
||||||
|
return code, mimeType, lang
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,78 +1,167 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"gemini-grc/common/snapshot"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Test for input: `20 text/gemini`
|
func TestGetHeadersAndData(t *testing.T) {
|
||||||
func TestGetMimeTypeAndLang1(t *testing.T) {
|
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
|
tests := []struct {
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
input []byte
|
||||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
header string
|
||||||
|
body []byte
|
||||||
|
expectError bool
|
||||||
|
}{
|
||||||
|
{[]byte("20 text/gemini\r\nThis is the body"), "20 text/gemini", []byte("This is the body"), false},
|
||||||
|
{[]byte("20 text/gemini\nThis is the body"), "20 text/gemini", []byte("This is the body"), false},
|
||||||
|
{[]byte("53 No proxying!\r\n"), "53 No proxying!", []byte(""), false},
|
||||||
|
{[]byte("No header"), "", nil, true},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
header, body, err := getHeadersAndData(test.input)
|
||||||
|
|
||||||
|
if test.expectError && err == nil {
|
||||||
|
t.Errorf("Expected error, got nil for input: %s", test.input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !test.expectError && err != nil {
|
||||||
|
t.Errorf("Unexpected error for input '%s': %v", test.input, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if header != test.header {
|
||||||
|
t.Errorf("Expected header '%s', got '%s' for input: %s", test.header, header, test.input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(body, test.body) {
|
||||||
|
t.Errorf("Expected body '%s', got '%s' for input: %s", test.body, string(body), test.input)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang11(t *testing.T) {
|
func TestGetMimeTypeAndLang(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
|
tests := []struct {
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
header string
|
||||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
code int
|
||||||
|
mimeType string
|
||||||
|
lang string
|
||||||
|
}{
|
||||||
|
{"20 text/gemini lang=en", 20, "text/gemini", "en"},
|
||||||
|
{"20 text/gemini", 20, "text/gemini", ""},
|
||||||
|
{"31 gemini://redirected.to/other/site", 31, "", ""},
|
||||||
|
{"20 text/plain;charset=utf-8", 20, "text/plain", ""},
|
||||||
|
{"20 text/plain;lang=el-GR", 20, "text/plain", "el-GR"},
|
||||||
|
{"20 text/gemini;lang=en-US;charset=utf-8", 20, "text/gemini", "en-US"}, // charset should be ignored
|
||||||
|
{"Invalid header", 0, "", ""},
|
||||||
|
{"99", 99, "", ""},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
code, mimeType, lang := getMimeTypeAndLang(test.header)
|
||||||
|
|
||||||
|
if code != test.code {
|
||||||
|
t.Errorf("Expected code %d, got %d for header: %s", test.code, code, test.header)
|
||||||
|
}
|
||||||
|
|
||||||
|
if mimeType != test.mimeType {
|
||||||
|
t.Errorf("Expected mimeType '%s', got '%s' for header: %s", test.mimeType, mimeType, test.header)
|
||||||
|
}
|
||||||
|
|
||||||
|
if lang != test.lang {
|
||||||
|
t.Errorf("Expected lang '%s', got '%s' for header: %s", test.lang, lang, test.header)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang12(t *testing.T) {
|
func TestProcessData(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
|
tests := []struct {
|
||||||
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
|
name string
|
||||||
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
inputData []byte
|
||||||
}
|
expectedCode int
|
||||||
|
expectedMime string
|
||||||
|
expectedLang string
|
||||||
|
expectedData []byte
|
||||||
|
expectedError bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Gemini document",
|
||||||
|
inputData: []byte("20 text/gemini\r\n# Hello\nWorld"),
|
||||||
|
expectedCode: 20,
|
||||||
|
expectedMime: "text/gemini",
|
||||||
|
expectedLang: "",
|
||||||
|
expectedData: []byte("# Hello\nWorld"),
|
||||||
|
expectedError: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Gemini document with language",
|
||||||
|
inputData: []byte("20 text/gemini lang=en\r\n# Hello\nWorld"),
|
||||||
|
expectedCode: 20,
|
||||||
|
expectedMime: "text/gemini",
|
||||||
|
expectedLang: "en",
|
||||||
|
expectedData: []byte("# Hello\nWorld"),
|
||||||
|
expectedError: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Non-Gemini document",
|
||||||
|
inputData: []byte("20 text/html\r\n<h1>Hello</h1>"),
|
||||||
|
expectedCode: 20,
|
||||||
|
expectedMime: "text/html",
|
||||||
|
expectedLang: "",
|
||||||
|
expectedData: []byte("<h1>Hello</h1>"),
|
||||||
|
expectedError: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Error header",
|
||||||
|
inputData: []byte("53 No proxying!\r\n"),
|
||||||
|
expectedCode: 53,
|
||||||
|
expectedMime: "",
|
||||||
|
expectedLang: "",
|
||||||
|
expectedData: []byte(""),
|
||||||
|
expectedError: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Invalid header",
|
||||||
|
inputData: []byte("Invalid header"),
|
||||||
|
expectedError: true,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang13(t *testing.T) {
|
for _, test := range tests {
|
||||||
t.Parallel()
|
t.Run(test.name, func(t *testing.T) {
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
|
s := snapshot.Snapshot{}
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
|
result := UpdateSnapshotWithData(s, test.inputData)
|
||||||
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
|
||||||
}
|
if test.expectedError {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetTypeAndLang2(t *testing.T) {
|
if int(result.ResponseCode.ValueOrZero()) != test.expectedCode {
|
||||||
t.Parallel()
|
t.Errorf("Expected code %d, got %d", test.expectedCode, int(result.ResponseCode.ValueOrZero()))
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
|
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
|
||||||
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetTypeAndLang21(t *testing.T) {
|
if result.MimeType.ValueOrZero() != test.expectedMime {
|
||||||
t.Parallel()
|
t.Errorf("Expected mimeType '%s', got '%s'", test.expectedMime, result.MimeType.ValueOrZero())
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
|
||||||
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang3(t *testing.T) {
|
if result.Lang.ValueOrZero() != test.expectedLang {
|
||||||
t.Parallel()
|
t.Errorf("Expected lang '%s', got '%s'", test.expectedLang, result.Lang.ValueOrZero())
|
||||||
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
|
|
||||||
if code != 31 || mimeType != "" || lang != "" {
|
|
||||||
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang4(t *testing.T) {
|
if test.expectedMime == "text/gemini" {
|
||||||
t.Parallel()
|
if !strings.Contains(result.GemText.String, string(test.expectedData)) {
|
||||||
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
|
t.Errorf("Expected GemText '%s', got '%s'", test.expectedData, result.GemText.String)
|
||||||
if code != 0 || mimeType != "" || lang != "" {
|
}
|
||||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
} else {
|
||||||
|
if !slices.Equal(result.Data.ValueOrZero(), test.expectedData) {
|
||||||
|
t.Errorf("Expected data '%s', got '%s'", test.expectedData, result.Data.ValueOrZero())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
})
|
||||||
func TestGetMimeTypeAndLang5(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
code, mimeType, lang := getMimeTypeAndLang("")
|
|
||||||
if code != 0 || mimeType != "" || lang != "" {
|
|
||||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
"golang.org/x/text/encoding/charmap"
|
"golang.org/x/text/encoding/charmap"
|
||||||
"golang.org/x/text/encoding/japanese"
|
"golang.org/x/text/encoding/japanese"
|
||||||
"golang.org/x/text/encoding/korean"
|
"golang.org/x/text/encoding/korean"
|
||||||
@@ -22,11 +24,16 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
|||||||
if len(input) == 0 {
|
if len(input) == 0 {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
const maxSize = 10 * 1024 * 1024 // 10MB
|
|
||||||
if len(input) > maxSize {
|
maxSize := config.CONFIG.MaxResponseSize
|
||||||
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
|
if maxSize == 0 {
|
||||||
|
maxSize = 1024 * 1024 // Default 1MB for tests
|
||||||
}
|
}
|
||||||
// Remove NULL byte 0x00 (ReplaceAll accepts slices)
|
if len(input) > maxSize {
|
||||||
|
return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always remove NULL bytes first (before UTF-8 validity check)
|
||||||
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
|
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
|
||||||
if utf8.Valid(inputNoNull) {
|
if utf8.Valid(inputNoNull) {
|
||||||
return string(inputNoNull), nil
|
return string(inputNoNull), nil
|
||||||
@@ -41,6 +48,8 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
|||||||
japanese.EUCJP.NewDecoder(), // Japanese
|
japanese.EUCJP.NewDecoder(), // Japanese
|
||||||
korean.EUCKR.NewDecoder(), // Korean
|
korean.EUCKR.NewDecoder(), // Korean
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Still invalid Unicode. Try some encodings to convert to.
|
||||||
// First successful conversion wins.
|
// First successful conversion wins.
|
||||||
var lastErr error
|
var lastErr error
|
||||||
for _, encoding := range encodings {
|
for _, encoding := range encodings {
|
||||||
@@ -55,5 +64,5 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return "", fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr)
|
return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,87 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"gemini-grc/common"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
"gemini-grc/logging"
|
|
||||||
)
|
|
||||||
|
|
||||||
// RobotsCache is a map of blocked URLs
|
|
||||||
// key: URL
|
|
||||||
// value: []string list of disallowed URLs
|
|
||||||
// If a key has no blocked URLs, an empty
|
|
||||||
// list is stored for caching.
|
|
||||||
var RobotsCache sync.Map //nolint:gochecknoglobals
|
|
||||||
|
|
||||||
func populateBlacklist(key string) (entries []string) {
|
|
||||||
// We either store an empty list when
|
|
||||||
// no rules, or a list of disallowed URLs.
|
|
||||||
// This applies even if we have an error
|
|
||||||
// finding/downloading robots.txt
|
|
||||||
defer func() {
|
|
||||||
RobotsCache.Store(key, entries)
|
|
||||||
}()
|
|
||||||
url := fmt.Sprintf("gemini://%s/robots.txt", key)
|
|
||||||
robotsContent, err := ConnectAndGetData(url)
|
|
||||||
if err != nil {
|
|
||||||
logging.LogDebug("robots.txt error %s", err)
|
|
||||||
return []string{}
|
|
||||||
}
|
|
||||||
robotsData, err := processData(robotsContent)
|
|
||||||
if err != nil {
|
|
||||||
logging.LogDebug("robots.txt error %s", err)
|
|
||||||
return []string{}
|
|
||||||
}
|
|
||||||
if robotsData.ResponseCode != 20 {
|
|
||||||
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
|
|
||||||
return []string{}
|
|
||||||
}
|
|
||||||
// Some return text/plain, others text/gemini.
|
|
||||||
// According to spec, the first is correct,
|
|
||||||
// however let's be lenient
|
|
||||||
var data string
|
|
||||||
switch {
|
|
||||||
case robotsData.MimeType == "text/plain":
|
|
||||||
data = string(robotsData.Data)
|
|
||||||
case robotsData.MimeType == "text/gemini":
|
|
||||||
data = robotsData.GemText
|
|
||||||
default:
|
|
||||||
return []string{}
|
|
||||||
}
|
|
||||||
entries = ParseRobotsTxt(data, key)
|
|
||||||
return entries
|
|
||||||
}
|
|
||||||
|
|
||||||
// RobotMatch checks if the snapshot URL matches
|
|
||||||
// a robots.txt allow rule.
|
|
||||||
func RobotMatch(u string) bool {
|
|
||||||
url, err := common.ParseURL(u, "")
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
|
||||||
logging.LogDebug("Checking robots.txt cache for %s", key)
|
|
||||||
var disallowedURLs []string
|
|
||||||
cacheEntries, ok := RobotsCache.Load(key)
|
|
||||||
if !ok {
|
|
||||||
// First time check, populate robot cache
|
|
||||||
disallowedURLs = populateBlacklist(key)
|
|
||||||
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
|
||||||
} else {
|
|
||||||
disallowedURLs, _ = cacheEntries.([]string)
|
|
||||||
}
|
|
||||||
return isURLblocked(disallowedURLs, url.Full)
|
|
||||||
}
|
|
||||||
|
|
||||||
func isURLblocked(disallowedURLs []string, input string) bool {
|
|
||||||
for _, url := range disallowedURLs {
|
|
||||||
if strings.HasPrefix(strings.ToLower(input), url) {
|
|
||||||
logging.LogDebug("robots.txt match: %s matches %s", input, url)
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// ParseRobotsTxt takes robots.txt content and a host, and
|
|
||||||
// returns a list of full URLs that shouldn't
|
|
||||||
// be visited.
|
|
||||||
// TODO Also take into account the user agent?
|
|
||||||
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
|
||||||
func ParseRobotsTxt(content string, host string) []string {
|
|
||||||
var disallowedPaths []string
|
|
||||||
for _, line := range strings.Split(content, "\n") {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
line = strings.ToLower(line)
|
|
||||||
if strings.HasPrefix(line, "disallow:") {
|
|
||||||
parts := strings.SplitN(line, ":", 2)
|
|
||||||
if len(parts) == 2 {
|
|
||||||
path := strings.TrimSpace(parts[1])
|
|
||||||
if path != "" {
|
|
||||||
// Construct full Gemini URL
|
|
||||||
disallowedPaths = append(disallowedPaths,
|
|
||||||
fmt.Sprintf("gemini://%s%s", host, path))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return disallowedPaths
|
|
||||||
}
|
|
||||||
344
gemini/worker.go
344
gemini/worker.go
@@ -1,344 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"gemini-grc/common"
|
|
||||||
_db "gemini-grc/db"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gemini-grc/logging"
|
|
||||||
"gemini-grc/util"
|
|
||||||
"github.com/guregu/null/v5"
|
|
||||||
"github.com/jmoiron/sqlx"
|
|
||||||
)
|
|
||||||
|
|
||||||
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
|
||||||
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
|
||||||
statusChan = make(chan WorkerStatus, numOfWorkers)
|
|
||||||
go PrintWorkerStatus(numOfWorkers, statusChan)
|
|
||||||
|
|
||||||
for i := range numOfWorkers {
|
|
||||||
go func(i int) {
|
|
||||||
// Jitter to avoid starting everything at the same time
|
|
||||||
time.Sleep(time.Duration(util.SecureRandomInt(10)) * time.Second)
|
|
||||||
for {
|
|
||||||
RunWorkerWithTx(i, db, nil)
|
|
||||||
}
|
|
||||||
}(i)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) {
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: "Starting up",
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: "Done",
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
tx, err := db.Beginx()
|
|
||||||
if err != nil {
|
|
||||||
panic(fmt.Sprintf("Failed to begin transaction: %v", err))
|
|
||||||
}
|
|
||||||
runWorker(workerID, tx, url)
|
|
||||||
logging.LogDebug("[%d] Committing transaction", workerID)
|
|
||||||
err = tx.Commit()
|
|
||||||
// On deadlock errors, rollback and return, otherwise panic.
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("[%d] Failed to commit transaction: %w", workerID, err)
|
|
||||||
if _db.IsDeadlockError(err) {
|
|
||||||
logging.LogError("[%d] Deadlock detected. Rolling back", workerID)
|
|
||||||
time.Sleep(time.Duration(10) * time.Second)
|
|
||||||
err := tx.Rollback()
|
|
||||||
if err != nil {
|
|
||||||
panic(fmt.Sprintf("[%d] Failed to roll back transaction: %v", workerID, err))
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
panic(fmt.Sprintf("[%d] Failed to commit transaction: %v", workerID, err))
|
|
||||||
}
|
|
||||||
logging.LogDebug("[%d] Worker done!", workerID)
|
|
||||||
}
|
|
||||||
|
|
||||||
func runWorker(workerID int, tx *sqlx.Tx, url *string) {
|
|
||||||
var urls []string
|
|
||||||
var err error
|
|
||||||
|
|
||||||
// If not given a specific URL,
|
|
||||||
// get some random ones to visit from db.
|
|
||||||
if url == nil {
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: "Getting URLs",
|
|
||||||
}
|
|
||||||
urls, err = _db.GetURLsToVisit(tx)
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err)
|
|
||||||
panic("This should never happen")
|
|
||||||
} else if len(urls) == 0 {
|
|
||||||
logging.LogInfo("[%d] No URLs to visit.", workerID)
|
|
||||||
time.Sleep(1 * time.Minute)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
geminiURL, err := common.ParseURL(*url, "")
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("Invalid URL given: %s", *url)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
urls = []string{geminiURL.String()}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start visiting URLs.
|
|
||||||
total := len(urls)
|
|
||||||
for i, u := range urls {
|
|
||||||
logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, u)
|
|
||||||
// We differentiate between errors:
|
|
||||||
// Unexpected errors are the ones returned from the following function.
|
|
||||||
// If an error is unexpected (which should never happen) we panic.
|
|
||||||
// Expected errors are stored as strings within the snapshot.
|
|
||||||
err := workOnUrl(workerID, tx, u)
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("[%d] Unexpected GeminiError %w while visiting %s", workerID, err, u)
|
|
||||||
util.PrintStackAndPanic(err)
|
|
||||||
}
|
|
||||||
logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// workOnUrl visits a URL and stores the result.
|
|
||||||
// unexpected errors are returned.
|
|
||||||
// expected errors are stored within the snapshot.
|
|
||||||
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
|
|
||||||
if url == "" {
|
|
||||||
return fmt.Errorf("nil URL given")
|
|
||||||
}
|
|
||||||
|
|
||||||
if IsBlacklisted(url) {
|
|
||||||
logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, url)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
s := common.SnapshotFromURL(url)
|
|
||||||
|
|
||||||
// If URL matches a robots.txt disallow line,
|
|
||||||
// add it as an error so next time it won't be
|
|
||||||
// crawled.
|
|
||||||
if RobotMatch(url) {
|
|
||||||
s.Error = null.StringFrom(common.ErrGeminiRobotsDisallowed.Error())
|
|
||||||
err = _db.OverwriteSnapshot(workerID, tx, s)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("[%d] %w", workerID, err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Resolve IP address via DNS
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: fmt.Sprintf("Resolving %s", url),
|
|
||||||
}
|
|
||||||
IPs, err := getHostIPAddresses(s.Host)
|
|
||||||
if err != nil {
|
|
||||||
s.Error = null.StringFrom(err.Error())
|
|
||||||
err = _db.OverwriteSnapshot(workerID, tx, s)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("[%d] %w", workerID, err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
for {
|
|
||||||
count := 1
|
|
||||||
if isAnotherWorkerVisitingHost(workerID, IPs) {
|
|
||||||
logging.LogDebug("[%d] Another worker is visiting this host, waiting", workerID)
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host),
|
|
||||||
}
|
|
||||||
time.Sleep(2 * time.Second) // Avoid flood-retrying
|
|
||||||
count++
|
|
||||||
if count == 3 {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: fmt.Sprintf("Adding to pool %s", url),
|
|
||||||
}
|
|
||||||
AddIPsToPool(IPs)
|
|
||||||
// After finishing, remove the host IPs from
|
|
||||||
// the connections pool, with a small delay
|
|
||||||
// to avoid potentially hitting the same IP quickly.
|
|
||||||
defer func() {
|
|
||||||
go func() {
|
|
||||||
time.Sleep(1 * time.Second)
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: fmt.Sprintf("Removing from pool %s", url),
|
|
||||||
}
|
|
||||||
RemoveIPsFromPool(IPs)
|
|
||||||
}()
|
|
||||||
}()
|
|
||||||
|
|
||||||
statusChan <- WorkerStatus{
|
|
||||||
id: workerID,
|
|
||||||
status: fmt.Sprintf("Visiting %s", url),
|
|
||||||
}
|
|
||||||
|
|
||||||
err = Visit(s)
|
|
||||||
if err != nil {
|
|
||||||
if !common.IsKnownError(err) {
|
|
||||||
logging.LogError("[%d] Unknown error visiting %s: %w", workerID, url, err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
s.Error = null.StringFrom(err.Error())
|
|
||||||
// Check if error is redirection, and handle it
|
|
||||||
if errors.As(err, new(*common.GeminiError)) &&
|
|
||||||
err.(*common.GeminiError).Msg == "redirect" {
|
|
||||||
err = handleRedirection(workerID, tx, s)
|
|
||||||
if err != nil {
|
|
||||||
if common.IsKnownError(err) {
|
|
||||||
s.Error = null.StringFrom(err.Error())
|
|
||||||
} else {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// If this is a gemini page, parse possible links inside
|
|
||||||
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
|
||||||
links := GetPageLinks(s.URL, s.GemText.String)
|
|
||||||
if len(links) > 0 {
|
|
||||||
logging.LogDebug("[%d] Found %d links", workerID, len(links))
|
|
||||||
s.Links = null.ValueFrom(links)
|
|
||||||
err = storeLinks(tx, s)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID)
|
|
||||||
}
|
|
||||||
|
|
||||||
err = _db.OverwriteSnapshot(workerID, tx, s)
|
|
||||||
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func isAnotherWorkerVisitingHost(workerID int, IPs []string) bool {
|
|
||||||
IPPool.Lock.RLock()
|
|
||||||
defer func() {
|
|
||||||
IPPool.Lock.RUnlock()
|
|
||||||
}()
|
|
||||||
logging.LogDebug("[%d] Checking pool for IPs", workerID)
|
|
||||||
for _, ip := range IPs {
|
|
||||||
_, ok := IPPool.IPs[ip]
|
|
||||||
if ok {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func storeLinks(tx *sqlx.Tx, s *common.Snapshot) error {
|
|
||||||
if s.Links.Valid {
|
|
||||||
var batchSnapshots []*common.Snapshot
|
|
||||||
for _, link := range s.Links.ValueOrZero() {
|
|
||||||
if shouldPersistURL(&link) {
|
|
||||||
newSnapshot := &common.Snapshot{
|
|
||||||
URL: link,
|
|
||||||
Host: link.Hostname,
|
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
|
||||||
}
|
|
||||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(batchSnapshots) > 0 {
|
|
||||||
err := _db.SaveLinksToDBinBatches(tx, batchSnapshots)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// shouldPersistURL returns true if we
|
|
||||||
// should save the URL in the _db.
|
|
||||||
// Only gemini:// urls are saved.
|
|
||||||
func shouldPersistURL(u *common.URL) bool {
|
|
||||||
return strings.HasPrefix(u.String(), "gemini://")
|
|
||||||
}
|
|
||||||
|
|
||||||
func haveWeVisitedURL(tx *sqlx.Tx, u *common.URL) (bool, error) {
|
|
||||||
var result bool
|
|
||||||
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u.String())
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
|
|
||||||
}
|
|
||||||
if result {
|
|
||||||
return result, nil
|
|
||||||
}
|
|
||||||
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshot.url=$1`, u.String())
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
|
|
||||||
}
|
|
||||||
return result, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleRedirection saves redirect URL as new snapshot
|
|
||||||
func handleRedirection(workerID int, tx *sqlx.Tx, s *common.Snapshot) error {
|
|
||||||
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, common.ErrGeminiRedirect) {
|
|
||||||
logging.LogDebug("[%d] %s", workerID, err)
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
logging.LogDebug("[%d] Page redirects to %s", workerID, newURL)
|
|
||||||
// Insert fresh snapshot with new URL
|
|
||||||
if shouldPersistURL(newURL) {
|
|
||||||
snapshot := &common.Snapshot{
|
|
||||||
// UID: uid.UID(),
|
|
||||||
URL: *newURL,
|
|
||||||
Host: newURL.Hostname,
|
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
|
||||||
}
|
|
||||||
logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String())
|
|
||||||
err = _db.SaveSnapshotIfNew(tx, snapshot)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]common.Snapshot, error) {
|
|
||||||
query := `
|
|
||||||
SELECT *
|
|
||||||
FROM snapshots
|
|
||||||
WHERE url=$1
|
|
||||||
LIMIT 1
|
|
||||||
`
|
|
||||||
var snapshots []common.Snapshot
|
|
||||||
err := tx.Select(&snapshots, query, url)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return snapshots, nil
|
|
||||||
}
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
package gemini
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
type WorkerStatus struct {
|
|
||||||
id int
|
|
||||||
status string
|
|
||||||
}
|
|
||||||
|
|
||||||
var statusChan chan WorkerStatus
|
|
||||||
|
|
||||||
func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
|
|
||||||
// Create a slice to store current status of each worker
|
|
||||||
statuses := make([]string, totalWorkers)
|
|
||||||
|
|
||||||
// Initialize empty statuses
|
|
||||||
for i := range statuses {
|
|
||||||
statuses[i] = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initial print
|
|
||||||
var output strings.Builder
|
|
||||||
// \033[H moves the cursor to the top left corner of the screen
|
|
||||||
// (ie, the first column of the first row in the screen).
|
|
||||||
// \033[J clears the part of the screen from the cursor to the end of the screen.
|
|
||||||
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
|
|
||||||
for i := range statuses {
|
|
||||||
output.WriteString(fmt.Sprintf("[%2d] \n", i))
|
|
||||||
}
|
|
||||||
fmt.Print(output.String())
|
|
||||||
|
|
||||||
// Continuously receive status updates
|
|
||||||
for update := range statusChan {
|
|
||||||
if update.id >= totalWorkers {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the status
|
|
||||||
statuses[update.id] = update.status
|
|
||||||
|
|
||||||
// Build the complete output string
|
|
||||||
output.Reset()
|
|
||||||
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
|
|
||||||
for i, status := range statuses {
|
|
||||||
output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Print the entire status
|
|
||||||
fmt.Print(output.String())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
30
go.mod
30
go.mod
@@ -1,24 +1,34 @@
|
|||||||
module gemini-grc
|
module gemini-grc
|
||||||
|
|
||||||
go 1.23.1
|
go 1.24.3
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
git.antanst.com/antanst/logging v0.0.1
|
||||||
|
git.antanst.com/antanst/uid v0.0.1
|
||||||
|
git.antanst.com/antanst/xerrors v0.0.2
|
||||||
github.com/guregu/null/v5 v5.0.0
|
github.com/guregu/null/v5 v5.0.0
|
||||||
github.com/jackc/pgx/v5 v5.7.1
|
github.com/jackc/pgx/v5 v5.7.2
|
||||||
github.com/jmoiron/sqlx v1.4.0
|
github.com/jmoiron/sqlx v1.4.0
|
||||||
github.com/lib/pq v1.10.9
|
github.com/lib/pq v1.10.9
|
||||||
github.com/matoous/go-nanoid/v2 v2.1.0
|
github.com/stretchr/testify v1.9.0
|
||||||
github.com/rs/zerolog v1.33.0
|
golang.org/x/text v0.21.0
|
||||||
golang.org/x/text v0.19.0
|
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||||
github.com/jackc/pgpassfile v1.0.0 // indirect
|
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||||
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
github.com/kr/text v0.2.0 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||||
golang.org/x/crypto v0.27.0 // indirect
|
github.com/rogpeppe/go-internal v1.13.1 // indirect
|
||||||
golang.org/x/sync v0.8.0 // indirect
|
golang.org/x/crypto v0.32.0 // indirect
|
||||||
golang.org/x/sys v0.25.0 // indirect
|
golang.org/x/sync v0.10.0 // indirect
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
|
||||||
|
replace git.antanst.com/antanst/xerrors => ../xerrors
|
||||||
|
|
||||||
|
replace git.antanst.com/antanst/uid => ../uid
|
||||||
|
|
||||||
|
replace git.antanst.com/antanst/logging => ../logging
|
||||||
|
|||||||
44
go.sum
44
go.sum
@@ -1,59 +1,49 @@
|
|||||||
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
|
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
|
||||||
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
|
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
|
||||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
|
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
|
||||||
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
|
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
|
||||||
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
|
||||||
github.com/guregu/null/v5 v5.0.0 h1:PRxjqyOekS11W+w/7Vfz6jgJE/BCwELWtgvOJzddimw=
|
github.com/guregu/null/v5 v5.0.0 h1:PRxjqyOekS11W+w/7Vfz6jgJE/BCwELWtgvOJzddimw=
|
||||||
github.com/guregu/null/v5 v5.0.0/go.mod h1:SjupzNy+sCPtwQTKWhUCqjhVCO69hpsl2QsZrWHjlwU=
|
github.com/guregu/null/v5 v5.0.0/go.mod h1:SjupzNy+sCPtwQTKWhUCqjhVCO69hpsl2QsZrWHjlwU=
|
||||||
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||||
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
||||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
||||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
|
||||||
github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs=
|
github.com/jackc/pgx/v5 v5.7.2 h1:mLoDLV6sonKlvjIEsV56SkWNCnuNv531l94GaIzO+XI=
|
||||||
github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA=
|
github.com/jackc/pgx/v5 v5.7.2/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ=
|
||||||
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
||||||
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
||||||
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
|
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
|
||||||
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
|
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
|
||||||
|
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
|
||||||
|
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
|
||||||
|
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||||
|
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||||
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
|
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
|
||||||
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||||
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
|
|
||||||
github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM=
|
|
||||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
|
||||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
|
||||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
|
||||||
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
|
||||||
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
||||||
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
|
||||||
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
|
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
|
||||||
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
|
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A=
|
golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc=
|
||||||
golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70=
|
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
|
||||||
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
|
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
|
||||||
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
|
||||||
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
|
|
||||||
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
|
||||||
golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
|
|
||||||
golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
|
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
30
gopher/errors.go
Normal file
30
gopher/errors.go
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
package gopher
|
||||||
|
|
||||||
|
import "errors"
|
||||||
|
|
||||||
|
// GopherError is an error encountered while
|
||||||
|
// visiting a Gopher host, and is only for
|
||||||
|
// Gopher errors (item type indicator 3).
|
||||||
|
type GopherError struct {
|
||||||
|
Err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *GopherError) Error() string {
|
||||||
|
return e.Err.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *GopherError) Unwrap() error {
|
||||||
|
return e.Err
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewGopherError(err error) error {
|
||||||
|
return &GopherError{Err: err}
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsGopherError(err error) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
var asError *GopherError
|
||||||
|
return errors.As(err, &asError)
|
||||||
|
}
|
||||||
220
gopher/network.go
Normal file
220
gopher/network.go
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
package gopher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net"
|
||||||
|
stdurl "net/url"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
commonErrors "gemini-grc/common/errors"
|
||||||
|
"gemini-grc/config"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
)
|
||||||
|
|
||||||
|
// References:
|
||||||
|
// RFC 1436 https://www.rfc-editor.org/rfc/rfc1436.html
|
||||||
|
|
||||||
|
// The default port for Gopher is 70.
|
||||||
|
// Originally Gopher used ASCII or
|
||||||
|
// ISO-8859-1, now others use UTF-8.
|
||||||
|
// In any case, just converting to UTF-8
|
||||||
|
// will work. If not, we bail.
|
||||||
|
|
||||||
|
// Here's the complete list of Gopher item type indicators (prefixes):
|
||||||
|
//
|
||||||
|
// `0` - Plain Text File
|
||||||
|
// `1` - Directory/Menu
|
||||||
|
// `2` - CSO Phone Book Server
|
||||||
|
// `3` - Error Message
|
||||||
|
// `4` - BinHexed Macintosh File
|
||||||
|
// `5` - DOS Binary Archive
|
||||||
|
// `6` - UNIX uuencoded File
|
||||||
|
// `7` - Index/Search Server
|
||||||
|
// `8` - Telnet Session
|
||||||
|
// `9` - Binary File
|
||||||
|
// `+` - Mirror/Redundant Server
|
||||||
|
// `g` - GIF Image
|
||||||
|
// `I` - Image File (non-GIF)
|
||||||
|
// `T` - TN3270 Session
|
||||||
|
// `i` - Informational Message (menu line)
|
||||||
|
// `h` - HTML File
|
||||||
|
// `s` - Sound/Music File
|
||||||
|
// `d` - Document File
|
||||||
|
// `w` - WHOIS Service
|
||||||
|
// `;` - Document File with Alternative View
|
||||||
|
// `<` - Video File
|
||||||
|
// `M` - MIME File (mail message or similar)
|
||||||
|
// `:` - Bitmap Image
|
||||||
|
// `c` - Calendar File
|
||||||
|
// `p` - PostScript File
|
||||||
|
|
||||||
|
// The most commonly used ones are `0` (text), `1` (directory), `i` (info), and `3` (error).
|
||||||
|
// The original Gopher protocol only specified types 0-9, `+`, `g`, `I`, and `T`.
|
||||||
|
// The others were added by various implementations and extensions over time.
|
||||||
|
|
||||||
|
func connectAndGetData(url string) ([]byte, error) {
|
||||||
|
parsedURL, err := stdurl.Parse(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w", err), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
hostname := parsedURL.Hostname()
|
||||||
|
port := parsedURL.Port()
|
||||||
|
if port == "" {
|
||||||
|
port = "70"
|
||||||
|
}
|
||||||
|
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||||
|
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
|
||||||
|
// Establish the underlying TCP connection.
|
||||||
|
dialer := &net.Dialer{
|
||||||
|
Timeout: timeoutDuration,
|
||||||
|
}
|
||||||
|
logging.LogDebug("Dialing %s", host)
|
||||||
|
conn, err := dialer.Dial("tcp", host)
|
||||||
|
if err != nil {
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
// Make sure we always close the connection.
|
||||||
|
defer func() {
|
||||||
|
_ = conn.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Set read and write timeouts on the TCP connection.
|
||||||
|
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||||
|
if err != nil {
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||||
|
if err != nil {
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We read `buf`-sized chunks and add data to `data`.
|
||||||
|
buf := make([]byte, 4096)
|
||||||
|
var data []byte
|
||||||
|
|
||||||
|
// Send Gopher request to trigger server response.
|
||||||
|
payload := constructPayloadFromPath(parsedURL.Path)
|
||||||
|
_, err = conn.Write([]byte(fmt.Sprintf("%s\r\n", payload)))
|
||||||
|
if err != nil {
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
// Read response bytes in len(buf) byte chunks
|
||||||
|
for {
|
||||||
|
n, err := conn.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
data = append(data, buf[:n]...)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
if len(data) > config.CONFIG.MaxResponseSize {
|
||||||
|
return nil, commonErrors.NewHostError(fmt.Errorf("response exceeded max"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logging.LogDebug("Got %d bytes", len(data))
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func constructPayloadFromPath(urlpath string) string {
|
||||||
|
// remove Gopher item type in URL from payload, if one.
|
||||||
|
re := regexp.MustCompile(`^/[\w]/.*`)
|
||||||
|
payloadWithoutItemtype := urlpath
|
||||||
|
if re.Match([]byte(urlpath)) {
|
||||||
|
payloadWithoutItemtype = strings.Join(strings.Split(urlpath, "/")[2:], "/")
|
||||||
|
}
|
||||||
|
if !strings.HasPrefix(payloadWithoutItemtype, "/") {
|
||||||
|
payloadWithoutItemtype = fmt.Sprintf("/%s", payloadWithoutItemtype)
|
||||||
|
}
|
||||||
|
return payloadWithoutItemtype
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkForError(utfData string) error {
|
||||||
|
lines := strings.Split(strings.TrimSpace(utfData), "\n")
|
||||||
|
var firstLine string
|
||||||
|
if len(lines) > 0 {
|
||||||
|
firstLine = lines[0]
|
||||||
|
} else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(firstLine, "3") {
|
||||||
|
split := strings.Split(firstLine, "\t")
|
||||||
|
return NewGopherError(fmt.Errorf("gopher error: %s", strings.TrimSpace(split[0])))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getGopherPageLinks(content string) []string {
|
||||||
|
var links []string
|
||||||
|
|
||||||
|
lines := strings.Split(strings.TrimSpace(content), "\n")
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
if line == "" || line == "." {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(line) < 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
itemType := line[0]
|
||||||
|
if itemType == 'i' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := strings.SplitN(line[1:], "\t", 4)
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
selector := strings.TrimSpace(parts[1])
|
||||||
|
host := strings.TrimSpace(parts[2])
|
||||||
|
|
||||||
|
if host == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle HTML links first
|
||||||
|
if itemType == 'h' && strings.HasPrefix(selector, "URL:") {
|
||||||
|
if url := strings.TrimSpace(selector[4:]); url != "" {
|
||||||
|
links = append(links, url)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// For gopher links, build URL carefully
|
||||||
|
var url strings.Builder
|
||||||
|
|
||||||
|
// Protocol and host:port
|
||||||
|
url.WriteString("gopher://")
|
||||||
|
url.WriteString(host)
|
||||||
|
url.WriteString(":")
|
||||||
|
if len(parts) > 3 && strings.TrimSpace(parts[3]) != "" {
|
||||||
|
url.WriteString(strings.TrimSpace(parts[3]))
|
||||||
|
} else {
|
||||||
|
url.WriteString("70")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Path: always /type + selector
|
||||||
|
url.WriteString("/")
|
||||||
|
url.WriteString(string(itemType))
|
||||||
|
if strings.HasPrefix(selector, "/") {
|
||||||
|
url.WriteString(selector)
|
||||||
|
} else {
|
||||||
|
url.WriteString("/" + selector)
|
||||||
|
}
|
||||||
|
|
||||||
|
links = append(links, url.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
return links
|
||||||
|
}
|
||||||
205
gopher/network_context.go
Normal file
205
gopher/network_context.go
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
package gopher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net"
|
||||||
|
stdurl "net/url"
|
||||||
|
"time"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
commonErrors "gemini-grc/common/errors"
|
||||||
|
"gemini-grc/common/linkList"
|
||||||
|
"gemini-grc/common/snapshot"
|
||||||
|
"gemini-grc/common/text"
|
||||||
|
_url "gemini-grc/common/url"
|
||||||
|
"gemini-grc/config"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
"github.com/guregu/null/v5"
|
||||||
|
)
|
||||||
|
|
||||||
|
// VisitWithContext is a context-aware version of Visit that visits
|
||||||
|
// a given URL using the Gopher protocol. It uses the context for
|
||||||
|
// cancellation, timeout, and logging.
|
||||||
|
func VisitWithContext(ctx context.Context, url string) (*snapshot.Snapshot, error) {
|
||||||
|
// Create a gopher-specific context with the "gopher" component
|
||||||
|
gopherCtx := contextutil.ContextWithComponent(ctx, "gopher")
|
||||||
|
|
||||||
|
if !config.CONFIG.GopherEnable {
|
||||||
|
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Gopher protocol is disabled")
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
s, err := snapshot.SnapshotFromURL(url, true)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(gopherCtx, logging.GetSlogger(), "Failed to create snapshot from URL: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context is canceled
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := connectAndGetDataWithContext(gopherCtx, url)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Error: %s", err.Error())
|
||||||
|
if IsGopherError(err) || commonErrors.IsHostError(err) {
|
||||||
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context is canceled
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
isValidUTF8 := utf8.ValidString(string(data))
|
||||||
|
if isValidUTF8 {
|
||||||
|
s.GemText = null.StringFrom(text.RemoveNullChars(string(data)))
|
||||||
|
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Response is valid UTF-8 text (%d bytes)", len(data))
|
||||||
|
} else {
|
||||||
|
s.Data = null.ValueFrom(data)
|
||||||
|
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Response is binary data (%d bytes)", len(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isValidUTF8 {
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
responseError := checkForError(string(data))
|
||||||
|
if responseError != nil {
|
||||||
|
contextlog.LogErrorWithContext(gopherCtx, logging.GetSlogger(), "Gopher server returned error: %v", responseError)
|
||||||
|
s.Error = null.StringFrom(responseError.Error())
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract links from the response
|
||||||
|
links := getGopherPageLinks(string(data))
|
||||||
|
linkURLs := linkList.LinkList(make([]_url.URL, len(links)))
|
||||||
|
|
||||||
|
for i, link := range links {
|
||||||
|
linkURL, err := _url.ParseURL(link, "", true)
|
||||||
|
if err == nil {
|
||||||
|
linkURLs[i] = *linkURL
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(links) != 0 {
|
||||||
|
s.Links = null.ValueFrom(linkURLs)
|
||||||
|
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Found %d links in gopher page", len(links))
|
||||||
|
}
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Successfully visited Gopher URL: %s", url)
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// connectAndGetDataWithContext is a context-aware version of connectAndGetData
|
||||||
|
func connectAndGetDataWithContext(ctx context.Context, url string) ([]byte, error) {
|
||||||
|
parsedURL, err := stdurl.Parse(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w", err), 0, "", false)
|
||||||
|
}
|
||||||
|
|
||||||
|
hostname := parsedURL.Hostname()
|
||||||
|
port := parsedURL.Port()
|
||||||
|
if port == "" {
|
||||||
|
port = "70"
|
||||||
|
}
|
||||||
|
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||||
|
|
||||||
|
// Use the context's deadline if it has one, otherwise use the config timeout
|
||||||
|
var timeoutDuration time.Duration
|
||||||
|
deadline, ok := ctx.Deadline()
|
||||||
|
if ok {
|
||||||
|
timeoutDuration = time.Until(deadline)
|
||||||
|
} else {
|
||||||
|
timeoutDuration = time.Duration(config.CONFIG.ResponseTimeout) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the context is canceled
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Dialing %s", host)
|
||||||
|
|
||||||
|
// Establish the underlying TCP connection with context-based cancellation
|
||||||
|
dialer := &net.Dialer{
|
||||||
|
Timeout: timeoutDuration,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use DialContext to allow cancellation via context
|
||||||
|
conn, err := dialer.DialContext(ctx, "tcp", host)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to connect: %v", err)
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure we always close the connection
|
||||||
|
defer func() {
|
||||||
|
_ = conn.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Set read and write timeouts on the TCP connection
|
||||||
|
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||||
|
if err != nil {
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||||
|
if err != nil {
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We read `buf`-sized chunks and add data to `data`
|
||||||
|
buf := make([]byte, 4096)
|
||||||
|
var data []byte
|
||||||
|
|
||||||
|
// Check if the context is canceled before sending request
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send Gopher request to trigger server response
|
||||||
|
payload := constructPayloadFromPath(parsedURL.Path)
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Sending request with payload: %s", payload)
|
||||||
|
_, err = conn.Write([]byte(fmt.Sprintf("%s\r\n", payload)))
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to send request: %v", err)
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read response bytes in len(buf) byte chunks
|
||||||
|
for {
|
||||||
|
// Check if the context is canceled before each read
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
n, err := conn.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
data = append(data, buf[:n]...)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
|
||||||
|
return nil, commonErrors.NewHostError(err)
|
||||||
|
}
|
||||||
|
if len(data) > config.CONFIG.MaxResponseSize {
|
||||||
|
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
|
||||||
|
return nil, commonErrors.NewHostError(fmt.Errorf("response exceeded max"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes", len(data))
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
298
gopher/network_test.go
Normal file
298
gopher/network_test.go
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
package gopher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gemini-grc/common/errors"
|
||||||
|
"gemini-grc/config"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestConstructPayloadFromPath(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Path with Gopher item type",
|
||||||
|
input: "/1/path/to/resource",
|
||||||
|
expected: "/path/to/resource",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Path with different item type",
|
||||||
|
input: "/0/another/path",
|
||||||
|
expected: "/another/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Path without item type but with leading slash",
|
||||||
|
input: "/simple/path",
|
||||||
|
expected: "/simple/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Path without item type and without leading slash",
|
||||||
|
input: "no/leading/slash",
|
||||||
|
expected: "/no/leading/slash",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Empty path",
|
||||||
|
input: "",
|
||||||
|
expected: "/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Single character item type",
|
||||||
|
input: "/h/homepage",
|
||||||
|
expected: "/homepage",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Single slash",
|
||||||
|
input: "/",
|
||||||
|
expected: "/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Item type-looking path",
|
||||||
|
input: "/1",
|
||||||
|
expected: "/1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := constructPayloadFromPath(tt.input)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("constructPayloadFromPath(%q) = %q, want %q",
|
||||||
|
tt.input, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseLinks(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
currentURL string
|
||||||
|
input string
|
||||||
|
want int // number of expected links
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Empty input",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "",
|
||||||
|
want: 0,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Single directory link",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "1About Us\t/about\texample.com",
|
||||||
|
want: 1,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Single text file link",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "0README\t/readme.txt\texample.com",
|
||||||
|
want: 1,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Multiple links of different types",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "1About Us\t/about\texample.com\n0README\t/readme.txt\texample.com\n1Contact\t/contact\texample.com",
|
||||||
|
want: 3,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Ignore non-linkable types",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "iInfo line\t/info\texample.com\n1Directory\t/dir\texample.com\n0Text\t/text.txt\texample.com",
|
||||||
|
want: 2,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Malformed lines",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "1Incomplete line\n0No tabs\n1Missing parts\t",
|
||||||
|
want: 0,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Mixed valid and invalid lines",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "1Valid link\t/valid\texample.com\n1Incomplete\t\n0Text file\t/text.txt\texample.com\n1Another valid\t/another\texample.com",
|
||||||
|
want: 3,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Absolute URLs",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "1External link\tgopher://external.com/path\texternal.com\n0Document\tgopher://other.com/doc.txt\tother.com",
|
||||||
|
want: 2,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "With whitespace",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: " 1Padded line \t/padded\texample.com\n0Text file \t/doc.txt\texample.com",
|
||||||
|
want: 2,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Special characters in paths",
|
||||||
|
currentURL: "gopher://example.com:70",
|
||||||
|
input: "1Special chars\t/path with spaces\texample.com\n0Doc\t/über/päth.txt\texample.com",
|
||||||
|
want: 2,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got := getGopherPageLinks(tt.input)
|
||||||
|
assert.Equal(t, tt.want, len(got), "expected %d links, got %d", tt.want, len(got))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckForError(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
wantError bool
|
||||||
|
errorPrefix string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "No error",
|
||||||
|
input: "1Directory\t/dir\texample.com\n0Text\t/text.txt\texample.com",
|
||||||
|
wantError: false,
|
||||||
|
errorPrefix: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Simple error message",
|
||||||
|
input: "3Error: File not found\t\texample.com",
|
||||||
|
wantError: true,
|
||||||
|
errorPrefix: "gopher error: 3Error: File not found",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Error with multiple tabs",
|
||||||
|
input: "3File not found\t/error\texample.com\t70",
|
||||||
|
wantError: true,
|
||||||
|
errorPrefix: "gopher error: 3File not found",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Error among valid entries",
|
||||||
|
input: `1Welcome\t/welcome\texample.com
|
||||||
|
3Access denied\t\texample.com
|
||||||
|
0README\t/readme.txt\texample.com`,
|
||||||
|
wantError: false,
|
||||||
|
errorPrefix: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Error with no tabs",
|
||||||
|
input: "3Server is down for maintenance",
|
||||||
|
wantError: true,
|
||||||
|
errorPrefix: "gopher error: 3Server is down for maintenance",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Multiple errors (should return first)",
|
||||||
|
input: `3First error\t\texample.com
|
||||||
|
3Second error\t\texample.com`,
|
||||||
|
wantError: true,
|
||||||
|
errorPrefix: "gopher error: 3First error",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Error with whitespace",
|
||||||
|
input: " 3 Error with spaces \t\texample.com",
|
||||||
|
wantError: true,
|
||||||
|
errorPrefix: "gopher error: 3 Error with spaces",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Empty input",
|
||||||
|
input: "",
|
||||||
|
wantError: false,
|
||||||
|
errorPrefix: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Just newlines",
|
||||||
|
input: "\n\n\n",
|
||||||
|
wantError: false,
|
||||||
|
errorPrefix: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Error after empty lines",
|
||||||
|
input: `
|
||||||
|
|
||||||
|
3Error after empty lines\t\texample.com`,
|
||||||
|
wantError: true,
|
||||||
|
errorPrefix: "gopher error: 3Error after empty lines",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
err := checkForError(tt.input)
|
||||||
|
|
||||||
|
if !tt.wantError {
|
||||||
|
assert.NoError(t, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
assert.Error(t, err)
|
||||||
|
assert.Contains(t, err.Error(), tt.errorPrefix)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConnectAndGetDataTimeout(t *testing.T) {
|
||||||
|
// Start a test server that doesn't respond
|
||||||
|
listener, err := net.Listen("tcp", "localhost:0")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to start listener: %v", err)
|
||||||
|
}
|
||||||
|
defer listener.Close()
|
||||||
|
|
||||||
|
// Accept the connection but don't respond
|
||||||
|
go func() {
|
||||||
|
conn, err := listener.Accept()
|
||||||
|
if err != nil {
|
||||||
|
t.Logf("Failed to accept connection: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer conn.Close()
|
||||||
|
|
||||||
|
// Keep the connection open without sending any data to simulate a timeout
|
||||||
|
select {}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Construct the URL of our test server
|
||||||
|
address := listener.Addr().String()
|
||||||
|
testURL := "gopher://" + address + "/testpath"
|
||||||
|
|
||||||
|
// Save original config values
|
||||||
|
originalTimeout := config.CONFIG.ResponseTimeout
|
||||||
|
originalMaxSize := config.CONFIG.MaxResponseSize
|
||||||
|
|
||||||
|
// Set test config values
|
||||||
|
config.CONFIG.ResponseTimeout = 1 // Set a very short timeout for this test
|
||||||
|
config.CONFIG.MaxResponseSize = 1024 // Just for consistency, we won't reach this
|
||||||
|
|
||||||
|
// Test the function
|
||||||
|
_, err = connectAndGetData(testURL)
|
||||||
|
|
||||||
|
// Reset config values
|
||||||
|
config.CONFIG.ResponseTimeout = originalTimeout
|
||||||
|
config.CONFIG.MaxResponseSize = originalMaxSize
|
||||||
|
|
||||||
|
// Check if the error is due to timeout
|
||||||
|
if err == nil {
|
||||||
|
t.Error("Expected an error due to timeout, but got no error")
|
||||||
|
} else if !commonErrors.IsHostError(err) {
|
||||||
|
t.Errorf("Expected a HostError, but got: %v", err)
|
||||||
|
} else {
|
||||||
|
// Here you might want to check if the specific error message contains 'timeout'
|
||||||
|
// However, since we don't have the exact error string, we're checking the type
|
||||||
|
t.Logf("Successfully timed out: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
75
hostPool/hostPool.go
Normal file
75
hostPool/hostPool.go
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
package hostPool
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"math/rand"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
"git.antanst.com/antanst/xerrors"
|
||||||
|
)
|
||||||
|
|
||||||
|
var hostPool = HostPool{hostnames: make(map[string]struct{})}
|
||||||
|
|
||||||
|
type HostPool struct {
|
||||||
|
hostnames map[string]struct{}
|
||||||
|
lock sync.RWMutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveHostFromPool removes a host from the pool with context awareness
|
||||||
|
func RemoveHostFromPool(ctx context.Context, key string) {
|
||||||
|
hostCtx := contextutil.ContextWithComponent(ctx, "hostPool")
|
||||||
|
hostPool.lock.Lock()
|
||||||
|
delete(hostPool.hostnames, key)
|
||||||
|
hostPool.lock.Unlock()
|
||||||
|
|
||||||
|
// Add some jitter
|
||||||
|
time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond)
|
||||||
|
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Host %s removed from pool", key)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddHostToHostPool adds a host to the host pool with context awareness.
|
||||||
|
// Blocks until the host is added or the context is canceled.
|
||||||
|
func AddHostToHostPool(ctx context.Context, key string) error {
|
||||||
|
// Create a hostPool-specific context
|
||||||
|
hostCtx := contextutil.ContextWithComponent(ctx, "hostPool")
|
||||||
|
|
||||||
|
// Use a ticker to periodically check if we can add the host
|
||||||
|
ticker := time.NewTicker(500 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
// We continuously poll the pool,
|
||||||
|
// and if the host isn't already
|
||||||
|
// there, we add it.
|
||||||
|
for {
|
||||||
|
// Check if context is done before attempting to acquire lock
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return xerrors.NewSimpleError(ctx.Err())
|
||||||
|
default:
|
||||||
|
// Continue with attempt to add host
|
||||||
|
}
|
||||||
|
|
||||||
|
hostPool.lock.Lock()
|
||||||
|
_, exists := hostPool.hostnames[key]
|
||||||
|
if !exists {
|
||||||
|
hostPool.hostnames[key] = struct{}{}
|
||||||
|
hostPool.lock.Unlock()
|
||||||
|
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Added host %s to pool", key)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
hostPool.lock.Unlock()
|
||||||
|
|
||||||
|
// Wait for next tick or context cancellation
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
// Try again on next tick
|
||||||
|
case <-ctx.Done():
|
||||||
|
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Context canceled while waiting for host %s", key)
|
||||||
|
return xerrors.NewSimpleError(ctx.Err())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
package logging
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
zlog "github.com/rs/zerolog/log"
|
|
||||||
)
|
|
||||||
|
|
||||||
func LogDebug(format string, args ...interface{}) {
|
|
||||||
zlog.Debug().Msg(fmt.Sprintf(format, args...))
|
|
||||||
}
|
|
||||||
|
|
||||||
func LogInfo(format string, args ...interface{}) {
|
|
||||||
zlog.Info().Msg(fmt.Sprintf(format, args...))
|
|
||||||
}
|
|
||||||
|
|
||||||
func LogWarn(format string, args ...interface{}) {
|
|
||||||
zlog.Warn().Msg(fmt.Sprintf(format, args...))
|
|
||||||
}
|
|
||||||
|
|
||||||
func LogError(format string, args ...interface{}) {
|
|
||||||
zlog.Error().Err(fmt.Errorf(format, args...)).Msg("")
|
|
||||||
}
|
|
||||||
57
main.go
57
main.go
@@ -1,57 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
main2 "gemini-grc/db"
|
|
||||||
"os"
|
|
||||||
"os/signal"
|
|
||||||
"syscall"
|
|
||||||
|
|
||||||
"gemini-grc/config"
|
|
||||||
"gemini-grc/gemini"
|
|
||||||
"gemini-grc/logging"
|
|
||||||
"github.com/jmoiron/sqlx"
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
zlog "github.com/rs/zerolog/log"
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
config.CONFIG = *config.GetConfig()
|
|
||||||
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
|
||||||
zerolog.SetGlobalLevel(config.CONFIG.LogLevel)
|
|
||||||
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
|
|
||||||
if err := runApp(); err != nil {
|
|
||||||
logging.LogError("Application error: %w", err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func runApp() error {
|
|
||||||
logging.LogInfo("Starting up. Press Ctrl+C to exit")
|
|
||||||
signals := make(chan os.Signal, 1)
|
|
||||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
|
||||||
|
|
||||||
db := main2.ConnectToDB()
|
|
||||||
|
|
||||||
defer func(db *sqlx.DB) {
|
|
||||||
err := db.Close()
|
|
||||||
if err != nil {
|
|
||||||
// TODO properly log & hangle error
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}(db)
|
|
||||||
|
|
||||||
gemini.LoadBlacklist()
|
|
||||||
|
|
||||||
// If there's an argument, visit this
|
|
||||||
// URL only and don't spawn other workers
|
|
||||||
if len(os.Args) > 1 {
|
|
||||||
url := os.Args[1]
|
|
||||||
go gemini.RunWorkerWithTx(0, db, &url)
|
|
||||||
} else {
|
|
||||||
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
|
|
||||||
}
|
|
||||||
|
|
||||||
<-signals
|
|
||||||
logging.LogWarn("Received SIGINT or SIGTERM signal, exiting")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
28
misc/sql/README.md
Normal file
28
misc/sql/README.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# SQL Queries for Snapshot Analysis
|
||||||
|
|
||||||
|
This directory contains SQL queries to analyze snapshot data in the gemini-grc database.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
You can run these queries directly from psql using the `\i` directive:
|
||||||
|
|
||||||
|
```
|
||||||
|
\i misc/sql/snapshots_per_url.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
## Available Queries
|
||||||
|
|
||||||
|
- **snapshots_per_url.sql** - Basic count of snapshots per URL
|
||||||
|
- **snapshots_date_range.sql** - Shows snapshot count with date range information for each URL
|
||||||
|
- **host_snapshot_stats.sql** - Groups snapshots by hosts and shows URLs with multiple snapshots
|
||||||
|
- **content_changes.sql** - Finds URLs with the most content changes between consecutive snapshots
|
||||||
|
- **snapshot_distribution.sql** - Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
|
||||||
|
- **recent_snapshot_activity.sql** - Shows URLs with most snapshots in the last 7 days
|
||||||
|
- **storage_efficiency.sql** - Shows potential storage savings from deduplication
|
||||||
|
- **snapshots_by_timeframe.sql** - Shows snapshot count by timeframe (day, week, month)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- These queries are designed to work with PostgreSQL and the gemini-grc database schema
|
||||||
|
- Some queries may be resource-intensive on large databases
|
||||||
|
- The results can help optimize storage and understand the effectiveness of the versioned snapshot feature
|
||||||
19
misc/sql/cleanup_duplicate_snapshots.sql
Normal file
19
misc/sql/cleanup_duplicate_snapshots.sql
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
WITH snapshot_rankings AS (
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
url,
|
||||||
|
ROW_NUMBER() OVER (
|
||||||
|
PARTITION BY url
|
||||||
|
ORDER BY
|
||||||
|
CASE WHEN (gemtext IS NOT NULL AND gemtext != '') OR data IS NOT NULL
|
||||||
|
THEN 0 ELSE 1 END,
|
||||||
|
timestamp DESC
|
||||||
|
) as rn
|
||||||
|
FROM snapshots
|
||||||
|
)
|
||||||
|
DELETE FROM snapshots
|
||||||
|
WHERE id IN (
|
||||||
|
SELECT id
|
||||||
|
FROM snapshot_rankings
|
||||||
|
WHERE rn > 1
|
||||||
|
);
|
||||||
26
misc/sql/content_changes.sql
Normal file
26
misc/sql/content_changes.sql
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
-- File: content_changes.sql
|
||||||
|
-- Finds URLs with the most content changes between consecutive snapshots
|
||||||
|
-- Usage: \i misc/sql/content_changes.sql
|
||||||
|
|
||||||
|
WITH snapshot_changes AS (
|
||||||
|
SELECT
|
||||||
|
s1.url,
|
||||||
|
s1.timestamp as prev_timestamp,
|
||||||
|
s2.timestamp as next_timestamp,
|
||||||
|
s1.gemtext IS DISTINCT FROM s2.gemtext as gemtext_changed,
|
||||||
|
s1.data IS DISTINCT FROM s2.data as data_changed
|
||||||
|
FROM snapshots s1
|
||||||
|
JOIN snapshots s2 ON s1.url = s2.url AND s1.timestamp < s2.timestamp
|
||||||
|
WHERE NOT EXISTS (
|
||||||
|
SELECT 1 FROM snapshots s3
|
||||||
|
WHERE s3.url = s1.url AND s1.timestamp < s3.timestamp AND s3.timestamp < s2.timestamp
|
||||||
|
)
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
COUNT(*) + 1 as snapshot_count,
|
||||||
|
SUM(CASE WHEN gemtext_changed OR data_changed THEN 1 ELSE 0 END) as content_changes
|
||||||
|
FROM snapshot_changes
|
||||||
|
GROUP BY url
|
||||||
|
HAVING COUNT(*) + 1 > 1
|
||||||
|
ORDER BY content_changes DESC, snapshot_count DESC;
|
||||||
30
misc/sql/crawl_top_level.sql
Normal file
30
misc/sql/crawl_top_level.sql
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
BEGIN;
|
||||||
|
|
||||||
|
WITH matching_urls AS (
|
||||||
|
SELECT url, host
|
||||||
|
FROM snapshots
|
||||||
|
WHERE url ~ '^gemini://[^/]+/$'
|
||||||
|
AND timestamp < (NOW() - INTERVAL '1 week')
|
||||||
|
ORDER BY random()
|
||||||
|
LIMIT 500
|
||||||
|
)
|
||||||
|
INSERT INTO urls (url, host)
|
||||||
|
SELECT url, host
|
||||||
|
FROM matching_urls
|
||||||
|
ON CONFLICT DO NOTHING;
|
||||||
|
|
||||||
|
-- WITH matching_urls AS (
|
||||||
|
-- SELECT url, host
|
||||||
|
-- FROM snapshots
|
||||||
|
-- WHERE url ~ '^gemini://[^/]+/$'
|
||||||
|
-- AND timestamp < (NOW() - INTERVAL '1 week')
|
||||||
|
-- ORDER BY random()
|
||||||
|
-- LIMIT 500
|
||||||
|
-- )
|
||||||
|
-- DELETE FROM snapshots
|
||||||
|
-- WHERE url IN (
|
||||||
|
-- SELECT url
|
||||||
|
-- FROM matching_urls
|
||||||
|
-- );
|
||||||
|
|
||||||
|
COMMIT;
|
||||||
6
misc/sql/fetch-snapshot-history.sql
Normal file
6
misc/sql/fetch-snapshot-history.sql
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
select count(*) from snapshots
|
||||||
|
where last_crawled < now() - interval '30 days'
|
||||||
|
and error IS NULL
|
||||||
|
and gemtext IS NOT NULL
|
||||||
|
and mimetype='text/gemini'
|
||||||
|
and url ~ '^gemini://[^/]+/?$';
|
||||||
20
misc/sql/host_snapshot_stats.sql
Normal file
20
misc/sql/host_snapshot_stats.sql
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
-- File: host_snapshot_stats.sql
|
||||||
|
-- Groups snapshots by hosts and shows URLs with multiple snapshots
|
||||||
|
-- Usage: \i misc/sql/host_snapshot_stats.sql
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
host,
|
||||||
|
COUNT(DISTINCT url) as unique_urls,
|
||||||
|
SUM(CASE WHEN url_count > 1 THEN 1 ELSE 0 END) as urls_with_multiple_snapshots,
|
||||||
|
SUM(snapshot_count) as total_snapshots
|
||||||
|
FROM (
|
||||||
|
SELECT
|
||||||
|
host,
|
||||||
|
url,
|
||||||
|
COUNT(*) as snapshot_count,
|
||||||
|
COUNT(*) OVER (PARTITION BY url) as url_count
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY host, url
|
||||||
|
) subquery
|
||||||
|
GROUP BY host
|
||||||
|
ORDER BY total_snapshots DESC;
|
||||||
46
misc/sql/initdb.sql
Normal file
46
misc/sql/initdb.sql
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
DROP TABLE IF EXISTS snapshots;
|
||||||
|
DROP TABLE IF EXISTS urls;
|
||||||
|
|
||||||
|
CREATE TABLE urls (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
url TEXT NOT NULL,
|
||||||
|
host TEXT NOT NULL,
|
||||||
|
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
being_processed BOOLEAN
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX urls_url_key ON urls (url);
|
||||||
|
CREATE INDEX idx_urls_url ON urls (url);
|
||||||
|
CREATE INDEX idx_urls_timestamp ON urls (timestamp);
|
||||||
|
CREATE INDEX idx_being_processed ON urls (being_processed);
|
||||||
|
|
||||||
|
CREATE TABLE snapshots (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
url TEXT NOT NULL,
|
||||||
|
host TEXT NOT NULL,
|
||||||
|
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
mimetype TEXT,
|
||||||
|
data BYTEA,
|
||||||
|
gemtext TEXT,
|
||||||
|
links JSONB,
|
||||||
|
lang TEXT,
|
||||||
|
response_code INTEGER,
|
||||||
|
error TEXT,
|
||||||
|
header TEXT,
|
||||||
|
last_crawled TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX idx_url_timestamp ON snapshots (url, timestamp);
|
||||||
|
CREATE INDEX idx_url ON snapshots (url);
|
||||||
|
CREATE INDEX idx_timestamp ON snapshots (timestamp);
|
||||||
|
CREATE INDEX idx_mimetype ON snapshots (mimetype);
|
||||||
|
CREATE INDEX idx_lang ON snapshots (lang);
|
||||||
|
CREATE INDEX idx_response_code ON snapshots (response_code);
|
||||||
|
CREATE INDEX idx_error ON snapshots (error);
|
||||||
|
CREATE INDEX idx_host ON snapshots (host);
|
||||||
|
CREATE INDEX idx_response_code_error ON snapshots (response_code, error);
|
||||||
|
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
|
||||||
|
CREATE INDEX idx_snapshots_unprocessed ON snapshots (host) WHERE response_code IS NULL AND error IS NULL;
|
||||||
|
CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC);
|
||||||
|
CREATE INDEX idx_last_crawled ON snapshots (last_crawled);
|
||||||
|
CREATE INDEX idx_url_last_crawled ON snapshots (url, last_crawled DESC);
|
||||||
1
misc/sql/mark_urls_processed_false.sql
Normal file
1
misc/sql/mark_urls_processed_false.sql
Normal file
@@ -0,0 +1 @@
|
|||||||
|
update urls set being_processed=false where being_processed is true;
|
||||||
13
misc/sql/recent_snapshot_activity.sql
Normal file
13
misc/sql/recent_snapshot_activity.sql
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
-- File: recent_snapshot_activity.sql
|
||||||
|
-- Shows URLs with most snapshots in the last 7 days
|
||||||
|
-- Usage: \i misc/sql/recent_snapshot_activity.sql
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
COUNT(*) as snapshot_count
|
||||||
|
FROM snapshots
|
||||||
|
WHERE timestamp > NOW() - INTERVAL '7 days'
|
||||||
|
GROUP BY url
|
||||||
|
HAVING COUNT(*) > 1
|
||||||
|
ORDER BY snapshot_count DESC
|
||||||
|
LIMIT 20;
|
||||||
16
misc/sql/snapshot_distribution.sql
Normal file
16
misc/sql/snapshot_distribution.sql
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
-- File: snapshot_distribution.sql
|
||||||
|
-- Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
|
||||||
|
-- Usage: \i misc/sql/snapshot_distribution.sql
|
||||||
|
|
||||||
|
WITH counts AS (
|
||||||
|
SELECT url, COUNT(*) as snapshot_count
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY url
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
snapshot_count,
|
||||||
|
COUNT(*) as url_count,
|
||||||
|
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
|
||||||
|
FROM counts
|
||||||
|
GROUP BY snapshot_count
|
||||||
|
ORDER BY snapshot_count;
|
||||||
37
misc/sql/snapshots_by_timeframe.sql
Normal file
37
misc/sql/snapshots_by_timeframe.sql
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
-- File: snapshots_by_timeframe.sql
|
||||||
|
-- Shows snapshot count by timeframe (day, week, month)
|
||||||
|
-- Usage: \i misc/sql/snapshots_by_timeframe.sql
|
||||||
|
|
||||||
|
WITH daily_snapshots AS (
|
||||||
|
SELECT
|
||||||
|
date_trunc('day', timestamp) as day,
|
||||||
|
COUNT(*) as snapshot_count,
|
||||||
|
COUNT(DISTINCT url) as unique_urls
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY day
|
||||||
|
ORDER BY day
|
||||||
|
),
|
||||||
|
weekly_snapshots AS (
|
||||||
|
SELECT
|
||||||
|
date_trunc('week', timestamp) as week,
|
||||||
|
COUNT(*) as snapshot_count,
|
||||||
|
COUNT(DISTINCT url) as unique_urls
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY week
|
||||||
|
ORDER BY week
|
||||||
|
),
|
||||||
|
monthly_snapshots AS (
|
||||||
|
SELECT
|
||||||
|
date_trunc('month', timestamp) as month,
|
||||||
|
COUNT(*) as snapshot_count,
|
||||||
|
COUNT(DISTINCT url) as unique_urls
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY month
|
||||||
|
ORDER BY month
|
||||||
|
)
|
||||||
|
SELECT 'Daily' as timeframe, * FROM daily_snapshots
|
||||||
|
UNION ALL
|
||||||
|
SELECT 'Weekly' as timeframe, * FROM weekly_snapshots
|
||||||
|
UNION ALL
|
||||||
|
SELECT 'Monthly' as timeframe, * FROM monthly_snapshots
|
||||||
|
ORDER BY timeframe, day;
|
||||||
14
misc/sql/snapshots_date_range.sql
Normal file
14
misc/sql/snapshots_date_range.sql
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
-- File: snapshots_date_range.sql
|
||||||
|
-- Shows snapshot count with date range information for each URL
|
||||||
|
-- Usage: \i misc/sql/snapshots_date_range.sql
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
COUNT(*) as snapshot_count,
|
||||||
|
MIN(timestamp) as first_snapshot,
|
||||||
|
MAX(timestamp) as last_snapshot,
|
||||||
|
MAX(timestamp) - MIN(timestamp) as time_span
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY url
|
||||||
|
HAVING COUNT(*) > 1
|
||||||
|
ORDER BY snapshot_count DESC;
|
||||||
8
misc/sql/snapshots_per_url.sql
Normal file
8
misc/sql/snapshots_per_url.sql
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
-- File: snapshots_per_url.sql
|
||||||
|
-- Basic count of snapshots per URL
|
||||||
|
-- Usage: \i misc/sql/snapshots_per_url.sql
|
||||||
|
|
||||||
|
SELECT url, COUNT(*) as snapshot_count
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY url
|
||||||
|
ORDER BY snapshot_count DESC;
|
||||||
20
misc/sql/storage_efficiency.sql
Normal file
20
misc/sql/storage_efficiency.sql
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
-- File: storage_efficiency.sql
|
||||||
|
-- Shows potential storage savings from deduplication
|
||||||
|
-- Usage: \i misc/sql/storage_efficiency.sql
|
||||||
|
|
||||||
|
WITH duplicate_stats AS (
|
||||||
|
SELECT
|
||||||
|
url,
|
||||||
|
COUNT(*) as snapshot_count,
|
||||||
|
COUNT(DISTINCT gemtext) as unique_gemtexts,
|
||||||
|
COUNT(DISTINCT data) as unique_datas
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY url
|
||||||
|
HAVING COUNT(*) > 1
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
SUM(snapshot_count) as total_snapshots,
|
||||||
|
SUM(unique_gemtexts + unique_datas) as unique_contents,
|
||||||
|
SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas) as duplicate_content_count,
|
||||||
|
ROUND((SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas)) * 100.0 / SUM(snapshot_count), 2) as duplicate_percentage
|
||||||
|
FROM duplicate_stats;
|
||||||
73
robotsMatch/robots.go
Normal file
73
robotsMatch/robots.go
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
package robotsMatch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseRobotsTxt takes robots.txt content and a host, and
|
||||||
|
// returns a list of full URLs that shouldn't be visited.
|
||||||
|
// This is the legacy version without context support.
|
||||||
|
// TODO Also take into account the user agent?
|
||||||
|
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
|
func ParseRobotsTxt(content string, host string) []string {
|
||||||
|
// Call the context-aware version with a background context
|
||||||
|
return ParseRobotsTxtWithContext(context.Background(), content, host)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseRobotsTxtWithContext takes robots.txt content and a host, and
|
||||||
|
// returns a list of full URLs that shouldn't be visited.
|
||||||
|
// This version supports context for logging.
|
||||||
|
// TODO Also take into account the user agent?
|
||||||
|
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
|
func ParseRobotsTxtWithContext(ctx context.Context, content string, host string) []string {
|
||||||
|
// Create a context for robots.txt parsing
|
||||||
|
parseCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.parser")
|
||||||
|
|
||||||
|
var disallowedPaths []string
|
||||||
|
for _, line := range strings.Split(content, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
line = strings.ToLower(line)
|
||||||
|
if strings.HasPrefix(line, "disallow:") {
|
||||||
|
parts := strings.SplitN(line, ":", 2)
|
||||||
|
if len(parts) == 2 {
|
||||||
|
path := strings.TrimSpace(parts[1])
|
||||||
|
if path != "" {
|
||||||
|
// Construct full Gemini URL
|
||||||
|
var fullURL string
|
||||||
|
|
||||||
|
// Handle if the path is already a full URL
|
||||||
|
if strings.HasPrefix(path, "gemini://") {
|
||||||
|
// Extract just the path from the full URL
|
||||||
|
urlParts := strings.SplitN(path, "/", 4)
|
||||||
|
if len(urlParts) >= 4 {
|
||||||
|
// Get the path part (everything after the domain)
|
||||||
|
pathPart := "/" + urlParts[3]
|
||||||
|
fullURL = fmt.Sprintf("gemini://%s%s", host, pathPart)
|
||||||
|
} else {
|
||||||
|
// If it's just a domain without a path, skip it or use root path
|
||||||
|
fullURL = fmt.Sprintf("gemini://%s/", host)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// It's a relative path, just add it to the host
|
||||||
|
if !strings.HasPrefix(path, "/") {
|
||||||
|
path = "/" + path
|
||||||
|
}
|
||||||
|
fullURL = fmt.Sprintf("gemini://%s%s", host, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
disallowedPaths = append(disallowedPaths, fullURL)
|
||||||
|
|
||||||
|
// Add additional logging to debug robots.txt parsing
|
||||||
|
contextlog.LogDebugWithContext(parseCtx, logging.GetSlogger(), "Added robots.txt disallow rule: %s from original: %s", fullURL, path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return disallowedPaths
|
||||||
|
}
|
||||||
161
robotsMatch/robotsMatch.go
Normal file
161
robotsMatch/robotsMatch.go
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
package robotsMatch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"gemini-grc/common/contextlog"
|
||||||
|
"gemini-grc/common/snapshot"
|
||||||
|
geminiUrl "gemini-grc/common/url"
|
||||||
|
"gemini-grc/config"
|
||||||
|
"gemini-grc/contextutil"
|
||||||
|
"gemini-grc/gemini"
|
||||||
|
"git.antanst.com/antanst/logging"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RobotsCache is a map of blocked URLs
|
||||||
|
// key: URL
|
||||||
|
// value: []string list of disallowed URLs
|
||||||
|
// If a key has no blocked URLs, an empty
|
||||||
|
// list is stored for caching.
|
||||||
|
var RobotsCache sync.Map //nolint:gochecknoglobals
|
||||||
|
|
||||||
|
func populateRobotsCache(ctx context.Context, key string) (entries []string, _err error) {
|
||||||
|
// Create a context for robots cache population
|
||||||
|
cacheCtx := contextutil.ContextWithComponent(ctx, "robotsCache")
|
||||||
|
|
||||||
|
// We either store an empty list when
|
||||||
|
// no rules, or a list of disallowed URLs.
|
||||||
|
// This applies even if we have an error
|
||||||
|
// finding/downloading robots.txt
|
||||||
|
defer func() {
|
||||||
|
RobotsCache.Store(key, entries)
|
||||||
|
}()
|
||||||
|
|
||||||
|
url := fmt.Sprintf("gemini://%s/robots.txt", key)
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Fetching robots.txt from %s", url)
|
||||||
|
|
||||||
|
// Use the context-aware version to honor timeout and cancellation
|
||||||
|
robotsContent, err := gemini.ConnectAndGetData(cacheCtx, url)
|
||||||
|
if err != nil {
|
||||||
|
// Check for context timeout or cancellation specifically
|
||||||
|
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Timeout or cancellation while fetching robots.txt: %v", err)
|
||||||
|
// Don't cache the result on timeout, to allow retrying later
|
||||||
|
return []string{}, err
|
||||||
|
}
|
||||||
|
// For other errors, we store an empty list for this host
|
||||||
|
// to avoid continually hitting it
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Failed to get robots.txt: %v", err)
|
||||||
|
RobotsCache.Store(key, []string{})
|
||||||
|
return []string{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
s, err := snapshot.SnapshotFromURL(url, true)
|
||||||
|
if err != nil {
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Failed to create snapshot from URL: %v", err)
|
||||||
|
return []string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
s = gemini.UpdateSnapshotWithData(*s, robotsContent)
|
||||||
|
|
||||||
|
if s.ResponseCode.ValueOrZero() != 20 {
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
|
||||||
|
return []string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Some return text/plain, others text/gemini.
|
||||||
|
// According to spec, the first is correct,
|
||||||
|
// however let's be lenient
|
||||||
|
var data string
|
||||||
|
switch {
|
||||||
|
case s.MimeType.ValueOrZero() == "text/plain":
|
||||||
|
data = string(s.Data.ValueOrZero())
|
||||||
|
case s.MimeType.ValueOrZero() == "text/gemini":
|
||||||
|
data = s.GemText.ValueOrZero()
|
||||||
|
default:
|
||||||
|
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Unsupported mime type: %s", s.MimeType.ValueOrZero())
|
||||||
|
return []string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = ParseRobotsTxtWithContext(ctx, data, key)
|
||||||
|
return entries, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RobotMatch checks if the snapshot URL matches
|
||||||
|
// a robots.txt allow rule.
|
||||||
|
func RobotMatch(ctx context.Context, u string) bool {
|
||||||
|
// Create a context for robots operations
|
||||||
|
robotsCtx := contextutil.ContextWithComponent(ctx, "robotsMatch")
|
||||||
|
|
||||||
|
// TODO Missing Gopher functionality
|
||||||
|
if config.CONFIG.GopherEnable {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
url, err := geminiUrl.ParseURL(u, "", true)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Checking robots.txt for URL: %s with host key: %s", u, key)
|
||||||
|
|
||||||
|
var disallowedURLs []string
|
||||||
|
cacheEntries, ok := RobotsCache.Load(key)
|
||||||
|
if !ok {
|
||||||
|
// First time check, populate robot cache
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "No robots.txt cache for %s, fetching...", key)
|
||||||
|
var fetchErr error
|
||||||
|
disallowedURLs, fetchErr = populateRobotsCache(ctx, key)
|
||||||
|
if fetchErr != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if len(disallowedURLs) > 0 {
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||||
|
} else {
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "No disallowed paths found in robots.txt for %s", key)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var ok bool
|
||||||
|
disallowedURLs, ok = cacheEntries.([]string)
|
||||||
|
if !ok {
|
||||||
|
contextlog.LogErrorWithContext(robotsCtx, logging.GetSlogger(), "Invalid type in robots.txt cache for %s", key)
|
||||||
|
disallowedURLs = []string{} // Use empty list as fallback
|
||||||
|
}
|
||||||
|
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Found %d disallowed paths in robots.txt cache for %s", len(disallowedURLs), key)
|
||||||
|
}
|
||||||
|
return isURLblocked(ctx, disallowedURLs, url.Full)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize initializes the robots.txt match package
|
||||||
|
func Initialize() error {
|
||||||
|
logging.LogDebug("Initializing robotsMatch package")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown cleans up the robots.txt match package
|
||||||
|
func Shutdown() error {
|
||||||
|
logging.LogDebug("Shutting down robotsMatch package")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func isURLblocked(ctx context.Context, disallowedURLs []string, input string) bool {
|
||||||
|
// Create a context for URL blocking checks
|
||||||
|
blockCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.isURLblocked")
|
||||||
|
|
||||||
|
inputLower := strings.ToLower(input)
|
||||||
|
|
||||||
|
for _, url := range disallowedURLs {
|
||||||
|
urlLower := strings.ToLower(url)
|
||||||
|
if strings.HasPrefix(inputLower, urlLower) {
|
||||||
|
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "MATCH! robots.txt rule: %s blocks URL: %s", url, input)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "No robots.txt rules matched URL: %s", input)
|
||||||
|
return false
|
||||||
|
}
|
||||||
40
robotsMatch/robotsMatch_test.go
Normal file
40
robotsMatch/robotsMatch_test.go
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
package robotsMatch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestInitializeShutdown(t *testing.T) {
|
||||||
|
err := Initialize()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Initialize() failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = Shutdown()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Shutdown() failed: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRobotMatch_EmptyCache(t *testing.T) {
|
||||||
|
// This test doesn't actually connect to gemini URLs due to the complexity
|
||||||
|
// of mocking the gemini client, but tests the caching behavior when no
|
||||||
|
// robots.txt is found (empty cache case)
|
||||||
|
config.CONFIG.ResponseTimeout = 5
|
||||||
|
|
||||||
|
// Clear the cache before testing
|
||||||
|
RobotsCache = sync.Map{}
|
||||||
|
|
||||||
|
// For empty cache or DNS errors, RobotMatch should return false (allow the URL) without an error
|
||||||
|
ctx := context.Background()
|
||||||
|
blocked := RobotMatch(ctx, "gemini://nonexistent.example.com/")
|
||||||
|
|
||||||
|
// The URL should be allowed (not blocked) when robots.txt can't be fetched
|
||||||
|
if blocked {
|
||||||
|
t.Errorf("Expected URL to be allowed when robots.txt can't be fetched")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
package gemini
|
package robotsMatch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
@@ -44,12 +45,13 @@ func TestIsURLblocked(t *testing.T) {
|
|||||||
"gemini://example.com/cgi-bin/wp.cgi/media",
|
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||||
"gemini://example.com/admin/",
|
"gemini://example.com/admin/",
|
||||||
}
|
}
|
||||||
|
ctx := context.Background()
|
||||||
url := "gemini://example.com/admin/index.html"
|
url := "gemini://example.com/admin/index.html"
|
||||||
if !isURLblocked(disallowedURLs, url) {
|
if !isURLblocked(ctx, disallowedURLs, url) {
|
||||||
t.Errorf("Expected %s to be blocked", url)
|
t.Errorf("Expected %s to be blocked", url)
|
||||||
}
|
}
|
||||||
url = "gemini://example1.com/admin/index.html"
|
url = "gemini://example1.com/admin/index.html"
|
||||||
if isURLblocked(disallowedURLs, url) {
|
if isURLblocked(ctx, disallowedURLs, url) {
|
||||||
t.Errorf("expected %s to not be blocked", url)
|
t.Errorf("expected %s to not be blocked", url)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
10
seed_urls.txt
Normal file
10
seed_urls.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
gemini://geminiprotocol.net/
|
||||||
|
gemini://warmedal.se/~antenna/
|
||||||
|
gemini://skyjake.fi/~Cosmos/
|
||||||
|
gemini://gemini.circumlunar.space/capcom/
|
||||||
|
gemini://auragem.letz.dev/
|
||||||
|
gemini://gemplex.space/
|
||||||
|
gemini://kennedy.gemi.dev/
|
||||||
|
gemini://tlgs.one/
|
||||||
|
gemini://yesterday.gemlog.org/
|
||||||
|
gemini://gemini.cyberbot.space/feed.gmi
|
||||||
22
test.txt
Normal file
22
test.txt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Test redirect full url:
|
||||||
|
gemini://gemini.circumlunar.space
|
||||||
|
|
||||||
|
# Test blacklist:
|
||||||
|
gemi.dev
|
||||||
|
|
||||||
|
# Test robots disallow:
|
||||||
|
gemini://tlgs.one/search?aa
|
||||||
|
|
||||||
|
# Test TLS cert required:
|
||||||
|
gemini://astrobotany.mozz.us/app/plant
|
||||||
|
// 31 redirect
|
||||||
|
gemini://gemini.circumlunar.space
|
||||||
|
|
||||||
|
// body with null byte
|
||||||
|
gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False
|
||||||
|
|
||||||
|
// has invalid url
|
||||||
|
gemini://tlgs.one/known-hosts
|
||||||
|
|
||||||
|
// Needs SNI TLS info (our bug)
|
||||||
|
gemini://hanzbrix.pollux.casa/gemlog/20241002.gmi
|
||||||
14
uid/uid.go
14
uid/uid.go
@@ -1,14 +0,0 @@
|
|||||||
package uid
|
|
||||||
|
|
||||||
import (
|
|
||||||
nanoid "github.com/matoous/go-nanoid/v2"
|
|
||||||
)
|
|
||||||
|
|
||||||
func UID() string {
|
|
||||||
// No 'o','O' and 'l'
|
|
||||||
id, err := nanoid.Generate("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20)
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
42
util/util.go
42
util/util.go
@@ -5,14 +5,9 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math/big"
|
"math/big"
|
||||||
"runtime/debug"
|
"regexp"
|
||||||
)
|
)
|
||||||
|
|
||||||
func PrintStackAndPanic(err error) {
|
|
||||||
fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack())
|
|
||||||
panic("PANIC")
|
|
||||||
}
|
|
||||||
|
|
||||||
// SecureRandomInt returns a cryptographically secure random integer in the range [0,max).
|
// SecureRandomInt returns a cryptographically secure random integer in the range [0,max).
|
||||||
// Panics if max <= 0 or if there's an error reading from the system's secure
|
// Panics if max <= 0 or if there's an error reading from the system's secure
|
||||||
// random number generator.
|
// random number generator.
|
||||||
@@ -23,14 +18,45 @@ func SecureRandomInt(max int) int {
|
|||||||
// Generate random number
|
// Generate random number
|
||||||
n, err := rand.Int(rand.Reader, maxBig)
|
n, err := rand.Int(rand.Reader, maxBig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
PrintStackAndPanic(fmt.Errorf("could not generate a random integer between 0 and %d", max))
|
panic(fmt.Errorf("could not generate a random integer between 0 and %d", max))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert back to int
|
// Convert back to int
|
||||||
return int(n.Int64())
|
return int(n.Int64())
|
||||||
}
|
}
|
||||||
|
|
||||||
func PrettyJson(data string) string {
|
func PrettifyJson(data string) string {
|
||||||
marshalled, _ := json.MarshalIndent(data, "", " ")
|
marshalled, _ := json.MarshalIndent(data, "", " ")
|
||||||
return fmt.Sprintf("%s\n", marshalled)
|
return fmt.Sprintf("%s\n", marshalled)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetLinesMatchingRegex returns all lines that match given regex
|
||||||
|
func GetLinesMatchingRegex(input string, pattern string) []string {
|
||||||
|
re := regexp.MustCompile(pattern)
|
||||||
|
matches := re.FindAllString(input, -1)
|
||||||
|
return matches
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter applies a predicate function to each element in a slice and returns a new slice
|
||||||
|
// containing only the elements for which the predicate returns true.
|
||||||
|
// Type parameter T allows this function to work with slices of any type.
|
||||||
|
func Filter[T any](slice []T, f func(T) bool) []T {
|
||||||
|
filtered := make([]T, 0)
|
||||||
|
for _, v := range slice {
|
||||||
|
if f(v) {
|
||||||
|
filtered = append(filtered, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return filtered
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map applies a function to each element in a slice and returns a new slice
|
||||||
|
// containing the results.
|
||||||
|
// Type parameters T and R allow this function to work with different input and output types.
|
||||||
|
func Map[T any, R any](slice []T, f func(T) R) []R {
|
||||||
|
result := make([]R, len(slice))
|
||||||
|
for i, v := range slice {
|
||||||
|
result[i] = f(v)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user