Compare commits

...

18 Commits

Author SHA1 Message Date
43f2242558 Update README 2024-12-09 19:54:15 +02:00
aa4aecdc14 Preliminary web server 2024-12-09 19:54:08 +02:00
6cf507bdc9 DB scripts and migrations 2024-12-09 19:54:00 +02:00
7a36614232 Better error handling, many fixes all around 2024-12-09 19:53:15 +02:00
b52d4f6532 refactor: Update error message and remove commented JSON conversion methods 2024-11-26 09:51:25 +02:00
825c7e3391 . 2024-11-18 16:28:45 +02:00
f0452ff9f7 fix: Change URL Value method to use value receiver for database encoding 2024-11-11 16:19:07 +02:00
bb49ea8565 fix: Update URL Value method to use pointer receiver for consistency 2024-11-11 16:19:06 +02:00
a19f157c80 fix: Implement database serialization for gemini URL type 2024-11-11 16:14:59 +02:00
8b341d2ac6 refactor: Simplify URL parsing and remove JSON-related methods 2024-11-11 16:14:58 +02:00
2bb8589eb7 feat: Improve error handling with custom error types and detailed messages 2024-11-11 15:01:20 +02:00
6346c9a829 refactor: Remove default configuration constants and simplify config parsing 2024-11-11 15:01:18 +02:00
c7b0778b77 refactor: Add config validation, defaults, and unit tests 2024-11-11 14:58:09 +02:00
bea0d22c26 refactor: Replace switch statement with map-based config parsing 2024-11-11 14:56:41 +02:00
6bcc7081b2 Fix dependency on go-nanoid 2024-11-05 12:43:28 +02:00
f34ac651b7 Add Makefile and remove ad-hoc lint script 2024-11-05 12:40:30 +02:00
d5da9ac62d Better unicode conversion 2024-11-05 12:39:14 +02:00
a0563074ed Lint fixes. 2024-11-01 10:10:42 +02:00
41 changed files with 1346 additions and 558 deletions

2
.gitignore vendored
View File

@@ -1,6 +1,8 @@
.idea .idea
.goroot
**/.#* **/.#*
**/*~ **/*~
/.go
/cmd /cmd
/db/initdb.sql /db/initdb.sql
/db/*sh /db/*sh

28
Makefile Normal file
View File

@@ -0,0 +1,28 @@
SHELL := /usr/local/bin/oksh
export PATH := $(PATH)
.PHONY: all fmt lint
all: fmt lint test
.PHONY: debug
debug:
@echo "PATH: $(PATH)"
@echo "GOPATH: $(shell go env GOPATH)"
@which go
@which gofumpt
@which gci
@which golangci-lint
# Test
test:
go test -v ./...
# Format code
fmt:
gofumpt -l -w .
gci write .
# Run linter
lint: fmt
golangci-lint run

View File

@@ -2,6 +2,8 @@
A Gemini crawler. A Gemini crawler.
URLs to visit as well as data from visited URLs are stored into "snapshots" in the database.
## Done ## Done
- [x] Concurrent downloading with workers - [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host - [x] Concurrent connection limit per host
@@ -10,22 +12,16 @@ A Gemini crawler.
- [x] Configuration via environment variables - [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL - [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation - [x] Proper response header & body UTF-8 and format validation
- [x] Follow robots.txt - [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
- [x] Handle redirects (3X status codes)
## TODO ## TODO
- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi - [ ] Better URL normalization
- [ ] Proper handling of all response codes - [ ] Provide a TLS cert for sites that require it, like Astrobotany
- [ ] Handle 3X redirects properly
- [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
+ [ ] Probably have a common "grc" cert for all?
- [ ] Proper input and response validations:
+ [ ] When making a request, the URI MUST NOT exceed 1024 bytes
- [ ] Subscriptions to gemini pages? gemini://geminiprotocol.net/docs/companion/
## TODO for later ## TODO for later
- [ ] Add other protocols - [ ] Gopher
+ [ ] Gopher - [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
+ [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi - [ ] Spartan
+ [ ] Spartan - [ ] Nex
+ [ ] Nex - [ ] SuperTXT https://supertxt.net/00-intro.html
+ [ ] SuperTXT https://supertxt.net/00-intro.html

2
blacklist.txt Normal file
View File

@@ -0,0 +1,2 @@
gemi.dev
mastogem.picasoft.net

View File

@@ -8,83 +8,149 @@ import (
"github.com/rs/zerolog" "github.com/rs/zerolog"
) )
// Environment variable names.
const (
EnvLogLevel = "LOG_LEVEL"
EnvNumWorkers = "NUM_OF_WORKERS"
EnvWorkerBatchSize = "WORKER_BATCH_SIZE"
EnvMaxResponseSize = "MAX_RESPONSE_SIZE"
EnvResponseTimeout = "RESPONSE_TIMEOUT"
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
EnvBlacklistPath = "BLACKLIST_PATH"
EnvDryRun = "DRY_RUN"
)
// Config holds the application configuration loaded from environment variables.
type Config struct { type Config struct {
LogLevel zerolog.Level LogLevel zerolog.Level // Logging level (debug, info, warn, error)
rootPath string MaxResponseSize int // Maximum size of response in bytes
MaxResponseSize int NumOfWorkers int // Number of concurrent workers
NumOfWorkers int ResponseTimeout int // Timeout for responses in seconds
ResponseTimeout int WorkerBatchSize int // Batch size for worker processing
WorkerBatchSize int PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
BlacklistPath string // File that has blacklisted strings of "host:port"
DryRun bool // If false, don't write to disk
} }
var CONFIG Config var CONFIG Config //nolint:gochecknoglobals
func GetConfig() *Config { // parsePositiveInt parses and validates positive integer values.
var config Config func parsePositiveInt(param, value string) (int, error) {
for _, envVar := range []string{ val, err := strconv.Atoi(value)
"LOG_LEVEL",
"ROOT_PATH",
"NUM_OF_WORKERS",
"WORKER_BATCH_SIZE",
"MAX_RESPONSE_SIZE",
"RESPONSE_TIMEOUT",
} {
if env, ok := os.LookupEnv(envVar); !ok {
fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar)
os.Exit(1)
} else {
switch envVar {
case "LOG_LEVEL":
{
logLevel, err := zerolog.ParseLevel(env)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "Invalid LOG_LEVEL value\n") return 0, ValidationError{
os.Exit(1) Param: param,
} Value: value,
config.LogLevel = logLevel Reason: "must be a valid integer",
}
case "ROOT_PATH":
{
config.rootPath = env
}
case "NUM_OF_WORKERS":
{
if numOfWorkers, err := strconv.Atoi(env); err != nil {
fmt.Fprintf(os.Stderr, "Invalid NUM_OF_WORKERS value\n")
os.Exit(1)
} else {
config.NumOfWorkers = numOfWorkers
} }
} }
case "WORKER_BATCH_SIZE": if val <= 0 {
{ return 0, ValidationError{
if workerBatchSize, err := strconv.Atoi(env); err != nil { Param: param,
fmt.Fprintf(os.Stderr, "Invalid WORKER_BATCH_SIZE value\n") Value: value,
os.Exit(1) Reason: "must be positive",
} else {
config.WorkerBatchSize = workerBatchSize
} }
} }
case "MAX_RESPONSE_SIZE": return val, nil
{ }
if maxResponseSize, err := strconv.Atoi(env); err != nil {
fmt.Fprintf(os.Stderr, "Invalid MAX_RESPONSE_SIZE value\n") func parseBool(param, value string) (bool, error) {
os.Exit(1) val, err := strconv.ParseBool(value)
} else { if err != nil {
config.MaxResponseSize = maxResponseSize return false, ValidationError{
} Param: param,
} Value: value,
case "RESPONSE_TIMEOUT": Reason: "cannot be converted to boolean",
{ }
if val, err := strconv.Atoi(env); err != nil { }
fmt.Fprintf(os.Stderr, "Invalid RESPONSE_TIMEOUT value\n") return val, nil
os.Exit(1) }
} else {
config.ResponseTimeout = val // GetConfig loads and validates configuration from environment variables
} func GetConfig() *Config {
} config := &Config{}
}
} // Map of environment variables to their parsing functions
} parsers := map[string]func(string) error{
return &config EnvLogLevel: func(v string) error {
level, err := zerolog.ParseLevel(v)
if err != nil {
return ValidationError{
Param: EnvLogLevel,
Value: v,
Reason: "must be one of: debug, info, warn, error",
}
}
config.LogLevel = level
return nil
},
EnvNumWorkers: func(v string) error {
val, err := parsePositiveInt(EnvNumWorkers, v)
if err != nil {
return err
}
config.NumOfWorkers = val
return nil
},
EnvWorkerBatchSize: func(v string) error {
val, err := parsePositiveInt(EnvWorkerBatchSize, v)
if err != nil {
return err
}
config.WorkerBatchSize = val
return nil
},
EnvMaxResponseSize: func(v string) error {
val, err := parsePositiveInt(EnvMaxResponseSize, v)
if err != nil {
return err
}
config.MaxResponseSize = val
return nil
},
EnvResponseTimeout: func(v string) error {
val, err := parsePositiveInt(EnvResponseTimeout, v)
if err != nil {
return err
}
config.ResponseTimeout = val
return nil
},
EnvPanicOnUnexpectedError: func(v string) error {
val, err := parseBool(EnvPanicOnUnexpectedError, v)
if err != nil {
return err
}
config.PanicOnUnexpectedError = val
return nil
},
EnvBlacklistPath: func(v string) error {
config.BlacklistPath = v
return nil
},
EnvDryRun: func(v string) error {
val, err := parseBool(EnvDryRun, v)
if err != nil {
return err
}
config.DryRun = val
return nil
},
}
// Process each environment variable
for envVar, parser := range parsers {
value, ok := os.LookupEnv(envVar)
if !ok {
fmt.Fprintf(os.Stderr, "Missing required environment variable: %s\n", envVar)
os.Exit(1)
}
if err := parser(value); err != nil {
fmt.Fprintf(os.Stderr, "Configuration error: %v\n", err)
os.Exit(1)
}
}
return config
} }

14
config/errors.go Normal file
View File

@@ -0,0 +1,14 @@
package config
import "fmt"
// ValidationError represents a config validation error
type ValidationError struct {
Param string
Value string
Reason string
}
func (e ValidationError) Error() string {
return fmt.Sprintf("invalid value '%s' for %s: %s", e.Value, e.Param, e.Reason)
}

View File

@@ -0,0 +1,7 @@
delete FROM snapshots
WHERE host IN (
SELECT DISTINCT host
FROM snapshots
WHERE error LIKE 'robots.txt%'
)
AND url LIKE 'gemini://' || host || '/%';

5
db/error_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT error, count(error) as count
FROM snapshots
GROUP BY error
ORDER BY count DESC
LIMIT 20;

22
db/fix-url-ports.sql Normal file
View File

@@ -0,0 +1,22 @@
-- Here's an SQL script that will find and remove snapshots without port numbers
-- when there exists a duplicate with the default port 1965.
-- Before running this DELETE though, you might want to
-- verify the matches first with this SELECT:
WITH duplicates AS (
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
FROM snapshots s1
JOIN snapshots s2
ON s2.url = s1.url || ':1965'
)
SELECT * FROM duplicates;
-- Now delete them for real:
WITH duplicates AS (
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
FROM snapshots s1
JOIN snapshots s2
ON s2.url = s1.url || ':1965'
)
DELETE FROM snapshots
WHERE id IN (SELECT id_without_port FROM duplicates);

View File

@@ -0,0 +1,7 @@
SELECT host, COUNT(*) AS row_count
FROM snapshots
WHERE response_code IS NOT NULL
AND error IS NULL
GROUP BY host
ORDER BY row_count DESC
LIMIT 10;

View File

@@ -20,7 +20,7 @@ DROP TABLE IF EXISTS snapshots;
CREATE TABLE snapshots ( CREATE TABLE snapshots (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
uid TEXT NOT NULL UNIQUE, uid TEXT NOT NULL UNIQUE,
url TEXT NOT NULL, url TEXT NOT NULL UNIQUE,
host TEXT NOT NULL, host TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
mimetype TEXT, mimetype TEXT,
@@ -42,7 +42,10 @@ CREATE INDEX idx_lang ON snapshots (lang);
CREATE INDEX idx_response_code ON snapshots (response_code); CREATE INDEX idx_response_code ON snapshots (response_code);
CREATE INDEX idx_error ON snapshots (error); CREATE INDEX idx_error ON snapshots (error);
CREATE INDEX idx_host ON snapshots (host); CREATE INDEX idx_host ON snapshots (host);
CREATE INDEX unique_uid_url ON snapshots (uid, url);
CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host) CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
WHERE response_code IS NULL AND error IS NULL WHERE response_code IS NULL AND error IS NULL
INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang); INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;

View File

@@ -0,0 +1,18 @@
-- Step 1: Delete duplicate entries, keeping the last one based on timestamp
-- Use a CTE to mark duplicates and delete them efficiently
WITH ranked_snapshots AS (
SELECT
id,
url,
ROW_NUMBER() OVER(PARTITION BY url ORDER BY timestamp DESC) AS row_num
FROM
snapshots
)
DELETE FROM snapshots
USING ranked_snapshots
WHERE snapshots.id = ranked_snapshots.id
AND ranked_snapshots.row_num > 1;
-- Step 2: Add a unique constraint on the url column to prevent future duplicates
ALTER TABLE snapshots
ADD CONSTRAINT unique_url UNIQUE (url);

View File

@@ -2,44 +2,56 @@ package main
import ( import (
"fmt" "fmt"
"gemini-grc/gemini"
"os" "os"
"gemini-grc/gemini"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
// MANUALLY SET TO TRUE WHEN MIGRATION HAS BEEN APPLIED
func checkIfDone() bool { return true } func checkIfDone() bool { return true }
func MustSelect(tx *sqlx.Tx, dest interface{}, query string, args ...interface{}) {
if err := tx.Select(dest, query, args...); err != nil {
panic(err)
}
}
// Populates the `host` field // Populates the `host` field
func main() { func main() {
db := connectToDB()
if checkIfDone() { if checkIfDone() {
fmt.Println("Migration already applied") fmt.Println("Migration already applied")
return return
} }
count := 0 db := connectToDB()
defer db.Close()
batchSize := 1000
for { for {
// Start the transaction // Start the transaction
tx, err := db.Beginx() tx := db.MustBegin()
if err != nil { query := `SELECT * FROM snapshots WHERE url NOT LIKE '%1965%' LIMIT $1`
fmt.Println(err)
return
}
query := `
SELECT * FROM snapshots
WHERE host IS NULL
LIMIT 5000
`
var snapshots []gemini.Snapshot var snapshots []gemini.Snapshot
err = tx.Select(&snapshots, query) MustSelect(tx, &snapshots, query, batchSize)
if len(snapshots) == 0 { if len(snapshots) == 0 {
fmt.Println("Done!") fmt.Println("No snapshots remaining, done")
return break
} }
for i, s := range snapshots {
_, err := gemini.ParseURL(s.URL.String(), "")
if err != nil {
panic(fmt.Sprintf("Error parsing URL. ID %d URL %s\n", s.ID, s.URL))
}
fmt.Printf("Saving %d %d %s\n", i+1, s.ID, s.URL)
err = gemini.UpsertSnapshot(0, tx, &s)
if err != nil {
panic(fmt.Sprintf("Error saving %s: %s", s.URL, err))
}
tx.MustExec(`DELETE FROM snapshots WHERE id=$1`, s.ID)
}
err := tx.Commit()
if err != nil { if err != nil {
fmt.Println(err) fmt.Println(err)
err := tx.Rollback() err := tx.Rollback()
@@ -47,31 +59,7 @@ func main() {
panic(err) panic(err)
} }
} }
for _, s := range snapshots {
s.Host = s.URL.Hostname
fmt.Println(count, s.UID, s.URL.Hostname)
err := gemini.SaveSnapshotToDB(tx, &s)
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
} }
}
count += 1
}
err = tx.Commit()
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
}
} }
func connectToDB() *sqlx.DB { func connectToDB() *sqlx.DB {

View File

@@ -1,20 +1,29 @@
// func PopulateDB(db *sqlx.DB) { package main
// // Delete all rows in the snapshots table
// db.MustExec("TRUNCATE snapshots;")
// // Prepare the query for inserting a snapshot with uid, url, and timestamp import (
// query := `INSERT INTO snapshots(uid, url, timestamp) "time"
// VALUES ($1, $2, $3)`
// // Calculate the timestamp for 2 days ago "gemini-grc/uid"
// timestamp := time.Now().Add(-48 * time.Hour) "github.com/jmoiron/sqlx"
)
// db.MustExec(query, uid.UID(), "gemini://geminiprotocol.net/", timestamp) func PopulateDB(db *sqlx.DB) {
// db.MustExec(query, uid.UID(), "gemini://warmedal.se/~antenna", timestamp) // Delete all rows in the snapshots table
// db.MustExec(query, uid.UID(), "gemini://skyjake.fi/~Cosmos/", timestamp) db.MustExec("TRUNCATE snapshots;")
// db.MustExec(query, uid.UID(), "gemini://gemini.circumlunar.space/capcom/", timestamp)
// db.MustExec(query, uid.UID(), "gemini://auragem.letz.dev/", timestamp) // Prepare the query for inserting a snapshot with uid, url, and timestamp
// db.MustExec(query, uid.UID(), "gemini://gemplex.space/", timestamp) query := `INSERT INTO snapshots(uid, url, timestamp)
// db.MustExec(query, uid.UID(), "gemini://kennedy.gemi.dev/", timestamp) VALUES ($1, $2, $3)`
// db.MustExec(query, uid.UID(), "gemini://tlgs.one/", timestamp)
// } // Calculate the timestamp for 2 days ago
timestamp := time.Now().Add(-48 * time.Hour)
db.MustExec(query, uid.UID(), "gemini://geminiprotocol.net/", timestamp)
db.MustExec(query, uid.UID(), "gemini://warmedal.se/~antenna", timestamp)
db.MustExec(query, uid.UID(), "gemini://skyjake.fi/~Cosmos/", timestamp)
db.MustExec(query, uid.UID(), "gemini://gemini.circumlunar.space/capcom/", timestamp)
db.MustExec(query, uid.UID(), "gemini://auragem.letz.dev/", timestamp)
db.MustExec(query, uid.UID(), "gemini://gemplex.space/", timestamp)
db.MustExec(query, uid.UID(), "gemini://kennedy.gemi.dev/", timestamp)
db.MustExec(query, uid.UID(), "gemini://tlgs.one/", timestamp)
}

5
db/url_port_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT
COUNT(*) AS "All",
COUNT(CASE WHEN URL ~ '://[^:]+:[0-9]+' THEN 1 END) AS "With port",
COUNT(CASE WHEN URL !~ '://[^:]+:[0-9]+' THEN 1 END) AS "Without port"
FROM snapshots;

View File

@@ -1,14 +1,16 @@
#!/bin/sh #!/bin/sh
set -eu set -eu
# Max response size 10MiB
MAX_RESPONSE_SIZE=10485760 \ MAX_RESPONSE_SIZE=10485760 \
LOG_LEVEL=info \ LOG_LEVEL=debug \
ROOT_PATH=./snaps \ ROOT_PATH=./snaps \
RESPONSE_TIMEOUT=10 \ RESPONSE_TIMEOUT=10 \
NUM_OF_WORKERS=5 \ NUM_OF_WORKERS=1 \
WORKER_BATCH_SIZE=1 \
PG_DATABASE=gemini \ PG_DATABASE=gemini \
PG_HOST=127.0.0.1 \ PG_HOST=127.0.0.1 \
PG_PORT=5433 \ PG_PORT=5433 \
PG_USER=gemini \ PG_USER=gemini \
PG_PASSWORD=gemini \ PG_PASSWORD=gemini \
go run ./migrate1_host.go dlv debug

View File

@@ -1,3 +1,6 @@
// 31 redirect
gemini://gemini.circumlunar.space
// body with null byte // body with null byte
gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False

51
gemini/blacklist.go Normal file
View File

@@ -0,0 +1,51 @@
package gemini
import (
"fmt"
"os"
"strings"
"gemini-grc/config"
"gemini-grc/logging"
)
var Blacklist *[]string //nolint:gochecknoglobals
func LoadBlacklist() {
if Blacklist == nil {
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
if err != nil {
Blacklist = &[]string{}
logging.LogWarn("Could not load Blacklist file: %v", err)
return
}
lines := strings.Split(string(data), "\n")
// Ignore lines starting with '#' (comments)
filteredLines := func() []string {
out := make([]string, 0, len(lines))
for _, line := range lines {
if !strings.HasPrefix(line, "#") {
out = append(out, line)
}
}
return out
}()
if len(lines) > 0 {
Blacklist = &filteredLines
logging.LogInfo("Blacklist has %d entries", len(*Blacklist))
}
}
}
func IsBlacklisted(url URL) bool {
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
for _, v := range *Blacklist {
if v == url.Hostname || v == hostWithPort {
return true
}
}
return false
}

View File

@@ -4,11 +4,11 @@ import (
"gemini-grc/logging" "gemini-grc/logging"
) )
var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)} var IpPool = IpAddressPool{IPs: make(map[string]int)}
func AddIPsToPool(IPs []string) { func AddIPsToPool(ips []string) {
IpPool.Lock.Lock() IpPool.Lock.Lock()
for _, ip := range IPs { for _, ip := range ips {
logging.LogDebug("Adding %s to pool", ip) logging.LogDebug("Adding %s to pool", ip)
IpPool.IPs[ip]++ IpPool.IPs[ip]++
} }

101
gemini/errors.go Normal file
View File

@@ -0,0 +1,101 @@
package gemini
import (
"errors"
"fmt"
)
type GeminiError struct {
Msg string
Code int
Header string
}
func (e *GeminiError) Error() string {
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
}
func NewErrGeminiStatusCode(code int, header string) error {
var msg string
switch {
case code >= 10 && code < 20:
msg = "needs input"
case code >= 30 && code < 40:
msg = "redirect"
case code >= 40 && code < 50:
msg = "bad request"
case code >= 50 && code < 60:
msg = "server error"
case code >= 60 && code < 70:
msg = "TLS error"
default:
msg = "unexpected status code"
}
return &GeminiError{
Msg: msg,
Code: code,
Header: header,
}
}
var (
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
ErrGeminiResponseHeader = errors.New("gemini response header error")
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
ErrURLParse = errors.New("URL parse error")
ErrURLDecode = errors.New("URL decode error")
ErrUTF8Parse = errors.New("UTF-8 parse error")
ErrTextParse = errors.New("text parse error")
ErrNetwork = errors.New("network error")
ErrNetworkDNS = errors.New("network DNS error")
ErrNetworkTLS = errors.New("network TLS error")
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
ErrNetworkCannotWrite = errors.New("network error - cannot write")
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
ErrDatabase = errors.New("database error")
)
// We could have used a map for speed, but
// we would lose ability to check wrapped
// errors via errors.Is().
var errGemini *GeminiError
var knownErrors = []error{ //nolint:gochecknoglobals
errGemini,
ErrGeminiLinkLineParse,
ErrGeminiRobotsParse,
ErrGeminiRobotsDisallowed,
ErrGeminiResponseHeader,
ErrURLParse,
ErrURLDecode,
ErrUTF8Parse,
ErrTextParse,
ErrNetwork,
ErrNetworkDNS,
ErrNetworkTLS,
ErrNetworkSetConnectionDeadline,
ErrNetworkCannotWrite,
ErrNetworkResponseSizeExceededMax,
ErrDatabase,
}
func IsKnownError(err error) bool {
for _, known := range knownErrors {
if errors.Is(err, known) {
return true
}
}
// Check for wrapped errors as well
if errors.As(err, new(*GeminiError)) {
return true
}
return false
}

24
gemini/errors_test.go Normal file
View File

@@ -0,0 +1,24 @@
package gemini
import (
"errors"
"fmt"
"testing"
)
func TestErrGemini(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
if !errors.As(err, new(*GeminiError)) {
t.Errorf("TestErrGemini fail")
}
}
func TestErrGeminiWrapped(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
errWrapped := fmt.Errorf("%w wrapped", err)
if !errors.As(errWrapped, new(*GeminiError)) {
t.Errorf("TestErrGeminiWrapped fail")
}
}

View File

@@ -2,12 +2,13 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/logging"
"net/url" "net/url"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
"strings" "strings"
"gemini-grc/logging"
) )
// sanitizePath encodes invalid filesystem characters using URL encoding. // sanitizePath encodes invalid filesystem characters using URL encoding.
@@ -67,7 +68,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
urlPath := s.URL.Path urlPath := s.URL.Path
// If path is empty, add `index.gmi` as the file to save // If path is empty, add `index.gmi` as the file to save
if urlPath == "" || urlPath == "." { if urlPath == "" || urlPath == "." {
urlPath = fmt.Sprintf("index.gmi") urlPath = "index.gmi"
} }
// If path ends with '/' then add index.gmi for the // If path ends with '/' then add index.gmi for the
// directory to be created. // directory to be created.
@@ -77,7 +78,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
finalPath, err := calcFilePath(parentPath, urlPath) finalPath, err := calcFilePath(parentPath, urlPath)
if err != nil { if err != nil {
logging.LogError("Error saving %s: %w", s.URL, err) logging.LogError("GeminiError saving %s: %w", s.URL, err)
return return
} }
// Ensure the directory exists // Ensure the directory exists
@@ -87,12 +88,12 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
return return
} }
if s.MimeType.Valid && s.MimeType.String == "text/gemini" { if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
err = os.WriteFile(finalPath, (*s).Data.V, 0666) err = os.WriteFile(finalPath, (*s).Data.V, 0o666)
} else { } else {
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666) err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
} }
if err != nil { if err != nil {
logging.LogError("Error saving %s: %w", s.URL.Full, err) logging.LogError("GeminiError saving %s: %w", s.URL.Full, err)
} }
close(done) close(done)
} }

View File

@@ -1,98 +1,37 @@
package gemini package gemini
import ( import (
"errors"
"fmt" "fmt"
"gemini-grc/logging"
"net/url" "net/url"
gourl "net/url"
"regexp" "regexp"
"strconv" "strconv"
"strings"
"gemini-grc/logging"
) )
func isGeminiURL(url string) bool { func GetPageLinks(currentURL URL, gemtext string) LinkList {
_, err := gourl.Parse(url)
if err != nil {
logging.LogWarn("[%s] Invalid URL: %v", url, err)
return false
}
return strings.HasPrefix(url, "gemini://")
}
func parseLinks(s Snapshot, queue chan string) {
for _, link := range *s.Links {
if strings.HasPrefix(link.Full, "gemini://") {
go func(link GeminiUrl) {
// fmt.Printf("LINK: %s\n", link)
queue <- link.Full
}(link)
}
}
}
func checkGeminiStatusCode(code int) error {
switch {
case code == 20:
return nil
case code >= 10 && code < 20:
return fmt.Errorf("gemini response %d needs data input", code)
case code >= 30 && code < 40:
return fmt.Errorf("gemini response %d redirect", code)
case code >= 40 && code < 50:
return fmt.Errorf("gemini response %d server error", code)
case code >= 50 && code < 60:
return fmt.Errorf("gemini response %d server permanent error", code)
case code >= 60 && code < 70:
return fmt.Errorf("gemini response %d certificate error", code)
default:
return fmt.Errorf("unexpected/unhandled Gemini response %d", code)
}
}
func ProcessGemini(snapshot *Snapshot) *Snapshot {
// Grab link lines // Grab link lines
linkLines := ExtractLinkLines(snapshot.GemText.String) linkLines := ExtractLinkLines(gemtext)
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines)) if len(linkLines) == 0 {
return nil
}
var linkURLs LinkList
// Normalize URLs in links, and store them in snapshot // Normalize URLs in links, and store them in snapshot
for _, line := range linkLines { for _, line := range linkLines {
normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String()) normalizedLink, descr, err := NormalizeLink(line, currentURL.String())
if err != nil { if err != nil {
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err) logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
continue continue
} }
geminiUrl, err := ParseUrl(normalizedLink, descr) geminiUrl, err := ParseURL(normalizedLink, descr)
if err != nil { if err != nil {
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err) logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
continue continue
} }
if snapshot.Links == nil { logging.LogDebug(geminiUrl.String())
snapshot.Links = &LinkList{*geminiUrl} linkURLs = append(linkURLs, *geminiUrl)
} else {
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
} }
} return linkURLs
return snapshot
}
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
path := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
}
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
} }
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines // ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
@@ -109,11 +48,11 @@ func ExtractLinkLines(gemtext string) []string {
// NormalizeLink takes a single link line and the current URL, // NormalizeLink takes a single link line and the current URL,
// return the URL converted to an absolute URL // return the URL converted to an absolute URL
// and its description. // and its description.
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) { func NormalizeLink(linkLine string, currentURL string) (string, string, error) {
// Parse the current URL // Parse the current URL
baseURL, err := url.Parse(currentURL) baseURL, err := url.Parse(currentURL)
if err != nil { if err != nil {
return "", "", fmt.Errorf("invalid current URL: %v", err) return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
} }
// Regular expression to extract the URL part from a link line // Regular expression to extract the URL part from a link line
@@ -123,13 +62,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
matches := re.FindStringSubmatch(linkLine) matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 { if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged // If the line doesn't match the expected format, return it unchanged
return "", "", fmt.Errorf("not a link line: %v", linkLine) return "", "", fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine)
} }
originalURLStr := matches[1] originalURLStr := matches[1]
_, err = url.QueryUnescape(originalURLStr) _, err = url.QueryUnescape(originalURLStr)
if err != nil { if err != nil {
return "", "", fmt.Errorf("error decoding URL: %w", err) return "", "", fmt.Errorf("%w: %w", ErrURLDecode, err)
} }
restOfLine := "" restOfLine := ""
@@ -141,7 +80,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
parsedURL, err := url.Parse(originalURLStr) parsedURL, err := url.Parse(originalURLStr)
if err != nil { if err != nil {
// If URL parsing fails, return an error // If URL parsing fails, return an error
return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err) return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
} }
// Resolve relative URLs against the base URL // Resolve relative URLs against the base URL
@@ -173,14 +112,33 @@ func ParseFirstTwoDigits(input string) (int, error) {
// Find the first match in the string // Find the first match in the string
matches := re.FindStringSubmatch(input) matches := re.FindStringSubmatch(input)
if len(matches) == 0 { if len(matches) == 0 {
return 0, errors.New("no digits found at the beginning of the string") return 0, fmt.Errorf("%w", ErrGeminiResponseHeader)
} }
// Parse the captured match as an integer // Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1]) snapshot, err := strconv.Atoi(matches[1])
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err) return 0, fmt.Errorf("%w: %w", ErrTextParse, err)
} }
return snapshot, nil return snapshot, nil
} }
// extractRedirectTarget returns the redirection
// URL by parsing the header (or error message)
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
// \d+ - matches one or more digits
// \s+ - matches one or more whitespace
// ([^\r]+) - captures everything until it hits a \r (or end of string)
pattern := `\d+\s+([^\r]+)`
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) < 2 {
return nil, fmt.Errorf("%w: Cannot find redirect target from header %s", ErrGeminiResponseHeader, input)
}
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
if err != nil {
return nil, fmt.Errorf("%w: Cannot find redirect target from header: %w", ErrGeminiResponseHeader, err)
}
return newURL, nil
}

47
gemini/gemini_test.go Normal file
View File

@@ -0,0 +1,47 @@
package gemini
import (
"fmt"
"testing"
)
func TestExtractRedirectTargetFullURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://target.gr:1965") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 /a/b"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
input := "redirect: 31 ./"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://nox.im:1965/") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetWrong(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 fsdsdf"
result, err := extractRedirectTarget(*currentURL, input)
fmt.Println(err)
if result != nil || err == nil {
t.Errorf("fail: result should be nil, err is %s", err)
}
}

View File

@@ -1,12 +1,16 @@
package gemini package gemini
import ( import (
"encoding/json" "database/sql/driver"
"fmt" "fmt"
"gemini-grc/logging" "gemini-grc/logging"
"net/url"
"path"
"strconv"
"strings"
) )
type GeminiUrl struct { type URL struct {
Protocol string `json:"protocol,omitempty"` Protocol string `json:"protocol,omitempty"`
Hostname string `json:"hostname,omitempty"` Hostname string `json:"hostname,omitempty"`
Port int `json:"port,omitempty"` Port int `json:"port,omitempty"`
@@ -15,43 +19,79 @@ type GeminiUrl struct {
Full string `json:"full,omitempty"` Full string `json:"full,omitempty"`
} }
func (g *GeminiUrl) Scan(value interface{}) error { func (u *URL) Scan(value interface{}) error {
if value == nil { if value == nil {
// Clear the fields in the current GeminiUrl object (not the pointer itself) // Clear the fields in the current GeminiUrl object (not the pointer itself)
*g = GeminiUrl{} *u = URL{}
return nil return nil
} }
b, ok := value.(string) b, ok := value.(string)
if !ok { if !ok {
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value) return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
} }
parsedUrl, err := ParseUrl(b, "") parsedURL, err := ParseURL(b, "")
if err != nil { if err != nil {
return err return err
} }
*g = *parsedUrl *u = *parsedURL
return nil return nil
} }
func (u GeminiUrl) String() string { func (u URL) String() string {
return u.Full return u.Full
// return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path)
} }
func GeminiUrltoJSON(g GeminiUrl) string { func (u URL) StringNoDefaultPort() string {
// Serialize the Person struct to JSON if u.Port == 1965 {
jsonData, err := json.Marshal(g) return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
if err != nil {
logging.LogError("Error serializing to JSON: %w", err)
} }
return string(jsonData) return u.Full
} }
func GeminiUrlFromJSON(input string) GeminiUrl { func (u URL) Value() (driver.Value, error) {
var geminiUrl GeminiUrl if u.Full == "" {
err := json.Unmarshal([]byte(input), &geminiUrl) return nil, nil
if err != nil {
logging.LogError("Error deserializing from JSON: %w", err)
} }
return geminiUrl return u.Full, nil
}
func ParseURL(input string, descr string) (*URL, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
path := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
}
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
}
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
logging.LogDebug("Calculating redirect URL. Current %s header string %s", currentURL, input)
// If URL is absolute, return just it
if strings.Contains(input, "://") {
return ParseURL(input, "")
}
// input is a path. Clean it and construct
// new path
var newPath string
// Handle weird cases found in the wild
if strings.HasPrefix(input, "/") {
newPath = path.Clean(input)
} else if input == "./" || input == "." {
newPath = path.Join(currentURL.Path, "/")
} else {
newPath = path.Join(currentURL.Path, path.Clean(input))
}
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
return ParseURL(strURL, "")
} }

103
gemini/gemini_url_test.go Normal file
View File

@@ -0,0 +1,103 @@
package gemini
import (
"reflect"
"testing"
)
func TestParseURL(t *testing.T) {
t.Parallel()
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
parsed, err := ParseURL(input, "")
value, _ := parsed.Value()
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
t.Errorf("fail: %s", parsed)
}
}
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "gemini://a.b/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "a.b",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://a.b:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://smol.gr:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "c/d"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b/c/d",
Descr: "",
Full: "gemini://smol.gr:1965/a/b/c/d",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}

View File

@@ -2,21 +2,26 @@ package gemini
import ( import (
"crypto/tls" "crypto/tls"
"errors"
"fmt" "fmt"
"gemini-grc/config" "gemini-grc/logging"
"io" "io"
"net" "net"
go_url "net/url" gourl "net/url"
"regexp" "regexp"
"slices" "slices"
"strconv" "strconv"
"strings"
"time" "time"
"gemini-grc/config"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
type GeminiPageData struct { type PageData struct {
ResponseCode int ResponseCode int
ResponseHeader string
MimeType string MimeType string
Lang string Lang string
GemText string GemText string
@@ -31,7 +36,7 @@ type GeminiPageData struct {
func getHostIPAddresses(hostname string) ([]string, error) { func getHostIPAddresses(hostname string) ([]string, error) {
addrs, err := net.LookupHost(hostname) addrs, err := net.LookupHost(hostname)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err)
} }
IpPool.Lock.RLock() IpPool.Lock.RLock()
defer func() { defer func() {
@@ -41,12 +46,12 @@ func getHostIPAddresses(hostname string) ([]string, error) {
} }
func ConnectAndGetData(url string) ([]byte, error) { func ConnectAndGetData(url string) ([]byte, error) {
parsedUrl, err := go_url.Parse(url) parsedURL, err := gourl.Parse(url)
if err != nil { if err != nil {
return nil, fmt.Errorf("Could not parse URL, error %w", err) return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
} }
hostname := parsedUrl.Hostname() hostname := parsedURL.Hostname()
port := parsedUrl.Port() port := parsedURL.Port()
if port == "" { if port == "" {
port = "1965" port = "1965"
} }
@@ -54,39 +59,37 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Establish the underlying TCP connection. // Establish the underlying TCP connection.
dialer := &net.Dialer{ dialer := &net.Dialer{
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
KeepAlive: 10 * time.Second,
} }
conn, err := dialer.Dial("tcp", host) conn, err := dialer.Dial("tcp", host)
if err != nil { if err != nil {
return nil, fmt.Errorf("TCP connection failed: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
} }
// Make sure we always close the connection. // Make sure we always close the connection.
defer func() { defer func() {
err := conn.Close() // No need to handle error:
if err != nil { // Connection will time out eventually if still open somehow.
// Do nothing! Connection will timeout eventually if still open somehow. _ = conn.Close()
}
}() }()
// Set read and write timeouts on the TCP connection. // Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error setting connection deadline: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
} }
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error setting connection deadline: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
} }
// Perform the TLS handshake // Perform the TLS handshake
tlsConfig := &tls.Config{ tlsConfig := &tls.Config{
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure. InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
ServerName: parsedUrl.Hostname(), // SNI should not include port ServerName: parsedURL.Hostname(), // SNI should not include port
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites. // MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
} }
tlsConn := tls.Client(conn, tlsConfig) tlsConn := tls.Client(conn, tlsConfig)
if err := tlsConn.Handshake(); err != nil { if err := tlsConn.Handshake(); err != nil {
return nil, fmt.Errorf("TLS handshake error: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err)
} }
// We read `buf`-sized chunks and add data to `data`. // We read `buf`-sized chunks and add data to `data`.
@@ -94,9 +97,13 @@ func ConnectAndGetData(url string) ([]byte, error) {
var data []byte var data []byte
// Send Gemini request to trigger server response. // Send Gemini request to trigger server response.
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url))) // Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitely in the URL.
_url, _ := ParseURL(url, "")
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error sending network request: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
} }
// Read response bytes in len(buf) byte chunks // Read response bytes in len(buf) byte chunks
for { for {
@@ -105,69 +112,83 @@ func ConnectAndGetData(url string) ([]byte, error) {
data = append(data, buf[:n]...) data = append(data, buf[:n]...)
} }
if len(data) > config.CONFIG.MaxResponseSize { if len(data) > config.CONFIG.MaxResponseSize {
data = []byte{} return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
return nil, fmt.Errorf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize)
} }
if err != nil { if err != nil {
if err == io.EOF { if errors.Is(err, io.EOF) {
break break
} else {
return nil, fmt.Errorf("Network error: %s", err)
} }
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
} }
} }
return data, nil return data, nil
} }
// Connect to given URL, using the Gemini protocol. // Visit given URL, using the Gemini protocol.
// Mutate given Snapshot with the data or the error. // Mutates given Snapshot with the data.
func Visit(s *Snapshot) { // In case of error, we store the error string
data, err := ConnectAndGetData(s.URL.String()) // inside snapshot and return the error.
func Visit(s *Snapshot) (err error) {
// Don't forget to also store error
// response code (if we have one)
defer func() {
if err != nil { if err != nil {
s.Error = null.StringFrom(err.Error()) s.Error = null.StringFrom(err.Error())
return if errors.As(err, new(*GeminiError)) {
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
}
}
}()
data, err := ConnectAndGetData(s.URL.String())
if err != nil {
return err
} }
pageData, err := processData(data) pageData, err := processData(data)
if err != nil { if err != nil {
s.Error = null.StringFrom(err.Error()) return err
return
} }
//marshalled, _ := json.MarshalIndent(pageData, "", " ")
//fmt.Printf("%s\n", marshalled)
s.Header = null.StringFrom(pageData.ResponseHeader)
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode)) s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
s.MimeType = null.StringFrom(pageData.MimeType) s.MimeType = null.StringFrom(pageData.MimeType)
s.Lang = null.StringFrom(pageData.Lang) s.Lang = null.StringFrom(pageData.Lang)
if pageData.GemText != "" { if pageData.GemText != "" {
s.GemText = null.StringFrom(string(pageData.GemText)) s.GemText = null.StringFrom(pageData.GemText)
} }
if pageData.Data != nil { if pageData.Data != nil {
s.Data = null.ValueFrom(pageData.Data) s.Data = null.ValueFrom(pageData.Data)
} }
return return nil
} }
// Update given snapshot with the // processData returne results from
// Gemini header data: response code, // parsing Gemini header data:
// mime type and lang (optional) // Code, mime type and lang (optional)
func processData(data []byte) (*GeminiPageData, error) { // Returns error if header was invalid
headers, body, err := getHeadersAndData(data) func processData(data []byte) (*PageData, error) {
header, body, err := getHeadersAndData(data)
if err != nil { if err != nil {
return nil, err return nil, err
} }
code, mimeType, lang := getMimeTypeAndLang(headers) code, mimeType, lang := getMimeTypeAndLang(header)
geminiError := checkGeminiStatusCode(code) logging.LogDebug("Header: %s", strings.TrimSpace(header))
if geminiError != nil { if code != 20 {
return nil, geminiError return nil, NewErrGeminiStatusCode(code, header)
} }
pageData := GeminiPageData{
pageData := PageData{
ResponseCode: code, ResponseCode: code,
ResponseHeader: header,
MimeType: mimeType, MimeType: mimeType,
Lang: lang, Lang: lang,
} }
// If we've got a Gemini document, populate // If we've got a Gemini document, populate
// `GemText` field, otherwise raw data goes to `Data`. // `GemText` field, otherwise raw data goes to `Data`.
if mimeType == "text/gemini" { if mimeType == "text/gemini" {
validBody, err := EnsureValidUTF8(body) validBody, err := BytesToValidUTF8(body)
if err != nil { if err != nil {
return nil, fmt.Errorf("UTF-8 error: %w", err) return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err)
} }
pageData.GemText = validBody pageData.GemText = validBody
} else { } else {
@@ -180,14 +201,14 @@ func processData(data []byte) (*GeminiPageData, error) {
// basically the first line of the response // basically the first line of the response
// and should contain the response code, // and should contain the response code,
// mimeType and language. // mimeType and language.
func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) { func getHeadersAndData(data []byte) (string, []byte, error) {
firstLineEnds := slices.Index(data, '\n') firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 { if firstLineEnds == -1 {
return "", nil, fmt.Errorf("Could not parse response header") return "", nil, ErrGeminiResponseHeader
} }
firstLine = string(data[:firstLineEnds]) firstLine := string(data[:firstLineEnds])
rest = data[firstLineEnds+1:] rest := data[firstLineEnds+1:]
return string(firstLine), rest, nil return firstLine, rest, nil
} }
// Parses code, mime type and language // Parses code, mime type and language
@@ -196,12 +217,12 @@ func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
// `20 text/gemini lang=en` (code, mimetype, lang) // `20 text/gemini lang=en` (code, mimetype, lang)
// `20 text/gemini` (code, mimetype) // `20 text/gemini` (code, mimetype)
// `31 gemini://redirected.to/other/site` (code) // `31 gemini://redirected.to/other/site` (code)
func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) { func getMimeTypeAndLang(headers string) (int, string, string) {
// Regex that parses code, mimetype & lang // Regex that parses code, mimetype & optional charset/lang parameters
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`) re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
matches := re.FindStringSubmatch(headers) matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 { if matches == nil || len(matches) <= 1 {
// Try to get code at least. // Try to get code at least
re := regexp.MustCompile(`^(\d+)\s+`) re := regexp.MustCompile(`^(\d+)\s+`)
matches := re.FindStringSubmatch(headers) matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 { if matches == nil || len(matches) <= 1 {
@@ -217,7 +238,7 @@ func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string)
if err != nil { if err != nil {
return 0, "", "" return 0, "", ""
} }
mimeType = matches[2] mimeType := matches[2]
lang = matches[4] param := matches[3] // This will capture either charset or lang value
return code, mimeType, lang return code, mimeType, param
} }

View File

@@ -6,6 +6,7 @@ import (
// Test for input: `20 text/gemini` // Test for input: `20 text/gemini`
func TestGetMimeTypeAndLang1(t *testing.T) { func TestGetMimeTypeAndLang1(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
if code != 20 || mimeType != "text/gemini" || lang != "" { if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -13,13 +14,39 @@ func TestGetMimeTypeAndLang1(t *testing.T) {
} }
func TestGetMimeTypeAndLang11(t *testing.T) { func TestGetMimeTypeAndLang11(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
if code != 20 || mimeType != "text/gemini" || lang != "" { if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
} }
} }
func TestGetMimeTypeAndLang12(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang13(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang2(t *testing.T) { func TestGetTypeAndLang2(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang21(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" { if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -27,6 +54,7 @@ func TestGetTypeAndLang2(t *testing.T) {
} }
func TestGetMimeTypeAndLang3(t *testing.T) { func TestGetMimeTypeAndLang3(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page") code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
if code != 31 || mimeType != "" || lang != "" { if code != 31 || mimeType != "" || lang != "" {
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -34,6 +62,7 @@ func TestGetMimeTypeAndLang3(t *testing.T) {
} }
func TestGetMimeTypeAndLang4(t *testing.T) { func TestGetMimeTypeAndLang4(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd") code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
if code != 0 || mimeType != "" || lang != "" { if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -41,6 +70,7 @@ func TestGetMimeTypeAndLang4(t *testing.T) {
} }
func TestGetMimeTypeAndLang5(t *testing.T) { func TestGetMimeTypeAndLang5(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("") code, mimeType, lang := getMimeTypeAndLang("")
if code != 0 || mimeType != "" || lang != "" { if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)

View File

@@ -1,10 +1,13 @@
package gemini package gemini
import ( import (
"encoding/json"
"fmt" "fmt"
"gemini-grc/logging" "gemini-grc/config"
"os" "os"
"gemini-grc/logging"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
@@ -33,11 +36,41 @@ func ConnectToDB() *sqlx.DB {
return db return db
} }
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s))
}
if config.CONFIG.DryRun {
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
return nil
}
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO UPDATE SET ON CONFLICT (url) DO NOTHING
`
_, err = tx.NamedExec(query, s)
if err != nil {
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
}
return nil
}
func UpsertSnapshot(id int, tx *sqlx.Tx, s *Snapshot) error {
marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s))
}
if config.CONFIG.DryRun {
logging.LogDebug("[%d] Would upsert snapshot %s", id, marshalled)
return nil
}
query := `
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO UPDATE SET
url = EXCLUDED.url, url = EXCLUDED.url,
host = EXCLUDED.host, host = EXCLUDED.host,
timestamp = EXCLUDED.timestamp, timestamp = EXCLUDED.timestamp,
@@ -47,24 +80,30 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
links = EXCLUDED.links, links = EXCLUDED.links,
lang = EXCLUDED.lang, lang = EXCLUDED.lang,
response_code = EXCLUDED.response_code, response_code = EXCLUDED.response_code,
error = EXCLUDED.error error = EXCLUDED.error`
` _, err = tx.NamedExec(query, s)
_, err := tx.NamedExec(query, s) //if err != nil {
// logging.LogError("[%s] GeminiError upserting snapshot: %w", s.URL, err)
// panic("This shouldn't happen")
//}
if err != nil { if err != nil {
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err) return fmt.Errorf("[%s] GeminiError upserting snapshot: %w", s.URL, err)
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
} }
return nil return nil
} }
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe // Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
const batchSize = 5000 const batchSize = 5000
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO NOTHING ON CONFLICT (url) DO NOTHING
` `
for i := 0; i < len(snapshots); i += batchSize { for i := 0; i < len(snapshots); i += batchSize {
@@ -77,7 +116,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
_, err := tx.NamedExec(query, batch) _, err := tx.NamedExec(query, batch)
if err != nil { if err != nil {
logging.LogError("Error batch inserting snapshots: %w", err) logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err) return fmt.Errorf("DB error: %w", err)
} }
} }
@@ -86,14 +125,17 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
} }
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error { func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO NOTHING ON CONFLICT (url) DO NOTHING
` `
_, err := tx.NamedExec(query, snapshots) _, err := tx.NamedExec(query, snapshots)
if err != nil { if err != nil {
logging.LogError("Error batch inserting snapshots: %w", err) logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err) return fmt.Errorf("DB error: %w", err)
} }
return nil return nil

View File

@@ -2,32 +2,58 @@ package gemini
import ( import (
"bytes" "bytes"
"errors"
"fmt" "fmt"
"io" "io"
"unicode/utf8" "unicode/utf8"
"golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/transform" "golang.org/x/text/transform"
) )
func EnsureValidUTF8(input []byte) (string, error) { var (
// Remove NULL byte 0x00 ErrInputTooLarge = errors.New("input too large")
inputNoNull := bytes.ReplaceAll(input, []byte{0}, nil) ErrUTF8Conversion = errors.New("UTF-8 conversion error")
isValidUTF8 := utf8.Valid(inputNoNull) )
if !isValidUTF8 {
encodings := []transform.Transformer{ func BytesToValidUTF8(input []byte) (string, error) {
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1 if len(input) == 0 {
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc return "", nil
// TODO: Try more encodings?
} }
const maxSize = 10 * 1024 * 1024 // 10MB
if len(input) > maxSize {
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
}
// Remove NULL byte 0x00 (ReplaceAll accepts slices)
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
if utf8.Valid(inputNoNull) {
return string(inputNoNull), nil
}
encodings := []transform.Transformer{
charmap.ISO8859_1.NewDecoder(),
charmap.ISO8859_7.NewDecoder(),
charmap.Windows1250.NewDecoder(), // Central European
charmap.Windows1251.NewDecoder(), // Cyrillic
charmap.Windows1252.NewDecoder(),
charmap.Windows1256.NewDecoder(), // Arabic
japanese.EUCJP.NewDecoder(), // Japanese
korean.EUCKR.NewDecoder(), // Korean
}
// First successful conversion wins.
var lastErr error
for _, encoding := range encodings { for _, encoding := range encodings {
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding) reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
result, err := io.ReadAll(reader) result, err := io.ReadAll(reader)
if err != nil { if err != nil {
return "", fmt.Errorf("UTF-8 error: %w", err) lastErr = err
continue
} }
if utf8.Valid(result) {
return string(result), nil return string(result), nil
} }
} }
return string(inputNoNull), nil
return "", fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr)
} }

View File

@@ -4,9 +4,10 @@ import "testing"
// Make sure NULL bytes are removed // Make sure NULL bytes are removed
func TestEnsureValidUTF8(t *testing.T) { func TestEnsureValidUTF8(t *testing.T) {
t.Parallel()
// Create a string with a null byte // Create a string with a null byte
strWithNull := "Hello" + string('\x00') + "world" strWithNull := "Hello" + string('\x00') + "world"
result, _ := EnsureValidUTF8([]byte(strWithNull)) result, _ := BytesToValidUTF8([]byte(strWithNull))
if result != "Helloworld" { if result != "Helloworld" {
t.Errorf("Expected string without NULL byte, got %s", result) t.Errorf("Expected string without NULL byte, got %s", result)
} }

View File

@@ -2,16 +2,18 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/logging"
"strings" "strings"
"sync" "sync"
"gemini-grc/logging"
) )
// key: "host:port" (string) // RobotsCache is a map of blocked URLs
// value: // key: URL
// empty []string if no robots data, or // value: []string list of disallowed URLs
// list of URL prefixes ([]string) in robots // If a key has no blocked URLs, an empty
var RobotsCache sync.Map // list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals
func populateBlacklist(key string) (entries []string) { func populateBlacklist(key string) (entries []string) {
// We either store an empty list when // We either store an empty list when
@@ -40,44 +42,41 @@ func populateBlacklist(key string) (entries []string) {
// According to spec, the first is correct, // According to spec, the first is correct,
// however let's be lenient // however let's be lenient
var data string var data string
if robotsData.MimeType == "text/plain" { switch {
case robotsData.MimeType == "text/plain":
data = string(robotsData.Data) data = string(robotsData.Data)
} else if robotsData.MimeType == "text/gemini" { case robotsData.MimeType == "text/gemini":
data = robotsData.GemText data = robotsData.GemText
} else { default:
return []string{} return []string{}
} }
entries = ParseRobotsTxt(string(data), key) entries = ParseRobotsTxt(data, key)
return entries return entries
} }
// Check if the snapshot URL matches // RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule. // a robots.txt allow rule.
func RobotMatch(s *Snapshot) bool { func RobotMatch(url URL) bool {
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String()) key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port) logging.LogDebug("Checking robots.txt cache for %s", key)
v, ok := RobotsCache.Load(key) var disallowedURLs []string
if ok == false { cacheEntries, ok := RobotsCache.Load(key)
if !ok {
// First time check, populate robot cache // First time check, populate robot cache
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String()) disallowedURLs = populateBlacklist(key)
disallowedURLs := populateBlacklist(key) logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
for _, url := range disallowedURLs {
if strings.HasPrefix(s.URL.String(), url) {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
return true
}
}
} else { } else {
if len(v.([]string)) == 0 { disallowedURLs, _ = cacheEntries.([]string)
logging.LogDebug("No robots.txt or no rules, allowed")
return false
} }
for _, url := range v.([]string) { return isURLblocked(disallowedURLs, url.Full)
if strings.HasPrefix(s.URL.String(), url) { }
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
func isURLblocked(disallowedURLs []string, input string) bool {
for _, url := range disallowedURLs {
if strings.HasPrefix(strings.ToLower(input), url) {
logging.LogDebug("robots.txt match: %s matches %s", input, url)
return true return true
} }
} }
}
return false return false
} }

View File

@@ -5,7 +5,7 @@ import (
"strings" "strings"
) )
// Takes robots.txt content and a host, and // ParseRobotsTxt takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't // returns a list of full URLs that shouldn't
// be visited. // be visited.
// TODO Also take into account the user agent? // TODO Also take into account the user agent?

View File

@@ -6,6 +6,7 @@ import (
) )
func TestParseRobotsTxt(t *testing.T) { func TestParseRobotsTxt(t *testing.T) {
t.Parallel()
input := `User-agent: * input := `User-agent: *
Disallow: /cgi-bin/wp.cgi/view Disallow: /cgi-bin/wp.cgi/view
Disallow: /cgi-bin/wp.cgi/media Disallow: /cgi-bin/wp.cgi/media
@@ -26,6 +27,7 @@ Disallow: /admin/`
} }
func TestParseRobotsTxtEmpty(t *testing.T) { func TestParseRobotsTxtEmpty(t *testing.T) {
t.Parallel()
input := `` input := ``
result := ParseRobotsTxt(input, "example.com") result := ParseRobotsTxt(input, "example.com")
@@ -34,3 +36,20 @@ func TestParseRobotsTxtEmpty(t *testing.T) {
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result) t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
} }
} }
func TestIsURLblocked(t *testing.T) {
t.Parallel()
disallowedURLs := []string{
"gemini://example.com/cgi-bin/wp.cgi/view",
"gemini://example.com/cgi-bin/wp.cgi/media",
"gemini://example.com/admin/",
}
url := "gemini://example.com/admin/index.html"
if !isURLblocked(disallowedURLs, url) {
t.Errorf("Expected %s to be blocked", url)
}
url = "gemini://example1.com/admin/index.html"
if isURLblocked(disallowedURLs, url) {
t.Errorf("expected %s to not be blocked", url)
}
}

View File

@@ -4,15 +4,13 @@ import (
"database/sql/driver" "database/sql/driver"
"encoding/json" "encoding/json"
"fmt" "fmt"
"gemini-grc/logging"
"strings"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
type LinkList []GeminiUrl type LinkList []URL
func (l LinkList) Value() (driver.Value, error) { func (l *LinkList) Value() (driver.Value, error) {
return json.Marshal(l) return json.Marshal(l)
} }
@@ -30,45 +28,16 @@ func (l *LinkList) Scan(value interface{}) error {
type Snapshot struct { type Snapshot struct {
ID int `db:"id" json:"id,omitempty"` ID int `db:"id" json:"id,omitempty"`
UID string `db:"uid" json:"uid,omitempty"` //UID string `db:"uid" json:"uid,omitempty"`
URL GeminiUrl `db:"url" json:"url,omitempty"` URL URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"` Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"` Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"` MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files. Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files. GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Links *LinkList `db:"links" json:"links,omitempty"` Header null.String `db:"header" json:"header,omitempty"` // Response header.
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
Lang null.String `db:"lang" json:"lang,omitempty"` Lang null.String `db:"lang" json:"lang,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code. ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
Error null.String `db:"error" json:"error,omitempty"` // On network errors only Error null.String `db:"error" json:"error,omitempty"` // On network errors only
} }
func SnapshotToJSON(g Snapshot) string {
// Serialize the Person struct to JSON
jsonData, err := json.MarshalIndent(g, "", "\t")
if err != nil {
logging.LogError("Error serializing to JSON: %w", err)
}
return string(jsonData)
}
func SnapshotFromJSON(input string) Snapshot {
var snapshot Snapshot
err := json.Unmarshal([]byte(input), &snapshot)
if err != nil {
logging.LogError("Error deserializing from JSON: %w", err)
}
return snapshot
}
func ShouldPersistSnapshot(result *Snapshot) bool {
if !result.MimeType.Valid {
return false
}
if result.MimeType.String == "text/gemini" ||
strings.HasPrefix(result.MimeType.String, "image/") ||
strings.HasPrefix(result.MimeType.String, "text/") {
return true
}
return false
}

View File

@@ -1,36 +1,38 @@
package gemini package gemini
import ( import (
"errors"
"fmt" "fmt"
"gemini-grc/config"
"gemini-grc/logging"
"gemini-grc/uid"
"gemini-grc/util"
"strings" "strings"
"time" "time"
"gemini-grc/config"
"gemini-grc/logging"
"gemini-grc/util"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers) logging.LogInfo("Spawning %d workers", numOfWorkers)
for i := 0; i < numOfWorkers; i++ { for i := range numOfWorkers {
go func(i int) { go func(i int) {
for { for {
runWorker(i, db) RunWorker(i, db, nil)
} }
}(i) }(i)
} }
} }
func runWorker(id int, db *sqlx.DB) { func RunWorker(id int, db *sqlx.DB, url *string) {
// Start the DB transaction // Each worker runs within a DB transaction.
tx, err := db.Beginx() tx, err := db.Beginx()
if err != nil { if err != nil {
logging.LogError("Failed to begin transaction: %w", err) logging.LogError("Failed to begin transaction: %w", err)
} }
// Commit/rollback at the end
defer func() { defer func() {
err = tx.Commit() err = tx.Commit()
if err != nil { if err != nil {
@@ -42,66 +44,97 @@ func runWorker(id int, db *sqlx.DB) {
} }
}() }()
snapshots, err := GetRandomSnapshotsDistinctHosts(tx) var snapshots []Snapshot
// If not given a specific URL,
// get some random ones to visit from DB.
if url == nil {
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
if err != nil { if err != nil {
logging.LogError("[%d] Error retrieving snapshot: %w", id, err) logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err)
time.Sleep(10 * time.Second) panic("This should never happen")
return
} else if len(snapshots) == 0 { } else if len(snapshots) == 0 {
logging.LogInfo("[%d] No remaining snapshots to visit.", id) logging.LogInfo("[%d] No snapshots to visit.", id)
time.Sleep(1 * time.Minute) time.Sleep(1 * time.Minute)
return return
} }
} else {
snapshotURL, err := ParseURL(*url, "")
if err != nil {
logging.LogError("Invalid URL given: " + *url)
return
}
snapshots = []Snapshot{{
//UID: uid.UID(),
URL: *snapshotURL,
Host: snapshotURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}}
}
// Start visiting URLs.
total := len(snapshots) total := len(snapshots)
for i, s := range snapshots { for i, s := range snapshots {
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL) logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
// We differentiate between errors:
// Unexpected errors are the ones returned from the following function.
// If an error is unexpected (which should never happen) we panic.
// Expected errors are stored as strings within the snapshot,
// so that they can also be stored in DB.
err = workOnSnapshot(id, tx, &s) err = workOnSnapshot(id, tx, &s)
if err != nil { if err != nil {
logging.LogError("[%d] [%s] Error %w", id, s.URL, err) logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err)
util.PrintStackAndPanic(err) util.PrintStackAndPanic(err)
} }
if s.Error.Valid { if s.Error.Valid {
logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String)) logging.LogWarn("[%d] Error: %v", id, s.Error.String)
} }
logging.LogDebug("[%d] Done %d/%d.", id, i, total) logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
} }
logging.LogInfo("[%d] Worker done.", id) logging.LogInfo("[%d] Worker done.", id)
} }
// workOnSnapshot visits a URL and stores the result.
// errors should be returned only if they are unexpected.
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
if IsBlacklisted(s.URL) {
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
return nil
}
// If URL matches a robots.txt disallow line, // If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be // add it as an error so next time it won't be
// crawled. // crawled.
if RobotMatch(s) { if RobotMatch(s.URL) {
s.Error = null.StringFrom("robots.txt disallow match") s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
err = SaveSnapshotToDB(tx, s) err = UpsertSnapshot(id, tx, s)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) return fmt.Errorf("[%d] %w", id, err)
} }
return nil return nil
} }
// Resolve IP address via DNS
IPs, err := getHostIPAddresses(s.Host) IPs, err := getHostIPAddresses(s.Host)
if err != nil { if err != nil {
s.Error = null.StringFrom("DNS Resolve error") s.Error = null.StringFrom(err.Error())
err = SaveSnapshotToDB(tx, s) err = UpsertSnapshot(id, tx, s)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) return fmt.Errorf("[%d] %w", id, err)
} }
return nil return nil
} }
// If the host's ip is in the connections pool, // If the host's ip is in the connections pool we stop
// stop and add the url in the queue later.
IpPool.Lock.RLock() IpPool.Lock.RLock()
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL) logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
for _, ip := range IPs { for _, ip := range IPs {
_, ok := IpPool.IPs[ip] _, ok := IpPool.IPs[ip]
if ok { if ok {
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL) logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
IpPool.Lock.RUnlock() IpPool.Lock.RUnlock()
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain time.Sleep(1 * time.Second) // Avoid flood-retrying
return nil return nil
} }
} }
@@ -109,72 +142,114 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
AddIPsToPool(IPs) AddIPsToPool(IPs)
url := s.URL.String() // After finishing, remove the host IPs from
logging.LogDebug("[%d] Dialing %s", id, url) // the connections pool, with a small delay
Visit(s) // to avoid potentially hitting the same IP quickly.
logging.LogDebug("[%d] Finished dialing.", id) defer func() {
go func() {
time.Sleep(5 * time.Second) time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs) RemoveIPsFromPool(IPs)
}() }()
if s.MimeType.Valid && s.MimeType.String == "text/gemini" { url := s.URL.String()
logging.LogDebug("[%d] [%s] Processing", id, url) logging.LogDebug("[%d] Dialing %s", id, url)
s = ProcessGemini(s)
} err = Visit(s)
logging.LogDebug("[%d] Saving", id)
err = SaveSnapshotToDB(tx, s)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) if !IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
return err
}
// Check if error is redirection, and handle it
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*GeminiError)) &&
err.(*GeminiError).Msg == "redirect" {
err = handleRedirection(id, tx, s)
if err != nil {
return err
}
}
}
logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero())
// If this is a gemini page, parse possible links inside
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
links := GetPageLinks(s.URL, s.GemText.String)
logging.LogDebug("[%d] Found %d links", id, len(links))
if len(links) > 0 {
s.Links = null.ValueFrom(links)
}
} else {
logging.LogDebug("[%d] Not looking for page links", id)
} }
// Store links in batch err = UpsertSnapshot(id, tx, s)
if s.Links != nil { if err != nil {
var batchSnapshots []*Snapshot return err
timestamp := null.TimeFrom(time.Now()) }
for _, link := range *s.Links { err = storeLinks(tx, s)
if shouldPersistURL(tx, link) { if err != nil {
return err
}
return nil
}
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
if s.Links.Valid {
var batchSnapshots []*Snapshot
for _, link := range s.Links.ValueOrZero() {
if shouldPersistURL(link) {
newSnapshot := &Snapshot{ newSnapshot := &Snapshot{
UID: uid.UID(), //UID: uid.UID(),
URL: link, URL: link,
Host: link.Hostname, Host: link.Hostname,
Timestamp: timestamp, Timestamp: null.TimeFrom(time.Now()),
} }
batchSnapshots = append(batchSnapshots, newSnapshot) batchSnapshots = append(batchSnapshots, newSnapshot)
} }
} }
if len(batchSnapshots) > 0 { if len(batchSnapshots) > 0 {
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots)) err := SaveLinksToDBinBatches(tx, batchSnapshots)
err = SaveLinksToDBinBatches(tx, batchSnapshots)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) return err
} }
} }
} }
return nil return nil
} }
// Should we save the given URL for crawling? // shouldPersistURL returns true if we
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool { // should save the URL in the DB.
if !strings.HasPrefix(u.String(), "gemini://") { // Only gemini:// urls are saved.
return false func shouldPersistURL(u URL) bool {
} return strings.HasPrefix(u.String(), "gemini://")
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)` }
var exists bool
err := tx.Get(&exists, query, u.String()) func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error {
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
if err != nil { if err != nil {
fmt.Println("Error executing query:", err) return err
return false
} }
return !exists logging.LogDebug("[%d] Page redirects to %s", id, newURL)
// Insert fresh snapshot with new URL
snapshot := &Snapshot{
//UID: uid.UID(),
URL: *newURL,
Host: newURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String())
err = SaveSnapshotIfNew(tx, snapshot)
if err != nil {
return err
}
return nil
} }
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) { func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
// Old, unoptimized query // Old, unoptimized query
//
// query := ` // query := `
// SELECT DISTINCT ON (host) * // SELECT DISTINCT ON (host) *
// FROM snapshots // FROM snapshots
@@ -184,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
// LIMIT $1 // LIMIT $1
// ` // `
query := ` query := `
WITH RankedSnapshots AS ( SELECT *
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
links, lang, response_code, error,
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
FROM snapshots FROM snapshots
WHERE response_code IS NULL WHERE response_code IS NULL
AND error IS NULL AND error IS NULL
) ORDER BY RANDOM()
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
links, lang, response_code, error
FROM RankedSnapshots
WHERE rn = 1
LIMIT $1 LIMIT $1
` `
//query := `
// WITH RankedSnapshots AS (
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
// links, lang, response_code, error,
// ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
// FROM snapshots
// WHERE response_code IS NULL
// AND error IS NULL
// )
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
// links, lang, response_code, error
// FROM RankedSnapshots
// WHERE rn = 1
// LIMIT $1
//`
var snapshots []Snapshot var snapshots []Snapshot
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize) err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
if err != nil { if err != nil {
@@ -205,3 +288,18 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
} }
return snapshots, nil return snapshots, nil
} }
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err
}
return snapshots, nil
}

18
go.mod
View File

@@ -3,23 +3,27 @@ module gemini-grc
go 1.23.1 go 1.23.1
require ( require (
github.com/jaevor/go-nanoid v1.4.0 github.com/guregu/null/v5 v5.0.0
github.com/jackc/pgx/v5 v5.7.1
github.com/jmoiron/sqlx v1.4.0
github.com/matoous/go-nanoid/v2 v2.1.0
github.com/rs/zerolog v1.33.0 github.com/rs/zerolog v1.33.0
github.com/stretchr/testify v1.9.0
golang.org/x/text v0.19.0
) )
require ( require (
github.com/guregu/null/v5 v5.0.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/pgx/v5 v5.7.1 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/jmoiron/sqlx v1.4.0 // indirect github.com/kr/text v0.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-isatty v0.0.20 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.13.1 // indirect
golang.org/x/crypto v0.27.0 // indirect golang.org/x/crypto v0.27.0 // indirect
golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 // indirect
golang.org/x/net v0.27.0 // indirect
golang.org/x/sync v0.8.0 // indirect golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.25.0 // indirect golang.org/x/sys v0.25.0 // indirect
golang.org/x/text v0.18.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
) )

34
go.sum
View File

@@ -1,8 +1,11 @@
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gabriel-vasile/mimetype v1.4.5 h1:J7wGKdGu33ocBOhGy0z653k/lFKLFDPJMG8Gql0kxn4= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/gabriel-vasile/mimetype v1.4.5/go.mod h1:ibHel+/kbxn9x2407k1izTA1S81ku1z/DlgOW2QE0M4= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/guregu/null/v5 v5.0.0 h1:PRxjqyOekS11W+w/7Vfz6jgJE/BCwELWtgvOJzddimw= github.com/guregu/null/v5 v5.0.0 h1:PRxjqyOekS11W+w/7Vfz6jgJE/BCwELWtgvOJzddimw=
@@ -15,32 +18,39 @@ github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs=
github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA= github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jaevor/go-nanoid v1.4.0 h1:mPz0oi3CrQyEtRxeRq927HHtZCJAAtZ7zdy7vOkrvWs=
github.com/jaevor/go-nanoid v1.4.0/go.mod h1:GIpPtsvl3eSBsjjIEFQdzzgpi50+Bo1Luk+aYlbJzlc=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A=
golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70=
golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 h1:1wqE9dj9NpSm04INVsJhhEUzhuDVjbcyKH91sVyPATw=
golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8=
golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -48,7 +58,11 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

52
http/http.go Normal file
View File

@@ -0,0 +1,52 @@
package http
import (
"fmt"
"gemini-grc/logging"
_ "gemini-grc/logging"
"net/http"
"time"
)
func CreateServer(listenAddr string) *http.Server {
mux := http.NewServeMux()
mux.HandleFunc("GET /ping", wrapForError(getPing))
server := &http.Server{
Addr: listenAddr,
Handler: mux,
ReadHeaderTimeout: 10 * time.Second,
}
go func() {
// Start the server. Blocking call.
logging.LogInfo("HTTP server listening on %s", listenAddr)
if err := server.ListenAndServe(); err != nil {
panic(fmt.Sprintf("Server failed to start: %s", err))
}
}()
return server
}
func wrapForError(f func(http.ResponseWriter, *http.Request) error) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
err := f(w, r)
if err != nil {
code := http.StatusInternalServerError
logging.LogWarn("Error while handling request: %d %s", code, err)
http.Error(w, http.StatusText(code), code)
}
}
}
func getPing(w http.ResponseWriter, r *http.Request) error {
method := r.Method
url := r.URL.String()
path := r.URL.Path
response := fmt.Sprintf("Pong %s %s %s", method, url, path)
_, err := w.Write([]byte(response))
if err != nil {
return fmt.Errorf("failed to write response: %w", err)
}
return nil
}

27
main.go
View File

@@ -3,15 +3,14 @@ package main
import ( import (
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/gemini" "gemini-grc/gemini"
"gemini-grc/http"
"gemini-grc/logging" "gemini-grc/logging"
"github.com/jmoiron/sqlx"
"github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log"
"os" "os"
"os/signal" "os/signal"
"syscall" "syscall"
"github.com/jmoiron/sqlx"
"github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log"
) )
func main() { func main() {
@@ -27,9 +26,10 @@ func main() {
func runApp() error { func runApp() error {
logging.LogInfo("Starting up. Press Ctrl+C to exit") logging.LogInfo("Starting up. Press Ctrl+C to exit")
sigs := make(chan os.Signal, 1) signals := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
server := http.CreateServer("localhost:8899")
db := gemini.ConnectToDB() db := gemini.ConnectToDB()
// !!! DANGER !!! // !!! DANGER !!!
@@ -44,9 +44,20 @@ func runApp() error {
} }
}(db) }(db)
gemini.LoadBlacklist()
// If there's an argument, assume it's a URL
// to visit and ignore database state.
if len(os.Args) > 1 {
url := os.Args[1]
go gemini.RunWorker(0, db, &url)
} else {
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db) go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
}
<-sigs <-signals
if err := server.Close(); err != nil {
logging.LogError("GeminiError during server shutdown: %s", err)
}
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting") logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
return nil return nil
} }

View File

@@ -1,14 +1,14 @@
package uid package uid
import ( import (
nanoid "github.com/jaevor/go-nanoid" nanoid "github.com/matoous/go-nanoid/v2"
) )
func UID() string { func UID() string {
// Missing o,O and l // No 'o','O' and 'l'
uid, err := nanoid.CustomASCII("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20) id, err := nanoid.Generate("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20)
if err != nil { if err != nil {
panic(err) panic(err)
} }
return uid() return id
} }