Compare commits

...

14 Commits

41 changed files with 1268 additions and 492 deletions

2
.gitignore vendored
View File

@@ -1,6 +1,8 @@
.idea .idea
.goroot
**/.#* **/.#*
**/*~ **/*~
/.go
/cmd /cmd
/db/initdb.sql /db/initdb.sql
/db/*sh /db/*sh

View File

@@ -5,6 +5,15 @@ export PATH := $(PATH)
all: fmt lint test all: fmt lint test
.PHONY: debug
debug:
@echo "PATH: $(PATH)"
@echo "GOPATH: $(shell go env GOPATH)"
@which go
@which gofumpt
@which gci
@which golangci-lint
# Test # Test
test: test:
go test -v ./... go test -v ./...

View File

@@ -2,6 +2,8 @@
A Gemini crawler. A Gemini crawler.
URLs to visit as well as data from visited URLs are stored into "snapshots" in the database.
## Done ## Done
- [x] Concurrent downloading with workers - [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host - [x] Concurrent connection limit per host
@@ -10,22 +12,16 @@ A Gemini crawler.
- [x] Configuration via environment variables - [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL - [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation - [x] Proper response header & body UTF-8 and format validation
- [x] Follow robots.txt - [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
- [x] Handle redirects (3X status codes)
## TODO ## TODO
- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi - [ ] Better URL normalization
- [ ] Proper handling of all response codes - [ ] Provide a TLS cert for sites that require it, like Astrobotany
- [ ] Handle 3X redirects properly
- [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
+ [ ] Probably have a common "grc" cert for all?
- [ ] Proper input and response validations:
+ [ ] When making a request, the URI MUST NOT exceed 1024 bytes
- [ ] Subscriptions to gemini pages? gemini://geminiprotocol.net/docs/companion/
## TODO for later ## TODO for later
- [ ] Add other protocols - [ ] Gopher
+ [ ] Gopher - [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
+ [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi - [ ] Spartan
+ [ ] Spartan - [ ] Nex
+ [ ] Nex - [ ] SuperTXT https://supertxt.net/00-intro.html
+ [ ] SuperTXT https://supertxt.net/00-intro.html

2
blacklist.txt Normal file
View File

@@ -0,0 +1,2 @@
gemi.dev
mastogem.picasoft.net

View File

@@ -8,83 +8,149 @@ import (
"github.com/rs/zerolog" "github.com/rs/zerolog"
) )
// Environment variable names.
const (
EnvLogLevel = "LOG_LEVEL"
EnvNumWorkers = "NUM_OF_WORKERS"
EnvWorkerBatchSize = "WORKER_BATCH_SIZE"
EnvMaxResponseSize = "MAX_RESPONSE_SIZE"
EnvResponseTimeout = "RESPONSE_TIMEOUT"
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
EnvBlacklistPath = "BLACKLIST_PATH"
EnvDryRun = "DRY_RUN"
)
// Config holds the application configuration loaded from environment variables.
type Config struct { type Config struct {
LogLevel zerolog.Level LogLevel zerolog.Level // Logging level (debug, info, warn, error)
rootPath string MaxResponseSize int // Maximum size of response in bytes
MaxResponseSize int NumOfWorkers int // Number of concurrent workers
NumOfWorkers int ResponseTimeout int // Timeout for responses in seconds
ResponseTimeout int WorkerBatchSize int // Batch size for worker processing
WorkerBatchSize int PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
BlacklistPath string // File that has blacklisted strings of "host:port"
DryRun bool // If false, don't write to disk
} }
var CONFIG Config var CONFIG Config //nolint:gochecknoglobals
func GetConfig() *Config { // parsePositiveInt parses and validates positive integer values.
var config Config func parsePositiveInt(param, value string) (int, error) {
for _, envVar := range []string{ val, err := strconv.Atoi(value)
"LOG_LEVEL", if err != nil {
"ROOT_PATH", return 0, ValidationError{
"NUM_OF_WORKERS", Param: param,
"WORKER_BATCH_SIZE", Value: value,
"MAX_RESPONSE_SIZE", Reason: "must be a valid integer",
"RESPONSE_TIMEOUT",
} {
if env, ok := os.LookupEnv(envVar); !ok {
fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar)
os.Exit(1)
} else {
switch envVar {
case "LOG_LEVEL":
{
logLevel, err := zerolog.ParseLevel(env)
if err != nil {
fmt.Fprintf(os.Stderr, "Invalid LOG_LEVEL value\n")
os.Exit(1)
}
config.LogLevel = logLevel
}
case "ROOT_PATH":
{
config.rootPath = env
}
case "NUM_OF_WORKERS":
{
if numOfWorkers, err := strconv.Atoi(env); err != nil {
fmt.Fprintf(os.Stderr, "Invalid NUM_OF_WORKERS value\n")
os.Exit(1)
} else {
config.NumOfWorkers = numOfWorkers
}
}
case "WORKER_BATCH_SIZE":
{
if workerBatchSize, err := strconv.Atoi(env); err != nil {
fmt.Fprintf(os.Stderr, "Invalid WORKER_BATCH_SIZE value\n")
os.Exit(1)
} else {
config.WorkerBatchSize = workerBatchSize
}
}
case "MAX_RESPONSE_SIZE":
{
if maxResponseSize, err := strconv.Atoi(env); err != nil {
fmt.Fprintf(os.Stderr, "Invalid MAX_RESPONSE_SIZE value\n")
os.Exit(1)
} else {
config.MaxResponseSize = maxResponseSize
}
}
case "RESPONSE_TIMEOUT":
{
if val, err := strconv.Atoi(env); err != nil {
fmt.Fprintf(os.Stderr, "Invalid RESPONSE_TIMEOUT value\n")
os.Exit(1)
} else {
config.ResponseTimeout = val
}
}
}
} }
} }
return &config if val <= 0 {
return 0, ValidationError{
Param: param,
Value: value,
Reason: "must be positive",
}
}
return val, nil
}
func parseBool(param, value string) (bool, error) {
val, err := strconv.ParseBool(value)
if err != nil {
return false, ValidationError{
Param: param,
Value: value,
Reason: "cannot be converted to boolean",
}
}
return val, nil
}
// GetConfig loads and validates configuration from environment variables
func GetConfig() *Config {
config := &Config{}
// Map of environment variables to their parsing functions
parsers := map[string]func(string) error{
EnvLogLevel: func(v string) error {
level, err := zerolog.ParseLevel(v)
if err != nil {
return ValidationError{
Param: EnvLogLevel,
Value: v,
Reason: "must be one of: debug, info, warn, error",
}
}
config.LogLevel = level
return nil
},
EnvNumWorkers: func(v string) error {
val, err := parsePositiveInt(EnvNumWorkers, v)
if err != nil {
return err
}
config.NumOfWorkers = val
return nil
},
EnvWorkerBatchSize: func(v string) error {
val, err := parsePositiveInt(EnvWorkerBatchSize, v)
if err != nil {
return err
}
config.WorkerBatchSize = val
return nil
},
EnvMaxResponseSize: func(v string) error {
val, err := parsePositiveInt(EnvMaxResponseSize, v)
if err != nil {
return err
}
config.MaxResponseSize = val
return nil
},
EnvResponseTimeout: func(v string) error {
val, err := parsePositiveInt(EnvResponseTimeout, v)
if err != nil {
return err
}
config.ResponseTimeout = val
return nil
},
EnvPanicOnUnexpectedError: func(v string) error {
val, err := parseBool(EnvPanicOnUnexpectedError, v)
if err != nil {
return err
}
config.PanicOnUnexpectedError = val
return nil
},
EnvBlacklistPath: func(v string) error {
config.BlacklistPath = v
return nil
},
EnvDryRun: func(v string) error {
val, err := parseBool(EnvDryRun, v)
if err != nil {
return err
}
config.DryRun = val
return nil
},
}
// Process each environment variable
for envVar, parser := range parsers {
value, ok := os.LookupEnv(envVar)
if !ok {
fmt.Fprintf(os.Stderr, "Missing required environment variable: %s\n", envVar)
os.Exit(1)
}
if err := parser(value); err != nil {
fmt.Fprintf(os.Stderr, "Configuration error: %v\n", err)
os.Exit(1)
}
}
return config
} }

14
config/errors.go Normal file
View File

@@ -0,0 +1,14 @@
package config
import "fmt"
// ValidationError represents a config validation error
type ValidationError struct {
Param string
Value string
Reason string
}
func (e ValidationError) Error() string {
return fmt.Sprintf("invalid value '%s' for %s: %s", e.Value, e.Param, e.Reason)
}

View File

@@ -0,0 +1,7 @@
delete FROM snapshots
WHERE host IN (
SELECT DISTINCT host
FROM snapshots
WHERE error LIKE 'robots.txt%'
)
AND url LIKE 'gemini://' || host || '/%';

5
db/error_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT error, count(error) as count
FROM snapshots
GROUP BY error
ORDER BY count DESC
LIMIT 20;

22
db/fix-url-ports.sql Normal file
View File

@@ -0,0 +1,22 @@
-- Here's an SQL script that will find and remove snapshots without port numbers
-- when there exists a duplicate with the default port 1965.
-- Before running this DELETE though, you might want to
-- verify the matches first with this SELECT:
WITH duplicates AS (
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
FROM snapshots s1
JOIN snapshots s2
ON s2.url = s1.url || ':1965'
)
SELECT * FROM duplicates;
-- Now delete them for real:
WITH duplicates AS (
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
FROM snapshots s1
JOIN snapshots s2
ON s2.url = s1.url || ':1965'
)
DELETE FROM snapshots
WHERE id IN (SELECT id_without_port FROM duplicates);

View File

@@ -0,0 +1,7 @@
SELECT host, COUNT(*) AS row_count
FROM snapshots
WHERE response_code IS NOT NULL
AND error IS NULL
GROUP BY host
ORDER BY row_count DESC
LIMIT 10;

View File

@@ -20,7 +20,7 @@ DROP TABLE IF EXISTS snapshots;
CREATE TABLE snapshots ( CREATE TABLE snapshots (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
uid TEXT NOT NULL UNIQUE, uid TEXT NOT NULL UNIQUE,
url TEXT NOT NULL, url TEXT NOT NULL UNIQUE,
host TEXT NOT NULL, host TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
mimetype TEXT, mimetype TEXT,
@@ -42,7 +42,10 @@ CREATE INDEX idx_lang ON snapshots (lang);
CREATE INDEX idx_response_code ON snapshots (response_code); CREATE INDEX idx_response_code ON snapshots (response_code);
CREATE INDEX idx_error ON snapshots (error); CREATE INDEX idx_error ON snapshots (error);
CREATE INDEX idx_host ON snapshots (host); CREATE INDEX idx_host ON snapshots (host);
CREATE INDEX unique_uid_url ON snapshots (uid, url);
CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host) CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
WHERE response_code IS NULL AND error IS NULL WHERE response_code IS NULL AND error IS NULL
INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang); INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;

View File

@@ -0,0 +1,18 @@
-- Step 1: Delete duplicate entries, keeping the last one based on timestamp
-- Use a CTE to mark duplicates and delete them efficiently
WITH ranked_snapshots AS (
SELECT
id,
url,
ROW_NUMBER() OVER(PARTITION BY url ORDER BY timestamp DESC) AS row_num
FROM
snapshots
)
DELETE FROM snapshots
USING ranked_snapshots
WHERE snapshots.id = ranked_snapshots.id
AND ranked_snapshots.row_num > 1;
-- Step 2: Add a unique constraint on the url column to prevent future duplicates
ALTER TABLE snapshots
ADD CONSTRAINT unique_url UNIQUE (url);

View File

@@ -2,44 +2,56 @@ package main
import ( import (
"fmt" "fmt"
"gemini-grc/gemini"
"os" "os"
"gemini-grc/gemini"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
// MANUALLY SET TO TRUE WHEN MIGRATION HAS BEEN APPLIED
func checkIfDone() bool { return true } func checkIfDone() bool { return true }
func MustSelect(tx *sqlx.Tx, dest interface{}, query string, args ...interface{}) {
if err := tx.Select(dest, query, args...); err != nil {
panic(err)
}
}
// Populates the `host` field // Populates the `host` field
func main() { func main() {
db := connectToDB()
if checkIfDone() { if checkIfDone() {
fmt.Println("Migration already applied") fmt.Println("Migration already applied")
return return
} }
count := 0 db := connectToDB()
defer db.Close()
batchSize := 1000
for { for {
// Start the transaction // Start the transaction
tx, err := db.Beginx() tx := db.MustBegin()
if err != nil { query := `SELECT * FROM snapshots WHERE url NOT LIKE '%1965%' LIMIT $1`
fmt.Println(err)
return
}
query := `
SELECT * FROM snapshots
WHERE host IS NULL
LIMIT 5000
`
var snapshots []gemini.Snapshot var snapshots []gemini.Snapshot
err = tx.Select(&snapshots, query) MustSelect(tx, &snapshots, query, batchSize)
if len(snapshots) == 0 { if len(snapshots) == 0 {
fmt.Println("Done!") fmt.Println("No snapshots remaining, done")
return break
} }
for i, s := range snapshots {
_, err := gemini.ParseURL(s.URL.String(), "")
if err != nil {
panic(fmt.Sprintf("Error parsing URL. ID %d URL %s\n", s.ID, s.URL))
}
fmt.Printf("Saving %d %d %s\n", i+1, s.ID, s.URL)
err = gemini.UpsertSnapshot(0, tx, &s)
if err != nil {
panic(fmt.Sprintf("Error saving %s: %s", s.URL, err))
}
tx.MustExec(`DELETE FROM snapshots WHERE id=$1`, s.ID)
}
err := tx.Commit()
if err != nil { if err != nil {
fmt.Println(err) fmt.Println(err)
err := tx.Rollback() err := tx.Rollback()
@@ -47,31 +59,7 @@ func main() {
panic(err) panic(err)
} }
} }
for _, s := range snapshots {
s.Host = s.URL.Hostname
fmt.Println(count, s.UID, s.URL.Hostname)
err := gemini.SaveSnapshotToDB(tx, &s)
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
count += 1
}
err = tx.Commit()
if err != nil {
fmt.Println(err)
err := tx.Rollback()
if err != nil {
panic(err)
}
}
} }
} }
func connectToDB() *sqlx.DB { func connectToDB() *sqlx.DB {

View File

@@ -1,9 +1,9 @@
package main package main
import ( import (
"gemini-grc/uid"
"time" "time"
"gemini-grc/uid"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )

5
db/url_port_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT
COUNT(*) AS "All",
COUNT(CASE WHEN URL ~ '://[^:]+:[0-9]+' THEN 1 END) AS "With port",
COUNT(CASE WHEN URL !~ '://[^:]+:[0-9]+' THEN 1 END) AS "Without port"
FROM snapshots;

View File

@@ -1,14 +1,16 @@
#!/bin/sh #!/bin/sh
set -eu set -eu
# Max response size 10MiB
MAX_RESPONSE_SIZE=10485760 \ MAX_RESPONSE_SIZE=10485760 \
LOG_LEVEL=info \ LOG_LEVEL=debug \
ROOT_PATH=./snaps \ ROOT_PATH=./snaps \
RESPONSE_TIMEOUT=10 \ RESPONSE_TIMEOUT=10 \
NUM_OF_WORKERS=5 \ NUM_OF_WORKERS=1 \
WORKER_BATCH_SIZE=1 \
PG_DATABASE=gemini \ PG_DATABASE=gemini \
PG_HOST=127.0.0.1 \ PG_HOST=127.0.0.1 \
PG_PORT=5433 \ PG_PORT=5433 \
PG_USER=gemini \ PG_USER=gemini \
PG_PASSWORD=gemini \ PG_PASSWORD=gemini \
go run ./migrate1_host.go dlv debug

View File

@@ -1,3 +1,6 @@
// 31 redirect
gemini://gemini.circumlunar.space
// body with null byte // body with null byte
gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False

51
gemini/blacklist.go Normal file
View File

@@ -0,0 +1,51 @@
package gemini
import (
"fmt"
"os"
"strings"
"gemini-grc/config"
"gemini-grc/logging"
)
var Blacklist *[]string //nolint:gochecknoglobals
func LoadBlacklist() {
if Blacklist == nil {
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
if err != nil {
Blacklist = &[]string{}
logging.LogWarn("Could not load Blacklist file: %v", err)
return
}
lines := strings.Split(string(data), "\n")
// Ignore lines starting with '#' (comments)
filteredLines := func() []string {
out := make([]string, 0, len(lines))
for _, line := range lines {
if !strings.HasPrefix(line, "#") {
out = append(out, line)
}
}
return out
}()
if len(lines) > 0 {
Blacklist = &filteredLines
logging.LogInfo("Blacklist has %d entries", len(*Blacklist))
}
}
}
func IsBlacklisted(url URL) bool {
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
for _, v := range *Blacklist {
if v == url.Hostname || v == hostWithPort {
return true
}
}
return false
}

View File

@@ -4,11 +4,11 @@ import (
"gemini-grc/logging" "gemini-grc/logging"
) )
var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)} var IpPool = IpAddressPool{IPs: make(map[string]int)}
func AddIPsToPool(IPs []string) { func AddIPsToPool(ips []string) {
IpPool.Lock.Lock() IpPool.Lock.Lock()
for _, ip := range IPs { for _, ip := range ips {
logging.LogDebug("Adding %s to pool", ip) logging.LogDebug("Adding %s to pool", ip)
IpPool.IPs[ip]++ IpPool.IPs[ip]++
} }

101
gemini/errors.go Normal file
View File

@@ -0,0 +1,101 @@
package gemini
import (
"errors"
"fmt"
)
type GeminiError struct {
Msg string
Code int
Header string
}
func (e *GeminiError) Error() string {
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
}
func NewErrGeminiStatusCode(code int, header string) error {
var msg string
switch {
case code >= 10 && code < 20:
msg = "needs input"
case code >= 30 && code < 40:
msg = "redirect"
case code >= 40 && code < 50:
msg = "bad request"
case code >= 50 && code < 60:
msg = "server error"
case code >= 60 && code < 70:
msg = "TLS error"
default:
msg = "unexpected status code"
}
return &GeminiError{
Msg: msg,
Code: code,
Header: header,
}
}
var (
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
ErrGeminiResponseHeader = errors.New("gemini response header error")
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
ErrURLParse = errors.New("URL parse error")
ErrURLDecode = errors.New("URL decode error")
ErrUTF8Parse = errors.New("UTF-8 parse error")
ErrTextParse = errors.New("text parse error")
ErrNetwork = errors.New("network error")
ErrNetworkDNS = errors.New("network DNS error")
ErrNetworkTLS = errors.New("network TLS error")
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
ErrNetworkCannotWrite = errors.New("network error - cannot write")
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
ErrDatabase = errors.New("database error")
)
// We could have used a map for speed, but
// we would lose ability to check wrapped
// errors via errors.Is().
var errGemini *GeminiError
var knownErrors = []error{ //nolint:gochecknoglobals
errGemini,
ErrGeminiLinkLineParse,
ErrGeminiRobotsParse,
ErrGeminiRobotsDisallowed,
ErrGeminiResponseHeader,
ErrURLParse,
ErrURLDecode,
ErrUTF8Parse,
ErrTextParse,
ErrNetwork,
ErrNetworkDNS,
ErrNetworkTLS,
ErrNetworkSetConnectionDeadline,
ErrNetworkCannotWrite,
ErrNetworkResponseSizeExceededMax,
ErrDatabase,
}
func IsKnownError(err error) bool {
for _, known := range knownErrors {
if errors.Is(err, known) {
return true
}
}
// Check for wrapped errors as well
if errors.As(err, new(*GeminiError)) {
return true
}
return false
}

24
gemini/errors_test.go Normal file
View File

@@ -0,0 +1,24 @@
package gemini
import (
"errors"
"fmt"
"testing"
)
func TestErrGemini(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
if !errors.As(err, new(*GeminiError)) {
t.Errorf("TestErrGemini fail")
}
}
func TestErrGeminiWrapped(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
errWrapped := fmt.Errorf("%w wrapped", err)
if !errors.As(errWrapped, new(*GeminiError)) {
t.Errorf("TestErrGeminiWrapped fail")
}
}

View File

@@ -2,12 +2,13 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/logging"
"net/url" "net/url"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
"strings" "strings"
"gemini-grc/logging"
) )
// sanitizePath encodes invalid filesystem characters using URL encoding. // sanitizePath encodes invalid filesystem characters using URL encoding.
@@ -77,7 +78,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
finalPath, err := calcFilePath(parentPath, urlPath) finalPath, err := calcFilePath(parentPath, urlPath)
if err != nil { if err != nil {
logging.LogError("Error saving %s: %w", s.URL, err) logging.LogError("GeminiError saving %s: %w", s.URL, err)
return return
} }
// Ensure the directory exists // Ensure the directory exists
@@ -87,12 +88,12 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
return return
} }
if s.MimeType.Valid && s.MimeType.String == "text/gemini" { if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
err = os.WriteFile(finalPath, (*s).Data.V, 0666) err = os.WriteFile(finalPath, (*s).Data.V, 0o666)
} else { } else {
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666) err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
} }
if err != nil { if err != nil {
logging.LogError("Error saving %s: %w", s.URL.Full, err) logging.LogError("GeminiError saving %s: %w", s.URL.Full, err)
} }
close(done) close(done)
} }

View File

@@ -1,76 +1,37 @@
package gemini package gemini
import ( import (
"errors"
"fmt" "fmt"
"gemini-grc/logging"
"net/url" "net/url"
"regexp" "regexp"
"strconv" "strconv"
"gemini-grc/logging"
) )
func checkGeminiStatusCode(code int) error { func GetPageLinks(currentURL URL, gemtext string) LinkList {
switch {
case code == 20:
return nil
case code >= 10 && code < 20:
return fmt.Errorf("gemini response %d needs data input", code)
case code >= 30 && code < 40:
return fmt.Errorf("gemini response %d redirect", code)
case code >= 40 && code < 50:
return fmt.Errorf("gemini response %d server error", code)
case code >= 50 && code < 60:
return fmt.Errorf("gemini response %d server permanent error", code)
case code >= 60 && code < 70:
return fmt.Errorf("gemini response %d certificate error", code)
default:
return fmt.Errorf("unexpected/unhandled Gemini response %d", code)
}
}
func ProcessGemini(snapshot *Snapshot) *Snapshot {
// Grab link lines // Grab link lines
linkLines := ExtractLinkLines(snapshot.GemText.String) linkLines := ExtractLinkLines(gemtext)
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines)) if len(linkLines) == 0 {
return nil
}
var linkURLs LinkList
// Normalize URLs in links, and store them in snapshot // Normalize URLs in links, and store them in snapshot
for _, line := range linkLines { for _, line := range linkLines {
normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String()) normalizedLink, descr, err := NormalizeLink(line, currentURL.String())
if err != nil { if err != nil {
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err) logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
continue continue
} }
geminiUrl, err := ParseUrl(normalizedLink, descr) geminiUrl, err := ParseURL(normalizedLink, descr)
if err != nil { if err != nil {
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err) logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
continue continue
} }
if snapshot.Links == nil { logging.LogDebug(geminiUrl.String())
snapshot.Links = &LinkList{*geminiUrl} linkURLs = append(linkURLs, *geminiUrl)
} else {
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
}
} }
return snapshot return linkURLs
}
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
path := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
}
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
} }
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines // ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
@@ -87,11 +48,11 @@ func ExtractLinkLines(gemtext string) []string {
// NormalizeLink takes a single link line and the current URL, // NormalizeLink takes a single link line and the current URL,
// return the URL converted to an absolute URL // return the URL converted to an absolute URL
// and its description. // and its description.
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) { func NormalizeLink(linkLine string, currentURL string) (string, string, error) {
// Parse the current URL // Parse the current URL
baseURL, err := url.Parse(currentURL) baseURL, err := url.Parse(currentURL)
if err != nil { if err != nil {
return "", "", fmt.Errorf("invalid current URL: %v", err) return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
} }
// Regular expression to extract the URL part from a link line // Regular expression to extract the URL part from a link line
@@ -101,13 +62,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
matches := re.FindStringSubmatch(linkLine) matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 { if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged // If the line doesn't match the expected format, return it unchanged
return "", "", fmt.Errorf("not a link line: %v", linkLine) return "", "", fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine)
} }
originalURLStr := matches[1] originalURLStr := matches[1]
_, err = url.QueryUnescape(originalURLStr) _, err = url.QueryUnescape(originalURLStr)
if err != nil { if err != nil {
return "", "", fmt.Errorf("error decoding URL: %w", err) return "", "", fmt.Errorf("%w: %w", ErrURLDecode, err)
} }
restOfLine := "" restOfLine := ""
@@ -119,7 +80,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
parsedURL, err := url.Parse(originalURLStr) parsedURL, err := url.Parse(originalURLStr)
if err != nil { if err != nil {
// If URL parsing fails, return an error // If URL parsing fails, return an error
return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err) return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
} }
// Resolve relative URLs against the base URL // Resolve relative URLs against the base URL
@@ -151,14 +112,33 @@ func ParseFirstTwoDigits(input string) (int, error) {
// Find the first match in the string // Find the first match in the string
matches := re.FindStringSubmatch(input) matches := re.FindStringSubmatch(input)
if len(matches) == 0 { if len(matches) == 0 {
return 0, errors.New("no digits found at the beginning of the string") return 0, fmt.Errorf("%w", ErrGeminiResponseHeader)
} }
// Parse the captured match as an integer // Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1]) snapshot, err := strconv.Atoi(matches[1])
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err) return 0, fmt.Errorf("%w: %w", ErrTextParse, err)
} }
return snapshot, nil return snapshot, nil
} }
// extractRedirectTarget returns the redirection
// URL by parsing the header (or error message)
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
// \d+ - matches one or more digits
// \s+ - matches one or more whitespace
// ([^\r]+) - captures everything until it hits a \r (or end of string)
pattern := `\d+\s+([^\r]+)`
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) < 2 {
return nil, fmt.Errorf("%w: Cannot find redirect target from header %s", ErrGeminiResponseHeader, input)
}
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
if err != nil {
return nil, fmt.Errorf("%w: Cannot find redirect target from header: %w", ErrGeminiResponseHeader, err)
}
return newURL, nil
}

47
gemini/gemini_test.go Normal file
View File

@@ -0,0 +1,47 @@
package gemini
import (
"fmt"
"testing"
)
func TestExtractRedirectTargetFullURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://target.gr:1965") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 /a/b"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
input := "redirect: 31 ./"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://nox.im:1965/") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetWrong(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 fsdsdf"
result, err := extractRedirectTarget(*currentURL, input)
fmt.Println(err)
if result != nil || err == nil {
t.Errorf("fail: result should be nil, err is %s", err)
}
}

View File

@@ -1,12 +1,16 @@
package gemini package gemini
import ( import (
"encoding/json" "database/sql/driver"
"fmt" "fmt"
"gemini-grc/logging" "gemini-grc/logging"
"net/url"
"path"
"strconv"
"strings"
) )
type GeminiUrl struct { type URL struct {
Protocol string `json:"protocol,omitempty"` Protocol string `json:"protocol,omitempty"`
Hostname string `json:"hostname,omitempty"` Hostname string `json:"hostname,omitempty"`
Port int `json:"port,omitempty"` Port int `json:"port,omitempty"`
@@ -15,43 +19,79 @@ type GeminiUrl struct {
Full string `json:"full,omitempty"` Full string `json:"full,omitempty"`
} }
func (g *GeminiUrl) Scan(value interface{}) error { func (u *URL) Scan(value interface{}) error {
if value == nil { if value == nil {
// Clear the fields in the current GeminiUrl object (not the pointer itself) // Clear the fields in the current GeminiUrl object (not the pointer itself)
*g = GeminiUrl{} *u = URL{}
return nil return nil
} }
b, ok := value.(string) b, ok := value.(string)
if !ok { if !ok {
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value) return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
} }
parsedUrl, err := ParseUrl(b, "") parsedURL, err := ParseURL(b, "")
if err != nil { if err != nil {
return err return err
} }
*g = *parsedUrl *u = *parsedURL
return nil return nil
} }
func (u GeminiUrl) String() string { func (u URL) String() string {
return u.Full return u.Full
// return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path)
} }
func GeminiUrltoJSON(g GeminiUrl) string { func (u URL) StringNoDefaultPort() string {
// Serialize the Person struct to JSON if u.Port == 1965 {
jsonData, err := json.Marshal(g) return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
if err != nil {
logging.LogError("Error serializing to JSON: %w", err)
} }
return string(jsonData) return u.Full
} }
func GeminiUrlFromJSON(input string) GeminiUrl { func (u URL) Value() (driver.Value, error) {
var geminiUrl GeminiUrl if u.Full == "" {
err := json.Unmarshal([]byte(input), &geminiUrl) return nil, nil
if err != nil {
logging.LogError("Error deserializing from JSON: %w", err)
} }
return geminiUrl return u.Full, nil
}
func ParseURL(input string, descr string) (*URL, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
path := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
}
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
}
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
logging.LogDebug("Calculating redirect URL. Current %s header string %s", currentURL, input)
// If URL is absolute, return just it
if strings.Contains(input, "://") {
return ParseURL(input, "")
}
// input is a path. Clean it and construct
// new path
var newPath string
// Handle weird cases found in the wild
if strings.HasPrefix(input, "/") {
newPath = path.Clean(input)
} else if input == "./" || input == "." {
newPath = path.Join(currentURL.Path, "/")
} else {
newPath = path.Join(currentURL.Path, path.Clean(input))
}
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
return ParseURL(strURL, "")
} }

103
gemini/gemini_url_test.go Normal file
View File

@@ -0,0 +1,103 @@
package gemini
import (
"reflect"
"testing"
)
func TestParseURL(t *testing.T) {
t.Parallel()
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
parsed, err := ParseURL(input, "")
value, _ := parsed.Value()
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
t.Errorf("fail: %s", parsed)
}
}
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "gemini://a.b/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "a.b",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://a.b:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://smol.gr:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "c/d"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b/c/d",
Descr: "",
Full: "gemini://smol.gr:1965/a/b/c/d",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}

View File

@@ -2,25 +2,30 @@ package gemini
import ( import (
"crypto/tls" "crypto/tls"
"errors"
"fmt" "fmt"
"gemini-grc/config" "gemini-grc/logging"
"io" "io"
"net" "net"
go_url "net/url" gourl "net/url"
"regexp" "regexp"
"slices" "slices"
"strconv" "strconv"
"strings"
"time" "time"
"gemini-grc/config"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
type GeminiPageData struct { type PageData struct {
ResponseCode int ResponseCode int
MimeType string ResponseHeader string
Lang string MimeType string
GemText string Lang string
Data []byte GemText string
Data []byte
} }
// Resolve the URL hostname and // Resolve the URL hostname and
@@ -31,7 +36,7 @@ type GeminiPageData struct {
func getHostIPAddresses(hostname string) ([]string, error) { func getHostIPAddresses(hostname string) ([]string, error) {
addrs, err := net.LookupHost(hostname) addrs, err := net.LookupHost(hostname)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err)
} }
IpPool.Lock.RLock() IpPool.Lock.RLock()
defer func() { defer func() {
@@ -41,51 +46,50 @@ func getHostIPAddresses(hostname string) ([]string, error) {
} }
func ConnectAndGetData(url string) ([]byte, error) { func ConnectAndGetData(url string) ([]byte, error) {
parsedUrl, err := go_url.Parse(url) parsedURL, err := gourl.Parse(url)
if err != nil { if err != nil {
return nil, fmt.Errorf("Could not parse URL, error %w", err) return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
} }
hostname := parsedUrl.Hostname() hostname := parsedURL.Hostname()
port := parsedUrl.Port() port := parsedURL.Port()
if port == "" { if port == "" {
port = "1965" port = "1965"
} }
host := fmt.Sprintf("%s:%s", hostname, port) host := fmt.Sprintf("%s:%s", hostname, port)
// Establish the underlying TCP connection. // Establish the underlying TCP connection.
dialer := &net.Dialer{ dialer := &net.Dialer{
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
KeepAlive: 10 * time.Second,
} }
conn, err := dialer.Dial("tcp", host) conn, err := dialer.Dial("tcp", host)
if err != nil { if err != nil {
return nil, fmt.Errorf("TCP connection failed: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
} }
// Make sure we always close the connection. // Make sure we always close the connection.
defer func() { defer func() {
// No need to handle error: // No need to handle error:
// Connection will timeout eventually if still open somehow. // Connection will time out eventually if still open somehow.
conn.Close() _ = conn.Close()
}() }()
// Set read and write timeouts on the TCP connection. // Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error setting connection deadline: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
} }
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error setting connection deadline: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
} }
// Perform the TLS handshake // Perform the TLS handshake
tlsConfig := &tls.Config{ tlsConfig := &tls.Config{
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure. InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
ServerName: parsedUrl.Hostname(), // SNI should not include port ServerName: parsedURL.Hostname(), // SNI should not include port
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites. // MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
} }
tlsConn := tls.Client(conn, tlsConfig) tlsConn := tls.Client(conn, tlsConfig)
if err := tlsConn.Handshake(); err != nil { if err := tlsConn.Handshake(); err != nil {
return nil, fmt.Errorf("TLS handshake error: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err)
} }
// We read `buf`-sized chunks and add data to `data`. // We read `buf`-sized chunks and add data to `data`.
@@ -93,9 +97,13 @@ func ConnectAndGetData(url string) ([]byte, error) {
var data []byte var data []byte
// Send Gemini request to trigger server response. // Send Gemini request to trigger server response.
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url))) // Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitely in the URL.
_url, _ := ParseURL(url, "")
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error sending network request: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
} }
// Read response bytes in len(buf) byte chunks // Read response bytes in len(buf) byte chunks
for { for {
@@ -104,68 +112,83 @@ func ConnectAndGetData(url string) ([]byte, error) {
data = append(data, buf[:n]...) data = append(data, buf[:n]...)
} }
if len(data) > config.CONFIG.MaxResponseSize { if len(data) > config.CONFIG.MaxResponseSize {
data = []byte{} return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
return nil, fmt.Errorf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize)
} }
if err != nil { if err != nil {
if err == io.EOF { if errors.Is(err, io.EOF) {
break break
} else {
return nil, fmt.Errorf("Network error: %s", err)
} }
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
} }
} }
return data, nil return data, nil
} }
// Connect to given URL, using the Gemini protocol. // Visit given URL, using the Gemini protocol.
// Mutate given Snapshot with the data or the error. // Mutates given Snapshot with the data.
func Visit(s *Snapshot) { // In case of error, we store the error string
// inside snapshot and return the error.
func Visit(s *Snapshot) (err error) {
// Don't forget to also store error
// response code (if we have one)
defer func() {
if err != nil {
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*GeminiError)) {
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
}
}
}()
data, err := ConnectAndGetData(s.URL.String()) data, err := ConnectAndGetData(s.URL.String())
if err != nil { if err != nil {
s.Error = null.StringFrom(err.Error()) return err
return
} }
pageData, err := processData(data) pageData, err := processData(data)
if err != nil { if err != nil {
s.Error = null.StringFrom(err.Error()) return err
return
} }
//marshalled, _ := json.MarshalIndent(pageData, "", " ")
//fmt.Printf("%s\n", marshalled)
s.Header = null.StringFrom(pageData.ResponseHeader)
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode)) s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
s.MimeType = null.StringFrom(pageData.MimeType) s.MimeType = null.StringFrom(pageData.MimeType)
s.Lang = null.StringFrom(pageData.Lang) s.Lang = null.StringFrom(pageData.Lang)
if pageData.GemText != "" { if pageData.GemText != "" {
s.GemText = null.StringFrom(string(pageData.GemText)) s.GemText = null.StringFrom(pageData.GemText)
} }
if pageData.Data != nil { if pageData.Data != nil {
s.Data = null.ValueFrom(pageData.Data) s.Data = null.ValueFrom(pageData.Data)
} }
return nil
} }
// Update given snapshot with the // processData returne results from
// Gemini header data: response code, // parsing Gemini header data:
// mime type and lang (optional) // Code, mime type and lang (optional)
func processData(data []byte) (*GeminiPageData, error) { // Returns error if header was invalid
headers, body, err := getHeadersAndData(data) func processData(data []byte) (*PageData, error) {
header, body, err := getHeadersAndData(data)
if err != nil { if err != nil {
return nil, err return nil, err
} }
code, mimeType, lang := getMimeTypeAndLang(headers) code, mimeType, lang := getMimeTypeAndLang(header)
geminiError := checkGeminiStatusCode(code) logging.LogDebug("Header: %s", strings.TrimSpace(header))
if geminiError != nil { if code != 20 {
return nil, geminiError return nil, NewErrGeminiStatusCode(code, header)
} }
pageData := GeminiPageData{
ResponseCode: code, pageData := PageData{
MimeType: mimeType, ResponseCode: code,
Lang: lang, ResponseHeader: header,
MimeType: mimeType,
Lang: lang,
} }
// If we've got a Gemini document, populate // If we've got a Gemini document, populate
// `GemText` field, otherwise raw data goes to `Data`. // `GemText` field, otherwise raw data goes to `Data`.
if mimeType == "text/gemini" { if mimeType == "text/gemini" {
validBody, err := EnsureValidUTF8(body) validBody, err := BytesToValidUTF8(body)
if err != nil { if err != nil {
return nil, fmt.Errorf("UTF-8 error: %w", err) return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err)
} }
pageData.GemText = validBody pageData.GemText = validBody
} else { } else {
@@ -178,14 +201,14 @@ func processData(data []byte) (*GeminiPageData, error) {
// basically the first line of the response // basically the first line of the response
// and should contain the response code, // and should contain the response code,
// mimeType and language. // mimeType and language.
func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) { func getHeadersAndData(data []byte) (string, []byte, error) {
firstLineEnds := slices.Index(data, '\n') firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 { if firstLineEnds == -1 {
return "", nil, fmt.Errorf("Could not parse response header") return "", nil, ErrGeminiResponseHeader
} }
firstLine = string(data[:firstLineEnds]) firstLine := string(data[:firstLineEnds])
rest = data[firstLineEnds+1:] rest := data[firstLineEnds+1:]
return string(firstLine), rest, nil return firstLine, rest, nil
} }
// Parses code, mime type and language // Parses code, mime type and language
@@ -194,12 +217,12 @@ func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
// `20 text/gemini lang=en` (code, mimetype, lang) // `20 text/gemini lang=en` (code, mimetype, lang)
// `20 text/gemini` (code, mimetype) // `20 text/gemini` (code, mimetype)
// `31 gemini://redirected.to/other/site` (code) // `31 gemini://redirected.to/other/site` (code)
func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) { func getMimeTypeAndLang(headers string) (int, string, string) {
// Regex that parses code, mimetype & lang // Regex that parses code, mimetype & optional charset/lang parameters
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`) re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
matches := re.FindStringSubmatch(headers) matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 { if matches == nil || len(matches) <= 1 {
// Try to get code at least. // Try to get code at least
re := regexp.MustCompile(`^(\d+)\s+`) re := regexp.MustCompile(`^(\d+)\s+`)
matches := re.FindStringSubmatch(headers) matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 { if matches == nil || len(matches) <= 1 {
@@ -215,7 +238,7 @@ func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string)
if err != nil { if err != nil {
return 0, "", "" return 0, "", ""
} }
mimeType = matches[2] mimeType := matches[2]
lang = matches[4] param := matches[3] // This will capture either charset or lang value
return code, mimeType, lang return code, mimeType, param
} }

View File

@@ -6,6 +6,7 @@ import (
// Test for input: `20 text/gemini` // Test for input: `20 text/gemini`
func TestGetMimeTypeAndLang1(t *testing.T) { func TestGetMimeTypeAndLang1(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
if code != 20 || mimeType != "text/gemini" || lang != "" { if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -13,13 +14,39 @@ func TestGetMimeTypeAndLang1(t *testing.T) {
} }
func TestGetMimeTypeAndLang11(t *testing.T) { func TestGetMimeTypeAndLang11(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
if code != 20 || mimeType != "text/gemini" || lang != "" { if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
} }
} }
func TestGetMimeTypeAndLang12(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang13(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang2(t *testing.T) { func TestGetTypeAndLang2(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang21(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" { if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -27,6 +54,7 @@ func TestGetTypeAndLang2(t *testing.T) {
} }
func TestGetMimeTypeAndLang3(t *testing.T) { func TestGetMimeTypeAndLang3(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page") code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
if code != 31 || mimeType != "" || lang != "" { if code != 31 || mimeType != "" || lang != "" {
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -34,6 +62,7 @@ func TestGetMimeTypeAndLang3(t *testing.T) {
} }
func TestGetMimeTypeAndLang4(t *testing.T) { func TestGetMimeTypeAndLang4(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd") code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
if code != 0 || mimeType != "" || lang != "" { if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -41,6 +70,7 @@ func TestGetMimeTypeAndLang4(t *testing.T) {
} }
func TestGetMimeTypeAndLang5(t *testing.T) { func TestGetMimeTypeAndLang5(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("") code, mimeType, lang := getMimeTypeAndLang("")
if code != 0 || mimeType != "" || lang != "" { if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)

View File

@@ -1,10 +1,13 @@
package gemini package gemini
import ( import (
"encoding/json"
"fmt" "fmt"
"gemini-grc/logging" "gemini-grc/config"
"os" "os"
"gemini-grc/logging"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
@@ -33,11 +36,41 @@ func ConnectToDB() *sqlx.DB {
return db return db
} }
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s))
}
if config.CONFIG.DryRun {
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
return nil
}
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO UPDATE SET ON CONFLICT (url) DO NOTHING
`
_, err = tx.NamedExec(query, s)
if err != nil {
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
}
return nil
}
func UpsertSnapshot(id int, tx *sqlx.Tx, s *Snapshot) error {
marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s))
}
if config.CONFIG.DryRun {
logging.LogDebug("[%d] Would upsert snapshot %s", id, marshalled)
return nil
}
query := `
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO UPDATE SET
url = EXCLUDED.url, url = EXCLUDED.url,
host = EXCLUDED.host, host = EXCLUDED.host,
timestamp = EXCLUDED.timestamp, timestamp = EXCLUDED.timestamp,
@@ -47,24 +80,30 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
links = EXCLUDED.links, links = EXCLUDED.links,
lang = EXCLUDED.lang, lang = EXCLUDED.lang,
response_code = EXCLUDED.response_code, response_code = EXCLUDED.response_code,
error = EXCLUDED.error error = EXCLUDED.error`
` _, err = tx.NamedExec(query, s)
_, err := tx.NamedExec(query, s) //if err != nil {
// logging.LogError("[%s] GeminiError upserting snapshot: %w", s.URL, err)
// panic("This shouldn't happen")
//}
if err != nil { if err != nil {
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err) return fmt.Errorf("[%s] GeminiError upserting snapshot: %w", s.URL, err)
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
} }
return nil return nil
} }
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe // Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
const batchSize = 5000 const batchSize = 5000
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO NOTHING ON CONFLICT (url) DO NOTHING
` `
for i := 0; i < len(snapshots); i += batchSize { for i := 0; i < len(snapshots); i += batchSize {
@@ -77,7 +116,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
_, err := tx.NamedExec(query, batch) _, err := tx.NamedExec(query, batch)
if err != nil { if err != nil {
logging.LogError("Error batch inserting snapshots: %w", err) logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err) return fmt.Errorf("DB error: %w", err)
} }
} }
@@ -86,14 +125,17 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
} }
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error { func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO NOTHING ON CONFLICT (url) DO NOTHING
` `
_, err := tx.NamedExec(query, snapshots) _, err := tx.NamedExec(query, snapshots)
if err != nil { if err != nil {
logging.LogError("Error batch inserting snapshots: %w", err) logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err) return fmt.Errorf("DB error: %w", err)
} }
return nil return nil

View File

@@ -2,33 +2,58 @@ package gemini
import ( import (
"bytes" "bytes"
"errors"
"fmt" "fmt"
"io" "io"
"unicode/utf8" "unicode/utf8"
"golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/transform" "golang.org/x/text/transform"
) )
var (
ErrInputTooLarge = errors.New("input too large")
ErrUTF8Conversion = errors.New("UTF-8 conversion error")
)
func BytesToValidUTF8(input []byte) (string, error) { func BytesToValidUTF8(input []byte) (string, error) {
if len(input) == 0 {
return "", nil
}
const maxSize = 10 * 1024 * 1024 // 10MB
if len(input) > maxSize {
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
}
// Remove NULL byte 0x00 (ReplaceAll accepts slices) // Remove NULL byte 0x00 (ReplaceAll accepts slices)
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{}) inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
isValidUTF8 := utf8.Valid(inputNoNull) if utf8.Valid(inputNoNull) {
if isValidUTF8 {
return string(inputNoNull), nil return string(inputNoNull), nil
} }
encodings := []transform.Transformer{ encodings := []transform.Transformer{
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1 charmap.ISO8859_1.NewDecoder(),
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc charmap.ISO8859_7.NewDecoder(),
// TODO: Try more encodings? charmap.Windows1250.NewDecoder(), // Central European
charmap.Windows1251.NewDecoder(), // Cyrillic
charmap.Windows1252.NewDecoder(),
charmap.Windows1256.NewDecoder(), // Arabic
japanese.EUCJP.NewDecoder(), // Japanese
korean.EUCKR.NewDecoder(), // Korean
} }
// First successful conversion wins. // First successful conversion wins.
var lastErr error
for _, encoding := range encodings { for _, encoding := range encodings {
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding) reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
result, err := io.ReadAll(reader) result, err := io.ReadAll(reader)
if err == nil { if err != nil {
lastErr = err
continue
}
if utf8.Valid(result) {
return string(result), nil return string(result), nil
} }
} }
return "", fmt.Errorf("UTF-8 error: %w", err)
return "", fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr)
} }

View File

@@ -4,6 +4,7 @@ import "testing"
// Make sure NULL bytes are removed // Make sure NULL bytes are removed
func TestEnsureValidUTF8(t *testing.T) { func TestEnsureValidUTF8(t *testing.T) {
t.Parallel()
// Create a string with a null byte // Create a string with a null byte
strWithNull := "Hello" + string('\x00') + "world" strWithNull := "Hello" + string('\x00') + "world"
result, _ := BytesToValidUTF8([]byte(strWithNull)) result, _ := BytesToValidUTF8([]byte(strWithNull))

View File

@@ -2,16 +2,18 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/logging"
"strings" "strings"
"sync" "sync"
"gemini-grc/logging"
) )
// key: "host:port" (string) // RobotsCache is a map of blocked URLs
// value: // key: URL
// empty []string if no robots data, or // value: []string list of disallowed URLs
// list of URL prefixes ([]string) in robots // If a key has no blocked URLs, an empty
var RobotsCache sync.Map // list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals
func populateBlacklist(key string) (entries []string) { func populateBlacklist(key string) (entries []string) {
// We either store an empty list when // We either store an empty list when
@@ -40,43 +42,40 @@ func populateBlacklist(key string) (entries []string) {
// According to spec, the first is correct, // According to spec, the first is correct,
// however let's be lenient // however let's be lenient
var data string var data string
if robotsData.MimeType == "text/plain" { switch {
case robotsData.MimeType == "text/plain":
data = string(robotsData.Data) data = string(robotsData.Data)
} else if robotsData.MimeType == "text/gemini" { case robotsData.MimeType == "text/gemini":
data = robotsData.GemText data = robotsData.GemText
} else { default:
return []string{} return []string{}
} }
entries = ParseRobotsTxt(string(data), key) entries = ParseRobotsTxt(data, key)
return entries return entries
} }
// Check if the snapshot URL matches // RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule. // a robots.txt allow rule.
func RobotMatch(s *Snapshot) bool { func RobotMatch(url URL) bool {
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String()) key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port) logging.LogDebug("Checking robots.txt cache for %s", key)
v, ok := RobotsCache.Load(key) var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key)
if !ok { if !ok {
// First time check, populate robot cache // First time check, populate robot cache
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String()) disallowedURLs = populateBlacklist(key)
disallowedURLs := populateBlacklist(key) logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
for _, url := range disallowedURLs {
if strings.HasPrefix(s.URL.String(), url) {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
return true
}
}
} else { } else {
if len(v.([]string)) == 0 { disallowedURLs, _ = cacheEntries.([]string)
logging.LogDebug("No robots.txt or no rules, allowed") }
return false return isURLblocked(disallowedURLs, url.Full)
} }
for _, url := range v.([]string) {
if strings.HasPrefix(s.URL.String(), url) { func isURLblocked(disallowedURLs []string, input string) bool {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) for _, url := range disallowedURLs {
return true if strings.HasPrefix(strings.ToLower(input), url) {
} logging.LogDebug("robots.txt match: %s matches %s", input, url)
return true
} }
} }
return false return false

View File

@@ -5,7 +5,7 @@ import (
"strings" "strings"
) )
// Takes robots.txt content and a host, and // ParseRobotsTxt takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't // returns a list of full URLs that shouldn't
// be visited. // be visited.
// TODO Also take into account the user agent? // TODO Also take into account the user agent?

View File

@@ -6,6 +6,7 @@ import (
) )
func TestParseRobotsTxt(t *testing.T) { func TestParseRobotsTxt(t *testing.T) {
t.Parallel()
input := `User-agent: * input := `User-agent: *
Disallow: /cgi-bin/wp.cgi/view Disallow: /cgi-bin/wp.cgi/view
Disallow: /cgi-bin/wp.cgi/media Disallow: /cgi-bin/wp.cgi/media
@@ -26,6 +27,7 @@ Disallow: /admin/`
} }
func TestParseRobotsTxtEmpty(t *testing.T) { func TestParseRobotsTxtEmpty(t *testing.T) {
t.Parallel()
input := `` input := ``
result := ParseRobotsTxt(input, "example.com") result := ParseRobotsTxt(input, "example.com")
@@ -34,3 +36,20 @@ func TestParseRobotsTxtEmpty(t *testing.T) {
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result) t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
} }
} }
func TestIsURLblocked(t *testing.T) {
t.Parallel()
disallowedURLs := []string{
"gemini://example.com/cgi-bin/wp.cgi/view",
"gemini://example.com/cgi-bin/wp.cgi/media",
"gemini://example.com/admin/",
}
url := "gemini://example.com/admin/index.html"
if !isURLblocked(disallowedURLs, url) {
t.Errorf("Expected %s to be blocked", url)
}
url = "gemini://example1.com/admin/index.html"
if isURLblocked(disallowedURLs, url) {
t.Errorf("expected %s to not be blocked", url)
}
}

View File

@@ -4,15 +4,13 @@ import (
"database/sql/driver" "database/sql/driver"
"encoding/json" "encoding/json"
"fmt" "fmt"
"gemini-grc/logging"
"strings"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
type LinkList []GeminiUrl type LinkList []URL
func (l LinkList) Value() (driver.Value, error) { func (l *LinkList) Value() (driver.Value, error) {
return json.Marshal(l) return json.Marshal(l)
} }
@@ -29,46 +27,17 @@ func (l *LinkList) Scan(value interface{}) error {
} }
type Snapshot struct { type Snapshot struct {
ID int `db:"id" json:"id,omitempty"` ID int `db:"id" json:"id,omitempty"`
UID string `db:"uid" json:"uid,omitempty"` //UID string `db:"uid" json:"uid,omitempty"`
URL GeminiUrl `db:"url" json:"url,omitempty"` URL URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"` Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"` Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"` MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files. Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files. GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Links *LinkList `db:"links" json:"links,omitempty"` Header null.String `db:"header" json:"header,omitempty"` // Response header.
Lang null.String `db:"lang" json:"lang,omitempty"` Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code. Lang null.String `db:"lang" json:"lang,omitempty"`
Error null.String `db:"error" json:"error,omitempty"` // On network errors only ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
} Error null.String `db:"error" json:"error,omitempty"` // On network errors only
func SnapshotToJSON(g Snapshot) string {
// Serialize the Person struct to JSON
jsonData, err := json.MarshalIndent(g, "", "\t")
if err != nil {
logging.LogError("Error serializing to JSON: %w", err)
}
return string(jsonData)
}
func SnapshotFromJSON(input string) Snapshot {
var snapshot Snapshot
err := json.Unmarshal([]byte(input), &snapshot)
if err != nil {
logging.LogError("Error deserializing from JSON: %w", err)
}
return snapshot
}
func ShouldPersistSnapshot(result *Snapshot) bool {
if !result.MimeType.Valid {
return false
}
if result.MimeType.String == "text/gemini" ||
strings.HasPrefix(result.MimeType.String, "image/") ||
strings.HasPrefix(result.MimeType.String, "text/") {
return true
}
return false
} }

View File

@@ -1,36 +1,38 @@
package gemini package gemini
import ( import (
"errors"
"fmt" "fmt"
"gemini-grc/config"
"gemini-grc/logging"
"gemini-grc/uid"
"gemini-grc/util"
"strings" "strings"
"time" "time"
"gemini-grc/config"
"gemini-grc/logging"
"gemini-grc/util"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers) logging.LogInfo("Spawning %d workers", numOfWorkers)
for i := 0; i < numOfWorkers; i++ { for i := range numOfWorkers {
go func(i int) { go func(i int) {
for { for {
runWorker(i, db) RunWorker(i, db, nil)
} }
}(i) }(i)
} }
} }
func runWorker(id int, db *sqlx.DB) { func RunWorker(id int, db *sqlx.DB, url *string) {
// Start the DB transaction // Each worker runs within a DB transaction.
tx, err := db.Beginx() tx, err := db.Beginx()
if err != nil { if err != nil {
logging.LogError("Failed to begin transaction: %w", err) logging.LogError("Failed to begin transaction: %w", err)
} }
// Commit/rollback at the end
defer func() { defer func() {
err = tx.Commit() err = tx.Commit()
if err != nil { if err != nil {
@@ -42,66 +44,97 @@ func runWorker(id int, db *sqlx.DB) {
} }
}() }()
snapshots, err := GetRandomSnapshotsDistinctHosts(tx) var snapshots []Snapshot
if err != nil { // If not given a specific URL,
logging.LogError("[%d] Error retrieving snapshot: %w", id, err) // get some random ones to visit from DB.
time.Sleep(10 * time.Second) if url == nil {
return snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
} else if len(snapshots) == 0 { if err != nil {
logging.LogInfo("[%d] No remaining snapshots to visit.", id) logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err)
time.Sleep(1 * time.Minute) panic("This should never happen")
return } else if len(snapshots) == 0 {
logging.LogInfo("[%d] No snapshots to visit.", id)
time.Sleep(1 * time.Minute)
return
}
} else {
snapshotURL, err := ParseURL(*url, "")
if err != nil {
logging.LogError("Invalid URL given: " + *url)
return
}
snapshots = []Snapshot{{
//UID: uid.UID(),
URL: *snapshotURL,
Host: snapshotURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}}
} }
// Start visiting URLs.
total := len(snapshots) total := len(snapshots)
for i, s := range snapshots { for i, s := range snapshots {
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL) logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
// We differentiate between errors:
// Unexpected errors are the ones returned from the following function.
// If an error is unexpected (which should never happen) we panic.
// Expected errors are stored as strings within the snapshot,
// so that they can also be stored in DB.
err = workOnSnapshot(id, tx, &s) err = workOnSnapshot(id, tx, &s)
if err != nil { if err != nil {
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL, err) logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err)
util.PrintStackAndPanic(err) util.PrintStackAndPanic(err)
} }
if s.Error.Valid { if s.Error.Valid {
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL, s.Error.String) logging.LogWarn("[%d] Error: %v", id, s.Error.String)
} }
logging.LogDebug("[%d] Done %d/%d.", id, i, total) logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
} }
logging.LogInfo("[%d] Worker done.", id) logging.LogInfo("[%d] Worker done.", id)
} }
// workOnSnapshot visits a URL and stores the result.
// errors should be returned only if they are unexpected.
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
if IsBlacklisted(s.URL) {
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
return nil
}
// If URL matches a robots.txt disallow line, // If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be // add it as an error so next time it won't be
// crawled. // crawled.
if RobotMatch(s) { if RobotMatch(s.URL) {
s.Error = null.StringFrom("robots.txt disallow match") s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
err = SaveSnapshotToDB(tx, s) err = UpsertSnapshot(id, tx, s)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) return fmt.Errorf("[%d] %w", id, err)
} }
return nil return nil
} }
// Resolve IP address via DNS
IPs, err := getHostIPAddresses(s.Host) IPs, err := getHostIPAddresses(s.Host)
if err != nil { if err != nil {
s.Error = null.StringFrom("DNS Resolve error") s.Error = null.StringFrom(err.Error())
err = SaveSnapshotToDB(tx, s) err = UpsertSnapshot(id, tx, s)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) return fmt.Errorf("[%d] %w", id, err)
} }
return nil return nil
} }
// If the host's ip is in the connections pool, // If the host's ip is in the connections pool we stop
// stop and add the url in the queue later.
IpPool.Lock.RLock() IpPool.Lock.RLock()
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL) logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
for _, ip := range IPs { for _, ip := range IPs {
_, ok := IpPool.IPs[ip] _, ok := IpPool.IPs[ip]
if ok { if ok {
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL) logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
IpPool.Lock.RUnlock() IpPool.Lock.RUnlock()
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain time.Sleep(1 * time.Second) // Avoid flood-retrying
return nil return nil
} }
} }
@@ -109,73 +142,115 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
AddIPsToPool(IPs) AddIPsToPool(IPs)
url := s.URL.String() // After finishing, remove the host IPs from
logging.LogDebug("[%d] Dialing %s", id, url) // the connections pool, with a small delay
Visit(s) // to avoid potentially hitting the same IP quickly.
logging.LogDebug("[%d] Finished dialing.", id) defer func() {
go func() {
time.Sleep(5 * time.Second) time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs) RemoveIPsFromPool(IPs)
}() }()
if s.MimeType.Valid && s.MimeType.String == "text/gemini" { url := s.URL.String()
logging.LogDebug("[%d] [%s] Processing", id, url) logging.LogDebug("[%d] Dialing %s", id, url)
s = ProcessGemini(s)
} err = Visit(s)
logging.LogDebug("[%d] Saving", id)
err = SaveSnapshotToDB(tx, s)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) if !IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
return err
}
// Check if error is redirection, and handle it
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*GeminiError)) &&
err.(*GeminiError).Msg == "redirect" {
err = handleRedirection(id, tx, s)
if err != nil {
return err
}
}
}
logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero())
// If this is a gemini page, parse possible links inside
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
links := GetPageLinks(s.URL, s.GemText.String)
logging.LogDebug("[%d] Found %d links", id, len(links))
if len(links) > 0 {
s.Links = null.ValueFrom(links)
}
} else {
logging.LogDebug("[%d] Not looking for page links", id)
} }
// Store links in batch err = UpsertSnapshot(id, tx, s)
if s.Links != nil { if err != nil {
var batchSnapshots []*Snapshot return err
timestamp := null.TimeFrom(time.Now()) }
for _, link := range *s.Links { err = storeLinks(tx, s)
if shouldPersistURL(tx, link) { if err != nil {
return err
}
return nil
}
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
if s.Links.Valid {
var batchSnapshots []*Snapshot
for _, link := range s.Links.ValueOrZero() {
if shouldPersistURL(link) {
newSnapshot := &Snapshot{ newSnapshot := &Snapshot{
UID: uid.UID(), //UID: uid.UID(),
URL: link, URL: link,
Host: link.Hostname, Host: link.Hostname,
Timestamp: timestamp, Timestamp: null.TimeFrom(time.Now()),
} }
batchSnapshots = append(batchSnapshots, newSnapshot) batchSnapshots = append(batchSnapshots, newSnapshot)
} }
} }
if len(batchSnapshots) > 0 { if len(batchSnapshots) > 0 {
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots)) err := SaveLinksToDBinBatches(tx, batchSnapshots)
err = SaveLinksToDBinBatches(tx, batchSnapshots)
if err != nil { if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err) return err
} }
} }
} }
return nil return nil
} }
// Should we save the given URL for crawling? // shouldPersistURL returns true if we
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool { // should save the URL in the DB.
if !strings.HasPrefix(u.String(), "gemini://") { // Only gemini:// urls are saved.
return false func shouldPersistURL(u URL) bool {
} return strings.HasPrefix(u.String(), "gemini://")
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)` }
var exists bool
err := tx.Get(&exists, query, u.String()) func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error {
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
if err != nil { if err != nil {
fmt.Println("Error executing query:", err) return err
return false
} }
return !exists logging.LogDebug("[%d] Page redirects to %s", id, newURL)
// Insert fresh snapshot with new URL
snapshot := &Snapshot{
//UID: uid.UID(),
URL: *newURL,
Host: newURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String())
err = SaveSnapshotIfNew(tx, snapshot)
if err != nil {
return err
}
return nil
} }
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) { func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
// Old, unoptimized query // Old, unoptimized query
//
// query := ` // query := `
// SELECT DISTINCT ON (host) * // SELECT DISTINCT ON (host) *
// FROM snapshots // FROM snapshots
// WHERE response_code IS NULL // WHERE response_code IS NULL
@@ -184,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
// LIMIT $1 // LIMIT $1
// ` // `
query := ` query := `
WITH RankedSnapshots AS ( SELECT *
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext, FROM snapshots
links, lang, response_code, error, WHERE response_code IS NULL
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn AND error IS NULL
FROM snapshots ORDER BY RANDOM()
WHERE response_code IS NULL LIMIT $1
AND error IS NULL `
) //query := `
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext, // WITH RankedSnapshots AS (
links, lang, response_code, error // SELECT id, url, host, timestamp, mimetype, data, gemtext,
FROM RankedSnapshots // links, lang, response_code, error,
WHERE rn = 1 // ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
LIMIT $1 // FROM snapshots
` // WHERE response_code IS NULL
// AND error IS NULL
// )
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
// links, lang, response_code, error
// FROM RankedSnapshots
// WHERE rn = 1
// LIMIT $1
//`
var snapshots []Snapshot var snapshots []Snapshot
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize) err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
if err != nil { if err != nil {
@@ -205,3 +288,18 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
} }
return snapshots, nil return snapshots, nil
} }
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err
}
return snapshots, nil
}

6
go.mod
View File

@@ -8,16 +8,22 @@ require (
github.com/jmoiron/sqlx v1.4.0 github.com/jmoiron/sqlx v1.4.0
github.com/matoous/go-nanoid/v2 v2.1.0 github.com/matoous/go-nanoid/v2 v2.1.0
github.com/rs/zerolog v1.33.0 github.com/rs/zerolog v1.33.0
github.com/stretchr/testify v1.9.0
golang.org/x/text v0.19.0 golang.org/x/text v0.19.0
) )
require ( require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-isatty v0.0.20 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.13.1 // indirect
golang.org/x/crypto v0.27.0 // indirect golang.org/x/crypto v0.27.0 // indirect
golang.org/x/sync v0.8.0 // indirect golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.25.0 // indirect golang.org/x/sys v0.25.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
) )

9
go.sum
View File

@@ -1,6 +1,7 @@
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -19,6 +20,10 @@ github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE= github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
@@ -34,6 +39,8 @@ github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxU
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
@@ -54,6 +61,8 @@ golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

52
http/http.go Normal file
View File

@@ -0,0 +1,52 @@
package http
import (
"fmt"
"gemini-grc/logging"
_ "gemini-grc/logging"
"net/http"
"time"
)
func CreateServer(listenAddr string) *http.Server {
mux := http.NewServeMux()
mux.HandleFunc("GET /ping", wrapForError(getPing))
server := &http.Server{
Addr: listenAddr,
Handler: mux,
ReadHeaderTimeout: 10 * time.Second,
}
go func() {
// Start the server. Blocking call.
logging.LogInfo("HTTP server listening on %s", listenAddr)
if err := server.ListenAndServe(); err != nil {
panic(fmt.Sprintf("Server failed to start: %s", err))
}
}()
return server
}
func wrapForError(f func(http.ResponseWriter, *http.Request) error) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
err := f(w, r)
if err != nil {
code := http.StatusInternalServerError
logging.LogWarn("Error while handling request: %d %s", code, err)
http.Error(w, http.StatusText(code), code)
}
}
}
func getPing(w http.ResponseWriter, r *http.Request) error {
method := r.Method
url := r.URL.String()
path := r.URL.Path
response := fmt.Sprintf("Pong %s %s %s", method, url, path)
_, err := w.Write([]byte(response))
if err != nil {
return fmt.Errorf("failed to write response: %w", err)
}
return nil
}

33
main.go
View File

@@ -3,15 +3,14 @@ package main
import ( import (
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/gemini" "gemini-grc/gemini"
"gemini-grc/http"
"gemini-grc/logging" "gemini-grc/logging"
"github.com/jmoiron/sqlx"
"github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log"
"os" "os"
"os/signal" "os/signal"
"syscall" "syscall"
"github.com/jmoiron/sqlx"
"github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log"
) )
func main() { func main() {
@@ -27,9 +26,10 @@ func main() {
func runApp() error { func runApp() error {
logging.LogInfo("Starting up. Press Ctrl+C to exit") logging.LogInfo("Starting up. Press Ctrl+C to exit")
sigs := make(chan os.Signal, 1) signals := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
server := http.CreateServer("localhost:8899")
db := gemini.ConnectToDB() db := gemini.ConnectToDB()
// !!! DANGER !!! // !!! DANGER !!!
@@ -44,13 +44,20 @@ func runApp() error {
} }
}(db) }(db)
// if len(os.Args) > 1 { gemini.LoadBlacklist()
// url := os.Args[1] // If there's an argument, assume it's a URL
// } // to visit and ignore database state.
// os.Exit(1) if len(os.Args) > 1 {
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db) url := os.Args[1]
go gemini.RunWorker(0, db, &url)
} else {
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
}
<-sigs <-signals
if err := server.Close(); err != nil {
logging.LogError("GeminiError during server shutdown: %s", err)
}
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting") logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
return nil return nil
} }

View File

@@ -1,14 +1,14 @@
package uid package uid
import ( import (
nanoid "github.com/jaevor/go-nanoid" nanoid "github.com/matoous/go-nanoid/v2"
) )
func UID() string { func UID() string {
// Missing o,O and l // No 'o','O' and 'l'
uid, err := nanoid.CustomASCII("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20) id, err := nanoid.Generate("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20)
if err != nil { if err != nil {
panic(err) panic(err)
} }
return uid() return id
} }