Compare commits
18 Commits
ef3f009709
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 43f2242558 | |||
| aa4aecdc14 | |||
| 6cf507bdc9 | |||
| 7a36614232 | |||
| b52d4f6532 | |||
| 825c7e3391 | |||
| f0452ff9f7 | |||
| bb49ea8565 | |||
| a19f157c80 | |||
| 8b341d2ac6 | |||
| 2bb8589eb7 | |||
| 6346c9a829 | |||
| c7b0778b77 | |||
| bea0d22c26 | |||
| 6bcc7081b2 | |||
| f34ac651b7 | |||
| d5da9ac62d | |||
| a0563074ed |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,6 +1,8 @@
|
|||||||
.idea
|
.idea
|
||||||
|
.goroot
|
||||||
**/.#*
|
**/.#*
|
||||||
**/*~
|
**/*~
|
||||||
|
/.go
|
||||||
/cmd
|
/cmd
|
||||||
/db/initdb.sql
|
/db/initdb.sql
|
||||||
/db/*sh
|
/db/*sh
|
||||||
|
|||||||
28
Makefile
Normal file
28
Makefile
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
SHELL := /usr/local/bin/oksh
|
||||||
|
export PATH := $(PATH)
|
||||||
|
|
||||||
|
.PHONY: all fmt lint
|
||||||
|
|
||||||
|
all: fmt lint test
|
||||||
|
|
||||||
|
.PHONY: debug
|
||||||
|
debug:
|
||||||
|
@echo "PATH: $(PATH)"
|
||||||
|
@echo "GOPATH: $(shell go env GOPATH)"
|
||||||
|
@which go
|
||||||
|
@which gofumpt
|
||||||
|
@which gci
|
||||||
|
@which golangci-lint
|
||||||
|
|
||||||
|
# Test
|
||||||
|
test:
|
||||||
|
go test -v ./...
|
||||||
|
|
||||||
|
# Format code
|
||||||
|
fmt:
|
||||||
|
gofumpt -l -w .
|
||||||
|
gci write .
|
||||||
|
|
||||||
|
# Run linter
|
||||||
|
lint: fmt
|
||||||
|
golangci-lint run
|
||||||
26
README.md
26
README.md
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
A Gemini crawler.
|
A Gemini crawler.
|
||||||
|
|
||||||
|
URLs to visit as well as data from visited URLs are stored into "snapshots" in the database.
|
||||||
|
|
||||||
## Done
|
## Done
|
||||||
- [x] Concurrent downloading with workers
|
- [x] Concurrent downloading with workers
|
||||||
- [x] Concurrent connection limit per host
|
- [x] Concurrent connection limit per host
|
||||||
@@ -10,22 +12,16 @@ A Gemini crawler.
|
|||||||
- [x] Configuration via environment variables
|
- [x] Configuration via environment variables
|
||||||
- [x] Storing snapshots in PostgreSQL
|
- [x] Storing snapshots in PostgreSQL
|
||||||
- [x] Proper response header & body UTF-8 and format validation
|
- [x] Proper response header & body UTF-8 and format validation
|
||||||
- [x] Follow robots.txt
|
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
|
- [x] Handle redirects (3X status codes)
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
- [ ] Take into account gemini://geminiprotocol.net/docs/companion/robots.gmi
|
- [ ] Better URL normalization
|
||||||
- [ ] Proper handling of all response codes
|
- [ ] Provide a TLS cert for sites that require it, like Astrobotany
|
||||||
- [ ] Handle 3X redirects properly
|
|
||||||
- [ ] Handle URLs that need presentation of a TLS cert, like astrobotany
|
|
||||||
+ [ ] Probably have a common "grc" cert for all?
|
|
||||||
- [ ] Proper input and response validations:
|
|
||||||
+ [ ] When making a request, the URI MUST NOT exceed 1024 bytes
|
|
||||||
- [ ] Subscriptions to gemini pages? gemini://geminiprotocol.net/docs/companion/
|
|
||||||
|
|
||||||
## TODO for later
|
## TODO for later
|
||||||
- [ ] Add other protocols
|
- [ ] Gopher
|
||||||
+ [ ] Gopher
|
- [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
|
||||||
+ [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi
|
- [ ] Spartan
|
||||||
+ [ ] Spartan
|
- [ ] Nex
|
||||||
+ [ ] Nex
|
- [ ] SuperTXT https://supertxt.net/00-intro.html
|
||||||
+ [ ] SuperTXT https://supertxt.net/00-intro.html
|
|
||||||
|
|||||||
2
blacklist.txt
Normal file
2
blacklist.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
gemi.dev
|
||||||
|
mastogem.picasoft.net
|
||||||
212
config/config.go
212
config/config.go
@@ -8,83 +8,149 @@ import (
|
|||||||
"github.com/rs/zerolog"
|
"github.com/rs/zerolog"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Environment variable names.
|
||||||
|
const (
|
||||||
|
EnvLogLevel = "LOG_LEVEL"
|
||||||
|
EnvNumWorkers = "NUM_OF_WORKERS"
|
||||||
|
EnvWorkerBatchSize = "WORKER_BATCH_SIZE"
|
||||||
|
EnvMaxResponseSize = "MAX_RESPONSE_SIZE"
|
||||||
|
EnvResponseTimeout = "RESPONSE_TIMEOUT"
|
||||||
|
EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
|
||||||
|
EnvBlacklistPath = "BLACKLIST_PATH"
|
||||||
|
EnvDryRun = "DRY_RUN"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Config holds the application configuration loaded from environment variables.
|
||||||
type Config struct {
|
type Config struct {
|
||||||
LogLevel zerolog.Level
|
LogLevel zerolog.Level // Logging level (debug, info, warn, error)
|
||||||
rootPath string
|
MaxResponseSize int // Maximum size of response in bytes
|
||||||
MaxResponseSize int
|
NumOfWorkers int // Number of concurrent workers
|
||||||
NumOfWorkers int
|
ResponseTimeout int // Timeout for responses in seconds
|
||||||
ResponseTimeout int
|
WorkerBatchSize int // Batch size for worker processing
|
||||||
WorkerBatchSize int
|
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
|
||||||
|
BlacklistPath string // File that has blacklisted strings of "host:port"
|
||||||
|
DryRun bool // If false, don't write to disk
|
||||||
}
|
}
|
||||||
|
|
||||||
var CONFIG Config
|
var CONFIG Config //nolint:gochecknoglobals
|
||||||
|
|
||||||
func GetConfig() *Config {
|
// parsePositiveInt parses and validates positive integer values.
|
||||||
var config Config
|
func parsePositiveInt(param, value string) (int, error) {
|
||||||
for _, envVar := range []string{
|
val, err := strconv.Atoi(value)
|
||||||
"LOG_LEVEL",
|
if err != nil {
|
||||||
"ROOT_PATH",
|
return 0, ValidationError{
|
||||||
"NUM_OF_WORKERS",
|
Param: param,
|
||||||
"WORKER_BATCH_SIZE",
|
Value: value,
|
||||||
"MAX_RESPONSE_SIZE",
|
Reason: "must be a valid integer",
|
||||||
"RESPONSE_TIMEOUT",
|
|
||||||
} {
|
|
||||||
if env, ok := os.LookupEnv(envVar); !ok {
|
|
||||||
fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar)
|
|
||||||
os.Exit(1)
|
|
||||||
} else {
|
|
||||||
switch envVar {
|
|
||||||
case "LOG_LEVEL":
|
|
||||||
{
|
|
||||||
logLevel, err := zerolog.ParseLevel(env)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, "Invalid LOG_LEVEL value\n")
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
config.LogLevel = logLevel
|
|
||||||
}
|
|
||||||
case "ROOT_PATH":
|
|
||||||
{
|
|
||||||
config.rootPath = env
|
|
||||||
}
|
|
||||||
case "NUM_OF_WORKERS":
|
|
||||||
{
|
|
||||||
if numOfWorkers, err := strconv.Atoi(env); err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, "Invalid NUM_OF_WORKERS value\n")
|
|
||||||
os.Exit(1)
|
|
||||||
} else {
|
|
||||||
config.NumOfWorkers = numOfWorkers
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case "WORKER_BATCH_SIZE":
|
|
||||||
{
|
|
||||||
if workerBatchSize, err := strconv.Atoi(env); err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, "Invalid WORKER_BATCH_SIZE value\n")
|
|
||||||
os.Exit(1)
|
|
||||||
} else {
|
|
||||||
config.WorkerBatchSize = workerBatchSize
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case "MAX_RESPONSE_SIZE":
|
|
||||||
{
|
|
||||||
if maxResponseSize, err := strconv.Atoi(env); err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, "Invalid MAX_RESPONSE_SIZE value\n")
|
|
||||||
os.Exit(1)
|
|
||||||
} else {
|
|
||||||
config.MaxResponseSize = maxResponseSize
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case "RESPONSE_TIMEOUT":
|
|
||||||
{
|
|
||||||
if val, err := strconv.Atoi(env); err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, "Invalid RESPONSE_TIMEOUT value\n")
|
|
||||||
os.Exit(1)
|
|
||||||
} else {
|
|
||||||
config.ResponseTimeout = val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return &config
|
if val <= 0 {
|
||||||
|
return 0, ValidationError{
|
||||||
|
Param: param,
|
||||||
|
Value: value,
|
||||||
|
Reason: "must be positive",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return val, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseBool(param, value string) (bool, error) {
|
||||||
|
val, err := strconv.ParseBool(value)
|
||||||
|
if err != nil {
|
||||||
|
return false, ValidationError{
|
||||||
|
Param: param,
|
||||||
|
Value: value,
|
||||||
|
Reason: "cannot be converted to boolean",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return val, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetConfig loads and validates configuration from environment variables
|
||||||
|
func GetConfig() *Config {
|
||||||
|
config := &Config{}
|
||||||
|
|
||||||
|
// Map of environment variables to their parsing functions
|
||||||
|
parsers := map[string]func(string) error{
|
||||||
|
EnvLogLevel: func(v string) error {
|
||||||
|
level, err := zerolog.ParseLevel(v)
|
||||||
|
if err != nil {
|
||||||
|
return ValidationError{
|
||||||
|
Param: EnvLogLevel,
|
||||||
|
Value: v,
|
||||||
|
Reason: "must be one of: debug, info, warn, error",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
config.LogLevel = level
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
EnvNumWorkers: func(v string) error {
|
||||||
|
val, err := parsePositiveInt(EnvNumWorkers, v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
config.NumOfWorkers = val
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
EnvWorkerBatchSize: func(v string) error {
|
||||||
|
val, err := parsePositiveInt(EnvWorkerBatchSize, v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
config.WorkerBatchSize = val
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
EnvMaxResponseSize: func(v string) error {
|
||||||
|
val, err := parsePositiveInt(EnvMaxResponseSize, v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
config.MaxResponseSize = val
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
EnvResponseTimeout: func(v string) error {
|
||||||
|
val, err := parsePositiveInt(EnvResponseTimeout, v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
config.ResponseTimeout = val
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
EnvPanicOnUnexpectedError: func(v string) error {
|
||||||
|
val, err := parseBool(EnvPanicOnUnexpectedError, v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
config.PanicOnUnexpectedError = val
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
EnvBlacklistPath: func(v string) error {
|
||||||
|
config.BlacklistPath = v
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
EnvDryRun: func(v string) error {
|
||||||
|
val, err := parseBool(EnvDryRun, v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
config.DryRun = val
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process each environment variable
|
||||||
|
for envVar, parser := range parsers {
|
||||||
|
value, ok := os.LookupEnv(envVar)
|
||||||
|
if !ok {
|
||||||
|
fmt.Fprintf(os.Stderr, "Missing required environment variable: %s\n", envVar)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := parser(value); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Configuration error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return config
|
||||||
}
|
}
|
||||||
|
|||||||
14
config/errors.go
Normal file
14
config/errors.go
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
package config
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
|
||||||
|
// ValidationError represents a config validation error
|
||||||
|
type ValidationError struct {
|
||||||
|
Param string
|
||||||
|
Value string
|
||||||
|
Reason string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e ValidationError) Error() string {
|
||||||
|
return fmt.Sprintf("invalid value '%s' for %s: %s", e.Value, e.Param, e.Reason)
|
||||||
|
}
|
||||||
7
db/delete_robots_hosts.sql
Normal file
7
db/delete_robots_hosts.sql
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
delete FROM snapshots
|
||||||
|
WHERE host IN (
|
||||||
|
SELECT DISTINCT host
|
||||||
|
FROM snapshots
|
||||||
|
WHERE error LIKE 'robots.txt%'
|
||||||
|
)
|
||||||
|
AND url LIKE 'gemini://' || host || '/%';
|
||||||
5
db/error_stats.sql
Normal file
5
db/error_stats.sql
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
SELECT error, count(error) as count
|
||||||
|
FROM snapshots
|
||||||
|
GROUP BY error
|
||||||
|
ORDER BY count DESC
|
||||||
|
LIMIT 20;
|
||||||
22
db/fix-url-ports.sql
Normal file
22
db/fix-url-ports.sql
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
-- Here's an SQL script that will find and remove snapshots without port numbers
|
||||||
|
-- when there exists a duplicate with the default port 1965.
|
||||||
|
|
||||||
|
-- Before running this DELETE though, you might want to
|
||||||
|
-- verify the matches first with this SELECT:
|
||||||
|
WITH duplicates AS (
|
||||||
|
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
|
||||||
|
FROM snapshots s1
|
||||||
|
JOIN snapshots s2
|
||||||
|
ON s2.url = s1.url || ':1965'
|
||||||
|
)
|
||||||
|
SELECT * FROM duplicates;
|
||||||
|
|
||||||
|
-- Now delete them for real:
|
||||||
|
WITH duplicates AS (
|
||||||
|
SELECT s1.id as id_without_port, s2.id as id_with_port, s1.url as url_without_port, s2.url as url_with_port
|
||||||
|
FROM snapshots s1
|
||||||
|
JOIN snapshots s2
|
||||||
|
ON s2.url = s1.url || ':1965'
|
||||||
|
)
|
||||||
|
DELETE FROM snapshots
|
||||||
|
WHERE id IN (SELECT id_without_port FROM duplicates);
|
||||||
7
db/host_stats_visited.sql
Normal file
7
db/host_stats_visited.sql
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
SELECT host, COUNT(*) AS row_count
|
||||||
|
FROM snapshots
|
||||||
|
WHERE response_code IS NOT NULL
|
||||||
|
AND error IS NULL
|
||||||
|
GROUP BY host
|
||||||
|
ORDER BY row_count DESC
|
||||||
|
LIMIT 10;
|
||||||
@@ -20,7 +20,7 @@ DROP TABLE IF EXISTS snapshots;
|
|||||||
CREATE TABLE snapshots (
|
CREATE TABLE snapshots (
|
||||||
id SERIAL PRIMARY KEY,
|
id SERIAL PRIMARY KEY,
|
||||||
uid TEXT NOT NULL UNIQUE,
|
uid TEXT NOT NULL UNIQUE,
|
||||||
url TEXT NOT NULL,
|
url TEXT NOT NULL UNIQUE,
|
||||||
host TEXT NOT NULL,
|
host TEXT NOT NULL,
|
||||||
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
mimetype TEXT,
|
mimetype TEXT,
|
||||||
@@ -42,7 +42,10 @@ CREATE INDEX idx_lang ON snapshots (lang);
|
|||||||
CREATE INDEX idx_response_code ON snapshots (response_code);
|
CREATE INDEX idx_response_code ON snapshots (response_code);
|
||||||
CREATE INDEX idx_error ON snapshots (error);
|
CREATE INDEX idx_error ON snapshots (error);
|
||||||
CREATE INDEX idx_host ON snapshots (host);
|
CREATE INDEX idx_host ON snapshots (host);
|
||||||
|
CREATE INDEX unique_uid_url ON snapshots (uid, url);
|
||||||
|
|
||||||
CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
|
CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
|
||||||
WHERE response_code IS NULL AND error IS NULL
|
WHERE response_code IS NULL AND error IS NULL
|
||||||
INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
|
INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
|
||||||
|
|
||||||
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
|
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;
|
||||||
|
|||||||
18
db/migrate2_unique_urls.sql
Normal file
18
db/migrate2_unique_urls.sql
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
-- Step 1: Delete duplicate entries, keeping the last one based on timestamp
|
||||||
|
-- Use a CTE to mark duplicates and delete them efficiently
|
||||||
|
WITH ranked_snapshots AS (
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
url,
|
||||||
|
ROW_NUMBER() OVER(PARTITION BY url ORDER BY timestamp DESC) AS row_num
|
||||||
|
FROM
|
||||||
|
snapshots
|
||||||
|
)
|
||||||
|
DELETE FROM snapshots
|
||||||
|
USING ranked_snapshots
|
||||||
|
WHERE snapshots.id = ranked_snapshots.id
|
||||||
|
AND ranked_snapshots.row_num > 1;
|
||||||
|
|
||||||
|
-- Step 2: Add a unique constraint on the url column to prevent future duplicates
|
||||||
|
ALTER TABLE snapshots
|
||||||
|
ADD CONSTRAINT unique_url UNIQUE (url);
|
||||||
@@ -2,44 +2,56 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/gemini"
|
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
|
"gemini-grc/gemini"
|
||||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// MANUALLY SET TO TRUE WHEN MIGRATION HAS BEEN APPLIED
|
||||||
func checkIfDone() bool { return true }
|
func checkIfDone() bool { return true }
|
||||||
|
|
||||||
|
func MustSelect(tx *sqlx.Tx, dest interface{}, query string, args ...interface{}) {
|
||||||
|
if err := tx.Select(dest, query, args...); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Populates the `host` field
|
// Populates the `host` field
|
||||||
func main() {
|
func main() {
|
||||||
db := connectToDB()
|
|
||||||
|
|
||||||
if checkIfDone() {
|
if checkIfDone() {
|
||||||
fmt.Println("Migration already applied")
|
fmt.Println("Migration already applied")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
count := 0
|
db := connectToDB()
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
batchSize := 1000
|
||||||
for {
|
for {
|
||||||
// Start the transaction
|
// Start the transaction
|
||||||
tx, err := db.Beginx()
|
tx := db.MustBegin()
|
||||||
if err != nil {
|
query := `SELECT * FROM snapshots WHERE url NOT LIKE '%1965%' LIMIT $1`
|
||||||
fmt.Println(err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
query := `
|
|
||||||
SELECT * FROM snapshots
|
|
||||||
WHERE host IS NULL
|
|
||||||
LIMIT 5000
|
|
||||||
`
|
|
||||||
var snapshots []gemini.Snapshot
|
var snapshots []gemini.Snapshot
|
||||||
err = tx.Select(&snapshots, query)
|
MustSelect(tx, &snapshots, query, batchSize)
|
||||||
if len(snapshots) == 0 {
|
if len(snapshots) == 0 {
|
||||||
fmt.Println("Done!")
|
fmt.Println("No snapshots remaining, done")
|
||||||
return
|
break
|
||||||
}
|
}
|
||||||
|
for i, s := range snapshots {
|
||||||
|
_, err := gemini.ParseURL(s.URL.String(), "")
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Error parsing URL. ID %d URL %s\n", s.ID, s.URL))
|
||||||
|
}
|
||||||
|
fmt.Printf("Saving %d %d %s\n", i+1, s.ID, s.URL)
|
||||||
|
err = gemini.UpsertSnapshot(0, tx, &s)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Error saving %s: %s", s.URL, err))
|
||||||
|
}
|
||||||
|
tx.MustExec(`DELETE FROM snapshots WHERE id=$1`, s.ID)
|
||||||
|
}
|
||||||
|
err := tx.Commit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
fmt.Println(err)
|
||||||
err := tx.Rollback()
|
err := tx.Rollback()
|
||||||
@@ -47,31 +59,7 @@ func main() {
|
|||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, s := range snapshots {
|
|
||||||
s.Host = s.URL.Hostname
|
|
||||||
fmt.Println(count, s.UID, s.URL.Hostname)
|
|
||||||
err := gemini.SaveSnapshotToDB(tx, &s)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println(err)
|
|
||||||
err := tx.Rollback()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
count += 1
|
|
||||||
}
|
|
||||||
|
|
||||||
err = tx.Commit()
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println(err)
|
|
||||||
err := tx.Rollback()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func connectToDB() *sqlx.DB {
|
func connectToDB() *sqlx.DB {
|
||||||
@@ -1,20 +1,29 @@
|
|||||||
// func PopulateDB(db *sqlx.DB) {
|
package main
|
||||||
// // Delete all rows in the snapshots table
|
|
||||||
// db.MustExec("TRUNCATE snapshots;")
|
|
||||||
|
|
||||||
// // Prepare the query for inserting a snapshot with uid, url, and timestamp
|
import (
|
||||||
// query := `INSERT INTO snapshots(uid, url, timestamp)
|
"time"
|
||||||
// VALUES ($1, $2, $3)`
|
|
||||||
|
|
||||||
// // Calculate the timestamp for 2 days ago
|
"gemini-grc/uid"
|
||||||
// timestamp := time.Now().Add(-48 * time.Hour)
|
"github.com/jmoiron/sqlx"
|
||||||
|
)
|
||||||
|
|
||||||
// db.MustExec(query, uid.UID(), "gemini://geminiprotocol.net/", timestamp)
|
func PopulateDB(db *sqlx.DB) {
|
||||||
// db.MustExec(query, uid.UID(), "gemini://warmedal.se/~antenna", timestamp)
|
// Delete all rows in the snapshots table
|
||||||
// db.MustExec(query, uid.UID(), "gemini://skyjake.fi/~Cosmos/", timestamp)
|
db.MustExec("TRUNCATE snapshots;")
|
||||||
// db.MustExec(query, uid.UID(), "gemini://gemini.circumlunar.space/capcom/", timestamp)
|
|
||||||
// db.MustExec(query, uid.UID(), "gemini://auragem.letz.dev/", timestamp)
|
// Prepare the query for inserting a snapshot with uid, url, and timestamp
|
||||||
// db.MustExec(query, uid.UID(), "gemini://gemplex.space/", timestamp)
|
query := `INSERT INTO snapshots(uid, url, timestamp)
|
||||||
// db.MustExec(query, uid.UID(), "gemini://kennedy.gemi.dev/", timestamp)
|
VALUES ($1, $2, $3)`
|
||||||
// db.MustExec(query, uid.UID(), "gemini://tlgs.one/", timestamp)
|
|
||||||
// }
|
// Calculate the timestamp for 2 days ago
|
||||||
|
timestamp := time.Now().Add(-48 * time.Hour)
|
||||||
|
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://geminiprotocol.net/", timestamp)
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://warmedal.se/~antenna", timestamp)
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://skyjake.fi/~Cosmos/", timestamp)
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://gemini.circumlunar.space/capcom/", timestamp)
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://auragem.letz.dev/", timestamp)
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://gemplex.space/", timestamp)
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://kennedy.gemi.dev/", timestamp)
|
||||||
|
db.MustExec(query, uid.UID(), "gemini://tlgs.one/", timestamp)
|
||||||
|
}
|
||||||
|
|||||||
5
db/url_port_stats.sql
Normal file
5
db/url_port_stats.sql
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
SELECT
|
||||||
|
COUNT(*) AS "All",
|
||||||
|
COUNT(CASE WHEN URL ~ '://[^:]+:[0-9]+' THEN 1 END) AS "With port",
|
||||||
|
COUNT(CASE WHEN URL !~ '://[^:]+:[0-9]+' THEN 1 END) AS "Without port"
|
||||||
|
FROM snapshots;
|
||||||
@@ -1,14 +1,16 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
|
# Max response size 10MiB
|
||||||
MAX_RESPONSE_SIZE=10485760 \
|
MAX_RESPONSE_SIZE=10485760 \
|
||||||
LOG_LEVEL=info \
|
LOG_LEVEL=debug \
|
||||||
ROOT_PATH=./snaps \
|
ROOT_PATH=./snaps \
|
||||||
RESPONSE_TIMEOUT=10 \
|
RESPONSE_TIMEOUT=10 \
|
||||||
NUM_OF_WORKERS=5 \
|
NUM_OF_WORKERS=1 \
|
||||||
|
WORKER_BATCH_SIZE=1 \
|
||||||
PG_DATABASE=gemini \
|
PG_DATABASE=gemini \
|
||||||
PG_HOST=127.0.0.1 \
|
PG_HOST=127.0.0.1 \
|
||||||
PG_PORT=5433 \
|
PG_PORT=5433 \
|
||||||
PG_USER=gemini \
|
PG_USER=gemini \
|
||||||
PG_PASSWORD=gemini \
|
PG_PASSWORD=gemini \
|
||||||
go run ./migrate1_host.go
|
dlv debug
|
||||||
@@ -1,3 +1,6 @@
|
|||||||
|
// 31 redirect
|
||||||
|
gemini://gemini.circumlunar.space
|
||||||
|
|
||||||
// body with null byte
|
// body with null byte
|
||||||
gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False
|
gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False
|
||||||
|
|
||||||
|
|||||||
51
gemini/blacklist.go
Normal file
51
gemini/blacklist.go
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
"gemini-grc/logging"
|
||||||
|
)
|
||||||
|
|
||||||
|
var Blacklist *[]string //nolint:gochecknoglobals
|
||||||
|
|
||||||
|
func LoadBlacklist() {
|
||||||
|
if Blacklist == nil {
|
||||||
|
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
Blacklist = &[]string{}
|
||||||
|
logging.LogWarn("Could not load Blacklist file: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
|
||||||
|
// Ignore lines starting with '#' (comments)
|
||||||
|
filteredLines := func() []string {
|
||||||
|
out := make([]string, 0, len(lines))
|
||||||
|
for _, line := range lines {
|
||||||
|
if !strings.HasPrefix(line, "#") {
|
||||||
|
out = append(out, line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}()
|
||||||
|
|
||||||
|
if len(lines) > 0 {
|
||||||
|
Blacklist = &filteredLines
|
||||||
|
logging.LogInfo("Blacklist has %d entries", len(*Blacklist))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsBlacklisted(url URL) bool {
|
||||||
|
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
|
||||||
|
for _, v := range *Blacklist {
|
||||||
|
if v == url.Hostname || v == hostWithPort {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -4,11 +4,11 @@ import (
|
|||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
)
|
)
|
||||||
|
|
||||||
var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)}
|
var IpPool = IpAddressPool{IPs: make(map[string]int)}
|
||||||
|
|
||||||
func AddIPsToPool(IPs []string) {
|
func AddIPsToPool(ips []string) {
|
||||||
IpPool.Lock.Lock()
|
IpPool.Lock.Lock()
|
||||||
for _, ip := range IPs {
|
for _, ip := range ips {
|
||||||
logging.LogDebug("Adding %s to pool", ip)
|
logging.LogDebug("Adding %s to pool", ip)
|
||||||
IpPool.IPs[ip]++
|
IpPool.IPs[ip]++
|
||||||
}
|
}
|
||||||
|
|||||||
101
gemini/errors.go
Normal file
101
gemini/errors.go
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
type GeminiError struct {
|
||||||
|
Msg string
|
||||||
|
Code int
|
||||||
|
Header string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *GeminiError) Error() string {
|
||||||
|
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewErrGeminiStatusCode(code int, header string) error {
|
||||||
|
var msg string
|
||||||
|
switch {
|
||||||
|
case code >= 10 && code < 20:
|
||||||
|
msg = "needs input"
|
||||||
|
case code >= 30 && code < 40:
|
||||||
|
msg = "redirect"
|
||||||
|
case code >= 40 && code < 50:
|
||||||
|
msg = "bad request"
|
||||||
|
case code >= 50 && code < 60:
|
||||||
|
msg = "server error"
|
||||||
|
case code >= 60 && code < 70:
|
||||||
|
msg = "TLS error"
|
||||||
|
default:
|
||||||
|
msg = "unexpected status code"
|
||||||
|
}
|
||||||
|
return &GeminiError{
|
||||||
|
Msg: msg,
|
||||||
|
Code: code,
|
||||||
|
Header: header,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
|
||||||
|
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
|
||||||
|
ErrGeminiResponseHeader = errors.New("gemini response header error")
|
||||||
|
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
|
||||||
|
|
||||||
|
ErrURLParse = errors.New("URL parse error")
|
||||||
|
ErrURLDecode = errors.New("URL decode error")
|
||||||
|
ErrUTF8Parse = errors.New("UTF-8 parse error")
|
||||||
|
ErrTextParse = errors.New("text parse error")
|
||||||
|
|
||||||
|
ErrNetwork = errors.New("network error")
|
||||||
|
ErrNetworkDNS = errors.New("network DNS error")
|
||||||
|
ErrNetworkTLS = errors.New("network TLS error")
|
||||||
|
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
|
||||||
|
ErrNetworkCannotWrite = errors.New("network error - cannot write")
|
||||||
|
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
|
||||||
|
|
||||||
|
ErrDatabase = errors.New("database error")
|
||||||
|
)
|
||||||
|
|
||||||
|
// We could have used a map for speed, but
|
||||||
|
// we would lose ability to check wrapped
|
||||||
|
// errors via errors.Is().
|
||||||
|
|
||||||
|
var errGemini *GeminiError
|
||||||
|
|
||||||
|
var knownErrors = []error{ //nolint:gochecknoglobals
|
||||||
|
errGemini,
|
||||||
|
ErrGeminiLinkLineParse,
|
||||||
|
ErrGeminiRobotsParse,
|
||||||
|
ErrGeminiRobotsDisallowed,
|
||||||
|
ErrGeminiResponseHeader,
|
||||||
|
|
||||||
|
ErrURLParse,
|
||||||
|
ErrURLDecode,
|
||||||
|
ErrUTF8Parse,
|
||||||
|
ErrTextParse,
|
||||||
|
|
||||||
|
ErrNetwork,
|
||||||
|
ErrNetworkDNS,
|
||||||
|
ErrNetworkTLS,
|
||||||
|
ErrNetworkSetConnectionDeadline,
|
||||||
|
ErrNetworkCannotWrite,
|
||||||
|
ErrNetworkResponseSizeExceededMax,
|
||||||
|
|
||||||
|
ErrDatabase,
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsKnownError(err error) bool {
|
||||||
|
for _, known := range knownErrors {
|
||||||
|
if errors.Is(err, known) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Check for wrapped errors as well
|
||||||
|
if errors.As(err, new(*GeminiError)) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
24
gemini/errors_test.go
Normal file
24
gemini/errors_test.go
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestErrGemini(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
err := NewErrGeminiStatusCode(50, "50 server error")
|
||||||
|
if !errors.As(err, new(*GeminiError)) {
|
||||||
|
t.Errorf("TestErrGemini fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrGeminiWrapped(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
err := NewErrGeminiStatusCode(50, "50 server error")
|
||||||
|
errWrapped := fmt.Errorf("%w wrapped", err)
|
||||||
|
if !errors.As(errWrapped, new(*GeminiError)) {
|
||||||
|
t.Errorf("TestErrGeminiWrapped fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,12 +2,13 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/logging"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"gemini-grc/logging"
|
||||||
)
|
)
|
||||||
|
|
||||||
// sanitizePath encodes invalid filesystem characters using URL encoding.
|
// sanitizePath encodes invalid filesystem characters using URL encoding.
|
||||||
@@ -67,7 +68,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
|||||||
urlPath := s.URL.Path
|
urlPath := s.URL.Path
|
||||||
// If path is empty, add `index.gmi` as the file to save
|
// If path is empty, add `index.gmi` as the file to save
|
||||||
if urlPath == "" || urlPath == "." {
|
if urlPath == "" || urlPath == "." {
|
||||||
urlPath = fmt.Sprintf("index.gmi")
|
urlPath = "index.gmi"
|
||||||
}
|
}
|
||||||
// If path ends with '/' then add index.gmi for the
|
// If path ends with '/' then add index.gmi for the
|
||||||
// directory to be created.
|
// directory to be created.
|
||||||
@@ -77,7 +78,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
|||||||
|
|
||||||
finalPath, err := calcFilePath(parentPath, urlPath)
|
finalPath, err := calcFilePath(parentPath, urlPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error saving %s: %w", s.URL, err)
|
logging.LogError("GeminiError saving %s: %w", s.URL, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Ensure the directory exists
|
// Ensure the directory exists
|
||||||
@@ -87,12 +88,12 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||||
err = os.WriteFile(finalPath, (*s).Data.V, 0666)
|
err = os.WriteFile(finalPath, (*s).Data.V, 0o666)
|
||||||
} else {
|
} else {
|
||||||
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666)
|
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error saving %s: %w", s.URL.Full, err)
|
logging.LogError("GeminiError saving %s: %w", s.URL.Full, err)
|
||||||
}
|
}
|
||||||
close(done)
|
close(done)
|
||||||
}
|
}
|
||||||
|
|||||||
120
gemini/gemini.go
120
gemini/gemini.go
@@ -1,98 +1,37 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/logging"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
gourl "net/url"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
|
||||||
|
"gemini-grc/logging"
|
||||||
)
|
)
|
||||||
|
|
||||||
func isGeminiURL(url string) bool {
|
func GetPageLinks(currentURL URL, gemtext string) LinkList {
|
||||||
_, err := gourl.Parse(url)
|
|
||||||
if err != nil {
|
|
||||||
logging.LogWarn("[%s] Invalid URL: %v", url, err)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return strings.HasPrefix(url, "gemini://")
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseLinks(s Snapshot, queue chan string) {
|
|
||||||
for _, link := range *s.Links {
|
|
||||||
if strings.HasPrefix(link.Full, "gemini://") {
|
|
||||||
go func(link GeminiUrl) {
|
|
||||||
// fmt.Printf("LINK: %s\n", link)
|
|
||||||
queue <- link.Full
|
|
||||||
}(link)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkGeminiStatusCode(code int) error {
|
|
||||||
switch {
|
|
||||||
case code == 20:
|
|
||||||
return nil
|
|
||||||
case code >= 10 && code < 20:
|
|
||||||
return fmt.Errorf("gemini response %d needs data input", code)
|
|
||||||
case code >= 30 && code < 40:
|
|
||||||
return fmt.Errorf("gemini response %d redirect", code)
|
|
||||||
case code >= 40 && code < 50:
|
|
||||||
return fmt.Errorf("gemini response %d server error", code)
|
|
||||||
case code >= 50 && code < 60:
|
|
||||||
return fmt.Errorf("gemini response %d server permanent error", code)
|
|
||||||
case code >= 60 && code < 70:
|
|
||||||
return fmt.Errorf("gemini response %d certificate error", code)
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("unexpected/unhandled Gemini response %d", code)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
|
||||||
// Grab link lines
|
// Grab link lines
|
||||||
linkLines := ExtractLinkLines(snapshot.GemText.String)
|
linkLines := ExtractLinkLines(gemtext)
|
||||||
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
|
if len(linkLines) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var linkURLs LinkList
|
||||||
// Normalize URLs in links, and store them in snapshot
|
// Normalize URLs in links, and store them in snapshot
|
||||||
for _, line := range linkLines {
|
for _, line := range linkLines {
|
||||||
normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String())
|
normalizedLink, descr, err := NormalizeLink(line, currentURL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
|
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
geminiUrl, err := ParseUrl(normalizedLink, descr)
|
geminiUrl, err := ParseURL(normalizedLink, descr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
|
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if snapshot.Links == nil {
|
logging.LogDebug(geminiUrl.String())
|
||||||
snapshot.Links = &LinkList{*geminiUrl}
|
linkURLs = append(linkURLs, *geminiUrl)
|
||||||
} else {
|
|
||||||
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return snapshot
|
return linkURLs
|
||||||
}
|
|
||||||
|
|
||||||
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
|
||||||
u, err := url.Parse(input)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
|
|
||||||
}
|
|
||||||
protocol := u.Scheme
|
|
||||||
hostname := u.Hostname()
|
|
||||||
strPort := u.Port()
|
|
||||||
path := u.Path
|
|
||||||
if strPort == "" {
|
|
||||||
strPort = "1965"
|
|
||||||
}
|
|
||||||
port, err := strconv.Atoi(strPort)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
|
|
||||||
}
|
|
||||||
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||||
@@ -109,11 +48,11 @@ func ExtractLinkLines(gemtext string) []string {
|
|||||||
// NormalizeLink takes a single link line and the current URL,
|
// NormalizeLink takes a single link line and the current URL,
|
||||||
// return the URL converted to an absolute URL
|
// return the URL converted to an absolute URL
|
||||||
// and its description.
|
// and its description.
|
||||||
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
|
func NormalizeLink(linkLine string, currentURL string) (string, string, error) {
|
||||||
// Parse the current URL
|
// Parse the current URL
|
||||||
baseURL, err := url.Parse(currentURL)
|
baseURL, err := url.Parse(currentURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", "", fmt.Errorf("invalid current URL: %v", err)
|
return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Regular expression to extract the URL part from a link line
|
// Regular expression to extract the URL part from a link line
|
||||||
@@ -123,13 +62,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
|
|||||||
matches := re.FindStringSubmatch(linkLine)
|
matches := re.FindStringSubmatch(linkLine)
|
||||||
if len(matches) == 0 {
|
if len(matches) == 0 {
|
||||||
// If the line doesn't match the expected format, return it unchanged
|
// If the line doesn't match the expected format, return it unchanged
|
||||||
return "", "", fmt.Errorf("not a link line: %v", linkLine)
|
return "", "", fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine)
|
||||||
}
|
}
|
||||||
|
|
||||||
originalURLStr := matches[1]
|
originalURLStr := matches[1]
|
||||||
_, err = url.QueryUnescape(originalURLStr)
|
_, err = url.QueryUnescape(originalURLStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", "", fmt.Errorf("error decoding URL: %w", err)
|
return "", "", fmt.Errorf("%w: %w", ErrURLDecode, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
restOfLine := ""
|
restOfLine := ""
|
||||||
@@ -141,7 +80,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
|
|||||||
parsedURL, err := url.Parse(originalURLStr)
|
parsedURL, err := url.Parse(originalURLStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// If URL parsing fails, return an error
|
// If URL parsing fails, return an error
|
||||||
return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err)
|
return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resolve relative URLs against the base URL
|
// Resolve relative URLs against the base URL
|
||||||
@@ -173,14 +112,33 @@ func ParseFirstTwoDigits(input string) (int, error) {
|
|||||||
// Find the first match in the string
|
// Find the first match in the string
|
||||||
matches := re.FindStringSubmatch(input)
|
matches := re.FindStringSubmatch(input)
|
||||||
if len(matches) == 0 {
|
if len(matches) == 0 {
|
||||||
return 0, errors.New("no digits found at the beginning of the string")
|
return 0, fmt.Errorf("%w", ErrGeminiResponseHeader)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse the captured match as an integer
|
// Parse the captured match as an integer
|
||||||
snapshot, err := strconv.Atoi(matches[1])
|
snapshot, err := strconv.Atoi(matches[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
|
return 0, fmt.Errorf("%w: %w", ErrTextParse, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return snapshot, nil
|
return snapshot, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// extractRedirectTarget returns the redirection
|
||||||
|
// URL by parsing the header (or error message)
|
||||||
|
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
|
||||||
|
// \d+ - matches one or more digits
|
||||||
|
// \s+ - matches one or more whitespace
|
||||||
|
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
||||||
|
pattern := `\d+\s+([^\r]+)`
|
||||||
|
re := regexp.MustCompile(pattern)
|
||||||
|
matches := re.FindStringSubmatch(input)
|
||||||
|
if len(matches) < 2 {
|
||||||
|
return nil, fmt.Errorf("%w: Cannot find redirect target from header %s", ErrGeminiResponseHeader, input)
|
||||||
|
}
|
||||||
|
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%w: Cannot find redirect target from header: %w", ErrGeminiResponseHeader, err)
|
||||||
|
}
|
||||||
|
return newURL, nil
|
||||||
|
}
|
||||||
|
|||||||
47
gemini/gemini_test.go
Normal file
47
gemini/gemini_test.go
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||||
|
input := "redirect: 31 gemini://target.gr"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://target.gr:1965") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||||
|
input := "redirect: 31 /a/b"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
|
||||||
|
input := "redirect: 31 ./"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetWrong(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||||
|
input := "redirect: 31 fsdsdf"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
fmt.Println(err)
|
||||||
|
if result != nil || err == nil {
|
||||||
|
t.Errorf("fail: result should be nil, err is %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,12 +1,16 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"database/sql/driver"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
|
"net/url"
|
||||||
|
"path"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type GeminiUrl struct {
|
type URL struct {
|
||||||
Protocol string `json:"protocol,omitempty"`
|
Protocol string `json:"protocol,omitempty"`
|
||||||
Hostname string `json:"hostname,omitempty"`
|
Hostname string `json:"hostname,omitempty"`
|
||||||
Port int `json:"port,omitempty"`
|
Port int `json:"port,omitempty"`
|
||||||
@@ -15,43 +19,79 @@ type GeminiUrl struct {
|
|||||||
Full string `json:"full,omitempty"`
|
Full string `json:"full,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (g *GeminiUrl) Scan(value interface{}) error {
|
func (u *URL) Scan(value interface{}) error {
|
||||||
if value == nil {
|
if value == nil {
|
||||||
// Clear the fields in the current GeminiUrl object (not the pointer itself)
|
// Clear the fields in the current GeminiUrl object (not the pointer itself)
|
||||||
*g = GeminiUrl{}
|
*u = URL{}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
b, ok := value.(string)
|
b, ok := value.(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
|
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
|
||||||
}
|
}
|
||||||
parsedUrl, err := ParseUrl(b, "")
|
parsedURL, err := ParseURL(b, "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
*g = *parsedUrl
|
*u = *parsedURL
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u GeminiUrl) String() string {
|
func (u URL) String() string {
|
||||||
return u.Full
|
return u.Full
|
||||||
// return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func GeminiUrltoJSON(g GeminiUrl) string {
|
func (u URL) StringNoDefaultPort() string {
|
||||||
// Serialize the Person struct to JSON
|
if u.Port == 1965 {
|
||||||
jsonData, err := json.Marshal(g)
|
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||||
if err != nil {
|
|
||||||
logging.LogError("Error serializing to JSON: %w", err)
|
|
||||||
}
|
}
|
||||||
return string(jsonData)
|
return u.Full
|
||||||
}
|
}
|
||||||
|
|
||||||
func GeminiUrlFromJSON(input string) GeminiUrl {
|
func (u URL) Value() (driver.Value, error) {
|
||||||
var geminiUrl GeminiUrl
|
if u.Full == "" {
|
||||||
err := json.Unmarshal([]byte(input), &geminiUrl)
|
return nil, nil
|
||||||
if err != nil {
|
|
||||||
logging.LogError("Error deserializing from JSON: %w", err)
|
|
||||||
}
|
}
|
||||||
return geminiUrl
|
return u.Full, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseURL(input string, descr string) (*URL, error) {
|
||||||
|
u, err := url.Parse(input)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
|
||||||
|
}
|
||||||
|
protocol := u.Scheme
|
||||||
|
hostname := u.Hostname()
|
||||||
|
strPort := u.Port()
|
||||||
|
path := u.Path
|
||||||
|
if strPort == "" {
|
||||||
|
strPort = "1965"
|
||||||
|
}
|
||||||
|
port, err := strconv.Atoi(strPort)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
|
||||||
|
}
|
||||||
|
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
|
||||||
|
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
|
||||||
|
logging.LogDebug("Calculating redirect URL. Current %s header string %s", currentURL, input)
|
||||||
|
// If URL is absolute, return just it
|
||||||
|
if strings.Contains(input, "://") {
|
||||||
|
return ParseURL(input, "")
|
||||||
|
}
|
||||||
|
// input is a path. Clean it and construct
|
||||||
|
// new path
|
||||||
|
var newPath string
|
||||||
|
// Handle weird cases found in the wild
|
||||||
|
if strings.HasPrefix(input, "/") {
|
||||||
|
newPath = path.Clean(input)
|
||||||
|
} else if input == "./" || input == "." {
|
||||||
|
newPath = path.Join(currentURL.Path, "/")
|
||||||
|
} else {
|
||||||
|
newPath = path.Join(currentURL.Path, path.Clean(input))
|
||||||
|
}
|
||||||
|
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
|
||||||
|
return ParseURL(strURL, "")
|
||||||
}
|
}
|
||||||
|
|||||||
103
gemini/gemini_url_test.go
Normal file
103
gemini/gemini_url_test.go
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
|
||||||
|
parsed, err := ParseURL(input, "")
|
||||||
|
value, _ := parsed.Value()
|
||||||
|
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
|
||||||
|
t.Errorf("fail: %s", parsed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL := URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b",
|
||||||
|
Descr: "Nothing",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b",
|
||||||
|
}
|
||||||
|
input := "gemini://a.b/c"
|
||||||
|
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("fail: %v", err)
|
||||||
|
}
|
||||||
|
expected := &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "a.b",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/c",
|
||||||
|
Descr: "",
|
||||||
|
Full: "gemini://a.b:1965/c",
|
||||||
|
}
|
||||||
|
pass := reflect.DeepEqual(output, expected)
|
||||||
|
if !pass {
|
||||||
|
t.Errorf("fail: %#v != %#v", output, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL := URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b",
|
||||||
|
Descr: "Nothing",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b",
|
||||||
|
}
|
||||||
|
input := "/c"
|
||||||
|
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("fail: %v", err)
|
||||||
|
}
|
||||||
|
expected := &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/c",
|
||||||
|
Descr: "",
|
||||||
|
Full: "gemini://smol.gr:1965/c",
|
||||||
|
}
|
||||||
|
pass := reflect.DeepEqual(output, expected)
|
||||||
|
if !pass {
|
||||||
|
t.Errorf("fail: %#v != %#v", output, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL := URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b",
|
||||||
|
Descr: "Nothing",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b",
|
||||||
|
}
|
||||||
|
input := "c/d"
|
||||||
|
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("fail: %v", err)
|
||||||
|
}
|
||||||
|
expected := &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b/c/d",
|
||||||
|
Descr: "",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b/c/d",
|
||||||
|
}
|
||||||
|
pass := reflect.DeepEqual(output, expected)
|
||||||
|
if !pass {
|
||||||
|
t.Errorf("fail: %#v != %#v", output, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,25 +2,30 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"crypto/tls"
|
"crypto/tls"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/config"
|
"gemini-grc/logging"
|
||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
go_url "net/url"
|
gourl "net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
|
||||||
"github.com/guregu/null/v5"
|
"github.com/guregu/null/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
type GeminiPageData struct {
|
type PageData struct {
|
||||||
ResponseCode int
|
ResponseCode int
|
||||||
MimeType string
|
ResponseHeader string
|
||||||
Lang string
|
MimeType string
|
||||||
GemText string
|
Lang string
|
||||||
Data []byte
|
GemText string
|
||||||
|
Data []byte
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resolve the URL hostname and
|
// Resolve the URL hostname and
|
||||||
@@ -31,7 +36,7 @@ type GeminiPageData struct {
|
|||||||
func getHostIPAddresses(hostname string) ([]string, error) {
|
func getHostIPAddresses(hostname string) ([]string, error) {
|
||||||
addrs, err := net.LookupHost(hostname)
|
addrs, err := net.LookupHost(hostname)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err)
|
||||||
}
|
}
|
||||||
IpPool.Lock.RLock()
|
IpPool.Lock.RLock()
|
||||||
defer func() {
|
defer func() {
|
||||||
@@ -41,52 +46,50 @@ func getHostIPAddresses(hostname string) ([]string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ConnectAndGetData(url string) ([]byte, error) {
|
func ConnectAndGetData(url string) ([]byte, error) {
|
||||||
parsedUrl, err := go_url.Parse(url)
|
parsedURL, err := gourl.Parse(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("Could not parse URL, error %w", err)
|
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
||||||
}
|
}
|
||||||
hostname := parsedUrl.Hostname()
|
hostname := parsedURL.Hostname()
|
||||||
port := parsedUrl.Port()
|
port := parsedURL.Port()
|
||||||
if port == "" {
|
if port == "" {
|
||||||
port = "1965"
|
port = "1965"
|
||||||
}
|
}
|
||||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||||
// Establish the underlying TCP connection.
|
// Establish the underlying TCP connection.
|
||||||
dialer := &net.Dialer{
|
dialer := &net.Dialer{
|
||||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
||||||
KeepAlive: 10 * time.Second,
|
|
||||||
}
|
}
|
||||||
conn, err := dialer.Dial("tcp", host)
|
conn, err := dialer.Dial("tcp", host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("TCP connection failed: %w", err)
|
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
||||||
}
|
}
|
||||||
// Make sure we always close the connection.
|
// Make sure we always close the connection.
|
||||||
defer func() {
|
defer func() {
|
||||||
err := conn.Close()
|
// No need to handle error:
|
||||||
if err != nil {
|
// Connection will time out eventually if still open somehow.
|
||||||
// Do nothing! Connection will timeout eventually if still open somehow.
|
_ = conn.Close()
|
||||||
}
|
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Set read and write timeouts on the TCP connection.
|
// Set read and write timeouts on the TCP connection.
|
||||||
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("Error setting connection deadline: %w", err)
|
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
|
||||||
}
|
}
|
||||||
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("Error setting connection deadline: %w", err)
|
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform the TLS handshake
|
// Perform the TLS handshake
|
||||||
tlsConfig := &tls.Config{
|
tlsConfig := &tls.Config{
|
||||||
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure.
|
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
|
||||||
ServerName: parsedUrl.Hostname(), // SNI should not include port
|
ServerName: parsedURL.Hostname(), // SNI should not include port
|
||||||
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
|
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
|
||||||
}
|
}
|
||||||
tlsConn := tls.Client(conn, tlsConfig)
|
tlsConn := tls.Client(conn, tlsConfig)
|
||||||
if err := tlsConn.Handshake(); err != nil {
|
if err := tlsConn.Handshake(); err != nil {
|
||||||
return nil, fmt.Errorf("TLS handshake error: %w", err)
|
return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We read `buf`-sized chunks and add data to `data`.
|
// We read `buf`-sized chunks and add data to `data`.
|
||||||
@@ -94,9 +97,13 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
var data []byte
|
var data []byte
|
||||||
|
|
||||||
// Send Gemini request to trigger server response.
|
// Send Gemini request to trigger server response.
|
||||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url)))
|
// Fix for stupid server bug:
|
||||||
|
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||||
|
// when the port is 1965 and is still specified explicitely in the URL.
|
||||||
|
_url, _ := ParseURL(url, "")
|
||||||
|
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("Error sending network request: %w", err)
|
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
|
||||||
}
|
}
|
||||||
// Read response bytes in len(buf) byte chunks
|
// Read response bytes in len(buf) byte chunks
|
||||||
for {
|
for {
|
||||||
@@ -105,69 +112,83 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
data = append(data, buf[:n]...)
|
data = append(data, buf[:n]...)
|
||||||
}
|
}
|
||||||
if len(data) > config.CONFIG.MaxResponseSize {
|
if len(data) > config.CONFIG.MaxResponseSize {
|
||||||
data = []byte{}
|
return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
|
||||||
return nil, fmt.Errorf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize)
|
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == io.EOF {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
} else {
|
|
||||||
return nil, fmt.Errorf("Network error: %s", err)
|
|
||||||
}
|
}
|
||||||
|
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return data, nil
|
return data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Connect to given URL, using the Gemini protocol.
|
// Visit given URL, using the Gemini protocol.
|
||||||
// Mutate given Snapshot with the data or the error.
|
// Mutates given Snapshot with the data.
|
||||||
func Visit(s *Snapshot) {
|
// In case of error, we store the error string
|
||||||
|
// inside snapshot and return the error.
|
||||||
|
func Visit(s *Snapshot) (err error) {
|
||||||
|
// Don't forget to also store error
|
||||||
|
// response code (if we have one)
|
||||||
|
defer func() {
|
||||||
|
if err != nil {
|
||||||
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
if errors.As(err, new(*GeminiError)) {
|
||||||
|
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
data, err := ConnectAndGetData(s.URL.String())
|
data, err := ConnectAndGetData(s.URL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.Error = null.StringFrom(err.Error())
|
return err
|
||||||
return
|
|
||||||
}
|
}
|
||||||
pageData, err := processData(data)
|
pageData, err := processData(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.Error = null.StringFrom(err.Error())
|
return err
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
//marshalled, _ := json.MarshalIndent(pageData, "", " ")
|
||||||
|
//fmt.Printf("%s\n", marshalled)
|
||||||
|
s.Header = null.StringFrom(pageData.ResponseHeader)
|
||||||
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
||||||
s.MimeType = null.StringFrom(pageData.MimeType)
|
s.MimeType = null.StringFrom(pageData.MimeType)
|
||||||
s.Lang = null.StringFrom(pageData.Lang)
|
s.Lang = null.StringFrom(pageData.Lang)
|
||||||
if pageData.GemText != "" {
|
if pageData.GemText != "" {
|
||||||
s.GemText = null.StringFrom(string(pageData.GemText))
|
s.GemText = null.StringFrom(pageData.GemText)
|
||||||
}
|
}
|
||||||
if pageData.Data != nil {
|
if pageData.Data != nil {
|
||||||
s.Data = null.ValueFrom(pageData.Data)
|
s.Data = null.ValueFrom(pageData.Data)
|
||||||
}
|
}
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update given snapshot with the
|
// processData returne results from
|
||||||
// Gemini header data: response code,
|
// parsing Gemini header data:
|
||||||
// mime type and lang (optional)
|
// Code, mime type and lang (optional)
|
||||||
func processData(data []byte) (*GeminiPageData, error) {
|
// Returns error if header was invalid
|
||||||
headers, body, err := getHeadersAndData(data)
|
func processData(data []byte) (*PageData, error) {
|
||||||
|
header, body, err := getHeadersAndData(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
code, mimeType, lang := getMimeTypeAndLang(headers)
|
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||||
geminiError := checkGeminiStatusCode(code)
|
logging.LogDebug("Header: %s", strings.TrimSpace(header))
|
||||||
if geminiError != nil {
|
if code != 20 {
|
||||||
return nil, geminiError
|
return nil, NewErrGeminiStatusCode(code, header)
|
||||||
}
|
}
|
||||||
pageData := GeminiPageData{
|
|
||||||
ResponseCode: code,
|
pageData := PageData{
|
||||||
MimeType: mimeType,
|
ResponseCode: code,
|
||||||
Lang: lang,
|
ResponseHeader: header,
|
||||||
|
MimeType: mimeType,
|
||||||
|
Lang: lang,
|
||||||
}
|
}
|
||||||
// If we've got a Gemini document, populate
|
// If we've got a Gemini document, populate
|
||||||
// `GemText` field, otherwise raw data goes to `Data`.
|
// `GemText` field, otherwise raw data goes to `Data`.
|
||||||
if mimeType == "text/gemini" {
|
if mimeType == "text/gemini" {
|
||||||
validBody, err := EnsureValidUTF8(body)
|
validBody, err := BytesToValidUTF8(body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("UTF-8 error: %w", err)
|
return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err)
|
||||||
}
|
}
|
||||||
pageData.GemText = validBody
|
pageData.GemText = validBody
|
||||||
} else {
|
} else {
|
||||||
@@ -180,14 +201,14 @@ func processData(data []byte) (*GeminiPageData, error) {
|
|||||||
// basically the first line of the response
|
// basically the first line of the response
|
||||||
// and should contain the response code,
|
// and should contain the response code,
|
||||||
// mimeType and language.
|
// mimeType and language.
|
||||||
func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
|
func getHeadersAndData(data []byte) (string, []byte, error) {
|
||||||
firstLineEnds := slices.Index(data, '\n')
|
firstLineEnds := slices.Index(data, '\n')
|
||||||
if firstLineEnds == -1 {
|
if firstLineEnds == -1 {
|
||||||
return "", nil, fmt.Errorf("Could not parse response header")
|
return "", nil, ErrGeminiResponseHeader
|
||||||
}
|
}
|
||||||
firstLine = string(data[:firstLineEnds])
|
firstLine := string(data[:firstLineEnds])
|
||||||
rest = data[firstLineEnds+1:]
|
rest := data[firstLineEnds+1:]
|
||||||
return string(firstLine), rest, nil
|
return firstLine, rest, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parses code, mime type and language
|
// Parses code, mime type and language
|
||||||
@@ -196,12 +217,12 @@ func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
|
|||||||
// `20 text/gemini lang=en` (code, mimetype, lang)
|
// `20 text/gemini lang=en` (code, mimetype, lang)
|
||||||
// `20 text/gemini` (code, mimetype)
|
// `20 text/gemini` (code, mimetype)
|
||||||
// `31 gemini://redirected.to/other/site` (code)
|
// `31 gemini://redirected.to/other/site` (code)
|
||||||
func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) {
|
func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||||
// Regex that parses code, mimetype & lang
|
// Regex that parses code, mimetype & optional charset/lang parameters
|
||||||
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
|
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
|
||||||
matches := re.FindStringSubmatch(headers)
|
matches := re.FindStringSubmatch(headers)
|
||||||
if matches == nil || len(matches) <= 1 {
|
if matches == nil || len(matches) <= 1 {
|
||||||
// Try to get code at least.
|
// Try to get code at least
|
||||||
re := regexp.MustCompile(`^(\d+)\s+`)
|
re := regexp.MustCompile(`^(\d+)\s+`)
|
||||||
matches := re.FindStringSubmatch(headers)
|
matches := re.FindStringSubmatch(headers)
|
||||||
if matches == nil || len(matches) <= 1 {
|
if matches == nil || len(matches) <= 1 {
|
||||||
@@ -217,7 +238,7 @@ func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string)
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, "", ""
|
return 0, "", ""
|
||||||
}
|
}
|
||||||
mimeType = matches[2]
|
mimeType := matches[2]
|
||||||
lang = matches[4]
|
param := matches[3] // This will capture either charset or lang value
|
||||||
return code, mimeType, lang
|
return code, mimeType, param
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
|
|
||||||
// Test for input: `20 text/gemini`
|
// Test for input: `20 text/gemini`
|
||||||
func TestGetMimeTypeAndLang1(t *testing.T) {
|
func TestGetMimeTypeAndLang1(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
||||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
@@ -13,13 +14,39 @@ func TestGetMimeTypeAndLang1(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang11(t *testing.T) {
|
func TestGetMimeTypeAndLang11(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
||||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetMimeTypeAndLang12(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
|
||||||
|
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
|
||||||
|
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetMimeTypeAndLang13(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
|
||||||
|
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
|
||||||
|
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestGetTypeAndLang2(t *testing.T) {
|
func TestGetTypeAndLang2(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
|
||||||
|
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||||
|
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetTypeAndLang21(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||||
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
@@ -27,6 +54,7 @@ func TestGetTypeAndLang2(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang3(t *testing.T) {
|
func TestGetMimeTypeAndLang3(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
|
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
|
||||||
if code != 31 || mimeType != "" || lang != "" {
|
if code != 31 || mimeType != "" || lang != "" {
|
||||||
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
@@ -34,6 +62,7 @@ func TestGetMimeTypeAndLang3(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang4(t *testing.T) {
|
func TestGetMimeTypeAndLang4(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
|
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
|
||||||
if code != 0 || mimeType != "" || lang != "" {
|
if code != 0 || mimeType != "" || lang != "" {
|
||||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
@@ -41,6 +70,7 @@ func TestGetMimeTypeAndLang4(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestGetMimeTypeAndLang5(t *testing.T) {
|
func TestGetMimeTypeAndLang5(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("")
|
code, mimeType, lang := getMimeTypeAndLang("")
|
||||||
if code != 0 || mimeType != "" || lang != "" {
|
if code != 0 || mimeType != "" || lang != "" {
|
||||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/logging"
|
"gemini-grc/config"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
|
"gemini-grc/logging"
|
||||||
|
|
||||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
@@ -33,11 +36,41 @@ func ConnectToDB() *sqlx.DB {
|
|||||||
return db
|
return db
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
|
||||||
|
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("JSON serialization error for %v", s))
|
||||||
|
}
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
ON CONFLICT (uid) DO UPDATE SET
|
ON CONFLICT (url) DO NOTHING
|
||||||
|
`
|
||||||
|
_, err = tx.NamedExec(query, s)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func UpsertSnapshot(id int, tx *sqlx.Tx, s *Snapshot) error {
|
||||||
|
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("JSON serialization error for %v", s))
|
||||||
|
}
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
logging.LogDebug("[%d] Would upsert snapshot %s", id, marshalled)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
query := `
|
||||||
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
|
ON CONFLICT (url) DO UPDATE SET
|
||||||
url = EXCLUDED.url,
|
url = EXCLUDED.url,
|
||||||
host = EXCLUDED.host,
|
host = EXCLUDED.host,
|
||||||
timestamp = EXCLUDED.timestamp,
|
timestamp = EXCLUDED.timestamp,
|
||||||
@@ -47,24 +80,30 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
|||||||
links = EXCLUDED.links,
|
links = EXCLUDED.links,
|
||||||
lang = EXCLUDED.lang,
|
lang = EXCLUDED.lang,
|
||||||
response_code = EXCLUDED.response_code,
|
response_code = EXCLUDED.response_code,
|
||||||
error = EXCLUDED.error
|
error = EXCLUDED.error`
|
||||||
`
|
_, err = tx.NamedExec(query, s)
|
||||||
_, err := tx.NamedExec(query, s)
|
//if err != nil {
|
||||||
|
// logging.LogError("[%s] GeminiError upserting snapshot: %w", s.URL, err)
|
||||||
|
// panic("This shouldn't happen")
|
||||||
|
//}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
return fmt.Errorf("[%s] GeminiError upserting snapshot: %w", s.URL, err)
|
||||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
|
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
|
||||||
const batchSize = 5000
|
const batchSize = 5000
|
||||||
|
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
ON CONFLICT (uid) DO NOTHING
|
ON CONFLICT (url) DO NOTHING
|
||||||
`
|
`
|
||||||
|
|
||||||
for i := 0; i < len(snapshots); i += batchSize {
|
for i := 0; i < len(snapshots); i += batchSize {
|
||||||
@@ -77,7 +116,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
|||||||
|
|
||||||
_, err := tx.NamedExec(query, batch)
|
_, err := tx.NamedExec(query, batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error batch inserting snapshots: %w", err)
|
logging.LogError("GeminiError batch inserting snapshots: %w", err)
|
||||||
return fmt.Errorf("DB error: %w", err)
|
return fmt.Errorf("DB error: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -86,14 +125,17 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
ON CONFLICT (uid) DO NOTHING
|
ON CONFLICT (url) DO NOTHING
|
||||||
`
|
`
|
||||||
_, err := tx.NamedExec(query, snapshots)
|
_, err := tx.NamedExec(query, snapshots)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error batch inserting snapshots: %w", err)
|
logging.LogError("GeminiError batch inserting snapshots: %w", err)
|
||||||
return fmt.Errorf("DB error: %w", err)
|
return fmt.Errorf("DB error: %w", err)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -2,32 +2,58 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"golang.org/x/text/encoding/charmap"
|
"golang.org/x/text/encoding/charmap"
|
||||||
|
"golang.org/x/text/encoding/japanese"
|
||||||
|
"golang.org/x/text/encoding/korean"
|
||||||
"golang.org/x/text/transform"
|
"golang.org/x/text/transform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func EnsureValidUTF8(input []byte) (string, error) {
|
var (
|
||||||
// Remove NULL byte 0x00
|
ErrInputTooLarge = errors.New("input too large")
|
||||||
inputNoNull := bytes.ReplaceAll(input, []byte{0}, nil)
|
ErrUTF8Conversion = errors.New("UTF-8 conversion error")
|
||||||
isValidUTF8 := utf8.Valid(inputNoNull)
|
)
|
||||||
if !isValidUTF8 {
|
|
||||||
encodings := []transform.Transformer{
|
func BytesToValidUTF8(input []byte) (string, error) {
|
||||||
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
|
if len(input) == 0 {
|
||||||
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
|
return "", nil
|
||||||
// TODO: Try more encodings?
|
}
|
||||||
|
const maxSize = 10 * 1024 * 1024 // 10MB
|
||||||
|
if len(input) > maxSize {
|
||||||
|
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
|
||||||
|
}
|
||||||
|
// Remove NULL byte 0x00 (ReplaceAll accepts slices)
|
||||||
|
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
|
||||||
|
if utf8.Valid(inputNoNull) {
|
||||||
|
return string(inputNoNull), nil
|
||||||
|
}
|
||||||
|
encodings := []transform.Transformer{
|
||||||
|
charmap.ISO8859_1.NewDecoder(),
|
||||||
|
charmap.ISO8859_7.NewDecoder(),
|
||||||
|
charmap.Windows1250.NewDecoder(), // Central European
|
||||||
|
charmap.Windows1251.NewDecoder(), // Cyrillic
|
||||||
|
charmap.Windows1252.NewDecoder(),
|
||||||
|
charmap.Windows1256.NewDecoder(), // Arabic
|
||||||
|
japanese.EUCJP.NewDecoder(), // Japanese
|
||||||
|
korean.EUCKR.NewDecoder(), // Korean
|
||||||
|
}
|
||||||
|
// First successful conversion wins.
|
||||||
|
var lastErr error
|
||||||
|
for _, encoding := range encodings {
|
||||||
|
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
|
||||||
|
result, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
lastErr = err
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
for _, encoding := range encodings {
|
if utf8.Valid(result) {
|
||||||
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
|
|
||||||
result, err := io.ReadAll(reader)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("UTF-8 error: %w", err)
|
|
||||||
}
|
|
||||||
return string(result), nil
|
return string(result), nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return string(inputNoNull), nil
|
|
||||||
|
return "", fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,9 +4,10 @@ import "testing"
|
|||||||
|
|
||||||
// Make sure NULL bytes are removed
|
// Make sure NULL bytes are removed
|
||||||
func TestEnsureValidUTF8(t *testing.T) {
|
func TestEnsureValidUTF8(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
// Create a string with a null byte
|
// Create a string with a null byte
|
||||||
strWithNull := "Hello" + string('\x00') + "world"
|
strWithNull := "Hello" + string('\x00') + "world"
|
||||||
result, _ := EnsureValidUTF8([]byte(strWithNull))
|
result, _ := BytesToValidUTF8([]byte(strWithNull))
|
||||||
if result != "Helloworld" {
|
if result != "Helloworld" {
|
||||||
t.Errorf("Expected string without NULL byte, got %s", result)
|
t.Errorf("Expected string without NULL byte, got %s", result)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,16 +2,18 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/logging"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"gemini-grc/logging"
|
||||||
)
|
)
|
||||||
|
|
||||||
// key: "host:port" (string)
|
// RobotsCache is a map of blocked URLs
|
||||||
// value:
|
// key: URL
|
||||||
// empty []string if no robots data, or
|
// value: []string list of disallowed URLs
|
||||||
// list of URL prefixes ([]string) in robots
|
// If a key has no blocked URLs, an empty
|
||||||
var RobotsCache sync.Map
|
// list is stored for caching.
|
||||||
|
var RobotsCache sync.Map //nolint:gochecknoglobals
|
||||||
|
|
||||||
func populateBlacklist(key string) (entries []string) {
|
func populateBlacklist(key string) (entries []string) {
|
||||||
// We either store an empty list when
|
// We either store an empty list when
|
||||||
@@ -40,43 +42,40 @@ func populateBlacklist(key string) (entries []string) {
|
|||||||
// According to spec, the first is correct,
|
// According to spec, the first is correct,
|
||||||
// however let's be lenient
|
// however let's be lenient
|
||||||
var data string
|
var data string
|
||||||
if robotsData.MimeType == "text/plain" {
|
switch {
|
||||||
|
case robotsData.MimeType == "text/plain":
|
||||||
data = string(robotsData.Data)
|
data = string(robotsData.Data)
|
||||||
} else if robotsData.MimeType == "text/gemini" {
|
case robotsData.MimeType == "text/gemini":
|
||||||
data = robotsData.GemText
|
data = robotsData.GemText
|
||||||
} else {
|
default:
|
||||||
return []string{}
|
return []string{}
|
||||||
}
|
}
|
||||||
entries = ParseRobotsTxt(string(data), key)
|
entries = ParseRobotsTxt(data, key)
|
||||||
return entries
|
return entries
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if the snapshot URL matches
|
// RobotMatch checks if the snapshot URL matches
|
||||||
// a robots.txt allow rule.
|
// a robots.txt allow rule.
|
||||||
func RobotMatch(s *Snapshot) bool {
|
func RobotMatch(url URL) bool {
|
||||||
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
|
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||||
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
|
logging.LogDebug("Checking robots.txt cache for %s", key)
|
||||||
v, ok := RobotsCache.Load(key)
|
var disallowedURLs []string
|
||||||
if ok == false {
|
cacheEntries, ok := RobotsCache.Load(key)
|
||||||
|
if !ok {
|
||||||
// First time check, populate robot cache
|
// First time check, populate robot cache
|
||||||
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
|
disallowedURLs = populateBlacklist(key)
|
||||||
disallowedURLs := populateBlacklist(key)
|
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||||
for _, url := range disallowedURLs {
|
|
||||||
if strings.HasPrefix(s.URL.String(), url) {
|
|
||||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if len(v.([]string)) == 0 {
|
disallowedURLs, _ = cacheEntries.([]string)
|
||||||
logging.LogDebug("No robots.txt or no rules, allowed")
|
}
|
||||||
return false
|
return isURLblocked(disallowedURLs, url.Full)
|
||||||
}
|
}
|
||||||
for _, url := range v.([]string) {
|
|
||||||
if strings.HasPrefix(s.URL.String(), url) {
|
func isURLblocked(disallowedURLs []string, input string) bool {
|
||||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
for _, url := range disallowedURLs {
|
||||||
return true
|
if strings.HasPrefix(strings.ToLower(input), url) {
|
||||||
}
|
logging.LogDebug("robots.txt match: %s matches %s", input, url)
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Takes robots.txt content and a host, and
|
// ParseRobotsTxt takes robots.txt content and a host, and
|
||||||
// returns a list of full URLs that shouldn't
|
// returns a list of full URLs that shouldn't
|
||||||
// be visited.
|
// be visited.
|
||||||
// TODO Also take into account the user agent?
|
// TODO Also take into account the user agent?
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseRobotsTxt(t *testing.T) {
|
func TestParseRobotsTxt(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
input := `User-agent: *
|
input := `User-agent: *
|
||||||
Disallow: /cgi-bin/wp.cgi/view
|
Disallow: /cgi-bin/wp.cgi/view
|
||||||
Disallow: /cgi-bin/wp.cgi/media
|
Disallow: /cgi-bin/wp.cgi/media
|
||||||
@@ -26,6 +27,7 @@ Disallow: /admin/`
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestParseRobotsTxtEmpty(t *testing.T) {
|
func TestParseRobotsTxtEmpty(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
input := ``
|
input := ``
|
||||||
|
|
||||||
result := ParseRobotsTxt(input, "example.com")
|
result := ParseRobotsTxt(input, "example.com")
|
||||||
@@ -34,3 +36,20 @@ func TestParseRobotsTxtEmpty(t *testing.T) {
|
|||||||
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
|
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIsURLblocked(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
disallowedURLs := []string{
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||||
|
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||||
|
"gemini://example.com/admin/",
|
||||||
|
}
|
||||||
|
url := "gemini://example.com/admin/index.html"
|
||||||
|
if !isURLblocked(disallowedURLs, url) {
|
||||||
|
t.Errorf("Expected %s to be blocked", url)
|
||||||
|
}
|
||||||
|
url = "gemini://example1.com/admin/index.html"
|
||||||
|
if isURLblocked(disallowedURLs, url) {
|
||||||
|
t.Errorf("expected %s to not be blocked", url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,15 +4,13 @@ import (
|
|||||||
"database/sql/driver"
|
"database/sql/driver"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/logging"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/guregu/null/v5"
|
"github.com/guregu/null/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
type LinkList []GeminiUrl
|
type LinkList []URL
|
||||||
|
|
||||||
func (l LinkList) Value() (driver.Value, error) {
|
func (l *LinkList) Value() (driver.Value, error) {
|
||||||
return json.Marshal(l)
|
return json.Marshal(l)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -29,46 +27,17 @@ func (l *LinkList) Scan(value interface{}) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Snapshot struct {
|
type Snapshot struct {
|
||||||
ID int `db:"id" json:"id,omitempty"`
|
ID int `db:"id" json:"id,omitempty"`
|
||||||
UID string `db:"uid" json:"uid,omitempty"`
|
//UID string `db:"uid" json:"uid,omitempty"`
|
||||||
URL GeminiUrl `db:"url" json:"url,omitempty"`
|
URL URL `db:"url" json:"url,omitempty"`
|
||||||
Host string `db:"host" json:"host,omitempty"`
|
Host string `db:"host" json:"host,omitempty"`
|
||||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||||
Links *LinkList `db:"links" json:"links,omitempty"`
|
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
||||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
|
||||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
||||||
}
|
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||||
|
|
||||||
func SnapshotToJSON(g Snapshot) string {
|
|
||||||
// Serialize the Person struct to JSON
|
|
||||||
jsonData, err := json.MarshalIndent(g, "", "\t")
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("Error serializing to JSON: %w", err)
|
|
||||||
}
|
|
||||||
return string(jsonData)
|
|
||||||
}
|
|
||||||
|
|
||||||
func SnapshotFromJSON(input string) Snapshot {
|
|
||||||
var snapshot Snapshot
|
|
||||||
err := json.Unmarshal([]byte(input), &snapshot)
|
|
||||||
if err != nil {
|
|
||||||
logging.LogError("Error deserializing from JSON: %w", err)
|
|
||||||
}
|
|
||||||
return snapshot
|
|
||||||
}
|
|
||||||
|
|
||||||
func ShouldPersistSnapshot(result *Snapshot) bool {
|
|
||||||
if !result.MimeType.Valid {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if result.MimeType.String == "text/gemini" ||
|
|
||||||
strings.HasPrefix(result.MimeType.String, "image/") ||
|
|
||||||
strings.HasPrefix(result.MimeType.String, "text/") {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|||||||
266
gemini/worker.go
266
gemini/worker.go
@@ -1,36 +1,38 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gemini-grc/config"
|
|
||||||
"gemini-grc/logging"
|
|
||||||
"gemini-grc/uid"
|
|
||||||
"gemini-grc/util"
|
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"gemini-grc/config"
|
||||||
|
"gemini-grc/logging"
|
||||||
|
"gemini-grc/util"
|
||||||
|
|
||||||
"github.com/guregu/null/v5"
|
"github.com/guregu/null/v5"
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
|
|
||||||
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||||
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
||||||
for i := 0; i < numOfWorkers; i++ {
|
for i := range numOfWorkers {
|
||||||
go func(i int) {
|
go func(i int) {
|
||||||
for {
|
for {
|
||||||
runWorker(i, db)
|
RunWorker(i, db, nil)
|
||||||
}
|
}
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runWorker(id int, db *sqlx.DB) {
|
func RunWorker(id int, db *sqlx.DB, url *string) {
|
||||||
// Start the DB transaction
|
// Each worker runs within a DB transaction.
|
||||||
tx, err := db.Beginx()
|
tx, err := db.Beginx()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Failed to begin transaction: %w", err)
|
logging.LogError("Failed to begin transaction: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Commit/rollback at the end
|
||||||
defer func() {
|
defer func() {
|
||||||
err = tx.Commit()
|
err = tx.Commit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -42,66 +44,97 @@ func runWorker(id int, db *sqlx.DB) {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
snapshots, err := GetRandomSnapshotsDistinctHosts(tx)
|
var snapshots []Snapshot
|
||||||
|
|
||||||
if err != nil {
|
// If not given a specific URL,
|
||||||
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
|
// get some random ones to visit from DB.
|
||||||
time.Sleep(10 * time.Second)
|
if url == nil {
|
||||||
return
|
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
|
||||||
} else if len(snapshots) == 0 {
|
if err != nil {
|
||||||
logging.LogInfo("[%d] No remaining snapshots to visit.", id)
|
logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err)
|
||||||
time.Sleep(1 * time.Minute)
|
panic("This should never happen")
|
||||||
return
|
} else if len(snapshots) == 0 {
|
||||||
|
logging.LogInfo("[%d] No snapshots to visit.", id)
|
||||||
|
time.Sleep(1 * time.Minute)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
snapshotURL, err := ParseURL(*url, "")
|
||||||
|
if err != nil {
|
||||||
|
logging.LogError("Invalid URL given: " + *url)
|
||||||
|
return
|
||||||
|
|
||||||
|
}
|
||||||
|
snapshots = []Snapshot{{
|
||||||
|
//UID: uid.UID(),
|
||||||
|
URL: *snapshotURL,
|
||||||
|
Host: snapshotURL.Hostname,
|
||||||
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Start visiting URLs.
|
||||||
total := len(snapshots)
|
total := len(snapshots)
|
||||||
for i, s := range snapshots {
|
for i, s := range snapshots {
|
||||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
|
||||||
|
// We differentiate between errors:
|
||||||
|
// Unexpected errors are the ones returned from the following function.
|
||||||
|
// If an error is unexpected (which should never happen) we panic.
|
||||||
|
// Expected errors are stored as strings within the snapshot,
|
||||||
|
// so that they can also be stored in DB.
|
||||||
err = workOnSnapshot(id, tx, &s)
|
err = workOnSnapshot(id, tx, &s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%d] [%s] Error %w", id, s.URL, err)
|
logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err)
|
||||||
util.PrintStackAndPanic(err)
|
util.PrintStackAndPanic(err)
|
||||||
}
|
}
|
||||||
if s.Error.Valid {
|
if s.Error.Valid {
|
||||||
logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String))
|
logging.LogWarn("[%d] Error: %v", id, s.Error.String)
|
||||||
}
|
}
|
||||||
logging.LogDebug("[%d] Done %d/%d.", id, i, total)
|
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
|
||||||
}
|
}
|
||||||
logging.LogInfo("[%d] Worker done.", id)
|
logging.LogInfo("[%d] Worker done.", id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// workOnSnapshot visits a URL and stores the result.
|
||||||
|
// errors should be returned only if they are unexpected.
|
||||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||||
|
if IsBlacklisted(s.URL) {
|
||||||
|
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// If URL matches a robots.txt disallow line,
|
// If URL matches a robots.txt disallow line,
|
||||||
// add it as an error so next time it won't be
|
// add it as an error so next time it won't be
|
||||||
// crawled.
|
// crawled.
|
||||||
if RobotMatch(s) {
|
if RobotMatch(s.URL) {
|
||||||
s.Error = null.StringFrom("robots.txt disallow match")
|
s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
|
||||||
err = SaveSnapshotToDB(tx, s)
|
err = UpsertSnapshot(id, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
return fmt.Errorf("[%d] %w", id, err)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Resolve IP address via DNS
|
||||||
IPs, err := getHostIPAddresses(s.Host)
|
IPs, err := getHostIPAddresses(s.Host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.Error = null.StringFrom("DNS Resolve error")
|
s.Error = null.StringFrom(err.Error())
|
||||||
err = SaveSnapshotToDB(tx, s)
|
err = UpsertSnapshot(id, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
return fmt.Errorf("[%d] %w", id, err)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the host's ip is in the connections pool,
|
// If the host's ip is in the connections pool we stop
|
||||||
// stop and add the url in the queue later.
|
|
||||||
IpPool.Lock.RLock()
|
IpPool.Lock.RLock()
|
||||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL)
|
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
|
||||||
for _, ip := range IPs {
|
for _, ip := range IPs {
|
||||||
_, ok := IpPool.IPs[ip]
|
_, ok := IpPool.IPs[ip]
|
||||||
if ok {
|
if ok {
|
||||||
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL)
|
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
|
||||||
IpPool.Lock.RUnlock()
|
IpPool.Lock.RUnlock()
|
||||||
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
|
time.Sleep(1 * time.Second) // Avoid flood-retrying
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -109,73 +142,115 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
|
|
||||||
AddIPsToPool(IPs)
|
AddIPsToPool(IPs)
|
||||||
|
|
||||||
url := s.URL.String()
|
// After finishing, remove the host IPs from
|
||||||
logging.LogDebug("[%d] Dialing %s", id, url)
|
// the connections pool, with a small delay
|
||||||
Visit(s)
|
// to avoid potentially hitting the same IP quickly.
|
||||||
logging.LogDebug("[%d] Finished dialing.", id)
|
defer func() {
|
||||||
|
|
||||||
go func() {
|
|
||||||
time.Sleep(5 * time.Second)
|
time.Sleep(5 * time.Second)
|
||||||
RemoveIPsFromPool(IPs)
|
RemoveIPsFromPool(IPs)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
url := s.URL.String()
|
||||||
logging.LogDebug("[%d] [%s] Processing", id, url)
|
logging.LogDebug("[%d] Dialing %s", id, url)
|
||||||
s = ProcessGemini(s)
|
|
||||||
}
|
err = Visit(s)
|
||||||
logging.LogDebug("[%d] Saving", id)
|
|
||||||
err = SaveSnapshotToDB(tx, s)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
if !IsKnownError(err) {
|
||||||
|
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Check if error is redirection, and handle it
|
||||||
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
if errors.As(err, new(*GeminiError)) &&
|
||||||
|
err.(*GeminiError).Msg == "redirect" {
|
||||||
|
err = handleRedirection(id, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero())
|
||||||
|
|
||||||
|
// If this is a gemini page, parse possible links inside
|
||||||
|
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||||
|
links := GetPageLinks(s.URL, s.GemText.String)
|
||||||
|
logging.LogDebug("[%d] Found %d links", id, len(links))
|
||||||
|
if len(links) > 0 {
|
||||||
|
s.Links = null.ValueFrom(links)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logging.LogDebug("[%d] Not looking for page links", id)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store links in batch
|
err = UpsertSnapshot(id, tx, s)
|
||||||
if s.Links != nil {
|
if err != nil {
|
||||||
var batchSnapshots []*Snapshot
|
return err
|
||||||
timestamp := null.TimeFrom(time.Now())
|
}
|
||||||
|
|
||||||
for _, link := range *s.Links {
|
err = storeLinks(tx, s)
|
||||||
if shouldPersistURL(tx, link) {
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
|
||||||
|
if s.Links.Valid {
|
||||||
|
var batchSnapshots []*Snapshot
|
||||||
|
for _, link := range s.Links.ValueOrZero() {
|
||||||
|
if shouldPersistURL(link) {
|
||||||
newSnapshot := &Snapshot{
|
newSnapshot := &Snapshot{
|
||||||
UID: uid.UID(),
|
//UID: uid.UID(),
|
||||||
URL: link,
|
URL: link,
|
||||||
Host: link.Hostname,
|
Host: link.Hostname,
|
||||||
Timestamp: timestamp,
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
}
|
}
|
||||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(batchSnapshots) > 0 {
|
if len(batchSnapshots) > 0 {
|
||||||
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
|
err := SaveLinksToDBinBatches(tx, batchSnapshots)
|
||||||
err = SaveLinksToDBinBatches(tx, batchSnapshots)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should we save the given URL for crawling?
|
// shouldPersistURL returns true if we
|
||||||
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
|
// should save the URL in the DB.
|
||||||
if !strings.HasPrefix(u.String(), "gemini://") {
|
// Only gemini:// urls are saved.
|
||||||
return false
|
func shouldPersistURL(u URL) bool {
|
||||||
}
|
return strings.HasPrefix(u.String(), "gemini://")
|
||||||
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)`
|
}
|
||||||
var exists bool
|
|
||||||
err := tx.Get(&exists, query, u.String())
|
func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error {
|
||||||
|
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println("Error executing query:", err)
|
return err
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
return !exists
|
logging.LogDebug("[%d] Page redirects to %s", id, newURL)
|
||||||
|
// Insert fresh snapshot with new URL
|
||||||
|
snapshot := &Snapshot{
|
||||||
|
//UID: uid.UID(),
|
||||||
|
URL: *newURL,
|
||||||
|
Host: newURL.Hostname,
|
||||||
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
|
}
|
||||||
|
logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String())
|
||||||
|
err = SaveSnapshotIfNew(tx, snapshot)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||||
// Old, unoptimized query
|
// Old, unoptimized query
|
||||||
//
|
|
||||||
// query := `
|
// query := `
|
||||||
// SELECT DISTINCT ON (host) *
|
// SELECT DISTINCT ON (host) *
|
||||||
// FROM snapshots
|
// FROM snapshots
|
||||||
// WHERE response_code IS NULL
|
// WHERE response_code IS NULL
|
||||||
@@ -184,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
|||||||
// LIMIT $1
|
// LIMIT $1
|
||||||
// `
|
// `
|
||||||
query := `
|
query := `
|
||||||
WITH RankedSnapshots AS (
|
SELECT *
|
||||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
FROM snapshots
|
||||||
links, lang, response_code, error,
|
WHERE response_code IS NULL
|
||||||
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
AND error IS NULL
|
||||||
FROM snapshots
|
ORDER BY RANDOM()
|
||||||
WHERE response_code IS NULL
|
LIMIT $1
|
||||||
AND error IS NULL
|
`
|
||||||
)
|
//query := `
|
||||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
// WITH RankedSnapshots AS (
|
||||||
links, lang, response_code, error
|
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||||
FROM RankedSnapshots
|
// links, lang, response_code, error,
|
||||||
WHERE rn = 1
|
// ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||||
LIMIT $1
|
// FROM snapshots
|
||||||
`
|
// WHERE response_code IS NULL
|
||||||
|
// AND error IS NULL
|
||||||
|
// )
|
||||||
|
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||||
|
// links, lang, response_code, error
|
||||||
|
// FROM RankedSnapshots
|
||||||
|
// WHERE rn = 1
|
||||||
|
// LIMIT $1
|
||||||
|
//`
|
||||||
var snapshots []Snapshot
|
var snapshots []Snapshot
|
||||||
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -205,3 +288,18 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
|||||||
}
|
}
|
||||||
return snapshots, nil
|
return snapshots, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
|
||||||
|
query := `
|
||||||
|
SELECT *
|
||||||
|
FROM snapshots
|
||||||
|
WHERE url=$1
|
||||||
|
LIMIT 1
|
||||||
|
`
|
||||||
|
var snapshots []Snapshot
|
||||||
|
err := tx.Select(&snapshots, query, url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return snapshots, nil
|
||||||
|
}
|
||||||
|
|||||||
18
go.mod
18
go.mod
@@ -3,23 +3,27 @@ module gemini-grc
|
|||||||
go 1.23.1
|
go 1.23.1
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/jaevor/go-nanoid v1.4.0
|
github.com/guregu/null/v5 v5.0.0
|
||||||
|
github.com/jackc/pgx/v5 v5.7.1
|
||||||
|
github.com/jmoiron/sqlx v1.4.0
|
||||||
|
github.com/matoous/go-nanoid/v2 v2.1.0
|
||||||
github.com/rs/zerolog v1.33.0
|
github.com/rs/zerolog v1.33.0
|
||||||
|
github.com/stretchr/testify v1.9.0
|
||||||
|
golang.org/x/text v0.19.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/guregu/null/v5 v5.0.0 // indirect
|
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||||
github.com/jackc/pgpassfile v1.0.0 // indirect
|
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||||
github.com/jackc/pgx/v5 v5.7.1 // indirect
|
|
||||||
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||||
github.com/jmoiron/sqlx v1.4.0 // indirect
|
github.com/kr/text v0.2.0 // indirect
|
||||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||||
|
github.com/rogpeppe/go-internal v1.13.1 // indirect
|
||||||
golang.org/x/crypto v0.27.0 // indirect
|
golang.org/x/crypto v0.27.0 // indirect
|
||||||
golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 // indirect
|
|
||||||
golang.org/x/net v0.27.0 // indirect
|
|
||||||
golang.org/x/sync v0.8.0 // indirect
|
golang.org/x/sync v0.8.0 // indirect
|
||||||
golang.org/x/sys v0.25.0 // indirect
|
golang.org/x/sys v0.25.0 // indirect
|
||||||
golang.org/x/text v0.18.0 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
34
go.sum
34
go.sum
@@ -1,8 +1,11 @@
|
|||||||
|
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
|
||||||
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
|
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
|
||||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||||
|
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/gabriel-vasile/mimetype v1.4.5 h1:J7wGKdGu33ocBOhGy0z653k/lFKLFDPJMG8Gql0kxn4=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/gabriel-vasile/mimetype v1.4.5/go.mod h1:ibHel+/kbxn9x2407k1izTA1S81ku1z/DlgOW2QE0M4=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
|
||||||
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
|
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
|
||||||
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
||||||
github.com/guregu/null/v5 v5.0.0 h1:PRxjqyOekS11W+w/7Vfz6jgJE/BCwELWtgvOJzddimw=
|
github.com/guregu/null/v5 v5.0.0 h1:PRxjqyOekS11W+w/7Vfz6jgJE/BCwELWtgvOJzddimw=
|
||||||
@@ -15,32 +18,39 @@ github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs=
|
|||||||
github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA=
|
github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA=
|
||||||
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
||||||
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
||||||
github.com/jaevor/go-nanoid v1.4.0 h1:mPz0oi3CrQyEtRxeRq927HHtZCJAAtZ7zdy7vOkrvWs=
|
|
||||||
github.com/jaevor/go-nanoid v1.4.0/go.mod h1:GIpPtsvl3eSBsjjIEFQdzzgpi50+Bo1Luk+aYlbJzlc=
|
|
||||||
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
|
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
|
||||||
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
|
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
|
||||||
|
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
|
||||||
|
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
|
||||||
|
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||||
|
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||||
|
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
|
||||||
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||||
|
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
|
||||||
|
github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM=
|
||||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||||
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
||||||
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
|
||||||
|
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
|
||||||
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
||||||
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
|
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
|
||||||
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
|
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A=
|
golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A=
|
||||||
golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70=
|
golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70=
|
||||||
golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 h1:1wqE9dj9NpSm04INVsJhhEUzhuDVjbcyKH91sVyPATw=
|
|
||||||
golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8=
|
|
||||||
golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
|
|
||||||
golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
|
|
||||||
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
|
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
|
||||||
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
@@ -48,7 +58,11 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
|||||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
|
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
|
||||||
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224=
|
golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
|
||||||
golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
|
golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
52
http/http.go
Normal file
52
http/http.go
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"gemini-grc/logging"
|
||||||
|
_ "gemini-grc/logging"
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func CreateServer(listenAddr string) *http.Server {
|
||||||
|
mux := http.NewServeMux()
|
||||||
|
mux.HandleFunc("GET /ping", wrapForError(getPing))
|
||||||
|
|
||||||
|
server := &http.Server{
|
||||||
|
Addr: listenAddr,
|
||||||
|
Handler: mux,
|
||||||
|
ReadHeaderTimeout: 10 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
// Start the server. Blocking call.
|
||||||
|
logging.LogInfo("HTTP server listening on %s", listenAddr)
|
||||||
|
if err := server.ListenAndServe(); err != nil {
|
||||||
|
panic(fmt.Sprintf("Server failed to start: %s", err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return server
|
||||||
|
}
|
||||||
|
|
||||||
|
func wrapForError(f func(http.ResponseWriter, *http.Request) error) http.HandlerFunc {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
err := f(w, r)
|
||||||
|
if err != nil {
|
||||||
|
code := http.StatusInternalServerError
|
||||||
|
logging.LogWarn("Error while handling request: %d %s", code, err)
|
||||||
|
http.Error(w, http.StatusText(code), code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getPing(w http.ResponseWriter, r *http.Request) error {
|
||||||
|
method := r.Method
|
||||||
|
url := r.URL.String()
|
||||||
|
path := r.URL.Path
|
||||||
|
response := fmt.Sprintf("Pong %s %s %s", method, url, path)
|
||||||
|
_, err := w.Write([]byte(response))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to write response: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
29
main.go
29
main.go
@@ -3,15 +3,14 @@ package main
|
|||||||
import (
|
import (
|
||||||
"gemini-grc/config"
|
"gemini-grc/config"
|
||||||
"gemini-grc/gemini"
|
"gemini-grc/gemini"
|
||||||
|
"gemini-grc/http"
|
||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
|
"github.com/jmoiron/sqlx"
|
||||||
|
"github.com/rs/zerolog"
|
||||||
|
zlog "github.com/rs/zerolog/log"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
"github.com/jmoiron/sqlx"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
zlog "github.com/rs/zerolog/log"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -27,9 +26,10 @@ func main() {
|
|||||||
|
|
||||||
func runApp() error {
|
func runApp() error {
|
||||||
logging.LogInfo("Starting up. Press Ctrl+C to exit")
|
logging.LogInfo("Starting up. Press Ctrl+C to exit")
|
||||||
sigs := make(chan os.Signal, 1)
|
signals := make(chan os.Signal, 1)
|
||||||
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
|
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
|
||||||
|
server := http.CreateServer("localhost:8899")
|
||||||
db := gemini.ConnectToDB()
|
db := gemini.ConnectToDB()
|
||||||
|
|
||||||
// !!! DANGER !!!
|
// !!! DANGER !!!
|
||||||
@@ -44,9 +44,20 @@ func runApp() error {
|
|||||||
}
|
}
|
||||||
}(db)
|
}(db)
|
||||||
|
|
||||||
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
|
gemini.LoadBlacklist()
|
||||||
|
// If there's an argument, assume it's a URL
|
||||||
|
// to visit and ignore database state.
|
||||||
|
if len(os.Args) > 1 {
|
||||||
|
url := os.Args[1]
|
||||||
|
go gemini.RunWorker(0, db, &url)
|
||||||
|
} else {
|
||||||
|
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
|
||||||
|
}
|
||||||
|
|
||||||
<-sigs
|
<-signals
|
||||||
|
if err := server.Close(); err != nil {
|
||||||
|
logging.LogError("GeminiError during server shutdown: %s", err)
|
||||||
|
}
|
||||||
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
|
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
package uid
|
package uid
|
||||||
|
|
||||||
import (
|
import (
|
||||||
nanoid "github.com/jaevor/go-nanoid"
|
nanoid "github.com/matoous/go-nanoid/v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
func UID() string {
|
func UID() string {
|
||||||
// Missing o,O and l
|
// No 'o','O' and 'l'
|
||||||
uid, err := nanoid.CustomASCII("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20)
|
id, err := nanoid.Generate("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
return uid()
|
return id
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user