This commit is contained in:
2024-11-18 16:28:45 +02:00
parent f0452ff9f7
commit 825c7e3391
34 changed files with 624 additions and 426 deletions

View File

@@ -5,6 +5,15 @@ export PATH := $(PATH)
all: fmt lint test all: fmt lint test
.PHONY: debug
debug:
@echo "PATH: $(PATH)"
@echo "GOPATH: $(shell go env GOPATH)"
@which go
@which gofumpt
@which gci
@which golangci-lint
# Test # Test
test: test:
go test -v ./... go test -v ./...

2
blacklist.txt Normal file
View File

@@ -0,0 +1,2 @@
gemi.dev
mastogem.picasoft.net

View File

@@ -8,48 +8,42 @@ import (
"github.com/rs/zerolog" "github.com/rs/zerolog"
) )
// Environment variable names // Environment variable names.
const ( const (
EnvLogLevel = "LOG_LEVEL" EnvLogLevel = "LOG_LEVEL"
EnvRootPath = "ROOT_PATH" EnvNumWorkers = "NUM_OF_WORKERS"
EnvNumWorkers = "NUM_OF_WORKERS" EnvWorkerBatchSize = "WORKER_BATCH_SIZE"
EnvWorkerBatchSize = "WORKER_BATCH_SIZE" EnvMaxResponseSize = "MAX_RESPONSE_SIZE"
EnvMaxResponseSize = "MAX_RESPONSE_SIZE" EnvResponseTimeout = "RESPONSE_TIMEOUT"
EnvResponseTimeout = "RESPONSE_TIMEOUT" EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR"
EnvBlacklistPath = "BLACKLIST_PATH"
) )
// Config holds the application configuration loaded from environment variables // Config holds the application configuration loaded from environment variables.
type Config struct { type Config struct {
LogLevel zerolog.Level // Logging level (debug, info, warn, error) LogLevel zerolog.Level // Logging level (debug, info, warn, error)
RootPath string // Root path for the application MaxResponseSize int // Maximum size of response in bytes
MaxResponseSize int // Maximum size of response in bytes NumOfWorkers int // Number of concurrent workers
NumOfWorkers int // Number of concurrent workers ResponseTimeout int // Timeout for responses in seconds
ResponseTimeout int // Timeout for responses in seconds WorkerBatchSize int // Batch size for worker processing
WorkerBatchSize int // Batch size for worker processing PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
BlacklistPath string // File that has blacklisted strings of "host:port"
} }
// String returns a string representation of the configuration var CONFIG Config //nolint:gochecknoglobals
func (c *Config) String() string {
return fmt.Sprintf(
"Config{LogLevel: %s, RootPath: %s, MaxResponseSize: %d, NumWorkers: %d, ResponseTimeout: %d, WorkerBatchSize: %d}",
c.LogLevel, c.RootPath, c.MaxResponseSize, c.NumOfWorkers, c.ResponseTimeout, c.WorkerBatchSize,
)
}
var CONFIG Config // parsePositiveInt parses and validates positive integer values.
// parsePositiveInt parses and validates positive integer values
func parsePositiveInt(param, value string) (int, error) { func parsePositiveInt(param, value string) (int, error) {
val, err := strconv.Atoi(value) val, err := strconv.Atoi(value)
if err != nil { if err != nil {
return 0, &ValidationError{ return 0, ValidationError{
Param: param, Param: param,
Value: value, Value: value,
Reason: "must be a valid integer", Reason: "must be a valid integer",
} }
} }
if val <= 0 { if val <= 0 {
return 0, &ValidationError{ return 0, ValidationError{
Param: param, Param: param,
Value: value, Value: value,
Reason: "must be positive", Reason: "must be positive",
@@ -58,6 +52,18 @@ func parsePositiveInt(param, value string) (int, error) {
return val, nil return val, nil
} }
func parseBool(param, value string) (bool, error) {
val, err := strconv.ParseBool(value)
if err != nil {
return false, ValidationError{
Param: param,
Value: value,
Reason: "cannot be converted to boolean",
}
}
return val, nil
}
// GetConfig loads and validates configuration from environment variables // GetConfig loads and validates configuration from environment variables
func GetConfig() *Config { func GetConfig() *Config {
config := &Config{} config := &Config{}
@@ -67,7 +73,7 @@ func GetConfig() *Config {
EnvLogLevel: func(v string) error { EnvLogLevel: func(v string) error {
level, err := zerolog.ParseLevel(v) level, err := zerolog.ParseLevel(v)
if err != nil { if err != nil {
return &ValidationError{ return ValidationError{
Param: EnvLogLevel, Param: EnvLogLevel,
Value: v, Value: v,
Reason: "must be one of: debug, info, warn, error", Reason: "must be one of: debug, info, warn, error",
@@ -76,16 +82,6 @@ func GetConfig() *Config {
config.LogLevel = level config.LogLevel = level
return nil return nil
}, },
EnvRootPath: func(v string) error {
if _, err := os.Stat(v); err != nil {
return &ConfigError{
Param: EnvRootPath,
Err: err,
}
}
config.RootPath = v
return nil
},
EnvNumWorkers: func(v string) error { EnvNumWorkers: func(v string) error {
val, err := parsePositiveInt(EnvNumWorkers, v) val, err := parsePositiveInt(EnvNumWorkers, v)
if err != nil { if err != nil {
@@ -118,6 +114,18 @@ func GetConfig() *Config {
config.ResponseTimeout = val config.ResponseTimeout = val
return nil return nil
}, },
EnvPanicOnUnexpectedError: func(v string) error {
val, err := parseBool(EnvPanicOnUnexpectedError, v)
if err != nil {
return err
}
config.PanicOnUnexpectedError = val
return nil
},
EnvBlacklistPath: func(v string) error {
config.BlacklistPath = v
return nil
},
} }
// Process each environment variable // Process each environment variable

View File

@@ -1,146 +0,0 @@
package config
import (
"os"
"testing"
"github.com/rs/zerolog"
"github.com/stretchr/testify/assert"
)
func TestGetConfig(t *testing.T) {
// Set up test environment variables
envVars := map[string]string{
EnvLogLevel: "debug",
EnvRootPath: ".",
EnvNumWorkers: "5",
EnvWorkerBatchSize: "100",
EnvMaxResponseSize: "1048576",
EnvResponseTimeout: "30",
}
for k, v := range envVars {
os.Setenv(k, v)
defer os.Unsetenv(k)
}
// Get configuration
config := GetConfig()
// Assert configuration values
assert.Equal(t, zerolog.DebugLevel, config.LogLevel)
assert.Equal(t, ".", config.RootPath)
assert.Equal(t, 5, config.NumOfWorkers)
assert.Equal(t, 100, config.WorkerBatchSize)
assert.Equal(t, 1048576, config.MaxResponseSize)
assert.Equal(t, 30, config.ResponseTimeout)
}
func TestParsePositiveInt(t *testing.T) {
tests := []struct {
name string
param string
input string
want int
wantErr bool
errType interface{}
errMessage string
}{
{
name: "valid positive",
param: "TEST_PARAM",
input: "42",
want: 42,
wantErr: false,
},
{
name: "zero",
param: "TEST_PARAM",
input: "0",
wantErr: true,
errType: &ValidationError{},
errMessage: "invalid value '0' for TEST_PARAM: must be positive",
},
{
name: "negative",
param: "TEST_PARAM",
input: "-1",
wantErr: true,
errType: &ValidationError{},
errMessage: "invalid value '-1' for TEST_PARAM: must be positive",
},
{
name: "invalid",
param: "TEST_PARAM",
input: "abc",
wantErr: true,
errType: &ValidationError{},
errMessage: "invalid value 'abc' for TEST_PARAM: must be a valid integer",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := parsePositiveInt(tt.param, tt.input)
if tt.wantErr {
assert.Error(t, err)
assert.IsType(t, tt.errType, err)
if tt.errMessage != "" {
assert.Equal(t, tt.errMessage, err.Error())
}
} else {
assert.NoError(t, err)
assert.Equal(t, tt.want, got)
}
})
}
}
func TestConfigValidation(t *testing.T) {
tests := []struct {
name string
envVars map[string]string
wantErr bool
errMessage string
}{
{
name: "invalid log level",
envVars: map[string]string{
EnvLogLevel: "invalid",
},
wantErr: true,
errMessage: "invalid value 'invalid' for LOG_LEVEL: must be one of: debug, info, warn, error",
},
{
name: "invalid worker count",
envVars: map[string]string{
EnvLogLevel: "debug",
EnvRootPath: ".",
EnvNumWorkers: "-1",
},
wantErr: true,
errMessage: "invalid value '-1' for NUM_OF_WORKERS: must be positive",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Clear environment
os.Clearenv()
// Set required environment variables
for k, v := range tt.envVars {
os.Setenv(k, v)
}
// Defer cleanup
defer os.Clearenv()
if tt.wantErr {
assert.PanicsWithError(t, tt.errMessage, func() {
GetConfig()
})
}
})
}
}

View File

@@ -2,27 +2,13 @@ package config
import "fmt" import "fmt"
// ConfigError represents a configuration error // ValidationError represents a config validation error
type ConfigError struct {
Param string
Err error
}
func (e *ConfigError) Error() string {
return fmt.Sprintf("configuration error for %s: %v", e.Param, e.Err)
}
func (e *ConfigError) Unwrap() error {
return e.Err
}
// ValidationError represents a validation error
type ValidationError struct { type ValidationError struct {
Param string Param string
Value string Value string
Reason string Reason string
} }
func (e *ValidationError) Error() string { func (e ValidationError) Error() string {
return fmt.Sprintf("invalid value '%s' for %s: %s", e.Value, e.Param, e.Reason) return fmt.Sprintf("invalid value '%s' for %s: %s", e.Value, e.Param, e.Reason)
} }

View File

@@ -0,0 +1,7 @@
delete FROM snapshots
WHERE host IN (
SELECT DISTINCT host
FROM snapshots
WHERE error LIKE 'robots.txt%'
)
AND url LIKE 'gemini://' || host || '/%';

5
db/error_stats.sql Normal file
View File

@@ -0,0 +1,5 @@
SELECT error, count(error) as count
FROM snapshots
GROUP BY error
ORDER BY count DESC
LIMIT 20;

View File

@@ -0,0 +1,7 @@
SELECT host, COUNT(*) AS row_count
FROM snapshots
WHERE response_code IS NOT NULL
AND error IS NULL
GROUP BY host
ORDER BY row_count DESC
LIMIT 10;

View File

@@ -20,7 +20,7 @@ DROP TABLE IF EXISTS snapshots;
CREATE TABLE snapshots ( CREATE TABLE snapshots (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
uid TEXT NOT NULL UNIQUE, uid TEXT NOT NULL UNIQUE,
url TEXT NOT NULL, url TEXT NOT NULL UNIQUE,
host TEXT NOT NULL, host TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
mimetype TEXT, mimetype TEXT,
@@ -42,7 +42,10 @@ CREATE INDEX idx_lang ON snapshots (lang);
CREATE INDEX idx_response_code ON snapshots (response_code); CREATE INDEX idx_response_code ON snapshots (response_code);
CREATE INDEX idx_error ON snapshots (error); CREATE INDEX idx_error ON snapshots (error);
CREATE INDEX idx_host ON snapshots (host); CREATE INDEX idx_host ON snapshots (host);
CREATE INDEX unique_uid_url ON snapshots (uid, url);
CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host) CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host)
WHERE response_code IS NULL AND error IS NULL WHERE response_code IS NULL AND error IS NULL
INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang); INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang);
CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL;

View File

@@ -2,9 +2,9 @@ package main
import ( import (
"fmt" "fmt"
"gemini-grc/gemini"
"os" "os"
"gemini-grc/gemini"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
@@ -71,7 +71,6 @@ func main() {
} }
} }
} }
func connectToDB() *sqlx.DB { func connectToDB() *sqlx.DB {

View File

@@ -0,0 +1,18 @@
-- Step 1: Delete duplicate entries, keeping the last one based on timestamp
-- Use a CTE to mark duplicates and delete them efficiently
WITH ranked_snapshots AS (
SELECT
id,
url,
ROW_NUMBER() OVER(PARTITION BY url ORDER BY timestamp DESC) AS row_num
FROM
snapshots
)
DELETE FROM snapshots
USING ranked_snapshots
WHERE snapshots.id = ranked_snapshots.id
AND ranked_snapshots.row_num > 1;
-- Step 2: Add a unique constraint on the url column to prevent future duplicates
ALTER TABLE snapshots
ADD CONSTRAINT unique_url UNIQUE (url);

View File

@@ -1,9 +1,9 @@
package main package main
import ( import (
"gemini-grc/uid"
"time" "time"
"gemini-grc/uid"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )

16
debug.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/sh
set -eu
# Max response size 10MiB
MAX_RESPONSE_SIZE=10485760 \
LOG_LEVEL=debug \
ROOT_PATH=./snaps \
RESPONSE_TIMEOUT=10 \
NUM_OF_WORKERS=1 \
WORKER_BATCH_SIZE=1 \
PG_DATABASE=gemini \
PG_HOST=127.0.0.1 \
PG_PORT=5433 \
PG_USER=gemini \
PG_PASSWORD=gemini \
dlv debug

View File

@@ -1,3 +1,6 @@
// 31 redirect
gemini://gemini.circumlunar.space
// body with null byte // body with null byte
gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False

51
gemini/blacklist.go Normal file
View File

@@ -0,0 +1,51 @@
package gemini
import (
"fmt"
"os"
"strings"
"gemini-grc/config"
"gemini-grc/logging"
)
var Blacklist *[]string //nolint:gochecknoglobals
func LoadBlacklist() {
if Blacklist == nil {
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
if err != nil {
Blacklist = &[]string{}
logging.LogWarn("Could not load Blacklist file: %v", err)
return
}
lines := strings.Split(string(data), "\n")
// Ignore lines starting with '#' (comments)
filteredLines := func() []string {
out := make([]string, 0, len(lines))
for _, line := range lines {
if !strings.HasPrefix(line, "#") {
out = append(out, line)
}
}
return out
}()
if len(lines) > 0 {
Blacklist = &filteredLines
logging.LogInfo("Blacklist has %d entries", len(*Blacklist))
}
}
}
func IsBlacklisted(url URL) bool {
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
for _, v := range *Blacklist {
if v == url.Hostname || v == hostWithPort {
return true
}
}
return false
}

View File

@@ -4,11 +4,11 @@ import (
"gemini-grc/logging" "gemini-grc/logging"
) )
var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)} var IpPool = IpAddressPool{IPs: make(map[string]int)}
func AddIPsToPool(IPs []string) { func AddIPsToPool(ips []string) {
IpPool.Lock.Lock() IpPool.Lock.Lock()
for _, ip := range IPs { for _, ip := range ips {
logging.LogDebug("Adding %s to pool", ip) logging.LogDebug("Adding %s to pool", ip)
IpPool.IPs[ip]++ IpPool.IPs[ip]++
} }

100
gemini/errors.go Normal file
View File

@@ -0,0 +1,100 @@
package gemini
import (
"errors"
"fmt"
)
type ErrGeminiStatusCode struct {
Msg string
Code int
Header string
}
func (e *ErrGeminiStatusCode) Error() string {
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
}
func NewErrGeminiStatusCode(code int, header string) error {
var msg string
switch {
case code >= 10 && code < 20:
msg = "needs input"
case code >= 30 && code < 40:
msg = "redirect"
case code >= 40 && code < 50:
msg = "bad request"
case code >= 50 && code < 60:
msg = "server error"
case code >= 60 && code < 70:
msg = "TLS error"
default:
msg = "unexpected status code"
}
return &ErrGeminiStatusCode{
Msg: msg,
Code: code,
Header: header,
}
}
var (
ErrGemini = errors.New("gemini general error")
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
ErrGeminiResponseHeader = errors.New("gemini response header error")
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
ErrURLParse = errors.New("URL parse error")
ErrURLDecode = errors.New("URL decode error")
ErrUTF8Parse = errors.New("UTF-8 parse error")
ErrTextParse = errors.New("text parse error")
ErrNetwork = errors.New("network error")
ErrNetworkDNS = errors.New("network DNS error")
ErrNetworkTLS = errors.New("network TLS error")
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
ErrNetworkCannotWrite = errors.New("network error - cannot write")
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
ErrDatabase = errors.New("database error")
)
// We could have used a map for speed, but
// we would lose ability to check wrapped
// errors via errors.Is().
var KnownErrors = []error{
ErrGemini,
ErrGeminiLinkLineParse,
ErrGeminiRobotsParse,
ErrGeminiRobotsDisallowed,
ErrGeminiResponseHeader,
ErrURLParse,
ErrURLDecode,
ErrUTF8Parse,
ErrTextParse,
ErrNetwork,
ErrNetworkDNS,
ErrNetworkTLS,
ErrNetworkSetConnectionDeadline,
ErrNetworkCannotWrite,
ErrNetworkResponseSizeExceededMax,
ErrDatabase,
}
func IsKnownError(err error) bool {
var errGeminiStatusCode *ErrGeminiStatusCode
if errors.As(err, &errGeminiStatusCode) {
return true
}
for _, known := range KnownErrors {
if errors.Is(err, known) {
return true
}
}
return false
}

View File

@@ -2,12 +2,13 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/logging"
"net/url" "net/url"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
"strings" "strings"
"gemini-grc/logging"
) )
// sanitizePath encodes invalid filesystem characters using URL encoding. // sanitizePath encodes invalid filesystem characters using URL encoding.
@@ -87,9 +88,9 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
return return
} }
if s.MimeType.Valid && s.MimeType.String == "text/gemini" { if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
err = os.WriteFile(finalPath, (*s).Data.V, 0666) err = os.WriteFile(finalPath, (*s).Data.V, 0o666)
} else { } else {
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666) err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
} }
if err != nil { if err != nil {
logging.LogError("Error saving %s: %w", s.URL.Full, err) logging.LogError("Error saving %s: %w", s.URL.Full, err)

View File

@@ -1,32 +1,13 @@
package gemini package gemini
import ( import (
"errors"
"fmt" "fmt"
"gemini-grc/logging"
"net/url" "net/url"
"regexp" "regexp"
"strconv" "strconv"
)
func checkGeminiStatusCode(code int) error { "gemini-grc/logging"
switch { )
case code == 20:
return nil
case code >= 10 && code < 20:
return fmt.Errorf("gemini response %d needs data input", code)
case code >= 30 && code < 40:
return fmt.Errorf("gemini response %d redirect", code)
case code >= 40 && code < 50:
return fmt.Errorf("gemini response %d server error", code)
case code >= 50 && code < 60:
return fmt.Errorf("gemini response %d server permanent error", code)
case code >= 60 && code < 70:
return fmt.Errorf("gemini response %d certificate error", code)
default:
return fmt.Errorf("unexpected/unhandled Gemini response %d", code)
}
}
func ProcessGemini(snapshot *Snapshot) *Snapshot { func ProcessGemini(snapshot *Snapshot) *Snapshot {
// Grab link lines // Grab link lines
@@ -40,7 +21,7 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err) logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
continue continue
} }
geminiUrl, err := ParseUrl(normalizedLink, descr) geminiUrl, err := ParseURL(normalizedLink, descr)
if err != nil { if err != nil {
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err) logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
continue continue
@@ -54,25 +35,6 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
return snapshot return snapshot
} }
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
path := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
}
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
}
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines // ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
func ExtractLinkLines(gemtext string) []string { func ExtractLinkLines(gemtext string) []string {
// Define the regular expression pattern to match link lines // Define the regular expression pattern to match link lines
@@ -87,11 +49,11 @@ func ExtractLinkLines(gemtext string) []string {
// NormalizeLink takes a single link line and the current URL, // NormalizeLink takes a single link line and the current URL,
// return the URL converted to an absolute URL // return the URL converted to an absolute URL
// and its description. // and its description.
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) { func NormalizeLink(linkLine string, currentURL string) (string, string, error) {
// Parse the current URL // Parse the current URL
baseURL, err := url.Parse(currentURL) baseURL, err := url.Parse(currentURL)
if err != nil { if err != nil {
return "", "", fmt.Errorf("invalid current URL: %v", err) return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
} }
// Regular expression to extract the URL part from a link line // Regular expression to extract the URL part from a link line
@@ -101,13 +63,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
matches := re.FindStringSubmatch(linkLine) matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 { if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged // If the line doesn't match the expected format, return it unchanged
return "", "", fmt.Errorf("not a link line: %v", linkLine) return "", "", fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine)
} }
originalURLStr := matches[1] originalURLStr := matches[1]
_, err = url.QueryUnescape(originalURLStr) _, err = url.QueryUnescape(originalURLStr)
if err != nil { if err != nil {
return "", "", fmt.Errorf("error decoding URL: %w", err) return "", "", fmt.Errorf("%w: %w", ErrURLDecode, err)
} }
restOfLine := "" restOfLine := ""
@@ -119,7 +81,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
parsedURL, err := url.Parse(originalURLStr) parsedURL, err := url.Parse(originalURLStr)
if err != nil { if err != nil {
// If URL parsing fails, return an error // If URL parsing fails, return an error
return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err) return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
} }
// Resolve relative URLs against the base URL // Resolve relative URLs against the base URL
@@ -151,13 +113,13 @@ func ParseFirstTwoDigits(input string) (int, error) {
// Find the first match in the string // Find the first match in the string
matches := re.FindStringSubmatch(input) matches := re.FindStringSubmatch(input)
if len(matches) == 0 { if len(matches) == 0 {
return 0, errors.New("no digits found at the beginning of the string") return 0, fmt.Errorf("%w", ErrGeminiResponseHeader)
} }
// Parse the captured match as an integer // Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1]) snapshot, err := strconv.Atoi(matches[1])
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err) return 0, fmt.Errorf("%w: %w", ErrTextParse, err)
} }
return snapshot, nil return snapshot, nil

View File

@@ -34,7 +34,7 @@ func (u *URL) Scan(value interface{}) error {
return nil return nil
} }
func (u *URL) String() string { func (u URL) String() string {
return u.Full return u.Full
} }
@@ -62,7 +62,8 @@ func ParseURL(input string, descr string) (*URL, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: Input %s Error %w", ErrURLParse, input, err) return nil, fmt.Errorf("%w: Input %s Error %w", ErrURLParse, input, err)
} }
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
} }
//func GeminiUrltoJSON(g URL) string { //func GeminiUrltoJSON(g URL) string {

View File

@@ -2,25 +2,27 @@ package gemini
import ( import (
"crypto/tls" "crypto/tls"
"errors"
"fmt" "fmt"
"gemini-grc/config"
"io" "io"
"net" "net"
go_url "net/url" gourl "net/url"
"regexp" "regexp"
"slices" "slices"
"strconv" "strconv"
"time" "time"
"gemini-grc/config"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
type GeminiPageData struct { type PageData struct {
ResponseCode int ResponseCode int
MimeType string ResponseHeader string
Lang string MimeType string
GemText string Lang string
Data []byte GemText string
Data []byte
} }
// Resolve the URL hostname and // Resolve the URL hostname and
@@ -31,7 +33,7 @@ type GeminiPageData struct {
func getHostIPAddresses(hostname string) ([]string, error) { func getHostIPAddresses(hostname string) ([]string, error) {
addrs, err := net.LookupHost(hostname) addrs, err := net.LookupHost(hostname)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err)
} }
IpPool.Lock.RLock() IpPool.Lock.RLock()
defer func() { defer func() {
@@ -41,12 +43,12 @@ func getHostIPAddresses(hostname string) ([]string, error) {
} }
func ConnectAndGetData(url string) ([]byte, error) { func ConnectAndGetData(url string) ([]byte, error) {
parsedUrl, err := go_url.Parse(url) parsedURL, err := gourl.Parse(url)
if err != nil { if err != nil {
return nil, fmt.Errorf("Could not parse URL, error %w", err) return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
} }
hostname := parsedUrl.Hostname() hostname := parsedURL.Hostname()
port := parsedUrl.Port() port := parsedURL.Port()
if port == "" { if port == "" {
port = "1965" port = "1965"
} }
@@ -58,34 +60,34 @@ func ConnectAndGetData(url string) ([]byte, error) {
} }
conn, err := dialer.Dial("tcp", host) conn, err := dialer.Dial("tcp", host)
if err != nil { if err != nil {
return nil, fmt.Errorf("TCP connection failed: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
} }
// Make sure we always close the connection. // Make sure we always close the connection.
defer func() { defer func() {
// No need to handle error: // No need to handle error:
// Connection will timeout eventually if still open somehow. // Connection will time out eventually if still open somehow.
conn.Close() _ = conn.Close()
}() }()
// Set read and write timeouts on the TCP connection. // Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error setting connection deadline: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
} }
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error setting connection deadline: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
} }
// Perform the TLS handshake // Perform the TLS handshake
tlsConfig := &tls.Config{ tlsConfig := &tls.Config{
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure. InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
ServerName: parsedUrl.Hostname(), // SNI should not include port ServerName: parsedURL.Hostname(), // SNI should not include port
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites. // MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
} }
tlsConn := tls.Client(conn, tlsConfig) tlsConn := tls.Client(conn, tlsConfig)
if err := tlsConn.Handshake(); err != nil { if err := tlsConn.Handshake(); err != nil {
return nil, fmt.Errorf("TLS handshake error: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err)
} }
// We read `buf`-sized chunks and add data to `data`. // We read `buf`-sized chunks and add data to `data`.
@@ -95,7 +97,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Send Gemini request to trigger server response. // Send Gemini request to trigger server response.
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url))) _, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url)))
if err != nil { if err != nil {
return nil, fmt.Errorf("Error sending network request: %w", err) return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
} }
// Read response bytes in len(buf) byte chunks // Read response bytes in len(buf) byte chunks
for { for {
@@ -104,68 +106,72 @@ func ConnectAndGetData(url string) ([]byte, error) {
data = append(data, buf[:n]...) data = append(data, buf[:n]...)
} }
if len(data) > config.CONFIG.MaxResponseSize { if len(data) > config.CONFIG.MaxResponseSize {
data = []byte{} return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
return nil, fmt.Errorf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize)
} }
if err != nil { if err != nil {
if err == io.EOF { if errors.Is(err, io.EOF) {
break break
} else { } else {
return nil, fmt.Errorf("Network error: %s", err) return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
} }
} }
} }
return data, nil return data, nil
} }
// Connect to given URL, using the Gemini protocol. // Visit given URL, using the Gemini protocol.
// Mutate given Snapshot with the data or the error. // Mutates given Snapshot with the data.
func Visit(s *Snapshot) { func Visit(s *Snapshot) error {
data, err := ConnectAndGetData(s.URL.String()) data, err := ConnectAndGetData(s.URL.String())
if err != nil { if err != nil {
s.Error = null.StringFrom(err.Error()) return err
return
} }
pageData, err := processData(data) pageData, err := processData(data)
if err != nil { if err != nil {
s.Error = null.StringFrom(err.Error()) return err
return
} }
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode)) s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
s.MimeType = null.StringFrom(pageData.MimeType) s.MimeType = null.StringFrom(pageData.MimeType)
s.Lang = null.StringFrom(pageData.Lang) s.Lang = null.StringFrom(pageData.Lang)
if pageData.GemText != "" { if pageData.GemText != "" {
s.GemText = null.StringFrom(string(pageData.GemText)) s.GemText = null.StringFrom(pageData.GemText)
} }
if pageData.Data != nil { if pageData.Data != nil {
s.Data = null.ValueFrom(pageData.Data) s.Data = null.ValueFrom(pageData.Data)
} }
return nil
} }
// Update given snapshot with the // Update given snapshot with the
// Gemini header data: response code, // Gemini header data: response code,
// mime type and lang (optional) // mime type and lang (optional)
func processData(data []byte) (*GeminiPageData, error) { func processData(data []byte) (*PageData, error) {
headers, body, err := getHeadersAndData(data) header, body, err := getHeadersAndData(data)
if err != nil { if err != nil {
return nil, err return nil, err
} }
code, mimeType, lang := getMimeTypeAndLang(headers) code, mimeType, lang := getMimeTypeAndLang(header)
geminiError := checkGeminiStatusCode(code) var geminiError error
if code != 20 {
geminiError = NewErrGeminiStatusCode(code, header)
}
fmt.Printf("%v\n", header)
if geminiError != nil { if geminiError != nil {
return nil, geminiError return nil, geminiError
} }
pageData := GeminiPageData{ pageData := PageData{
ResponseCode: code, ResponseCode: code,
MimeType: mimeType, ResponseHeader: header,
Lang: lang, MimeType: mimeType,
Lang: lang,
} }
// If we've got a Gemini document, populate // If we've got a Gemini document, populate
// `GemText` field, otherwise raw data goes to `Data`. // `GemText` field, otherwise raw data goes to `Data`.
if mimeType == "text/gemini" { if mimeType == "text/gemini" {
validBody, err := EnsureValidUTF8(body) validBody, err := BytesToValidUTF8(body)
if err != nil { if err != nil {
return nil, fmt.Errorf("UTF-8 error: %w", err) return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err)
} }
pageData.GemText = validBody pageData.GemText = validBody
} else { } else {
@@ -178,14 +184,14 @@ func processData(data []byte) (*GeminiPageData, error) {
// basically the first line of the response // basically the first line of the response
// and should contain the response code, // and should contain the response code,
// mimeType and language. // mimeType and language.
func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) { func getHeadersAndData(data []byte) (string, []byte, error) {
firstLineEnds := slices.Index(data, '\n') firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 { if firstLineEnds == -1 {
return "", nil, fmt.Errorf("Could not parse response header") return "", nil, ErrGeminiResponseHeader
} }
firstLine = string(data[:firstLineEnds]) firstLine := string(data[:firstLineEnds])
rest = data[firstLineEnds+1:] rest := data[firstLineEnds+1:]
return string(firstLine), rest, nil return firstLine, rest, nil
} }
// Parses code, mime type and language // Parses code, mime type and language
@@ -194,7 +200,7 @@ func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
// `20 text/gemini lang=en` (code, mimetype, lang) // `20 text/gemini lang=en` (code, mimetype, lang)
// `20 text/gemini` (code, mimetype) // `20 text/gemini` (code, mimetype)
// `31 gemini://redirected.to/other/site` (code) // `31 gemini://redirected.to/other/site` (code)
func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) { func getMimeTypeAndLang(headers string) (int, string, string) {
// Regex that parses code, mimetype & lang // Regex that parses code, mimetype & lang
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`) re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
matches := re.FindStringSubmatch(headers) matches := re.FindStringSubmatch(headers)
@@ -215,7 +221,7 @@ func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string)
if err != nil { if err != nil {
return 0, "", "" return 0, "", ""
} }
mimeType = matches[2] mimeType := matches[2]
lang = matches[4] lang := matches[4]
return code, mimeType, lang return code, mimeType, lang
} }

View File

@@ -6,6 +6,7 @@ import (
// Test for input: `20 text/gemini` // Test for input: `20 text/gemini`
func TestGetMimeTypeAndLang1(t *testing.T) { func TestGetMimeTypeAndLang1(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
if code != 20 || mimeType != "text/gemini" || lang != "" { if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -13,6 +14,7 @@ func TestGetMimeTypeAndLang1(t *testing.T) {
} }
func TestGetMimeTypeAndLang11(t *testing.T) { func TestGetMimeTypeAndLang11(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
if code != 20 || mimeType != "text/gemini" || lang != "" { if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -20,6 +22,7 @@ func TestGetMimeTypeAndLang11(t *testing.T) {
} }
func TestGetTypeAndLang2(t *testing.T) { func TestGetTypeAndLang2(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en") code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" { if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -27,6 +30,7 @@ func TestGetTypeAndLang2(t *testing.T) {
} }
func TestGetMimeTypeAndLang3(t *testing.T) { func TestGetMimeTypeAndLang3(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page") code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
if code != 31 || mimeType != "" || lang != "" { if code != 31 || mimeType != "" || lang != "" {
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -34,6 +38,7 @@ func TestGetMimeTypeAndLang3(t *testing.T) {
} }
func TestGetMimeTypeAndLang4(t *testing.T) { func TestGetMimeTypeAndLang4(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd") code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
if code != 0 || mimeType != "" || lang != "" { if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
@@ -41,6 +46,7 @@ func TestGetMimeTypeAndLang4(t *testing.T) {
} }
func TestGetMimeTypeAndLang5(t *testing.T) { func TestGetMimeTypeAndLang5(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("") code, mimeType, lang := getMimeTypeAndLang("")
if code != 0 || mimeType != "" || lang != "" { if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)

View File

@@ -2,9 +2,9 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/logging"
"os" "os"
"gemini-grc/logging"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
@@ -33,11 +33,26 @@ func ConnectToDB() *sqlx.DB {
return db return db
} }
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { func SaveSnapshotToDBIfNotExists(tx *sqlx.Tx, s *Snapshot) error {
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO UPDATE SET ON CONFLICT (url) DO NOTHING
`
_, err := tx.NamedExec(query, s)
if err != nil {
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
}
return nil
}
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
fmt.Printf("%+v", s)
query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO UPDATE SET
url = EXCLUDED.url, url = EXCLUDED.url,
host = EXCLUDED.host, host = EXCLUDED.host,
timestamp = EXCLUDED.timestamp, timestamp = EXCLUDED.timestamp,
@@ -64,7 +79,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO NOTHING ON CONFLICT (url) DO NOTHING
` `
for i := 0; i < len(snapshots); i += batchSize { for i := 0; i < len(snapshots); i += batchSize {
@@ -89,7 +104,7 @@ func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
query := ` query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO NOTHING ON CONFLICT (url) DO NOTHING
` `
_, err := tx.NamedExec(query, snapshots) _, err := tx.NamedExec(query, snapshots)
if err != nil { if err != nil {

View File

@@ -2,33 +2,58 @@ package gemini
import ( import (
"bytes" "bytes"
"errors"
"fmt" "fmt"
"io" "io"
"unicode/utf8" "unicode/utf8"
"golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/transform" "golang.org/x/text/transform"
) )
var (
ErrInputTooLarge = errors.New("input too large")
ErrUTF8Conversion = errors.New("UTF-8 conversion error")
)
func BytesToValidUTF8(input []byte) (string, error) { func BytesToValidUTF8(input []byte) (string, error) {
if len(input) == 0 {
return "", nil
}
const maxSize = 10 * 1024 * 1024 // 10MB
if len(input) > maxSize {
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
}
// Remove NULL byte 0x00 (ReplaceAll accepts slices) // Remove NULL byte 0x00 (ReplaceAll accepts slices)
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{}) inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
isValidUTF8 := utf8.Valid(inputNoNull) if utf8.Valid(inputNoNull) {
if isValidUTF8 {
return string(inputNoNull), nil return string(inputNoNull), nil
} }
encodings := []transform.Transformer{ encodings := []transform.Transformer{
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1 charmap.ISO8859_1.NewDecoder(),
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc charmap.ISO8859_7.NewDecoder(),
// TODO: Try more encodings? charmap.Windows1250.NewDecoder(), // Central European
charmap.Windows1251.NewDecoder(), // Cyrillic
charmap.Windows1252.NewDecoder(),
charmap.Windows1256.NewDecoder(), // Arabic
japanese.EUCJP.NewDecoder(), // Japanese
korean.EUCKR.NewDecoder(), // Korean
} }
// First successful conversion wins. // First successful conversion wins.
var lastErr error
for _, encoding := range encodings { for _, encoding := range encodings {
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding) reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
result, err := io.ReadAll(reader) result, err := io.ReadAll(reader)
if err == nil { if err != nil {
lastErr = err
continue
}
if utf8.Valid(result) {
return string(result), nil return string(result), nil
} }
} }
return "", fmt.Errorf("UTF-8 error: %w", err)
return "", fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr)
} }

View File

@@ -4,6 +4,7 @@ import "testing"
// Make sure NULL bytes are removed // Make sure NULL bytes are removed
func TestEnsureValidUTF8(t *testing.T) { func TestEnsureValidUTF8(t *testing.T) {
t.Parallel()
// Create a string with a null byte // Create a string with a null byte
strWithNull := "Hello" + string('\x00') + "world" strWithNull := "Hello" + string('\x00') + "world"
result, _ := BytesToValidUTF8([]byte(strWithNull)) result, _ := BytesToValidUTF8([]byte(strWithNull))

View File

@@ -2,16 +2,18 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/logging"
"strings" "strings"
"sync" "sync"
"gemini-grc/logging"
) )
// key: "host:port" (string) // RobotsCache is a map of blocked URLs
// value: // key: URL
// empty []string if no robots data, or // value: []string list of disallowed URLs
// list of URL prefixes ([]string) in robots // If a key has no blocked URLs, an empty
var RobotsCache sync.Map // list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals
func populateBlacklist(key string) (entries []string) { func populateBlacklist(key string) (entries []string) {
// We either store an empty list when // We either store an empty list when
@@ -40,43 +42,40 @@ func populateBlacklist(key string) (entries []string) {
// According to spec, the first is correct, // According to spec, the first is correct,
// however let's be lenient // however let's be lenient
var data string var data string
if robotsData.MimeType == "text/plain" { switch {
case robotsData.MimeType == "text/plain":
data = string(robotsData.Data) data = string(robotsData.Data)
} else if robotsData.MimeType == "text/gemini" { case robotsData.MimeType == "text/gemini":
data = robotsData.GemText data = robotsData.GemText
} else { default:
return []string{} return []string{}
} }
entries = ParseRobotsTxt(string(data), key) entries = ParseRobotsTxt(data, key)
return entries return entries
} }
// Check if the snapshot URL matches // RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule. // a robots.txt allow rule.
func RobotMatch(s *Snapshot) bool { func RobotMatch(url URL) bool {
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String()) logging.LogDebug("Checking robots.txt cache for %s", url.String())
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port) key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
v, ok := RobotsCache.Load(key) var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key)
if !ok { if !ok {
// First time check, populate robot cache // First time check, populate robot cache
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String()) disallowedURLs = populateBlacklist(key)
disallowedURLs := populateBlacklist(key) logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
for _, url := range disallowedURLs {
if strings.HasPrefix(s.URL.String(), url) {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
return true
}
}
} else { } else {
if len(v.([]string)) == 0 { disallowedURLs, _ = cacheEntries.([]string)
logging.LogDebug("No robots.txt or no rules, allowed") }
return false return isURLblocked(disallowedURLs, url.Full)
} }
for _, url := range v.([]string) {
if strings.HasPrefix(s.URL.String(), url) { func isURLblocked(disallowedURLs []string, input string) bool {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) for _, url := range disallowedURLs {
return true if strings.HasPrefix(strings.ToLower(input), url) {
} logging.LogDebug("robots.txt match: %s matches %s", input, url)
return true
} }
} }
return false return false

View File

@@ -5,7 +5,7 @@ import (
"strings" "strings"
) )
// Takes robots.txt content and a host, and // ParseRobotsTxt takes robots.txt content and a host, and
// returns a list of full URLs that shouldn't // returns a list of full URLs that shouldn't
// be visited. // be visited.
// TODO Also take into account the user agent? // TODO Also take into account the user agent?

View File

@@ -6,6 +6,7 @@ import (
) )
func TestParseRobotsTxt(t *testing.T) { func TestParseRobotsTxt(t *testing.T) {
t.Parallel()
input := `User-agent: * input := `User-agent: *
Disallow: /cgi-bin/wp.cgi/view Disallow: /cgi-bin/wp.cgi/view
Disallow: /cgi-bin/wp.cgi/media Disallow: /cgi-bin/wp.cgi/media
@@ -26,6 +27,7 @@ Disallow: /admin/`
} }
func TestParseRobotsTxtEmpty(t *testing.T) { func TestParseRobotsTxtEmpty(t *testing.T) {
t.Parallel()
input := `` input := ``
result := ParseRobotsTxt(input, "example.com") result := ParseRobotsTxt(input, "example.com")
@@ -34,3 +36,20 @@ func TestParseRobotsTxtEmpty(t *testing.T) {
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result) t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
} }
} }
func TestIsURLblocked(t *testing.T) {
t.Parallel()
disallowedURLs := []string{
"gemini://example.com/cgi-bin/wp.cgi/view",
"gemini://example.com/cgi-bin/wp.cgi/media",
"gemini://example.com/admin/",
}
url := "gemini://example.com/admin/index.html"
if !isURLblocked(disallowedURLs, url) {
t.Errorf("Expected %s to be blocked", url)
}
url = "gemini://example1.com/admin/index.html"
if isURLblocked(disallowedURLs, url) {
t.Errorf("expected %s to not be blocked", url)
}
}

View File

@@ -4,15 +4,13 @@ import (
"database/sql/driver" "database/sql/driver"
"encoding/json" "encoding/json"
"fmt" "fmt"
"gemini-grc/logging"
"strings"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
type LinkList []GeminiUrl type LinkList []URL
func (l LinkList) Value() (driver.Value, error) { func (l *LinkList) Value() (driver.Value, error) {
return json.Marshal(l) return json.Marshal(l)
} }
@@ -31,7 +29,7 @@ func (l *LinkList) Scan(value interface{}) error {
type Snapshot struct { type Snapshot struct {
ID int `db:"id" json:"id,omitempty"` ID int `db:"id" json:"id,omitempty"`
UID string `db:"uid" json:"uid,omitempty"` UID string `db:"uid" json:"uid,omitempty"`
URL GeminiUrl `db:"url" json:"url,omitempty"` URL URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"` Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"` Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"` MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
@@ -43,32 +41,32 @@ type Snapshot struct {
Error null.String `db:"error" json:"error,omitempty"` // On network errors only Error null.String `db:"error" json:"error,omitempty"` // On network errors only
} }
func SnapshotToJSON(g Snapshot) string { //func SnapshotToJSON(g Snapshot) string {
// Serialize the Person struct to JSON // // Serialize the Person struct to JSON
jsonData, err := json.MarshalIndent(g, "", "\t") // jsonData, err := json.MarshalIndent(g, "", "\t")
if err != nil { // if err != nil {
logging.LogError("Error serializing to JSON: %w", err) // logging.LogError("Error serializing to JSON: %w", err)
} // }
return string(jsonData) // return string(jsonData)
} //}
//
func SnapshotFromJSON(input string) Snapshot { //func SnapshotFromJSON(input string) Snapshot {
var snapshot Snapshot // var snapshot Snapshot
err := json.Unmarshal([]byte(input), &snapshot) // err := json.Unmarshal([]byte(input), &snapshot)
if err != nil { // if err != nil {
logging.LogError("Error deserializing from JSON: %w", err) // logging.LogError("Error deserializing from JSON: %w", err)
} // }
return snapshot // return snapshot
} //}
//
func ShouldPersistSnapshot(result *Snapshot) bool { //func ShouldPersistSnapshot(result *Snapshot) bool {
if !result.MimeType.Valid { // if !result.MimeType.Valid {
return false // return false
} // }
if result.MimeType.String == "text/gemini" || // if result.MimeType.String == "text/gemini" ||
strings.HasPrefix(result.MimeType.String, "image/") || // strings.HasPrefix(result.MimeType.String, "image/") ||
strings.HasPrefix(result.MimeType.String, "text/") { // strings.HasPrefix(result.MimeType.String, "text/") {
return true // return true
} // }
return false // return false
} //}

View File

@@ -1,30 +1,32 @@
package gemini package gemini
import ( import (
"errors"
"fmt" "fmt"
"regexp"
"strings"
"time"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/logging" "gemini-grc/logging"
"gemini-grc/uid" "gemini-grc/uid"
"gemini-grc/util" "gemini-grc/util"
"strings"
"time"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers) logging.LogInfo("Spawning %d workers", numOfWorkers)
for i := 0; i < numOfWorkers; i++ { for i := range numOfWorkers {
go func(i int) { go func(i int) {
for { for {
runWorker(i, db) RunWorker(i, db, nil)
} }
}(i) }(i)
} }
} }
func runWorker(id int, db *sqlx.DB) { func RunWorker(id int, db *sqlx.DB, url *string) {
// Start the DB transaction // Start the DB transaction
tx, err := db.Beginx() tx, err := db.Beginx()
if err != nil { if err != nil {
@@ -42,38 +44,85 @@ func runWorker(id int, db *sqlx.DB) {
} }
}() }()
snapshots, err := GetRandomSnapshotsDistinctHosts(tx) var snapshots []Snapshot
if url == nil {
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
} else {
snapshots, err = GetSnapshotFromURL(tx, *url)
if len(snapshots) == 0 {
snapshotURL, err := ParseURL(*url, "")
if err != nil {
panic("Invalid URL: " + *url)
}
snapshots = []Snapshot{{
UID: uid.UID(),
URL: *snapshotURL,
Host: snapshotURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}}
}
}
if err != nil { if err != nil {
logging.LogError("[%d] Error retrieving snapshot: %w", id, err) logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
time.Sleep(10 * time.Second) time.Sleep(10 * time.Second)
return return
} else if len(snapshots) == 0 { } else if len(snapshots) == 0 {
logging.LogInfo("[%d] No remaining snapshots to visit.", id) logging.LogInfo("[%d] No snapshots to visit.", id)
time.Sleep(1 * time.Minute) time.Sleep(1 * time.Minute)
return return
} }
total := len(snapshots) total := len(snapshots)
for i, s := range snapshots { for i, s := range snapshots {
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL) logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
err = workOnSnapshot(id, tx, &s) err = workOnSnapshot(id, tx, &s)
if err != nil { if err != nil {
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL, err) logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err)
util.PrintStackAndPanic(err) util.PrintStackAndPanic(err)
} }
if s.Error.Valid { if s.Error.Valid {
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL, s.Error.String) logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String)
} }
logging.LogDebug("[%d] Done %d/%d.", id, i, total) logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
} }
logging.LogInfo("[%d] Worker done.", id) logging.LogInfo("[%d] Worker done.", id)
} }
func handleRedirection(tx *sqlx.Tx, s *Snapshot) error {
re := regexp.MustCompile(`gemini://\S+`)
matches := re.FindStringSubmatch(s.Error.ValueOrZero())
if len(matches) == 1 {
newURL := matches[0]
logging.LogDebug("Page redirects to %s", newURL)
_url, err := ParseURL(newURL, "")
// Insert fresh snapshot with new URL
if err == nil {
snapshot := &Snapshot{
UID: uid.UID(),
URL: *_url,
Host: _url.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
err := SaveSnapshotToDBIfNotExists(tx, snapshot)
if err != nil {
return err
}
}
}
return nil
}
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
if IsBlacklisted(s.URL) {
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
return nil
}
// If URL matches a robots.txt disallow line, // If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be // add it as an error so next time it won't be
// crawled. // crawled.
if RobotMatch(s) { if RobotMatch(s.URL) {
s.Error = null.StringFrom("robots.txt disallow match") s.Error = null.StringFrom("robots.txt disallow match")
err = SaveSnapshotToDB(tx, s) err = SaveSnapshotToDB(tx, s)
if err != nil { if err != nil {
@@ -92,14 +141,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
return nil return nil
} }
defer func() {
time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs)
}()
// If the host's ip is in the connections pool, // If the host's ip is in the connections pool,
// stop and add the url in the queue later. // stop and add the url in the queue later.
IpPool.Lock.RLock() IpPool.Lock.RLock()
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL) logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
for _, ip := range IPs { for _, ip := range IPs {
_, ok := IpPool.IPs[ip] _, ok := IpPool.IPs[ip]
if ok { if ok {
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL) logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
IpPool.Lock.RUnlock() IpPool.Lock.RUnlock()
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
return nil return nil
@@ -111,15 +165,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
url := s.URL.String() url := s.URL.String()
logging.LogDebug("[%d] Dialing %s", id, url) logging.LogDebug("[%d] Dialing %s", id, url)
Visit(s) err = Visit(s)
if err != nil {
if !IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
if config.CONFIG.PanicOnUnexpectedError {
util.PrintStackAndPanic(err)
}
} else {
s.Error = null.StringFrom(err.Error())
}
if errors.As(err, new(*ErrGeminiStatusCode)) {
err = handleRedirection(tx, s)
if err != nil {
return err
}
}
}
logging.LogDebug("[%d] Finished dialing.", id) logging.LogDebug("[%d] Finished dialing.", id)
go func() { if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs)
}()
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
logging.LogDebug("[%d] [%s] Processing", id, url) logging.LogDebug("[%d] [%s] Processing", id, url)
s = ProcessGemini(s) s = ProcessGemini(s)
} }
@@ -158,7 +223,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
} }
// Should we save the given URL for crawling? // Should we save the given URL for crawling?
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool { func shouldPersistURL(tx *sqlx.Tx, u URL) bool {
if !strings.HasPrefix(u.String(), "gemini://") { if !strings.HasPrefix(u.String(), "gemini://") {
return false return false
} }
@@ -205,3 +270,18 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
} }
return snapshots, nil return snapshots, nil
} }
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err
}
return snapshots, nil
}

6
go.mod
View File

@@ -8,16 +8,22 @@ require (
github.com/jmoiron/sqlx v1.4.0 github.com/jmoiron/sqlx v1.4.0
github.com/matoous/go-nanoid/v2 v2.1.0 github.com/matoous/go-nanoid/v2 v2.1.0
github.com/rs/zerolog v1.33.0 github.com/rs/zerolog v1.33.0
github.com/stretchr/testify v1.9.0
golang.org/x/text v0.19.0 golang.org/x/text v0.19.0
) )
require ( require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-isatty v0.0.20 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.13.1 // indirect
golang.org/x/crypto v0.27.0 // indirect golang.org/x/crypto v0.27.0 // indirect
golang.org/x/sync v0.8.0 // indirect golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.25.0 // indirect golang.org/x/sys v0.25.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
) )

9
go.sum
View File

@@ -1,6 +1,7 @@
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -19,6 +20,10 @@ github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE= github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
@@ -34,6 +39,8 @@ github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxU
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
@@ -54,6 +61,8 @@ golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

20
main.go
View File

@@ -1,15 +1,14 @@
package main package main
import ( import (
"gemini-grc/config"
"gemini-grc/gemini"
"gemini-grc/logging"
"os" "os"
"os/signal" "os/signal"
"syscall" "syscall"
"gemini-grc/config"
"gemini-grc/gemini"
"gemini-grc/logging"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
"github.com/rs/zerolog" "github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log" zlog "github.com/rs/zerolog/log"
) )
@@ -44,11 +43,14 @@ func runApp() error {
} }
}(db) }(db)
// if len(os.Args) > 1 { gemini.LoadBlacklist()
// url := os.Args[1]
// } if len(os.Args) > 1 {
// os.Exit(1) url := os.Args[1]
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db) go gemini.RunWorker(0, db, &url)
} else {
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
}
<-sigs <-sigs
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting") logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")

View File

@@ -1,14 +1,14 @@
package uid package uid
import ( import (
nanoid "github.com/jaevor/go-nanoid" nanoid "github.com/matoous/go-nanoid/v2"
) )
func UID() string { func UID() string {
// Missing o,O and l // No 'o','O' and 'l'
uid, err := nanoid.CustomASCII("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20) id, err := nanoid.Generate("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20)
if err != nil { if err != nil {
panic(err) panic(err)
} }
return uid() return id
} }