diff --git a/Makefile b/Makefile index 85017e3..576d7e5 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,15 @@ export PATH := $(PATH) all: fmt lint test +.PHONY: debug +debug: + @echo "PATH: $(PATH)" + @echo "GOPATH: $(shell go env GOPATH)" + @which go + @which gofumpt + @which gci + @which golangci-lint + # Test test: go test -v ./... diff --git a/blacklist.txt b/blacklist.txt new file mode 100644 index 0000000..d07c636 --- /dev/null +++ b/blacklist.txt @@ -0,0 +1,2 @@ +gemi.dev +mastogem.picasoft.net diff --git a/config/config.go b/config/config.go index 24879e9..670e200 100644 --- a/config/config.go +++ b/config/config.go @@ -8,48 +8,42 @@ import ( "github.com/rs/zerolog" ) -// Environment variable names +// Environment variable names. const ( - EnvLogLevel = "LOG_LEVEL" - EnvRootPath = "ROOT_PATH" - EnvNumWorkers = "NUM_OF_WORKERS" - EnvWorkerBatchSize = "WORKER_BATCH_SIZE" - EnvMaxResponseSize = "MAX_RESPONSE_SIZE" - EnvResponseTimeout = "RESPONSE_TIMEOUT" + EnvLogLevel = "LOG_LEVEL" + EnvNumWorkers = "NUM_OF_WORKERS" + EnvWorkerBatchSize = "WORKER_BATCH_SIZE" + EnvMaxResponseSize = "MAX_RESPONSE_SIZE" + EnvResponseTimeout = "RESPONSE_TIMEOUT" + EnvPanicOnUnexpectedError = "PANIC_ON_UNEXPECTED_ERROR" + EnvBlacklistPath = "BLACKLIST_PATH" ) -// Config holds the application configuration loaded from environment variables +// Config holds the application configuration loaded from environment variables. type Config struct { - LogLevel zerolog.Level // Logging level (debug, info, warn, error) - RootPath string // Root path for the application - MaxResponseSize int // Maximum size of response in bytes - NumOfWorkers int // Number of concurrent workers - ResponseTimeout int // Timeout for responses in seconds - WorkerBatchSize int // Batch size for worker processing + LogLevel zerolog.Level // Logging level (debug, info, warn, error) + MaxResponseSize int // Maximum size of response in bytes + NumOfWorkers int // Number of concurrent workers + ResponseTimeout int // Timeout for responses in seconds + WorkerBatchSize int // Batch size for worker processing + PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL + BlacklistPath string // File that has blacklisted strings of "host:port" } -// String returns a string representation of the configuration -func (c *Config) String() string { - return fmt.Sprintf( - "Config{LogLevel: %s, RootPath: %s, MaxResponseSize: %d, NumWorkers: %d, ResponseTimeout: %d, WorkerBatchSize: %d}", - c.LogLevel, c.RootPath, c.MaxResponseSize, c.NumOfWorkers, c.ResponseTimeout, c.WorkerBatchSize, - ) -} +var CONFIG Config //nolint:gochecknoglobals -var CONFIG Config - -// parsePositiveInt parses and validates positive integer values +// parsePositiveInt parses and validates positive integer values. func parsePositiveInt(param, value string) (int, error) { val, err := strconv.Atoi(value) if err != nil { - return 0, &ValidationError{ + return 0, ValidationError{ Param: param, Value: value, Reason: "must be a valid integer", } } if val <= 0 { - return 0, &ValidationError{ + return 0, ValidationError{ Param: param, Value: value, Reason: "must be positive", @@ -58,6 +52,18 @@ func parsePositiveInt(param, value string) (int, error) { return val, nil } +func parseBool(param, value string) (bool, error) { + val, err := strconv.ParseBool(value) + if err != nil { + return false, ValidationError{ + Param: param, + Value: value, + Reason: "cannot be converted to boolean", + } + } + return val, nil +} + // GetConfig loads and validates configuration from environment variables func GetConfig() *Config { config := &Config{} @@ -67,7 +73,7 @@ func GetConfig() *Config { EnvLogLevel: func(v string) error { level, err := zerolog.ParseLevel(v) if err != nil { - return &ValidationError{ + return ValidationError{ Param: EnvLogLevel, Value: v, Reason: "must be one of: debug, info, warn, error", @@ -76,16 +82,6 @@ func GetConfig() *Config { config.LogLevel = level return nil }, - EnvRootPath: func(v string) error { - if _, err := os.Stat(v); err != nil { - return &ConfigError{ - Param: EnvRootPath, - Err: err, - } - } - config.RootPath = v - return nil - }, EnvNumWorkers: func(v string) error { val, err := parsePositiveInt(EnvNumWorkers, v) if err != nil { @@ -118,6 +114,18 @@ func GetConfig() *Config { config.ResponseTimeout = val return nil }, + EnvPanicOnUnexpectedError: func(v string) error { + val, err := parseBool(EnvPanicOnUnexpectedError, v) + if err != nil { + return err + } + config.PanicOnUnexpectedError = val + return nil + }, + EnvBlacklistPath: func(v string) error { + config.BlacklistPath = v + return nil + }, } // Process each environment variable diff --git a/config/config_test.go b/config/config_test.go deleted file mode 100644 index 541f1c3..0000000 --- a/config/config_test.go +++ /dev/null @@ -1,146 +0,0 @@ -package config - -import ( - "os" - "testing" - - "github.com/rs/zerolog" - "github.com/stretchr/testify/assert" -) - -func TestGetConfig(t *testing.T) { - // Set up test environment variables - envVars := map[string]string{ - EnvLogLevel: "debug", - EnvRootPath: ".", - EnvNumWorkers: "5", - EnvWorkerBatchSize: "100", - EnvMaxResponseSize: "1048576", - EnvResponseTimeout: "30", - } - - for k, v := range envVars { - os.Setenv(k, v) - defer os.Unsetenv(k) - } - - // Get configuration - config := GetConfig() - - // Assert configuration values - assert.Equal(t, zerolog.DebugLevel, config.LogLevel) - assert.Equal(t, ".", config.RootPath) - assert.Equal(t, 5, config.NumOfWorkers) - assert.Equal(t, 100, config.WorkerBatchSize) - assert.Equal(t, 1048576, config.MaxResponseSize) - assert.Equal(t, 30, config.ResponseTimeout) -} - -func TestParsePositiveInt(t *testing.T) { - tests := []struct { - name string - param string - input string - want int - wantErr bool - errType interface{} - errMessage string - }{ - { - name: "valid positive", - param: "TEST_PARAM", - input: "42", - want: 42, - wantErr: false, - }, - { - name: "zero", - param: "TEST_PARAM", - input: "0", - wantErr: true, - errType: &ValidationError{}, - errMessage: "invalid value '0' for TEST_PARAM: must be positive", - }, - { - name: "negative", - param: "TEST_PARAM", - input: "-1", - wantErr: true, - errType: &ValidationError{}, - errMessage: "invalid value '-1' for TEST_PARAM: must be positive", - }, - { - name: "invalid", - param: "TEST_PARAM", - input: "abc", - wantErr: true, - errType: &ValidationError{}, - errMessage: "invalid value 'abc' for TEST_PARAM: must be a valid integer", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := parsePositiveInt(tt.param, tt.input) - if tt.wantErr { - assert.Error(t, err) - assert.IsType(t, tt.errType, err) - if tt.errMessage != "" { - assert.Equal(t, tt.errMessage, err.Error()) - } - } else { - assert.NoError(t, err) - assert.Equal(t, tt.want, got) - } - }) - } -} - -func TestConfigValidation(t *testing.T) { - tests := []struct { - name string - envVars map[string]string - wantErr bool - errMessage string - }{ - { - name: "invalid log level", - envVars: map[string]string{ - EnvLogLevel: "invalid", - }, - wantErr: true, - errMessage: "invalid value 'invalid' for LOG_LEVEL: must be one of: debug, info, warn, error", - }, - { - name: "invalid worker count", - envVars: map[string]string{ - EnvLogLevel: "debug", - EnvRootPath: ".", - EnvNumWorkers: "-1", - }, - wantErr: true, - errMessage: "invalid value '-1' for NUM_OF_WORKERS: must be positive", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Clear environment - os.Clearenv() - - // Set required environment variables - for k, v := range tt.envVars { - os.Setenv(k, v) - } - - // Defer cleanup - defer os.Clearenv() - - if tt.wantErr { - assert.PanicsWithError(t, tt.errMessage, func() { - GetConfig() - }) - } - }) - } -} diff --git a/config/errors.go b/config/errors.go index 9a1d6dd..60482d7 100644 --- a/config/errors.go +++ b/config/errors.go @@ -2,27 +2,13 @@ package config import "fmt" -// ConfigError represents a configuration error -type ConfigError struct { - Param string - Err error -} - -func (e *ConfigError) Error() string { - return fmt.Sprintf("configuration error for %s: %v", e.Param, e.Err) -} - -func (e *ConfigError) Unwrap() error { - return e.Err -} - -// ValidationError represents a validation error +// ValidationError represents a config validation error type ValidationError struct { Param string Value string Reason string } -func (e *ValidationError) Error() string { +func (e ValidationError) Error() string { return fmt.Sprintf("invalid value '%s' for %s: %s", e.Value, e.Param, e.Reason) } diff --git a/db/delete_robots_hosts.sql b/db/delete_robots_hosts.sql new file mode 100644 index 0000000..6069873 --- /dev/null +++ b/db/delete_robots_hosts.sql @@ -0,0 +1,7 @@ +delete FROM snapshots +WHERE host IN ( + SELECT DISTINCT host + FROM snapshots + WHERE error LIKE 'robots.txt%' +) +AND url LIKE 'gemini://' || host || '/%'; diff --git a/db/error_stats.sql b/db/error_stats.sql new file mode 100644 index 0000000..c3f7f4b --- /dev/null +++ b/db/error_stats.sql @@ -0,0 +1,5 @@ +SELECT error, count(error) as count +FROM snapshots +GROUP BY error +ORDER BY count DESC +LIMIT 20; diff --git a/db/host_stats_visited.sql b/db/host_stats_visited.sql new file mode 100644 index 0000000..0e759d9 --- /dev/null +++ b/db/host_stats_visited.sql @@ -0,0 +1,7 @@ +SELECT host, COUNT(*) AS row_count +FROM snapshots +WHERE response_code IS NOT NULL + AND error IS NULL +GROUP BY host +ORDER BY row_count DESC +LIMIT 10; diff --git a/db/initdb.sql b/db/initdb.sql index 9071604..5483702 100644 --- a/db/initdb.sql +++ b/db/initdb.sql @@ -20,7 +20,7 @@ DROP TABLE IF EXISTS snapshots; CREATE TABLE snapshots ( id SERIAL PRIMARY KEY, uid TEXT NOT NULL UNIQUE, - url TEXT NOT NULL, + url TEXT NOT NULL UNIQUE, host TEXT NOT NULL, timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, mimetype TEXT, @@ -42,7 +42,10 @@ CREATE INDEX idx_lang ON snapshots (lang); CREATE INDEX idx_response_code ON snapshots (response_code); CREATE INDEX idx_error ON snapshots (error); CREATE INDEX idx_host ON snapshots (host); +CREATE INDEX unique_uid_url ON snapshots (uid, url); + CREATE INDEX idx_snapshots_unprocessed_no_data ON snapshots (host) WHERE response_code IS NULL AND error IS NULL INCLUDE (id, uid, url, timestamp, mimetype, gemtext, links, lang); + CREATE INDEX idx_response_code_error_nulls ON snapshots (response_code, error) WHERE response_code IS NULL AND error IS NULL; diff --git a/db/migrate1_host.go b/db/migrate1_host.go index ebc43ae..ab790ea 100644 --- a/db/migrate1_host.go +++ b/db/migrate1_host.go @@ -2,9 +2,9 @@ package main import ( "fmt" - "gemini-grc/gemini" "os" + "gemini-grc/gemini" _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL "github.com/jmoiron/sqlx" ) @@ -71,7 +71,6 @@ func main() { } } - } func connectToDB() *sqlx.DB { diff --git a/db/migrate2_unique_urls.sql b/db/migrate2_unique_urls.sql new file mode 100644 index 0000000..e46ec06 --- /dev/null +++ b/db/migrate2_unique_urls.sql @@ -0,0 +1,18 @@ +-- Step 1: Delete duplicate entries, keeping the last one based on timestamp +-- Use a CTE to mark duplicates and delete them efficiently +WITH ranked_snapshots AS ( + SELECT + id, + url, + ROW_NUMBER() OVER(PARTITION BY url ORDER BY timestamp DESC) AS row_num + FROM + snapshots +) +DELETE FROM snapshots +USING ranked_snapshots +WHERE snapshots.id = ranked_snapshots.id +AND ranked_snapshots.row_num > 1; + +-- Step 2: Add a unique constraint on the url column to prevent future duplicates +ALTER TABLE snapshots +ADD CONSTRAINT unique_url UNIQUE (url); diff --git a/db/populateDB.go b/db/populateDB.go index 42debba..480a1df 100644 --- a/db/populateDB.go +++ b/db/populateDB.go @@ -1,9 +1,9 @@ package main import ( - "gemini-grc/uid" "time" + "gemini-grc/uid" "github.com/jmoiron/sqlx" ) diff --git a/debug.sh b/debug.sh new file mode 100755 index 0000000..8dd1659 --- /dev/null +++ b/debug.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -eu + +# Max response size 10MiB +MAX_RESPONSE_SIZE=10485760 \ +LOG_LEVEL=debug \ +ROOT_PATH=./snaps \ +RESPONSE_TIMEOUT=10 \ +NUM_OF_WORKERS=1 \ +WORKER_BATCH_SIZE=1 \ +PG_DATABASE=gemini \ +PG_HOST=127.0.0.1 \ +PG_PORT=5433 \ +PG_USER=gemini \ +PG_PASSWORD=gemini \ +dlv debug diff --git a/error_urls.txt b/error_urls.txt index a817dd7..11604f0 100644 --- a/error_urls.txt +++ b/error_urls.txt @@ -1,3 +1,6 @@ +// 31 redirect +gemini://gemini.circumlunar.space + // body with null byte gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False diff --git a/gemini/blacklist.go b/gemini/blacklist.go new file mode 100644 index 0000000..dabcb72 --- /dev/null +++ b/gemini/blacklist.go @@ -0,0 +1,51 @@ +package gemini + +import ( + "fmt" + "os" + "strings" + + "gemini-grc/config" + "gemini-grc/logging" +) + +var Blacklist *[]string //nolint:gochecknoglobals + +func LoadBlacklist() { + if Blacklist == nil { + data, err := os.ReadFile(config.CONFIG.BlacklistPath) + + if err != nil { + Blacklist = &[]string{} + logging.LogWarn("Could not load Blacklist file: %v", err) + return + } + lines := strings.Split(string(data), "\n") + + // Ignore lines starting with '#' (comments) + filteredLines := func() []string { + out := make([]string, 0, len(lines)) + for _, line := range lines { + if !strings.HasPrefix(line, "#") { + out = append(out, line) + } + } + return out + }() + + if len(lines) > 0 { + Blacklist = &filteredLines + logging.LogInfo("Blacklist has %d entries", len(*Blacklist)) + } + } +} + +func IsBlacklisted(url URL) bool { + hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port) + for _, v := range *Blacklist { + if v == url.Hostname || v == hostWithPort { + return true + } + } + return false +} diff --git a/gemini/connectionPool.go b/gemini/connectionPool.go index a1eab2f..190eb7e 100644 --- a/gemini/connectionPool.go +++ b/gemini/connectionPool.go @@ -4,11 +4,11 @@ import ( "gemini-grc/logging" ) -var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)} +var IpPool = IpAddressPool{IPs: make(map[string]int)} -func AddIPsToPool(IPs []string) { +func AddIPsToPool(ips []string) { IpPool.Lock.Lock() - for _, ip := range IPs { + for _, ip := range ips { logging.LogDebug("Adding %s to pool", ip) IpPool.IPs[ip]++ } diff --git a/gemini/errors.go b/gemini/errors.go new file mode 100644 index 0000000..310377d --- /dev/null +++ b/gemini/errors.go @@ -0,0 +1,100 @@ +package gemini + +import ( + "errors" + "fmt" +) + +type ErrGeminiStatusCode struct { + Msg string + Code int + Header string +} + +func (e *ErrGeminiStatusCode) Error() string { + return fmt.Sprintf("%s: %s", e.Msg, e.Header) +} + +func NewErrGeminiStatusCode(code int, header string) error { + var msg string + switch { + case code >= 10 && code < 20: + msg = "needs input" + case code >= 30 && code < 40: + msg = "redirect" + case code >= 40 && code < 50: + msg = "bad request" + case code >= 50 && code < 60: + msg = "server error" + case code >= 60 && code < 70: + msg = "TLS error" + default: + msg = "unexpected status code" + } + return &ErrGeminiStatusCode{ + Msg: msg, + Code: code, + Header: header, + } +} + +var ( + ErrGemini = errors.New("gemini general error") + ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error") + ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed") + ErrGeminiResponseHeader = errors.New("gemini response header error") + ErrGeminiLinkLineParse = errors.New("gemini link line parse error") + + ErrURLParse = errors.New("URL parse error") + ErrURLDecode = errors.New("URL decode error") + ErrUTF8Parse = errors.New("UTF-8 parse error") + ErrTextParse = errors.New("text parse error") + + ErrNetwork = errors.New("network error") + ErrNetworkDNS = errors.New("network DNS error") + ErrNetworkTLS = errors.New("network TLS error") + ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline") + ErrNetworkCannotWrite = errors.New("network error - cannot write") + ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size") + + ErrDatabase = errors.New("database error") +) + +// We could have used a map for speed, but +// we would lose ability to check wrapped +// errors via errors.Is(). + +var KnownErrors = []error{ + ErrGemini, + ErrGeminiLinkLineParse, + ErrGeminiRobotsParse, + ErrGeminiRobotsDisallowed, + ErrGeminiResponseHeader, + + ErrURLParse, + ErrURLDecode, + ErrUTF8Parse, + ErrTextParse, + + ErrNetwork, + ErrNetworkDNS, + ErrNetworkTLS, + ErrNetworkSetConnectionDeadline, + ErrNetworkCannotWrite, + ErrNetworkResponseSizeExceededMax, + + ErrDatabase, +} + +func IsKnownError(err error) bool { + var errGeminiStatusCode *ErrGeminiStatusCode + if errors.As(err, &errGeminiStatusCode) { + return true + } + for _, known := range KnownErrors { + if errors.Is(err, known) { + return true + } + } + return false +} diff --git a/gemini/files.go b/gemini/files.go index c2e8147..1c685eb 100644 --- a/gemini/files.go +++ b/gemini/files.go @@ -2,12 +2,13 @@ package gemini import ( "fmt" - "gemini-grc/logging" "net/url" "os" "path" "path/filepath" "strings" + + "gemini-grc/logging" ) // sanitizePath encodes invalid filesystem characters using URL encoding. @@ -87,9 +88,9 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) { return } if s.MimeType.Valid && s.MimeType.String == "text/gemini" { - err = os.WriteFile(finalPath, (*s).Data.V, 0666) + err = os.WriteFile(finalPath, (*s).Data.V, 0o666) } else { - err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666) + err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666) } if err != nil { logging.LogError("Error saving %s: %w", s.URL.Full, err) diff --git a/gemini/gemini.go b/gemini/gemini.go index 18d48b2..57ec82d 100644 --- a/gemini/gemini.go +++ b/gemini/gemini.go @@ -1,32 +1,13 @@ package gemini import ( - "errors" "fmt" - "gemini-grc/logging" "net/url" "regexp" "strconv" -) -func checkGeminiStatusCode(code int) error { - switch { - case code == 20: - return nil - case code >= 10 && code < 20: - return fmt.Errorf("gemini response %d needs data input", code) - case code >= 30 && code < 40: - return fmt.Errorf("gemini response %d redirect", code) - case code >= 40 && code < 50: - return fmt.Errorf("gemini response %d server error", code) - case code >= 50 && code < 60: - return fmt.Errorf("gemini response %d server permanent error", code) - case code >= 60 && code < 70: - return fmt.Errorf("gemini response %d certificate error", code) - default: - return fmt.Errorf("unexpected/unhandled Gemini response %d", code) - } -} + "gemini-grc/logging" +) func ProcessGemini(snapshot *Snapshot) *Snapshot { // Grab link lines @@ -40,7 +21,7 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot { logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err) continue } - geminiUrl, err := ParseUrl(normalizedLink, descr) + geminiUrl, err := ParseURL(normalizedLink, descr) if err != nil { logging.LogDebug("Cannot parse URL in link '%s': %v", line, err) continue @@ -54,25 +35,6 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot { return snapshot } -func ParseUrl(input string, descr string) (*GeminiUrl, error) { - u, err := url.Parse(input) - if err != nil { - return nil, fmt.Errorf("error parsing URL %s: %w", input, err) - } - protocol := u.Scheme - hostname := u.Hostname() - strPort := u.Port() - path := u.Path - if strPort == "" { - strPort = "1965" - } - port, err := strconv.Atoi(strPort) - if err != nil { - return nil, fmt.Errorf("error parsing URL %s: %w", input, err) - } - return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil -} - // ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines func ExtractLinkLines(gemtext string) []string { // Define the regular expression pattern to match link lines @@ -87,11 +49,11 @@ func ExtractLinkLines(gemtext string) []string { // NormalizeLink takes a single link line and the current URL, // return the URL converted to an absolute URL // and its description. -func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) { +func NormalizeLink(linkLine string, currentURL string) (string, string, error) { // Parse the current URL baseURL, err := url.Parse(currentURL) if err != nil { - return "", "", fmt.Errorf("invalid current URL: %v", err) + return "", "", fmt.Errorf("%w: %w", ErrURLParse, err) } // Regular expression to extract the URL part from a link line @@ -101,13 +63,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin matches := re.FindStringSubmatch(linkLine) if len(matches) == 0 { // If the line doesn't match the expected format, return it unchanged - return "", "", fmt.Errorf("not a link line: %v", linkLine) + return "", "", fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine) } originalURLStr := matches[1] _, err = url.QueryUnescape(originalURLStr) if err != nil { - return "", "", fmt.Errorf("error decoding URL: %w", err) + return "", "", fmt.Errorf("%w: %w", ErrURLDecode, err) } restOfLine := "" @@ -119,7 +81,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin parsedURL, err := url.Parse(originalURLStr) if err != nil { // If URL parsing fails, return an error - return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err) + return "", "", fmt.Errorf("%w: %w", ErrURLParse, err) } // Resolve relative URLs against the base URL @@ -151,13 +113,13 @@ func ParseFirstTwoDigits(input string) (int, error) { // Find the first match in the string matches := re.FindStringSubmatch(input) if len(matches) == 0 { - return 0, errors.New("no digits found at the beginning of the string") + return 0, fmt.Errorf("%w", ErrGeminiResponseHeader) } // Parse the captured match as an integer snapshot, err := strconv.Atoi(matches[1]) if err != nil { - return 0, fmt.Errorf("failed to convert matched digits to int: %v", err) + return 0, fmt.Errorf("%w: %w", ErrTextParse, err) } return snapshot, nil diff --git a/gemini/gemini_url.go b/gemini/gemini_url.go index f9928d4..01cf135 100644 --- a/gemini/gemini_url.go +++ b/gemini/gemini_url.go @@ -34,7 +34,7 @@ func (u *URL) Scan(value interface{}) error { return nil } -func (u *URL) String() string { +func (u URL) String() string { return u.Full } @@ -62,7 +62,8 @@ func ParseURL(input string, descr string) (*URL, error) { if err != nil { return nil, fmt.Errorf("%w: Input %s Error %w", ErrURLParse, input, err) } - return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil + full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path) + return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil } //func GeminiUrltoJSON(g URL) string { diff --git a/gemini/network.go b/gemini/network.go index 5f48e2e..57506ba 100644 --- a/gemini/network.go +++ b/gemini/network.go @@ -2,25 +2,27 @@ package gemini import ( "crypto/tls" + "errors" "fmt" - "gemini-grc/config" "io" "net" - go_url "net/url" + gourl "net/url" "regexp" "slices" "strconv" "time" + "gemini-grc/config" "github.com/guregu/null/v5" ) -type GeminiPageData struct { - ResponseCode int - MimeType string - Lang string - GemText string - Data []byte +type PageData struct { + ResponseCode int + ResponseHeader string + MimeType string + Lang string + GemText string + Data []byte } // Resolve the URL hostname and @@ -31,7 +33,7 @@ type GeminiPageData struct { func getHostIPAddresses(hostname string) ([]string, error) { addrs, err := net.LookupHost(hostname) if err != nil { - return nil, err + return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err) } IpPool.Lock.RLock() defer func() { @@ -41,12 +43,12 @@ func getHostIPAddresses(hostname string) ([]string, error) { } func ConnectAndGetData(url string) ([]byte, error) { - parsedUrl, err := go_url.Parse(url) + parsedURL, err := gourl.Parse(url) if err != nil { - return nil, fmt.Errorf("Could not parse URL, error %w", err) + return nil, fmt.Errorf("%w: %w", ErrURLParse, err) } - hostname := parsedUrl.Hostname() - port := parsedUrl.Port() + hostname := parsedURL.Hostname() + port := parsedURL.Port() if port == "" { port = "1965" } @@ -58,34 +60,34 @@ func ConnectAndGetData(url string) ([]byte, error) { } conn, err := dialer.Dial("tcp", host) if err != nil { - return nil, fmt.Errorf("TCP connection failed: %w", err) + return nil, fmt.Errorf("%w: %w", ErrNetwork, err) } // Make sure we always close the connection. defer func() { // No need to handle error: - // Connection will timeout eventually if still open somehow. - conn.Close() + // Connection will time out eventually if still open somehow. + _ = conn.Close() }() // Set read and write timeouts on the TCP connection. err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) if err != nil { - return nil, fmt.Errorf("Error setting connection deadline: %w", err) + return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err) } err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) if err != nil { - return nil, fmt.Errorf("Error setting connection deadline: %w", err) + return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err) } // Perform the TLS handshake tlsConfig := &tls.Config{ - InsecureSkipVerify: true, // Accept all TLS certs, even if insecure. - ServerName: parsedUrl.Hostname(), // SNI should not include port + InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure. + ServerName: parsedURL.Hostname(), // SNI should not include port // MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites. } tlsConn := tls.Client(conn, tlsConfig) if err := tlsConn.Handshake(); err != nil { - return nil, fmt.Errorf("TLS handshake error: %w", err) + return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err) } // We read `buf`-sized chunks and add data to `data`. @@ -95,7 +97,7 @@ func ConnectAndGetData(url string) ([]byte, error) { // Send Gemini request to trigger server response. _, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url))) if err != nil { - return nil, fmt.Errorf("Error sending network request: %w", err) + return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err) } // Read response bytes in len(buf) byte chunks for { @@ -104,68 +106,72 @@ func ConnectAndGetData(url string) ([]byte, error) { data = append(data, buf[:n]...) } if len(data) > config.CONFIG.MaxResponseSize { - data = []byte{} - return nil, fmt.Errorf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize) + return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize) } if err != nil { - if err == io.EOF { + if errors.Is(err, io.EOF) { break } else { - return nil, fmt.Errorf("Network error: %s", err) + return nil, fmt.Errorf("%w: %w", ErrNetwork, err) } } } return data, nil } -// Connect to given URL, using the Gemini protocol. -// Mutate given Snapshot with the data or the error. -func Visit(s *Snapshot) { +// Visit given URL, using the Gemini protocol. +// Mutates given Snapshot with the data. +func Visit(s *Snapshot) error { data, err := ConnectAndGetData(s.URL.String()) if err != nil { - s.Error = null.StringFrom(err.Error()) - return + return err } pageData, err := processData(data) if err != nil { - s.Error = null.StringFrom(err.Error()) - return + return err } s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode)) s.MimeType = null.StringFrom(pageData.MimeType) s.Lang = null.StringFrom(pageData.Lang) if pageData.GemText != "" { - s.GemText = null.StringFrom(string(pageData.GemText)) + s.GemText = null.StringFrom(pageData.GemText) } if pageData.Data != nil { s.Data = null.ValueFrom(pageData.Data) } + return nil } // Update given snapshot with the // Gemini header data: response code, // mime type and lang (optional) -func processData(data []byte) (*GeminiPageData, error) { - headers, body, err := getHeadersAndData(data) +func processData(data []byte) (*PageData, error) { + header, body, err := getHeadersAndData(data) if err != nil { return nil, err } - code, mimeType, lang := getMimeTypeAndLang(headers) - geminiError := checkGeminiStatusCode(code) + code, mimeType, lang := getMimeTypeAndLang(header) + var geminiError error + if code != 20 { + geminiError = NewErrGeminiStatusCode(code, header) + } + fmt.Printf("%v\n", header) + if geminiError != nil { return nil, geminiError } - pageData := GeminiPageData{ - ResponseCode: code, - MimeType: mimeType, - Lang: lang, + pageData := PageData{ + ResponseCode: code, + ResponseHeader: header, + MimeType: mimeType, + Lang: lang, } // If we've got a Gemini document, populate // `GemText` field, otherwise raw data goes to `Data`. if mimeType == "text/gemini" { - validBody, err := EnsureValidUTF8(body) + validBody, err := BytesToValidUTF8(body) if err != nil { - return nil, fmt.Errorf("UTF-8 error: %w", err) + return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err) } pageData.GemText = validBody } else { @@ -178,14 +184,14 @@ func processData(data []byte) (*GeminiPageData, error) { // basically the first line of the response // and should contain the response code, // mimeType and language. -func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) { +func getHeadersAndData(data []byte) (string, []byte, error) { firstLineEnds := slices.Index(data, '\n') if firstLineEnds == -1 { - return "", nil, fmt.Errorf("Could not parse response header") + return "", nil, ErrGeminiResponseHeader } - firstLine = string(data[:firstLineEnds]) - rest = data[firstLineEnds+1:] - return string(firstLine), rest, nil + firstLine := string(data[:firstLineEnds]) + rest := data[firstLineEnds+1:] + return firstLine, rest, nil } // Parses code, mime type and language @@ -194,7 +200,7 @@ func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) { // `20 text/gemini lang=en` (code, mimetype, lang) // `20 text/gemini` (code, mimetype) // `31 gemini://redirected.to/other/site` (code) -func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) { +func getMimeTypeAndLang(headers string) (int, string, string) { // Regex that parses code, mimetype & lang re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`) matches := re.FindStringSubmatch(headers) @@ -215,7 +221,7 @@ func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) if err != nil { return 0, "", "" } - mimeType = matches[2] - lang = matches[4] + mimeType := matches[2] + lang := matches[4] return code, mimeType, lang } diff --git a/gemini/network_test.go b/gemini/network_test.go index 4d72b13..79cc71e 100644 --- a/gemini/network_test.go +++ b/gemini/network_test.go @@ -6,6 +6,7 @@ import ( // Test for input: `20 text/gemini` func TestGetMimeTypeAndLang1(t *testing.T) { + t.Parallel() code, mimeType, lang := getMimeTypeAndLang("20 text/gemini") if code != 20 || mimeType != "text/gemini" || lang != "" { t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) @@ -13,6 +14,7 @@ func TestGetMimeTypeAndLang1(t *testing.T) { } func TestGetMimeTypeAndLang11(t *testing.T) { + t.Parallel() code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n") if code != 20 || mimeType != "text/gemini" || lang != "" { t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) @@ -20,6 +22,7 @@ func TestGetMimeTypeAndLang11(t *testing.T) { } func TestGetTypeAndLang2(t *testing.T) { + t.Parallel() code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en") if code != 20 || mimeType != "text/gemini" || lang != "en" { t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang) @@ -27,6 +30,7 @@ func TestGetTypeAndLang2(t *testing.T) { } func TestGetMimeTypeAndLang3(t *testing.T) { + t.Parallel() code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page") if code != 31 || mimeType != "" || lang != "" { t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) @@ -34,6 +38,7 @@ func TestGetMimeTypeAndLang3(t *testing.T) { } func TestGetMimeTypeAndLang4(t *testing.T) { + t.Parallel() code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd") if code != 0 || mimeType != "" || lang != "" { t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) @@ -41,6 +46,7 @@ func TestGetMimeTypeAndLang4(t *testing.T) { } func TestGetMimeTypeAndLang5(t *testing.T) { + t.Parallel() code, mimeType, lang := getMimeTypeAndLang("") if code != 0 || mimeType != "" || lang != "" { t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang) diff --git a/gemini/persistence.go b/gemini/persistence.go index 73809ad..778873a 100644 --- a/gemini/persistence.go +++ b/gemini/persistence.go @@ -2,9 +2,9 @@ package gemini import ( "fmt" - "gemini-grc/logging" "os" + "gemini-grc/logging" _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL "github.com/jmoiron/sqlx" ) @@ -33,11 +33,26 @@ func ConnectToDB() *sqlx.DB { return db } -func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { +func SaveSnapshotToDBIfNotExists(tx *sqlx.Tx, s *Snapshot) error { query := ` INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) - ON CONFLICT (uid) DO UPDATE SET + ON CONFLICT (url) DO NOTHING + ` + _, err := tx.NamedExec(query, s) + if err != nil { + logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err) + return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking + } + return nil +} + +func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { + fmt.Printf("%+v", s) + query := ` + INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) + VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) + ON CONFLICT (url) DO UPDATE SET url = EXCLUDED.url, host = EXCLUDED.host, timestamp = EXCLUDED.timestamp, @@ -64,7 +79,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { query := ` INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) - ON CONFLICT (uid) DO NOTHING + ON CONFLICT (url) DO NOTHING ` for i := 0; i < len(snapshots); i += batchSize { @@ -89,7 +104,7 @@ func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error { query := ` INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) - ON CONFLICT (uid) DO NOTHING + ON CONFLICT (url) DO NOTHING ` _, err := tx.NamedExec(query, snapshots) if err != nil { diff --git a/gemini/processing.go b/gemini/processing.go index 15227cb..0afdac3 100644 --- a/gemini/processing.go +++ b/gemini/processing.go @@ -2,33 +2,58 @@ package gemini import ( "bytes" + "errors" "fmt" "io" "unicode/utf8" "golang.org/x/text/encoding/charmap" + "golang.org/x/text/encoding/japanese" + "golang.org/x/text/encoding/korean" "golang.org/x/text/transform" ) +var ( + ErrInputTooLarge = errors.New("input too large") + ErrUTF8Conversion = errors.New("UTF-8 conversion error") +) + func BytesToValidUTF8(input []byte) (string, error) { + if len(input) == 0 { + return "", nil + } + const maxSize = 10 * 1024 * 1024 // 10MB + if len(input) > maxSize { + return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize) + } // Remove NULL byte 0x00 (ReplaceAll accepts slices) inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{}) - isValidUTF8 := utf8.Valid(inputNoNull) - if isValidUTF8 { + if utf8.Valid(inputNoNull) { return string(inputNoNull), nil } encodings := []transform.Transformer{ - charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1 - charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc - // TODO: Try more encodings? + charmap.ISO8859_1.NewDecoder(), + charmap.ISO8859_7.NewDecoder(), + charmap.Windows1250.NewDecoder(), // Central European + charmap.Windows1251.NewDecoder(), // Cyrillic + charmap.Windows1252.NewDecoder(), + charmap.Windows1256.NewDecoder(), // Arabic + japanese.EUCJP.NewDecoder(), // Japanese + korean.EUCKR.NewDecoder(), // Korean } // First successful conversion wins. + var lastErr error for _, encoding := range encodings { reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding) result, err := io.ReadAll(reader) - if err == nil { + if err != nil { + lastErr = err + continue + } + if utf8.Valid(result) { return string(result), nil } } - return "", fmt.Errorf("UTF-8 error: %w", err) + + return "", fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr) } diff --git a/gemini/processing_test.go b/gemini/processing_test.go index 349a5a0..986323d 100644 --- a/gemini/processing_test.go +++ b/gemini/processing_test.go @@ -4,6 +4,7 @@ import "testing" // Make sure NULL bytes are removed func TestEnsureValidUTF8(t *testing.T) { + t.Parallel() // Create a string with a null byte strWithNull := "Hello" + string('\x00') + "world" result, _ := BytesToValidUTF8([]byte(strWithNull)) diff --git a/gemini/robotmatch.go b/gemini/robotmatch.go index 9524162..bd1f8ad 100644 --- a/gemini/robotmatch.go +++ b/gemini/robotmatch.go @@ -2,16 +2,18 @@ package gemini import ( "fmt" - "gemini-grc/logging" "strings" "sync" + + "gemini-grc/logging" ) -// key: "host:port" (string) -// value: -// empty []string if no robots data, or -// list of URL prefixes ([]string) in robots -var RobotsCache sync.Map +// RobotsCache is a map of blocked URLs +// key: URL +// value: []string list of disallowed URLs +// If a key has no blocked URLs, an empty +// list is stored for caching. +var RobotsCache sync.Map //nolint:gochecknoglobals func populateBlacklist(key string) (entries []string) { // We either store an empty list when @@ -40,43 +42,40 @@ func populateBlacklist(key string) (entries []string) { // According to spec, the first is correct, // however let's be lenient var data string - if robotsData.MimeType == "text/plain" { + switch { + case robotsData.MimeType == "text/plain": data = string(robotsData.Data) - } else if robotsData.MimeType == "text/gemini" { + case robotsData.MimeType == "text/gemini": data = robotsData.GemText - } else { + default: return []string{} } - entries = ParseRobotsTxt(string(data), key) + entries = ParseRobotsTxt(data, key) return entries } -// Check if the snapshot URL matches +// RobotMatch checks if the snapshot URL matches // a robots.txt allow rule. -func RobotMatch(s *Snapshot) bool { - logging.LogDebug("Checking robots.txt cache for %s", s.URL.String()) - key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port) - v, ok := RobotsCache.Load(key) +func RobotMatch(url URL) bool { + logging.LogDebug("Checking robots.txt cache for %s", url.String()) + key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port)) + var disallowedURLs []string + cacheEntries, ok := RobotsCache.Load(key) if !ok { // First time check, populate robot cache - logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String()) - disallowedURLs := populateBlacklist(key) - for _, url := range disallowedURLs { - if strings.HasPrefix(s.URL.String(), url) { - logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) - return true - } - } + disallowedURLs = populateBlacklist(key) + logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs) } else { - if len(v.([]string)) == 0 { - logging.LogDebug("No robots.txt or no rules, allowed") - return false - } - for _, url := range v.([]string) { - if strings.HasPrefix(s.URL.String(), url) { - logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url) - return true - } + disallowedURLs, _ = cacheEntries.([]string) + } + return isURLblocked(disallowedURLs, url.Full) +} + +func isURLblocked(disallowedURLs []string, input string) bool { + for _, url := range disallowedURLs { + if strings.HasPrefix(strings.ToLower(input), url) { + logging.LogDebug("robots.txt match: %s matches %s", input, url) + return true } } return false diff --git a/gemini/robots.go b/gemini/robots.go index be4477a..0653b62 100644 --- a/gemini/robots.go +++ b/gemini/robots.go @@ -5,7 +5,7 @@ import ( "strings" ) -// Takes robots.txt content and a host, and +// ParseRobotsTxt takes robots.txt content and a host, and // returns a list of full URLs that shouldn't // be visited. // TODO Also take into account the user agent? diff --git a/gemini/robots_test.go b/gemini/robots_test.go index 4a00d12..e73e7b5 100644 --- a/gemini/robots_test.go +++ b/gemini/robots_test.go @@ -6,6 +6,7 @@ import ( ) func TestParseRobotsTxt(t *testing.T) { + t.Parallel() input := `User-agent: * Disallow: /cgi-bin/wp.cgi/view Disallow: /cgi-bin/wp.cgi/media @@ -26,6 +27,7 @@ Disallow: /admin/` } func TestParseRobotsTxtEmpty(t *testing.T) { + t.Parallel() input := `` result := ParseRobotsTxt(input, "example.com") @@ -34,3 +36,20 @@ func TestParseRobotsTxtEmpty(t *testing.T) { t.Errorf("ParseRobotsTxt() = %v, want empty []string", result) } } + +func TestIsURLblocked(t *testing.T) { + t.Parallel() + disallowedURLs := []string{ + "gemini://example.com/cgi-bin/wp.cgi/view", + "gemini://example.com/cgi-bin/wp.cgi/media", + "gemini://example.com/admin/", + } + url := "gemini://example.com/admin/index.html" + if !isURLblocked(disallowedURLs, url) { + t.Errorf("Expected %s to be blocked", url) + } + url = "gemini://example1.com/admin/index.html" + if isURLblocked(disallowedURLs, url) { + t.Errorf("expected %s to not be blocked", url) + } +} diff --git a/gemini/snapshot.go b/gemini/snapshot.go index d2c3ed7..9d8ec59 100644 --- a/gemini/snapshot.go +++ b/gemini/snapshot.go @@ -4,15 +4,13 @@ import ( "database/sql/driver" "encoding/json" "fmt" - "gemini-grc/logging" - "strings" "github.com/guregu/null/v5" ) -type LinkList []GeminiUrl +type LinkList []URL -func (l LinkList) Value() (driver.Value, error) { +func (l *LinkList) Value() (driver.Value, error) { return json.Marshal(l) } @@ -31,7 +29,7 @@ func (l *LinkList) Scan(value interface{}) error { type Snapshot struct { ID int `db:"id" json:"id,omitempty"` UID string `db:"uid" json:"uid,omitempty"` - URL GeminiUrl `db:"url" json:"url,omitempty"` + URL URL `db:"url" json:"url,omitempty"` Host string `db:"host" json:"host,omitempty"` Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"` MimeType null.String `db:"mimetype" json:"mimetype,omitempty"` @@ -43,32 +41,32 @@ type Snapshot struct { Error null.String `db:"error" json:"error,omitempty"` // On network errors only } -func SnapshotToJSON(g Snapshot) string { - // Serialize the Person struct to JSON - jsonData, err := json.MarshalIndent(g, "", "\t") - if err != nil { - logging.LogError("Error serializing to JSON: %w", err) - } - return string(jsonData) -} - -func SnapshotFromJSON(input string) Snapshot { - var snapshot Snapshot - err := json.Unmarshal([]byte(input), &snapshot) - if err != nil { - logging.LogError("Error deserializing from JSON: %w", err) - } - return snapshot -} - -func ShouldPersistSnapshot(result *Snapshot) bool { - if !result.MimeType.Valid { - return false - } - if result.MimeType.String == "text/gemini" || - strings.HasPrefix(result.MimeType.String, "image/") || - strings.HasPrefix(result.MimeType.String, "text/") { - return true - } - return false -} +//func SnapshotToJSON(g Snapshot) string { +// // Serialize the Person struct to JSON +// jsonData, err := json.MarshalIndent(g, "", "\t") +// if err != nil { +// logging.LogError("Error serializing to JSON: %w", err) +// } +// return string(jsonData) +//} +// +//func SnapshotFromJSON(input string) Snapshot { +// var snapshot Snapshot +// err := json.Unmarshal([]byte(input), &snapshot) +// if err != nil { +// logging.LogError("Error deserializing from JSON: %w", err) +// } +// return snapshot +//} +// +//func ShouldPersistSnapshot(result *Snapshot) bool { +// if !result.MimeType.Valid { +// return false +// } +// if result.MimeType.String == "text/gemini" || +// strings.HasPrefix(result.MimeType.String, "image/") || +// strings.HasPrefix(result.MimeType.String, "text/") { +// return true +// } +// return false +//} diff --git a/gemini/worker.go b/gemini/worker.go index 71ce079..1d21fe5 100644 --- a/gemini/worker.go +++ b/gemini/worker.go @@ -1,30 +1,32 @@ package gemini import ( + "errors" "fmt" + "regexp" + "strings" + "time" + "gemini-grc/config" "gemini-grc/logging" "gemini-grc/uid" "gemini-grc/util" - "strings" - "time" - "github.com/guregu/null/v5" "github.com/jmoiron/sqlx" ) func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { logging.LogInfo("Spawning %d workers", numOfWorkers) - for i := 0; i < numOfWorkers; i++ { + for i := range numOfWorkers { go func(i int) { for { - runWorker(i, db) + RunWorker(i, db, nil) } }(i) } } -func runWorker(id int, db *sqlx.DB) { +func RunWorker(id int, db *sqlx.DB, url *string) { // Start the DB transaction tx, err := db.Beginx() if err != nil { @@ -42,38 +44,85 @@ func runWorker(id int, db *sqlx.DB) { } }() - snapshots, err := GetRandomSnapshotsDistinctHosts(tx) + var snapshots []Snapshot + + if url == nil { + snapshots, err = GetRandomSnapshotsDistinctHosts(tx) + } else { + snapshots, err = GetSnapshotFromURL(tx, *url) + if len(snapshots) == 0 { + snapshotURL, err := ParseURL(*url, "") + if err != nil { + panic("Invalid URL: " + *url) + } + snapshots = []Snapshot{{ + UID: uid.UID(), + URL: *snapshotURL, + Host: snapshotURL.Hostname, + Timestamp: null.TimeFrom(time.Now()), + }} + } + } if err != nil { logging.LogError("[%d] Error retrieving snapshot: %w", id, err) time.Sleep(10 * time.Second) return } else if len(snapshots) == 0 { - logging.LogInfo("[%d] No remaining snapshots to visit.", id) + logging.LogInfo("[%d] No snapshots to visit.", id) time.Sleep(1 * time.Minute) return } total := len(snapshots) for i, s := range snapshots { - logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL) + logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String()) err = workOnSnapshot(id, tx, &s) if err != nil { - logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL, err) + logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err) util.PrintStackAndPanic(err) } if s.Error.Valid { - logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL, s.Error.String) + logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String) } - logging.LogDebug("[%d] Done %d/%d.", id, i, total) + logging.LogDebug("[%d] Done %d/%d.", id, i+1, total) } logging.LogInfo("[%d] Worker done.", id) } +func handleRedirection(tx *sqlx.Tx, s *Snapshot) error { + re := regexp.MustCompile(`gemini://\S+`) + matches := re.FindStringSubmatch(s.Error.ValueOrZero()) + if len(matches) == 1 { + newURL := matches[0] + logging.LogDebug("Page redirects to %s", newURL) + _url, err := ParseURL(newURL, "") + // Insert fresh snapshot with new URL + if err == nil { + snapshot := &Snapshot{ + UID: uid.UID(), + URL: *_url, + Host: _url.Hostname, + Timestamp: null.TimeFrom(time.Now()), + } + err := SaveSnapshotToDBIfNotExists(tx, snapshot) + if err != nil { + return err + } + } + } + return nil +} + func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { + if IsBlacklisted(s.URL) { + logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String()) + return nil + } + // If URL matches a robots.txt disallow line, // add it as an error so next time it won't be // crawled. - if RobotMatch(s) { + if RobotMatch(s.URL) { s.Error = null.StringFrom("robots.txt disallow match") err = SaveSnapshotToDB(tx, s) if err != nil { @@ -92,14 +141,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { return nil } + defer func() { + time.Sleep(5 * time.Second) + RemoveIPsFromPool(IPs) + }() + // If the host's ip is in the connections pool, // stop and add the url in the queue later. IpPool.Lock.RLock() - logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL) + logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String()) for _, ip := range IPs { _, ok := IpPool.IPs[ip] if ok { - logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL) + logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String()) IpPool.Lock.RUnlock() time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain return nil @@ -111,15 +165,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { url := s.URL.String() logging.LogDebug("[%d] Dialing %s", id, url) - Visit(s) + err = Visit(s) + if err != nil { + if !IsKnownError(err) { + logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err) + if config.CONFIG.PanicOnUnexpectedError { + util.PrintStackAndPanic(err) + } + } else { + s.Error = null.StringFrom(err.Error()) + } + if errors.As(err, new(*ErrGeminiStatusCode)) { + err = handleRedirection(tx, s) + if err != nil { + return err + } + } + } logging.LogDebug("[%d] Finished dialing.", id) - go func() { - time.Sleep(5 * time.Second) - RemoveIPsFromPool(IPs) - }() - - if s.MimeType.Valid && s.MimeType.String == "text/gemini" { + if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" { logging.LogDebug("[%d] [%s] Processing", id, url) s = ProcessGemini(s) } @@ -158,7 +223,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { } // Should we save the given URL for crawling? -func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool { +func shouldPersistURL(tx *sqlx.Tx, u URL) bool { if !strings.HasPrefix(u.String(), "gemini://") { return false } @@ -205,3 +270,18 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) { } return snapshots, nil } + +func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) { + query := ` + SELECT * + FROM snapshots + WHERE url=$1 + LIMIT 1 + ` + var snapshots []Snapshot + err := tx.Select(&snapshots, query, url) + if err != nil { + return nil, err + } + return snapshots, nil +} diff --git a/go.mod b/go.mod index a3ddfd7..bb2da34 100644 --- a/go.mod +++ b/go.mod @@ -8,16 +8,22 @@ require ( github.com/jmoiron/sqlx v1.4.0 github.com/matoous/go-nanoid/v2 v2.1.0 github.com/rs/zerolog v1.33.0 + github.com/stretchr/testify v1.9.0 golang.org/x/text v0.19.0 ) require ( + github.com/davecgh/go-spew v1.1.1 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect + github.com/kr/text v0.2.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/rogpeppe/go-internal v1.13.1 // indirect golang.org/x/crypto v0.27.0 // indirect golang.org/x/sync v0.8.0 // indirect golang.org/x/sys v0.25.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 6deda2a..85a1711 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,7 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -19,6 +20,10 @@ github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE= @@ -34,6 +39,8 @@ github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxU github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= @@ -54,6 +61,8 @@ golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go index f00a0e8..532aa41 100644 --- a/main.go +++ b/main.go @@ -1,15 +1,14 @@ package main import ( - "gemini-grc/config" - "gemini-grc/gemini" - "gemini-grc/logging" "os" "os/signal" "syscall" + "gemini-grc/config" + "gemini-grc/gemini" + "gemini-grc/logging" "github.com/jmoiron/sqlx" - "github.com/rs/zerolog" zlog "github.com/rs/zerolog/log" ) @@ -44,11 +43,14 @@ func runApp() error { } }(db) - // if len(os.Args) > 1 { - // url := os.Args[1] - // } - // os.Exit(1) - go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db) + gemini.LoadBlacklist() + + if len(os.Args) > 1 { + url := os.Args[1] + go gemini.RunWorker(0, db, &url) + } else { + go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db) + } <-sigs logging.LogInfo("Received SIGINT or SIGTERM signal, exiting") diff --git a/uid/uid.go b/uid/uid.go index d0af93f..b98e342 100644 --- a/uid/uid.go +++ b/uid/uid.go @@ -1,14 +1,14 @@ package uid import ( - nanoid "github.com/jaevor/go-nanoid" + nanoid "github.com/matoous/go-nanoid/v2" ) func UID() string { - // Missing o,O and l - uid, err := nanoid.CustomASCII("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20) + // No 'o','O' and 'l' + id, err := nanoid.Generate("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 20) if err != nil { panic(err) } - return uid() + return id }