Compare commits

...

14 Commits

Author SHA1 Message Date
bbef81a4ce Update license and readme. 2025-02-26 10:37:37 +02:00
4f47521401 update gitignore 2025-02-26 10:37:20 +02:00
96a39ec3b6 Improve main error handling 2025-02-26 10:37:09 +02:00
54474d45cd Use Go race detector 2025-02-26 10:36:51 +02:00
d306c44f3d Tidy go mod 2025-02-26 10:36:41 +02:00
79e3175467 Add gemget script that downloads Gemini pages 2025-02-26 10:35:54 +02:00
d89dd72fe9 Add Gopherspace crawling! 2025-02-26 10:35:28 +02:00
29877cb2da Simplify host pool 2025-02-26 10:35:11 +02:00
4bceb75695 Reorganize code for more granular imports 2025-02-26 10:34:46 +02:00
a9983f3531 Reorganize errors 2025-02-26 10:32:38 +02:00
5cf720103f Improve blacklist to use regex matching 2025-02-26 10:32:01 +02:00
b6dd77e57e Add regex matching function to util 2025-01-16 22:37:39 +02:00
973a4f3a2d Add tidy & update Makefile targets 2025-01-16 22:37:39 +02:00
b30b7274ec Simplify duplicate code 2025-01-16 22:37:39 +02:00
48 changed files with 3207 additions and 1549 deletions

11
.gitignore vendored
View File

@@ -1,10 +1,17 @@
.idea
.goroot
**/.#* **/.#*
**/*~ **/*~
/.idea
/.goroot
/blacklist.txt
/check.sh
/debug.sh
/run.sh
/.go /.go
/cmd /cmd
/db/initdb.sql /db/initdb.sql
/gemini-grc /gemini-grc
run*.sh run*.sh
/main /main
/db/migration*/**
/db/populate/**
/db/sql/**

14
COPYING
View File

@@ -1,14 +0,0 @@
Copyright (c) Antanst
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

15
LICENSE Normal file
View File

@@ -0,0 +1,15 @@
ISC License
Copyright (c) Antanst 2014-2015
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

View File

@@ -1,10 +1,10 @@
SHELL := /usr/local/bin/oksh SHELL := /bin/env oksh
export PATH := $(PATH) export PATH := $(PATH)
all: fmt lintfix test clean build all: fmt lintfix tidy test clean build
clean: clean:
rm ./main rm -f ./gemini-grc
debug: debug:
@echo "PATH: $(PATH)" @echo "PATH: $(PATH)"
@@ -16,7 +16,10 @@ debug:
# Test # Test
test: test:
go test ./... go test -race ./...
tidy:
go mod tidy
# Format code # Format code
fmt: fmt:
@@ -32,4 +35,13 @@ lintfix: fmt
golangci-lint run --fix golangci-lint run --fix
build: build:
go build ./main.go go build -race -o gemini-grc ./main.go
show-updates:
go list -m -u all
update:
go get -u all
update-patch:
go get -u=patch all

View File

@@ -1,27 +1,83 @@
# gemini-grc # gemini-grc
A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) network. Easily extendable as a "wayback machine" of Gemini. A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) network.
Easily extendable as a "wayback machine" of Gemini.
## Features done ## Features
- [x] URL normalization
- [x] Handle redirects (3X status codes)
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi - [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
- [x] Save image/* and text/* files - [x] Save image/* and text/* files
- [x] Concurrent downloading with workers - [x] Concurrent downloading with configurable number of workers
- [x] Connection limit per host - [x] Connection limit per host
- [x] URL Blacklist - [x] URL Blacklist
- [x] Configuration via environment variables - [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL - [x] Storing capsule snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation - [x] Proper response header & body UTF-8 and format validation
- [x] Proper URL normalization
- [x] Handle redirects (3X status codes)
## How to run
Spin up a PostgreSQL, check `db/sql/initdb.sql` to create the tables and start the crawler.
All configuration is done via environment variables.
## Configuration
Bool can be `true`,`false` or `0`,`1`.
```text
LogLevel string // Logging level (debug, info, warn, error)
MaxResponseSize int // Maximum size of response in bytes
NumOfWorkers int // Number of concurrent workers
ResponseTimeout int // Timeout for responses in seconds
WorkerBatchSize int // Batch size for worker processing
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
BlacklistPath string // File that has blacklisted strings of "host:port"
DryRun bool // If false, don't write to disk
PrintWorkerStatus bool // If false, print logs and not worker status table
```
Example:
```shell
LOG_LEVEL=info \
NUM_OF_WORKERS=10 \
WORKER_BATCH_SIZE=10 \
BLACKLIST_PATH="./blacklist.txt" \ # one url per line, can be empty
MAX_RESPONSE_SIZE=10485760 \
RESPONSE_TIMEOUT=10 \
PANIC_ON_UNEXPECTED_ERROR=true \
PG_DATABASE=test \
PG_HOST=127.0.0.1 \
PG_MAX_OPEN_CONNECTIONS=100 \
PG_PORT=5434 \
PG_USER=test \
PG_PASSWORD=test \
DRY_RUN=false \
./gemini-grc
```
## Development
Install linters. Check the versions first.
```shell
go install mvdan.cc/gofumpt@v0.7.0
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.63.4
```
## TODO ## TODO
- [ ] Add snapshot history - [ ] Add snapshot history
- [ ] Add a web interface - [ ] Add a web interface
- [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany - [ ] Provide to servers a TLS cert for sites that require it, like Astrobotany
- [ ] Use pledge/unveil in OpenBSD hosts
## TODO (lower priority) ## TODO (lower priority)
- [ ] Gopher - [ ] Gopher
- [ ] Scroll gemini://auragem.letz.dev/devlog/20240316.gmi - [ ] More? http://dbohdan.sdf.org/smolnet/
- [ ] Spartan
- [ ] Nex ## Notes
- [ ] SuperTXT https://supertxt.net/00-intro.html Good starting points:
gemini://warmedal.se/~antenna/
gemini://tlgs.one/
gopher://i-logout.cz:70/1/bongusta/
gopher://gopher.quux.org:70/

47
bin/gemget/main.go Normal file
View File

@@ -0,0 +1,47 @@
package main
import (
"encoding/json"
"fmt"
"os"
"gemini-grc/common/snapshot"
_url "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/errors"
"gemini-grc/gemini"
"gemini-grc/gopher"
"gemini-grc/logging"
)
func main() {
config.CONFIG = *config.GetConfig()
err := runApp()
if err != nil {
fmt.Printf("%v\n", err)
logging.LogError("%v", err)
os.Exit(1)
}
}
func runApp() error {
if len(os.Args) != 2 {
return errors.NewError(fmt.Errorf("missing URL to visit"))
}
url := os.Args[1]
var s *snapshot.Snapshot
var err error
if _url.IsGeminiUrl(url) {
s, err = gemini.Visit(url)
} else if _url.IsGopherURL(url) {
s, err = gopher.Visit(url)
} else {
return errors.NewFatalError(fmt.Errorf("not a Gemini or Gopher URL"))
}
if err != nil {
return err
}
_json, _ := json.MarshalIndent(s, "", " ")
fmt.Printf("%s\n", _json)
return err
}

View File

@@ -4,7 +4,9 @@ import (
"fmt" "fmt"
"os" "os"
"gemini-grc/gemini" "gemini-grc/common/snapshot"
"gemini-grc/common/url"
main2 "gemini-grc/db"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
@@ -21,7 +23,7 @@ func main() {
ORDER BY id ORDER BY id
LIMIT 10000 OFFSET $1 LIMIT 10000 OFFSET $1
` `
var snapshots []gemini.Snapshot var snapshots []snapshot.Snapshot
err := tx.Select(&snapshots, query, count) err := tx.Select(&snapshots, query, count)
if err != nil { if err != nil {
printErrorAndExit(tx, err) printErrorAndExit(tx, err)
@@ -32,8 +34,8 @@ func main() {
} }
for _, s := range snapshots { for _, s := range snapshots {
count++ count++
escaped := gemini.EscapeURL(s.URL.String()) escaped := url.EscapeURL(s.URL.String())
normalizedGeminiURL, err := gemini.ParseURL(escaped, "") normalizedGeminiURL, err := url.ParseURL(escaped, "", true)
if err != nil { if err != nil {
fmt.Println(s.URL.String()) fmt.Println(s.URL.String())
fmt.Println(escaped) fmt.Println(escaped)
@@ -47,7 +49,7 @@ func main() {
} }
// If a snapshot already exists with the normalized // If a snapshot already exists with the normalized
// URL, delete the current snapshot and leave the other. // URL, delete the current snapshot and leave the other.
var ss []gemini.Snapshot var ss []snapshot.Snapshot
err = tx.Select(&ss, "SELECT * FROM snapshots WHERE URL=$1", normalizedURLString) err = tx.Select(&ss, "SELECT * FROM snapshots WHERE URL=$1", normalizedURLString)
if err != nil { if err != nil {
printErrorAndExit(tx, err) printErrorAndExit(tx, err)
@@ -69,7 +71,7 @@ func main() {
// Saves the snapshot with the normalized URL // Saves the snapshot with the normalized URL
tx.MustExec("DELETE FROM snapshots WHERE id=$1", s.ID) tx.MustExec("DELETE FROM snapshots WHERE id=$1", s.ID)
s.URL = *normalizedGeminiURL s.URL = *normalizedGeminiURL
err = gemini.UpsertSnapshot(0, tx, &s) err = main2.OverwriteSnapshot(tx, &s)
if err != nil { if err != nil {
printErrorAndExit(tx, err) printErrorAndExit(tx, err)
} }

View File

@@ -0,0 +1,55 @@
package blackList
import (
"fmt"
"os"
"regexp"
"strings"
"gemini-grc/config"
"gemini-grc/errors"
"gemini-grc/logging"
)
var Blacklist []regexp.Regexp //nolint:gochecknoglobals
func LoadBlacklist() error {
if config.CONFIG.BlacklistPath == "" {
return nil
}
if Blacklist == nil {
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
if err != nil {
Blacklist = []regexp.Regexp{}
return errors.NewError(fmt.Errorf("could not load Blacklist file: %w", err))
}
lines := strings.Split(string(data), "\n")
for _, line := range lines {
if line == "" || strings.HasPrefix(line, "#") {
continue
}
regex, err := regexp.Compile(line)
if err != nil {
return errors.NewError(fmt.Errorf("could not compile Blacklist line %s: %w", line, err))
}
Blacklist = append(Blacklist, *regex)
}
if len(lines) > 0 {
logging.LogInfo("Loaded %d blacklist entries", len(Blacklist))
}
}
return nil
}
func IsBlacklisted(u string) bool {
for _, v := range Blacklist {
if v.MatchString(u) {
return true
}
}
return false
}

View File

@@ -0,0 +1,295 @@
package blackList
import (
"os"
"regexp"
"testing"
"gemini-grc/config"
)
func TestIsBlacklisted(t *testing.T) {
// Save original blacklist to restore after test
originalBlacklist := Blacklist
defer func() {
Blacklist = originalBlacklist
}()
tests := []struct {
name string
setup func()
url string
expected bool
}{
{
name: "empty blacklist",
setup: func() {
Blacklist = []regexp.Regexp{}
},
url: "https://example.com",
expected: false,
},
{
name: "exact hostname match",
setup: func() {
regex, _ := regexp.Compile(`example\.com`)
Blacklist = []regexp.Regexp{*regex}
},
url: "example.com",
expected: true,
},
{
name: "hostname in URL match",
setup: func() {
regex, _ := regexp.Compile(`example\.com`)
Blacklist = []regexp.Regexp{*regex}
},
url: "https://example.com/path",
expected: true,
},
{
name: "partial hostname match",
setup: func() {
regex, _ := regexp.Compile(`example\.com`)
Blacklist = []regexp.Regexp{*regex}
},
url: "https://safe-example.com",
expected: true,
},
{
name: "full URL match",
setup: func() {
regex, _ := regexp.Compile(`https://example\.com/bad-path`)
Blacklist = []regexp.Regexp{*regex}
},
url: "https://example.com/bad-path",
expected: true,
},
{
name: "path match",
setup: func() {
regex, _ := regexp.Compile("/malicious-path")
Blacklist = []regexp.Regexp{*regex}
},
url: "https://example.com/malicious-path",
expected: true,
},
{
name: "subdomain match with word boundary",
setup: func() {
regex, _ := regexp.Compile(`bad\.example\.com`)
Blacklist = []regexp.Regexp{*regex}
},
url: "https://bad.example.com/path",
expected: true,
},
{
name: "multiple patterns, one match",
setup: func() {
regex1, _ := regexp.Compile(`badsite\.com`)
regex2, _ := regexp.Compile(`malicious\.org`)
regex3, _ := regexp.Compile(`example\.com/sensitive`)
Blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
},
url: "https://example.com/sensitive/data",
expected: true,
},
{
name: "multiple patterns, no match",
setup: func() {
regex1, _ := regexp.Compile(`badsite\.com`)
regex2, _ := regexp.Compile(`malicious\.org`)
regex3, _ := regexp.Compile(`example\.com/sensitive`)
Blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
},
url: "https://example.com/safe/data",
expected: false,
},
{
name: "pattern with wildcard",
setup: func() {
regex, _ := regexp.Compile(`.*\.evil\.com`)
Blacklist = []regexp.Regexp{*regex}
},
url: "https://subdomain.evil.com/path",
expected: true,
},
{
name: "pattern with special characters",
setup: func() {
regex, _ := regexp.Compile(`example\.com/path\?id=[0-9]+`)
Blacklist = []regexp.Regexp{*regex}
},
url: "https://example.com/path?id=12345",
expected: true,
},
{
name: "unicode character support",
setup: func() {
regex, _ := regexp.Compile(`example\.com/[\p{L}]+`)
Blacklist = []regexp.Regexp{*regex}
},
url: "https://example.com/café",
expected: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.setup()
result := IsBlacklisted(tt.url)
if result != tt.expected {
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
}
})
}
}
func TestLoadBlacklist(t *testing.T) {
// Save original blacklist to restore after test
originalBlacklist := Blacklist
originalConfigPath := config.CONFIG.BlacklistPath
defer func() {
Blacklist = originalBlacklist
config.CONFIG.BlacklistPath = originalConfigPath
}()
// Create a temporary blacklist file for testing
tmpFile, err := os.CreateTemp("", "blacklist-*.txt")
if err != nil {
t.Fatalf("Failed to create temporary file: %v", err)
}
defer os.Remove(tmpFile.Name())
// Test cases for LoadBlacklist
tests := []struct {
name string
blacklistLines []string
configPath string
wantErr bool
expectedLen int
}{
{
name: "empty path",
blacklistLines: []string{},
configPath: "",
wantErr: false,
expectedLen: 0,
},
{
name: "valid blacklist with comments",
blacklistLines: []string{"example\\.com", "# This is a comment", "malicious\\.org"},
configPath: tmpFile.Name(),
wantErr: false,
expectedLen: 2,
},
{
name: "invalid regex",
blacklistLines: []string{"example\\.com", "[invalid regex"},
configPath: tmpFile.Name(),
wantErr: true,
expectedLen: 0,
},
{
name: "nonexistent file",
blacklistLines: []string{},
configPath: "nonexistent-file.txt",
wantErr: true,
expectedLen: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Reset blacklist
Blacklist = nil
// Set config path
config.CONFIG.BlacklistPath = tt.configPath
// Write test data to file if needed
if tt.configPath == tmpFile.Name() {
content := ""
for _, line := range tt.blacklistLines {
content += line + "\n"
}
if err := os.WriteFile(tmpFile.Name(), []byte(content), 0o644); err != nil {
t.Fatalf("Failed to write to temporary file: %v", err)
}
}
// Call the function
err := LoadBlacklist()
// Check results
if (err != nil) != tt.wantErr {
t.Errorf("LoadBlacklist() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !tt.wantErr && len(Blacklist) != tt.expectedLen {
t.Errorf("LoadBlacklist() loaded %d entries, want %d", len(Blacklist), tt.expectedLen)
}
})
}
}
// TestIsBlacklistedIntegration tests the integration between LoadBlacklist and IsBlacklisted
func TestIsBlacklistedIntegration(t *testing.T) {
// Save original blacklist to restore after test
originalBlacklist := Blacklist
originalConfigPath := config.CONFIG.BlacklistPath
defer func() {
Blacklist = originalBlacklist
config.CONFIG.BlacklistPath = originalConfigPath
}()
// Create a temporary blacklist file for testing
tmpFile, err := os.CreateTemp("", "blacklist-*.txt")
if err != nil {
t.Fatalf("Failed to create temporary file: %v", err)
}
defer os.Remove(tmpFile.Name())
// Write test patterns to the blacklist file
blacklistContent := `# Test blacklist file
example\.com
malicious\.org
/phishing
.*\.evil\.com
\w+@spam\.com
`
if err := os.WriteFile(tmpFile.Name(), []byte(blacklistContent), 0o644); err != nil {
t.Fatalf("Failed to write to temporary file: %v", err)
}
// Set up the test
Blacklist = nil
config.CONFIG.BlacklistPath = tmpFile.Name()
// Load the blacklist
if err := LoadBlacklist(); err != nil {
t.Fatalf("LoadBlacklist() failed: %v", err)
}
// Test URLs against the loaded blacklist
tests := []struct {
url string
expected bool
}{
{"https://example.com", true},
{"https://safe-site.com", false},
{"https://malicious.org/path", true},
{"https://example.org/phishing", true},
{"https://subdomain.evil.com", true},
{"https://safe-site.com/safe-path", false},
{"mailto:user@spam.com", true},
}
for _, tt := range tests {
result := IsBlacklisted(tt.url)
if result != tt.expected {
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
}
}
}

View File

@@ -1,106 +0,0 @@
package common
import (
"errors"
"fmt"
)
type GeminiError struct {
Msg string
Code int
Header string
}
func (e *GeminiError) Error() string {
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
}
func NewErrGeminiStatusCode(code int, header string) error {
var msg string
switch {
case code >= 10 && code < 20:
msg = "needs input"
case code >= 30 && code < 40:
msg = "redirect"
case code >= 40 && code < 50:
msg = "bad request"
case code >= 50 && code < 60:
msg = "server error"
case code >= 60 && code < 70:
msg = "TLS error"
default:
msg = "unexpected Status code"
}
return &GeminiError{
Msg: msg,
Code: code,
Header: header,
}
}
var (
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
ErrGeminiResponseHeader = errors.New("gemini response header error")
ErrGeminiRedirect = errors.New("gemini redirection error")
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
ErrURLParse = errors.New("URL parse error")
ErrURLNotGemini = errors.New("not a Gemini URL")
ErrURLDecode = errors.New("URL decode error")
ErrUTF8Parse = errors.New("UTF-8 parse error")
ErrTextParse = errors.New("text parse error")
ErrBlacklistMatches = errors.New("url matches blacklist")
ErrNetwork = errors.New("network error")
ErrNetworkDNS = errors.New("network DNS error")
ErrNetworkTLS = errors.New("network TLS error")
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
ErrNetworkCannotWrite = errors.New("network error - cannot write")
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
ErrDatabase = errors.New("database error")
ErrDatabaseScan = errors.New("database scan error")
)
// We could have used a map for speed, but
// we would lose ability to check wrapped
// errors via errors.Is().
var errGemini *GeminiError
var knownErrors = []error{ //nolint:gochecknoglobals
errGemini,
ErrGeminiLinkLineParse,
ErrGeminiRobotsParse,
ErrGeminiRobotsDisallowed,
ErrGeminiResponseHeader,
ErrGeminiRedirect,
ErrBlacklistMatches,
ErrURLParse,
ErrURLDecode,
ErrUTF8Parse,
ErrTextParse,
ErrNetwork,
ErrNetworkDNS,
ErrNetworkTLS,
ErrNetworkSetConnectionDeadline,
ErrNetworkCannotWrite,
ErrNetworkResponseSizeExceededMax,
ErrDatabase,
ErrDatabaseScan,
}
func IsKnownError(err error) bool {
for _, known := range knownErrors {
if errors.Is(err, known) {
return true
}
}
return errors.As(err, new(*GeminiError))
}

41
common/errors/errors.go Normal file
View File

@@ -0,0 +1,41 @@
package errors
import (
"fmt"
"gemini-grc/errors"
)
// HostError is an error encountered while
// visiting a host, and should be recorded
// to the snapshot.
type HostError struct {
Err error
}
func (e *HostError) Error() string {
return e.Err.Error()
}
func (e *HostError) Unwrap() error {
return e.Err
}
func NewHostError(err error) error {
return &HostError{Err: err}
}
func IsHostError(err error) bool {
if err == nil {
return false
}
var asError *HostError
return errors.As(err, &asError)
}
// Sentinel errors used for their string message primarily.
// Do not use them by themselves, to be embedded to HostError.
var (
ErrBlacklistMatch = fmt.Errorf("black list match")
ErrRobotsMatch = fmt.Errorf("robots match")
)

View File

@@ -0,0 +1,38 @@
package errors_test
import (
"errors"
"fmt"
"testing"
"gemini-grc/gemini"
)
func TestErrGemini(t *testing.T) {
t.Parallel()
err := gemini.NewGeminiError(50, "50 server error")
if !errors.As(err, new(*gemini.GeminiError)) {
t.Errorf("TestErrGemini fail")
}
}
func TestErrGeminiWrapped(t *testing.T) {
t.Parallel()
err := gemini.NewGeminiError(50, "50 server error")
errWrapped := fmt.Errorf("%w wrapped", err)
if !errors.As(errWrapped, new(*gemini.GeminiError)) {
t.Errorf("TestErrGeminiWrapped fail")
}
}
func TestIsGeminiError(t *testing.T) {
t.Parallel()
err1 := gemini.NewGeminiError(50, "50 server error")
if !gemini.IsGeminiError(err1) {
t.Errorf("TestGeminiError fail #1")
}
wrappedErr1 := fmt.Errorf("wrapped %w", err1)
if !gemini.IsGeminiError(wrappedErr1) {
t.Errorf("TestGeminiError fail #2")
}
}

View File

@@ -1,25 +0,0 @@
package common_test
import (
"errors"
"fmt"
"gemini-grc/common"
"testing"
)
func TestErrGemini(t *testing.T) {
t.Parallel()
err := common.NewErrGeminiStatusCode(50, "50 server error")
if !errors.As(err, new(*common.GeminiError)) {
t.Errorf("TestErrGemini fail")
}
}
func TestErrGeminiWrapped(t *testing.T) {
t.Parallel()
err := common.NewErrGeminiStatusCode(50, "50 server error")
errWrapped := fmt.Errorf("%w wrapped", err)
if !errors.As(errWrapped, new(*common.GeminiError)) {
t.Errorf("TestErrGeminiWrapped fail")
}
}

View File

@@ -1,251 +0,0 @@
package common_test
import (
"gemini-grc/common"
"reflect"
"testing"
)
func TestParseURL(t *testing.T) {
t.Parallel()
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
parsed, err := common.ParseURL(input, "")
value, _ := parsed.Value()
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
t.Errorf("fail: %s", parsed)
}
}
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
t.Parallel()
currentURL := common.URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "gemini://a.b/c"
output, err := common.DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &common.URL{
Protocol: "gemini",
Hostname: "a.b",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://a.b:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
t.Parallel()
currentURL := common.URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "/c"
output, err := common.DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &common.URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://smol.gr:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
t.Parallel()
currentURL := common.URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "c/d"
output, err := common.DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &common.URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b/c/d",
Descr: "",
Full: "gemini://smol.gr:1965/a/b/c/d",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeURLSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/magazines/"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := input
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeURLNoSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/magazines"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := input
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeMultiSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/////////a///magazines"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeTrailingSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeNoTrailingSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeTrailingSlashPath(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/a/"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/a/"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeNoTrailingSlashPath(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/a"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/a"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeDot(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/./././////a///magazines"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizePort(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net:1965/a"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/a"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeURL(t *testing.T) {
t.Parallel()
input := "gemini://chat.gemini.lehmann.cx:11965/"
normalized, _ := common.NormalizeURL(input)
output := normalized.String()
expected := "gemini://chat.gemini.lehmann.cx:11965/"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
input = "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c"
normalized, _ = common.NormalizeURL(input)
output = normalized.String()
expected = "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c"
pass = reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
input = "gemini://chat.gemini.lehmann.cx:11965/index#1"
normalized, _ = common.NormalizeURL(input)
output = normalized.String()
expected = "gemini://chat.gemini.lehmann.cx:11965/index#1"
pass = reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
input = "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494"
normalized, _ = common.NormalizeURL(input)
output = normalized.String()
expected = "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494"
pass = reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}

View File

@@ -0,0 +1,27 @@
package linkList
import (
"database/sql/driver"
"encoding/json"
"fmt"
"gemini-grc/common/url"
)
type LinkList []url.URL
func (l *LinkList) Value() (driver.Value, error) {
return json.Marshal(l)
}
func (l *LinkList) Scan(value interface{}) error {
if value == nil {
*l = nil
return nil
}
b, ok := value.([]byte) // Type assertion! Converts to []byte
if !ok {
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
}
return json.Unmarshal(b, l)
}

13
common/shared.go Normal file
View File

@@ -0,0 +1,13 @@
package common
var (
StatusChan chan WorkerStatus
// ErrorsChan accepts errors from workers.
// In case of fatal error, gracefully
// exits the application.
ErrorsChan chan error
)
const VERSION string = "0.0.1"
const CtxKeyLogger string = "CtxKeyLogger"

View File

@@ -1,56 +0,0 @@
package common
import (
"database/sql/driver"
"encoding/json"
"fmt"
"time"
"github.com/guregu/null/v5"
)
type LinkList []URL
func (l *LinkList) Value() (driver.Value, error) {
return json.Marshal(l)
}
func (l *LinkList) Scan(value interface{}) error {
if value == nil {
*l = nil
return nil
}
b, ok := value.([]byte) // Type assertion! Converts to []byte
if !ok {
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
}
return json.Unmarshal(b, l)
}
type Snapshot struct {
ID int `db:"id" json:"id,omitempty"`
URL URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Header null.String `db:"header" json:"header,omitempty"` // Response header.
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
Lang null.String `db:"lang" json:"lang,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
}
func SnapshotFromURL(u string) *Snapshot {
url, err := ParseURL(u, "")
if err != nil {
return nil
}
newSnapshot := Snapshot{
URL: *url,
Host: url.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
return &newSnapshot
}

View File

@@ -0,0 +1,38 @@
package snapshot
import (
"time"
"gemini-grc/common/linkList"
commonUrl "gemini-grc/common/url"
"gemini-grc/errors"
"github.com/guregu/null/v5"
)
type Snapshot struct {
ID int `db:"ID" json:"ID,omitempty"`
URL commonUrl.URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Header null.String `db:"header" json:"header,omitempty"` // Response header.
Links null.Value[linkList.LinkList] `db:"links" json:"links,omitempty"`
Lang null.String `db:"lang" json:"lang,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response Status code.
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
}
func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
url, err := commonUrl.ParseURL(u, "", normalize)
if err != nil {
return nil, errors.NewError(err)
}
newSnapshot := Snapshot{
URL: *url,
Host: url.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
return &newSnapshot, nil
}

View File

@@ -1,12 +1,15 @@
package common package url
import ( import (
"database/sql/driver" "database/sql/driver"
"fmt" "fmt"
"net/url" "net/url"
"path" "path"
"regexp"
"strconv" "strconv"
"strings" "strings"
"gemini-grc/errors"
) )
type URL struct { type URL struct {
@@ -26,11 +29,10 @@ func (u *URL) Scan(value interface{}) error {
} }
b, ok := value.(string) b, ok := value.(string)
if !ok { if !ok {
return fmt.Errorf("%w: expected string, got %T", ErrDatabaseScan, value) return errors.NewFatalError(fmt.Errorf("database scan error: expected string, got %T", value))
} }
parsedURL, err := ParseURLNoNormalize(b, "") parsedURL, err := ParseURL(b, "", false)
if err != nil { if err != nil {
err = fmt.Errorf("%w: failed to scan GeminiUrl %s: %v", ErrDatabaseScan, b, err)
return err return err
} }
*u = *parsedURL *u = *parsedURL
@@ -42,8 +44,14 @@ func (u URL) String() string {
} }
func (u URL) StringNoDefaultPort() string { func (u URL) StringNoDefaultPort() string {
if u.Port == 1965 { if IsGeminiUrl(u.String()) {
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path) if u.Port == 1965 {
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
}
} else {
if u.Port == 70 {
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
}
} }
return u.Full return u.Full
} }
@@ -55,54 +63,43 @@ func (u URL) Value() (driver.Value, error) {
return u.Full, nil return u.Full, nil
} }
func ParseURLNoNormalize(input string, descr string) (*URL, error) { func IsGeminiUrl(url string) bool {
u, err := url.Parse(input) return strings.HasPrefix(url, "gemini://")
if err != nil {
return nil, fmt.Errorf("%w: Input %s URL Parse Error: %w", ErrURLParse, input, err)
}
if u.Scheme != "gemini" {
return nil, fmt.Errorf("%w: URL scheme '%s' is not supported", ErrURLNotGemini, u.Scheme)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
urlPath := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
}
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
// full field should also contain query params and url fragments
if u.RawQuery != "" {
full += "?" + u.RawQuery
}
if u.Fragment != "" {
full += "#" + u.Fragment
}
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: urlPath, Descr: descr, Full: full}, nil
} }
func ParseURL(input string, descr string) (*URL, error) { func IsGopherURL(s string) bool {
u, err := NormalizeURL(input) return strings.HasPrefix(s, "gopher://")
if err != nil { }
return nil, fmt.Errorf("%w: Input %s URL Parse Error: %w", ErrURLParse, input, err)
} func ParseURL(input string, descr string, normalize bool) (*URL, error) {
if u.Scheme != "gemini" { var u *url.URL
return nil, fmt.Errorf("%w: URL scheme '%s' is not supported", ErrURLNotGemini, u.Scheme) var err error
if normalize {
u, err = NormalizeURL(input)
if err != nil {
return nil, err
}
} else {
u, err = url.Parse(input)
if err != nil {
return nil, errors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input))
}
} }
protocol := u.Scheme protocol := u.Scheme
hostname := u.Hostname() hostname := u.Hostname()
strPort := u.Port() strPort := u.Port()
// urlPath := u.EscapedPath()
urlPath := u.Path urlPath := u.Path
if strPort == "" { if strPort == "" {
strPort = "1965" if u.Scheme == "gemini" {
strPort = "1965" // default Gemini port
} else {
strPort = "70" // default Gopher port
}
} }
port, err := strconv.Atoi(strPort) port, err := strconv.Atoi(strPort)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err) return nil, errors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input))
} }
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath) full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
// full field should also contain query params and url fragments // full field should also contain query params and url fragments
@@ -121,7 +118,7 @@ func ParseURL(input string, descr string) (*URL, error) {
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) { func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
// If target URL is absolute, return just it // If target URL is absolute, return just it
if strings.Contains(input, "://") { if strings.Contains(input, "://") {
return ParseURL(input, "") return ParseURL(input, "", true)
} }
// input is a relative path. Clean it and construct absolute. // input is a relative path. Clean it and construct absolute.
var newPath string var newPath string
@@ -134,10 +131,10 @@ func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
newPath = path.Join(currentURL.Path, "/", path.Clean(input)) newPath = path.Join(currentURL.Path, "/", path.Clean(input))
} }
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath) strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
return ParseURL(strURL, "") return ParseURL(strURL, "", true)
} }
// NormalizeURL takes a URL string and returns a normalized version. // NormalizeURL takes a URL string and returns a normalized version
// Normalized meaning: // Normalized meaning:
// - Path normalization (removing redundant slashes, . and .. segments) // - Path normalization (removing redundant slashes, . and .. segments)
// - Proper escaping of special characters // - Proper escaping of special characters
@@ -148,7 +145,13 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
// Parse the URL // Parse the URL
u, err := url.Parse(rawURL) u, err := url.Parse(rawURL)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", ErrURLParse, err) return nil, errors.NewError(fmt.Errorf("error normalizing URL: %w: %s", err, rawURL))
}
if u.Scheme == "" {
return nil, errors.NewError(fmt.Errorf("error normalizing URL: No scheme: %s", rawURL))
}
if u.Host == "" {
return nil, errors.NewError(fmt.Errorf("error normalizing URL: No host: %s", rawURL))
} }
// Convert scheme to lowercase // Convert scheme to lowercase
@@ -159,7 +162,7 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
u.Host = strings.ToLower(u.Host) u.Host = strings.ToLower(u.Host)
} }
// Remove default ports // remove default ports
if u.Port() != "" { if u.Port() != "" {
switch { switch {
case u.Scheme == "http" && u.Port() == "80": case u.Scheme == "http" && u.Port() == "80":
@@ -168,6 +171,8 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
u.Host = u.Hostname() u.Host = u.Hostname()
case u.Scheme == "gemini" && u.Port() == "1965": case u.Scheme == "gemini" && u.Port() == "1965":
u.Host = u.Hostname() u.Host = u.Hostname()
case u.Scheme == "gopher" && u.Port() == "70":
u.Host = u.Hostname()
} }
} }
@@ -176,7 +181,7 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
// Check if there was a trailing slash before cleaning // Check if there was a trailing slash before cleaning
hadTrailingSlash := strings.HasSuffix(u.Path, "/") hadTrailingSlash := strings.HasSuffix(u.Path, "/")
u.Path = path.Clean(u.Path) u.Path = path.Clean(u.EscapedPath())
// If path was "/", path.Clean() will return "." // If path was "/", path.Clean() will return "."
if u.Path == "." { if u.Path == "." {
u.Path = "/" u.Path = "/"
@@ -186,20 +191,25 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
} }
} }
// Properly escape the path // Properly escape the path, but only for unescaped parts
// First split on '/' to avoid escaping them
parts := strings.Split(u.Path, "/") parts := strings.Split(u.Path, "/")
for i, part := range parts { for i, part := range parts {
parts[i] = url.PathEscape(part) // Try to unescape to check if it's already escaped
unescaped, err := url.PathUnescape(part)
if err != nil || unescaped == part {
// Part is not escaped, so escape it
parts[i] = url.PathEscape(part)
}
// If already escaped, leave as is
} }
u.Path = strings.Join(parts, "/") u.Path = strings.Join(parts, "/")
// Remove trailing fragment if empty // remove trailing fragment if empty
if u.Fragment == "" { if u.Fragment == "" {
u.Fragment = "" u.Fragment = ""
} }
// Remove trailing query if empty // remove trailing query if empty
if u.RawQuery == "" { if u.RawQuery == "" {
u.RawQuery = "" u.RawQuery = ""
} }
@@ -212,7 +222,7 @@ func EscapeURL(input string) string {
if strings.Contains(input, "%") && !strings.Contains(input, "% ") { if strings.Contains(input, "%") && !strings.Contains(input, "% ") {
return input return input
} }
// Split URL into parts (protocol, host, path) // Split URL into parts (protocol, host, p)
parts := strings.SplitN(input, "://", 2) parts := strings.SplitN(input, "://", 2)
if len(parts) != 2 { if len(parts) != 2 {
return input return input
@@ -226,18 +236,50 @@ func EscapeURL(input string) string {
return input return input
} }
// Split host and path // Split host and p
parts = strings.SplitN(remainder, "/", 2) parts = strings.SplitN(remainder, "/", 2)
host := parts[0] host := parts[0]
if len(parts) == 1 { if len(parts) == 1 {
return protocol + "://" + host return protocol + "://" + host
} }
path := parts[1]
// Escape the path portion // Escape the path portion
escapedPath := url.PathEscape(path) escapedPath := url.PathEscape(parts[1])
// Reconstruct the URL // Reconstruct the URL
return protocol + "://" + host + "/" + escapedPath return protocol + "://" + host + "/" + escapedPath
} }
// TrimTrailingPathSlash trims trailing slash and handles empty path
func TrimTrailingPathSlash(path string) string {
// Handle empty path (e.g., "http://example.com" -> treat as root)
if path == "" {
return "/"
}
// Trim trailing slash while preserving root slash
path = strings.TrimSuffix(path, "/")
if path == "" { // This happens if path was just "/"
return "/"
}
return path
}
// ExtractRedirectTargetFromHeader returns the redirection
// URL by parsing the header (or error message)
func ExtractRedirectTargetFromHeader(currentURL URL, input string) (*URL, error) {
// \d+ - matches one or more digits
// \s+ - matches one or more whitespace
// ([^\r]+) - captures everything until it hits a \r (or end of string)
pattern := `\d+\s+([^\r]+)`
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) < 2 {
return nil, errors.NewError(fmt.Errorf("error extracting redirect target from string %s", input))
}
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
if err != nil {
return nil, err
}
return newURL, nil
}

420
common/url/url_test.go Normal file
View File

@@ -0,0 +1,420 @@
package url
import (
"reflect"
"testing"
)
func TestURLOperations(t *testing.T) {
t.Parallel()
t.Run("ParseURL", func(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input string
base string
absolute bool
want string
wantErr bool
}{
{
name: "parse CGI URL",
input: "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162",
base: "",
absolute: true,
want: "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162",
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
parsed, err := ParseURL(tt.input, tt.base, tt.absolute)
if (err != nil) != tt.wantErr {
t.Errorf("ParseURL() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !tt.wantErr {
value, _ := parsed.Value()
if value != tt.want {
t.Errorf("ParseURL() = %v, want %v", value, tt.want)
}
}
})
}
})
t.Run("DeriveAbsoluteURL", func(t *testing.T) {
t.Parallel()
baseURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
tests := []struct {
name string
current URL
input string
expected *URL
}{
{
name: "absolute URL input",
current: baseURL,
input: "gemini://a.b/c",
expected: &URL{
Protocol: "gemini",
Hostname: "a.b",
Port: 1965,
Path: "/c",
Full: "gemini://a.b:1965/c",
},
},
{
name: "absolute path input",
current: baseURL,
input: "/c",
expected: &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/c",
Full: "gemini://smol.gr:1965/c",
},
},
{
name: "relative path input",
current: baseURL,
input: "c/d",
expected: &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b/c/d",
Full: "gemini://smol.gr:1965/a/b/c/d",
},
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
output, err := DeriveAbsoluteURL(tt.current, tt.input)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !reflect.DeepEqual(output, tt.expected) {
t.Errorf("got %#v, want %#v", output, tt.expected)
}
})
}
})
t.Run("NormalizeURL", func(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input string
expected string
}{
{
name: "with trailing slash",
input: "gemini://uscoffings.net/retro-computing/magazines/",
expected: "gemini://uscoffings.net/retro-computing/magazines/",
},
{
name: "without trailing slash",
input: "gemini://uscoffings.net/retro-computing/magazines",
expected: "gemini://uscoffings.net/retro-computing/magazines",
},
{
name: "multiple slashes",
input: "gemini://uscoffings.net/retro-computing/////////a///magazines",
expected: "gemini://uscoffings.net/retro-computing/a/magazines",
},
{
name: "root with trailing slash",
input: "gemini://uscoffings.net/",
expected: "gemini://uscoffings.net/",
},
{
name: "root without trailing slash",
input: "gemini://uscoffings.net",
expected: "gemini://uscoffings.net",
},
{
name: "path with trailing slash",
input: "gemini://uscoffings.net/a/",
expected: "gemini://uscoffings.net/a/",
},
{
name: "path without trailing slash",
input: "gemini://uscoffings.net/a",
expected: "gemini://uscoffings.net/a",
},
{
name: "with dot segments",
input: "gemini://uscoffings.net/retro-computing/./././////a///magazines",
expected: "gemini://uscoffings.net/retro-computing/a/magazines",
},
{
name: "with default port",
input: "gemini://uscoffings.net:1965/a",
expected: "gemini://uscoffings.net/a",
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
normalized, err := NormalizeURL(tt.input)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
output := normalized.String()
if output != tt.expected {
t.Errorf("got %#v, want %#v", output, tt.expected)
}
})
}
})
}
func TestNormalizeURL(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input string
expected string
}{
{
name: "URL with non-default port",
input: "gemini://chat.gemini.lehmann.cx:11965/",
expected: "gemini://chat.gemini.lehmann.cx:11965/",
},
{
name: "URL with query parameters",
input: "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c",
expected: "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c",
},
{
name: "URL with fragment",
input: "gemini://chat.gemini.lehmann.cx:11965/index#1",
expected: "gemini://chat.gemini.lehmann.cx:11965/index#1",
},
{
name: "URL with CGI script and query",
input: "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494",
expected: "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494",
},
}
for _, tt := range tests {
tt := tt // capture range variable for parallel testing
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
normalized, err := NormalizeURL(tt.input)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
output := normalized.String()
if output != tt.expected {
t.Errorf("got %#v, want %#v", output, tt.expected)
}
})
}
}
func TestNormalizePath(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input string // URL string to parse
expected string // Expected normalized path
}{
// Basic cases
{
name: "empty_path",
input: "http://example.com",
expected: "",
},
{
name: "root_path",
input: "http://example.com/",
expected: "/",
},
{
name: "single_trailing_slash",
input: "http://example.com/test/",
expected: "/test/",
},
{
name: "no_trailing_slash",
input: "http://example.com/test",
expected: "/test",
},
// Edge cases with slashes
{
name: "multiple_trailing_slashes",
input: "http://example.com/test//",
expected: "/test/",
},
{
name: "multiple_consecutive_slashes",
input: "http://example.com//test//",
expected: "/test/",
},
{
name: "only_slashes",
input: "http://example.com////",
expected: "/",
},
// Encoded characters
{
name: "encoded_spaces",
input: "http://example.com/foo%20bar/",
expected: "/foo%20bar/",
},
{
name: "encoded_special_chars",
input: "http://example.com/foo%2Fbar/",
expected: "/foo%2Fbar/",
},
// Query parameters and fragments
{
name: "with_query_parameters",
input: "http://example.com/path?query=param",
expected: "/path",
},
{
name: "with_fragment",
input: "http://example.com/path#fragment",
expected: "/path",
},
{
name: "with_both_query_and_fragment",
input: "http://example.com/path?query=param#fragment",
expected: "/path",
},
// Unicode paths
{
name: "unicode_characters",
input: "http://example.com/über/path/",
expected: "/%C3%BCber/path/",
},
{
name: "unicode_encoded",
input: "http://example.com/%C3%BCber/path/",
expected: "/%C3%BCber/path/",
},
// Weird but valid cases
{
name: "dot_in_path",
input: "http://example.com/./path/",
expected: "/path/",
},
{
name: "double_dot_in_path",
input: "http://example.com/../path/",
expected: "/path/",
},
{
name: "mixed_case",
input: "http://example.com/PaTh/",
expected: "/PaTh/",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
u, err := ParseURL(tt.input, "", true)
if err != nil {
t.Fatalf("Failed to parse URL %q: %v", tt.input, err)
}
result := u.Path
if result != tt.expected {
t.Errorf("Input: %s\nExpected: %q\nGot: %q",
u.Path, tt.expected, result)
}
})
}
}
func TestExtractRedirectTargetFullURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
input := "redirect: 31 gemini://target.gr"
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
expected := "gemini://target.gr:1965"
if err != nil || (result.String() != expected) {
t.Errorf("fail: Expected %s got %s", expected, result)
}
}
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
input := "redirect: 31 gemini://target.gr/"
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
expected := "gemini://target.gr:1965/"
if err != nil || (result.String() != expected) {
t.Errorf("fail: Expected %s got %s", expected, result)
}
}
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
input := "redirect: 31 /a/b"
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://nox.im:1965", "", true)
input := "redirect: 31 ./"
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
if err != nil || (result.String() != "gemini://nox.im:1965/") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://status.zvava.org:1965", "", true)
input := "redirect: 31 index.gmi"
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetWrong(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
input := "redirect: 31"
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
if result != nil || err == nil {
t.Errorf("fail: result should be nil, err is %s", err)
}
}

320
common/worker.go Normal file
View File

@@ -0,0 +1,320 @@
package common
import (
"fmt"
"time"
"gemini-grc/common/blackList"
errors2 "gemini-grc/common/errors"
"gemini-grc/common/snapshot"
url2 "gemini-grc/common/url"
_db "gemini-grc/db"
"gemini-grc/errors"
"gemini-grc/gemini"
"gemini-grc/gopher"
"gemini-grc/hostPool"
"gemini-grc/logging"
"github.com/guregu/null/v5"
"github.com/jmoiron/sqlx"
)
func CrawlOneURL(db *sqlx.DB, url *string) error {
parsedURL, err := url2.ParseURL(*url, "", true)
if err != nil {
return err
}
if !url2.IsGeminiUrl(parsedURL.String()) && !url2.IsGopherURL(parsedURL.String()) {
return errors.NewError(fmt.Errorf("error parsing URL: not a Gemini or Gopher URL: %s", parsedURL.String()))
}
tx, err := db.Beginx()
if err != nil {
return errors.NewFatalError(err)
}
err = _db.InsertURL(tx, parsedURL.Full)
if err != nil {
return err
}
err = workOnUrl(0, tx, parsedURL.Full)
if err != nil {
return err
}
err = tx.Commit()
if err != nil {
//if _db.IsDeadlockError(err) {
// logging.LogError("Deadlock detected. Rolling back")
// time.Sleep(time.Duration(10) * time.Second)
// err := tx.Rollback()
// return errors.NewFatalError(err)
//}
return errors.NewFatalError(err)
}
logging.LogInfo("Done")
return nil
}
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers)
go PrintWorkerStatus(numOfWorkers, StatusChan)
for i := range numOfWorkers {
go func(i int) {
UpdateWorkerStatus(i, "Waiting to start")
// Jitter to avoid starting everything at the same time
time.Sleep(time.Duration(i+2) * time.Second)
for {
// TODO: Use cancellable context with tx, logger & worker ID.
// ctx := context.WithCancel()
// ctx = context.WithValue(ctx, common.CtxKeyLogger, &RequestLogger{r: r})
RunWorkerWithTx(i, db)
}
}(i)
}
}
func RunWorkerWithTx(workerID int, db *sqlx.DB) {
defer func() {
UpdateWorkerStatus(workerID, "Done")
}()
tx, err := db.Beginx()
if err != nil {
ErrorsChan <- err
return
}
err = runWorker(workerID, tx)
if err != nil {
// TODO: Rollback in this case?
ErrorsChan <- err
return
}
logging.LogDebug("[%3d] Committing transaction", workerID)
err = tx.Commit()
// On deadlock errors, rollback and return, otherwise panic.
if err != nil {
logging.LogError("[%3d] Failed to commit transaction: %w", workerID, err)
if _db.IsDeadlockError(err) {
logging.LogError("[%3d] Deadlock detected. Rolling back", workerID)
time.Sleep(time.Duration(10) * time.Second)
err := tx.Rollback()
if err != nil {
panic(fmt.Sprintf("[%3d] Failed to roll back transaction: %v", workerID, err))
}
return
}
panic(fmt.Sprintf("[%3d] Failed to commit transaction: %v", workerID, err))
}
logging.LogDebug("[%3d] Worker done!", workerID)
}
func runWorker(workerID int, tx *sqlx.Tx) error {
var urls []string
var err error
UpdateWorkerStatus(workerID, "Getting URLs from DB")
urls, err = _db.GetRandomUrls(tx)
// urls, err = _db.GetRandomUrlsWithBasePath(tx)
if err != nil {
return err
} else if len(urls) == 0 {
logging.LogInfo("[%3d] No URLs to visit, sleeping...", workerID)
UpdateWorkerStatus(workerID, "No URLs to visit, sleeping...")
time.Sleep(1 * time.Minute)
return nil
}
// Start visiting URLs.
total := len(urls)
for i, u := range urls {
logging.LogInfo("[%3d] Starting %d/%d %s", workerID, i+1, total, u)
UpdateWorkerStatus(workerID, fmt.Sprintf("Starting %d/%d %s", i+1, total, u))
err := workOnUrl(workerID, tx, u)
if err != nil {
return err
}
logging.LogDebug("[%3d] Done %d/%d.", workerID, i+1, total)
UpdateWorkerStatus(workerID, fmt.Sprintf("Done %d/%d %s", i+1, total, u))
}
return nil
}
// workOnUrl visits a URL and stores the result.
// unexpected errors are returned.
// expected errors are stored within the snapshot.
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
s, err := snapshot.SnapshotFromURL(url, false)
if err != nil {
return err
}
isGemini := url2.IsGeminiUrl(s.URL.String())
isGopher := url2.IsGopherURL(s.URL.String())
if !isGemini && !isGopher {
return errors.NewError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()))
}
if blackList.IsBlacklisted(s.URL.String()) {
logging.LogInfo("[%3d] URL matches blacklist, ignoring", workerID)
s.Error = null.StringFrom(errors2.ErrBlacklistMatch.Error())
return saveSnapshotAndRemoveURL(tx, s)
}
if isGemini {
// If URL matches a robots.txt disallow line,
// add it as an error and remove url
robotMatch, err := gemini.RobotMatch(s.URL.String())
if err != nil {
// robotMatch returns only network errors!
// we stop because we don't want to hit
// the server with another request on this case.
return err
}
if robotMatch {
logging.LogInfo("[%3d] URL matches robots.txt, ignoring", workerID)
s.Error = null.StringFrom(errors2.ErrRobotsMatch.Error())
return saveSnapshotAndRemoveURL(tx, s)
}
}
logging.LogDebug("[%3d] Adding to pool %s", workerID, s.URL.String())
UpdateWorkerStatus(workerID, fmt.Sprintf("Adding to pool %s", s.URL.String()))
hostPool.AddHostToHostPool(s.Host)
defer func(s string) {
hostPool.RemoveHostFromPool(s)
}(s.Host)
logging.LogDebug("[%3d] Visiting %s", workerID, s.URL.String())
UpdateWorkerStatus(workerID, fmt.Sprintf("Visiting %s", s.URL.String()))
if isGopher {
s, err = gopher.Visit(s.URL.String())
} else {
s, err = gemini.Visit(s.URL.String())
}
if err != nil {
return err
}
// Handle Gemini redirection.
if isGemini &&
s.ResponseCode.ValueOrZero() >= 30 &&
s.ResponseCode.ValueOrZero() < 40 {
err = handleRedirection(workerID, tx, s)
if err != nil {
return fmt.Errorf("error while handling redirection: %s", err)
}
}
// Store links
if len(s.Links.ValueOrZero()) > 0 {
logging.LogDebug("[%3d] Found %d links", workerID, len(s.Links.ValueOrZero()))
err = storeLinks(tx, s)
if err != nil {
return err
}
}
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
return saveSnapshotAndRemoveURL(tx, s)
}
func storeLinks(tx *sqlx.Tx, s *snapshot.Snapshot) error {
if s.Links.Valid { //nolint:nestif
for _, link := range s.Links.ValueOrZero() {
if shouldPersistURL(&link) {
visited, err := haveWeVisitedURL(tx, link.Full)
if err != nil {
return err
}
if !visited {
err := _db.InsertURL(tx, link.Full)
if err != nil {
return err
}
} else {
logging.LogDebug("Link already persisted: %s", link.Full)
}
}
}
}
return nil
}
func saveSnapshotAndRemoveURL(tx *sqlx.Tx, s *snapshot.Snapshot) error {
err := _db.OverwriteSnapshot(tx, s)
if err != nil {
return err
}
err = _db.DeleteURL(tx, s.URL.String())
if err != nil {
return err
}
return nil
}
// shouldPersistURL returns true if we
// should save the URL in the _db.
// Only gemini:// urls are saved.
func shouldPersistURL(u *url2.URL) bool {
return url2.IsGeminiUrl(u.String()) || url2.IsGopherURL(u.String())
}
func haveWeVisitedURL(tx *sqlx.Tx, u string) (bool, error) {
var result []bool
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u)
if err != nil {
return false, errors.NewFatalError(fmt.Errorf("database error: %w", err))
}
if len(result) > 0 {
return result[0], nil
}
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshots.url=$1`, u)
if err != nil {
return false, errors.NewFatalError(fmt.Errorf("database error: %w", err))
}
if len(result) > 0 {
return result[0], nil
}
return false, nil
}
// handleRedirection saves redirection URL.
func handleRedirection(workerID int, tx *sqlx.Tx, s *snapshot.Snapshot) error {
newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Error.ValueOrZero())
if err != nil {
return err
}
logging.LogDebug("[%3d] Page redirects to %s", workerID, newURL)
haveWeVisited, _ := haveWeVisitedURL(tx, newURL.String())
if shouldPersistURL(newURL) && !haveWeVisited {
err = _db.InsertURL(tx, newURL.Full)
if err != nil {
return err
}
logging.LogDebug("[%3d] Saved redirection URL %s", workerID, newURL.String())
}
return nil
}
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]snapshot.Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []snapshot.Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err
}
return snapshots, nil
}

View File

@@ -1,19 +1,35 @@
package gemini package common
import ( import (
"fmt" "fmt"
"strings" "strings"
"gemini-grc/config"
) )
type WorkerStatus struct { type WorkerStatus struct {
id int ID int
status string Status string
} }
var statusChan chan WorkerStatus func UpdateWorkerStatus(workerID int, status string) {
if !config.GetConfig().PrintWorkerStatus {
return
}
if config.CONFIG.NumOfWorkers > 1 {
StatusChan <- WorkerStatus{
ID: workerID,
Status: status,
}
}
}
func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) { func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
// Create a slice to store current status of each worker if !config.GetConfig().PrintWorkerStatus {
return
}
// Create a slice to store current Status of each worker
statuses := make([]string, totalWorkers) statuses := make([]string, totalWorkers)
// Initialize empty statuses // Initialize empty statuses
@@ -32,14 +48,14 @@ func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
} }
fmt.Print(output.String()) fmt.Print(output.String())
// Continuously receive status updates // Continuously receive Status updates
for update := range statusChan { for update := range statusChan {
if update.id >= totalWorkers { if update.ID >= totalWorkers {
continue continue
} }
// Update the status // Update the Status
statuses[update.id] = update.status statuses[update.ID] = update.Status
// Build the complete output string // Build the complete output string
output.Reset() output.Reset()
@@ -48,7 +64,7 @@ func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status)) output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status))
} }
// Print the entire status // Print the entire Status
fmt.Print(output.String()) fmt.Print(output.String())
} }
} }

160
db/db.go
View File

@@ -2,20 +2,22 @@ package db
import ( import (
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"gemini-grc/common"
"os" "os"
"strconv" "strconv"
"time"
"gemini-grc/common/snapshot"
commonUrl "gemini-grc/common/url"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/errors"
"gemini-grc/logging" "gemini-grc/logging"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
"github.com/lib/pq" "github.com/lib/pq"
) )
func ConnectToDB() *sqlx.DB { func ConnectToDB() (*sqlx.DB, error) {
connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s", //nolint:nosprintfhostport connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s", //nolint:nosprintfhostport
os.Getenv("PG_USER"), os.Getenv("PG_USER"),
os.Getenv("PG_PASSWORD"), os.Getenv("PG_PASSWORD"),
@@ -27,25 +29,26 @@ func ConnectToDB() *sqlx.DB {
// Create a connection pool // Create a connection pool
db, err := sqlx.Open("pgx", connStr) db, err := sqlx.Open("pgx", connStr)
if err != nil { if err != nil {
panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err)) return nil, errors.NewFatalError(fmt.Errorf("unable to connect to database with URL %s: %w", connStr, err))
} }
// TODO move PG_MAX_OPEN_CONNECTIONS to config env variables // TODO move PG_MAX_OPEN_CONNECTIONS to config env variables
maxConnections, err := strconv.Atoi(os.Getenv("PG_MAX_OPEN_CONNECTIONS")) maxConnections, err := strconv.Atoi(os.Getenv("PG_MAX_OPEN_CONNECTIONS"))
if err != nil { if err != nil {
panic(fmt.Sprintf("Unable to set max DB connections: %s\n", err)) return nil, errors.NewFatalError(fmt.Errorf("unable to set DB max connections: %w", err))
} }
db.SetMaxOpenConns(maxConnections) db.SetMaxOpenConns(maxConnections)
err = db.Ping() err = db.Ping()
if err != nil { if err != nil {
panic(fmt.Sprintf("Unable to ping database: %v\n", err)) return nil, errors.NewFatalError(fmt.Errorf("unable to ping database: %w", err))
} }
logging.LogDebug("Connected to database") logging.LogDebug("Connected to database")
return db return db, nil
} }
// IsDeadlockError checks if the error is a PostgreSQL deadlock error // IsDeadlockError checks if the error is a PostgreSQL deadlock error.
func IsDeadlockError(err error) bool { func IsDeadlockError(err error) bool {
err = errors.Unwrap(err)
var pqErr *pq.Error var pqErr *pq.Error
if errors.As(err, &pqErr) { if errors.As(err, &pqErr) {
return pqErr.Code == "40P01" // PostgreSQL deadlock error code return pqErr.Code == "40P01" // PostgreSQL deadlock error code
@@ -53,134 +56,85 @@ func IsDeadlockError(err error) bool {
return false return false
} }
func GetURLsToVisit(tx *sqlx.Tx) ([]string, error) { func GetRandomUrls(tx *sqlx.Tx) ([]string, error) {
var urls []string var urls []string
err := tx.Select(&urls, SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS, config.CONFIG.WorkerBatchSize) err := tx.Select(&urls, SQL_SELECT_RANDOM_URLS, config.CONFIG.WorkerBatchSize)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrDatabase, err) return nil, errors.NewFatalError(err)
}
return urls, nil
}
func GetRandomUrlsWithBasePath(tx *sqlx.Tx) ([]string, error) {
SqlQuery := `SELECT url FROM snapshots WHERE url ~ '^[^:]+://[^/]+/?$' ORDER BY RANDOM() LIMIT $1`
var urls []string
err := tx.Select(&urls, SqlQuery, config.CONFIG.WorkerBatchSize)
if err != nil {
return nil, errors.NewFatalError(err)
} }
return urls, nil return urls, nil
} }
func InsertURL(tx *sqlx.Tx, url string) error { func InsertURL(tx *sqlx.Tx, url string) error {
logging.LogDebug("Inserting URL %s", url)
query := SQL_INSERT_URL query := SQL_INSERT_URL
_, err := tx.NamedExec(query, url) normalizedURL, err := commonUrl.ParseURL(url, "", true)
if err != nil { if err != nil {
return fmt.Errorf("%w inserting URL: %w", common.ErrDatabase, err) return err
}
a := struct {
Url string
Host string
Timestamp time.Time
}{
Url: normalizedURL.Full,
Host: normalizedURL.Hostname,
Timestamp: time.Now(),
}
_, err = tx.NamedExec(query, a)
if err != nil {
return errors.NewFatalError(fmt.Errorf("cannot insert URL: database error %w URL %s", err, url))
} }
return nil return nil
} }
func SaveSnapshotIfNew(tx *sqlx.Tx, s *common.Snapshot) error { func DeleteURL(tx *sqlx.Tx, url string) error {
logging.LogDebug("Deleting URL %s", url)
query := SQL_DELETE_URL
_, err := tx.Exec(query, url)
if err != nil {
return errors.NewFatalError(fmt.Errorf("cannot delete URL: database error %w URL %s", err, url))
}
return nil
}
func OverwriteSnapshot(tx *sqlx.Tx, s *snapshot.Snapshot) (err error) {
if config.CONFIG.DryRun { if config.CONFIG.DryRun {
marshalled, err := json.MarshalIndent(s, "", " ") marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil { if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s)) return errors.NewFatalError(fmt.Errorf("JSON serialization error for %v", s))
} }
logging.LogDebug("Would insert (if new) snapshot %s", marshalled) logging.LogDebug("Would upsert snapshot %s", marshalled)
return nil return nil
} }
query := SQL_INSERT_SNAPSHOT_IF_NEW
_, err := tx.NamedExec(query, s)
if err != nil {
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
}
return nil
}
func OverwriteSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) {
// if config.CONFIG.DryRun {
//marshalled, err := json.MarshalIndent(s, "", " ")
//if err != nil {
// panic(fmt.Sprintf("JSON serialization error for %v", s))
//}
//logging.LogDebug("[%d] Would upsert snapshot %s", workedID, marshalled)
// return nil
// }
query := SQL_UPSERT_SNAPSHOT query := SQL_UPSERT_SNAPSHOT
rows, err := tx.NamedQuery(query, s) rows, err := tx.NamedQuery(query, s)
if err != nil { if err != nil {
return fmt.Errorf("[%d] %w while upserting snapshot: %w", workedID, common.ErrDatabase, err) return errors.NewFatalError(fmt.Errorf("cannot overwrite snapshot: %w", err))
} }
defer func() { defer func() {
_err := rows.Close() _err := rows.Close()
if _err != nil { if err == nil && _err != nil {
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err) err = errors.NewFatalError(fmt.Errorf("cannot overwrite snapshot: error closing rows: %w", err))
} }
}() }()
if rows.Next() { if rows.Next() {
var returnedID int var returnedID int
err = rows.Scan(&returnedID) err = rows.Scan(&returnedID)
if err != nil { if err != nil {
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err) return errors.NewFatalError(fmt.Errorf("cannot overwrite snapshot: error scanning rows: %w", err))
} }
s.ID = returnedID s.ID = returnedID
// logging.LogDebug("[%d] Upserted snapshot with ID %d", workedID, returnedID)
}
return nil
}
func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) {
// if config.CONFIG.DryRun {
//marshalled, err := json.MarshalIndent(s, "", " ")
//if err != nil {
// panic(fmt.Sprintf("JSON serialization error for %v", s))
//}
//logging.LogDebug("[%d] Would upsert snapshot %s", workedID, marshalled)
// return nil
// }
query := SQL_UPDATE_SNAPSHOT
rows, err := tx.NamedQuery(query, s)
if err != nil {
return fmt.Errorf("[%d] %w while updating snapshot: %w", workedID, common.ErrDatabase, err)
}
defer func() {
_err := rows.Close()
if _err != nil {
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err)
}
}()
if rows.Next() {
var returnedID int
err = rows.Scan(&returnedID)
if err != nil {
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err)
}
s.ID = returnedID
// logging.LogDebug("[%d] Updated snapshot with ID %d", workedID, returnedID)
}
return nil
}
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*common.Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
const batchSize = 5000
query := SQL_INSERT_SNAPSHOT_IF_NEW
for i := 0; i < len(snapshots); i += batchSize {
end := i + batchSize
if end > len(snapshots) {
end = len(snapshots)
}
batch := snapshots[i:end]
_, err := tx.NamedExec(query, batch)
if err != nil {
return fmt.Errorf("%w: While saving links in batches: %w", common.ErrDatabase, err)
}
}
return nil
}
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*common.Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
query := SQL_INSERT_SNAPSHOT_IF_NEW
_, err := tx.NamedExec(query, snapshots)
if err != nil {
logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err)
} }
return nil return nil
} }

View File

@@ -1,53 +1,24 @@
package db package db
const ( const (
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS = `
SELECT *
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
ORDER BY RANDOM()
FOR UPDATE SKIP LOCKED
LIMIT $1
`
SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS = ` SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS = `
SELECT url SELECT url
FROM urls u FROM urls u
WHERE u.id IN ( WHERE u.id IN (
SELECT MIN(id) SELECT id FROM (
FROM urls SELECT id, ROW_NUMBER() OVER (PARTITION BY host ORDER BY id) as rn
GROUP BY host FROM urls
) t
WHERE rn <= 3
) )
LIMIT $1
`
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = `
SELECT *
FROM snapshots s
WHERE response_code IS NULL
AND error IS NULL
AND s.id IN (
SELECT MIN(id)
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
GROUP BY host
)
ORDER BY RANDOM() ORDER BY RANDOM()
FOR UPDATE SKIP LOCKED FOR UPDATE SKIP LOCKED
LIMIT $1 LIMIT $1
` `
SQL_SELECT_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = ` SQL_SELECT_RANDOM_URLS = `
SELECT * SELECT url
FROM snapshots s FROM urls u
WHERE response_code IS NULL ORDER BY RANDOM()
AND error IS NULL
AND s.id IN (
SELECT MIN(id)
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
GROUP BY host
)
FOR UPDATE SKIP LOCKED FOR UPDATE SKIP LOCKED
LIMIT $1 LIMIT $1
` `
@@ -90,4 +61,7 @@ RETURNING id
VALUES (:url, :host, :timestamp) VALUES (:url, :host, :timestamp)
ON CONFLICT (url) DO NOTHING ON CONFLICT (url) DO NOTHING
` `
SQL_DELETE_URL = `
DELETE FROM urls WHERE url=$1
`
) )

104
errors/errors.go Normal file
View File

@@ -0,0 +1,104 @@
package errors
import (
"errors"
"fmt"
"runtime"
"strings"
)
type fatal interface {
Fatal() bool
}
func IsFatal(err error) bool {
var e fatal
ok := As(err, &e)
return ok && e.Fatal()
}
func As(err error, target any) bool {
return errors.As(err, target)
}
func Is(err, target error) bool {
return errors.Is(err, target)
}
func Unwrap(err error) error {
return errors.Unwrap(err)
}
type Error struct {
Err error
Stack string
fatal bool
}
func (e *Error) Error() string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("%v\n", e.Err))
sb.WriteString(fmt.Sprintf("Stack Trace:\n%s", e.Stack))
return sb.String()
}
func (e *Error) Fatal() bool {
return e.fatal
}
func (e *Error) Unwrap() error {
return e.Err
}
func NewError(err error) error {
if err == nil {
return nil
}
// Check if it's already of our own
// Error type, so we don't add stack twice.
var asError *Error
if errors.As(err, &asError) {
return err
}
// has the stack trace
var stack strings.Builder
buf := make([]uintptr, 50)
n := runtime.Callers(2, buf)
frames := runtime.CallersFrames(buf[:n])
// Format the stack trace
for {
frame, more := frames.Next()
// Skip runtime and standard library frames
if !strings.Contains(frame.File, "runtime/") {
stack.WriteString(fmt.Sprintf("\t%s:%d - %s\n", frame.File, frame.Line, frame.Function))
}
if !more {
break
}
}
return &Error{
Err: err,
Stack: stack.String(),
}
}
func NewFatalError(err error) error {
if err == nil {
return nil
}
// Check if it's already of our own
// Error type.
var asError *Error
if errors.As(err, &asError) {
asError.fatal = true // Set fatal even for existing Error types
return err
}
err2 := NewError(err)
err2.(*Error).fatal = true
return err2
}

184
errors/errors_test.go Normal file
View File

@@ -0,0 +1,184 @@
package errors
import (
"errors"
"fmt"
"testing"
)
type CustomError struct {
Err error
}
func (e *CustomError) Error() string { return e.Err.Error() }
func IsCustomError(err error) bool {
var asError *CustomError
return errors.As(err, &asError)
}
func TestWrapping(t *testing.T) {
t.Parallel()
originalErr := errors.New("original error")
err1 := NewError(originalErr)
if !errors.Is(err1, originalErr) {
t.Errorf("original error is not wrapped")
}
if !Is(err1, originalErr) {
t.Errorf("original error is not wrapped")
}
unwrappedErr := errors.Unwrap(err1)
if !errors.Is(unwrappedErr, originalErr) {
t.Errorf("original error is not wrapped")
}
if !Is(unwrappedErr, originalErr) {
t.Errorf("original error is not wrapped")
}
unwrappedErr = Unwrap(err1)
if !errors.Is(unwrappedErr, originalErr) {
t.Errorf("original error is not wrapped")
}
if !Is(unwrappedErr, originalErr) {
t.Errorf("original error is not wrapped")
}
wrappedErr := fmt.Errorf("wrapped: %w", originalErr)
if !errors.Is(wrappedErr, originalErr) {
t.Errorf("original error is not wrapped")
}
if !Is(wrappedErr, originalErr) {
t.Errorf("original error is not wrapped")
}
}
func TestNewError(t *testing.T) {
t.Parallel()
originalErr := &CustomError{errors.New("err1")}
if !IsCustomError(originalErr) {
t.Errorf("TestNewError fail #1")
}
err1 := NewError(originalErr)
if !IsCustomError(err1) {
t.Errorf("TestNewError fail #2")
}
wrappedErr1 := fmt.Errorf("wrapped %w", err1)
if !IsCustomError(wrappedErr1) {
t.Errorf("TestNewError fail #3")
}
unwrappedErr1 := Unwrap(wrappedErr1)
if !IsCustomError(unwrappedErr1) {
t.Errorf("TestNewError fail #4")
}
}
func TestIsFatal(t *testing.T) {
t.Parallel()
tests := []struct {
name string
err error
want bool
}{
{
name: "nil error",
err: nil,
want: false,
},
{
name: "simple non-fatal error",
err: fmt.Errorf("regular error"),
want: false,
},
{
name: "direct fatal error",
err: NewFatalError(fmt.Errorf("fatal error")),
want: true,
},
{
name: "non-fatal Error type",
err: NewError(fmt.Errorf("non-fatal error")),
want: false,
},
{
name: "wrapped fatal error - one level",
err: fmt.Errorf("outer: %w", NewFatalError(fmt.Errorf("inner fatal"))),
want: true,
},
{
name: "wrapped fatal error - two levels",
err: fmt.Errorf("outer: %w",
fmt.Errorf("middle: %w",
NewFatalError(fmt.Errorf("inner fatal")))),
want: true,
},
{
name: "wrapped fatal error - three levels",
err: fmt.Errorf("outer: %w",
fmt.Errorf("middle1: %w",
fmt.Errorf("middle2: %w",
NewFatalError(fmt.Errorf("inner fatal"))))),
want: true,
},
{
name: "multiple wrapped errors - non-fatal",
err: fmt.Errorf("outer: %w",
fmt.Errorf("middle: %w",
fmt.Errorf("inner: %w",
NewError(fmt.Errorf("base error"))))),
want: false,
},
{
name: "wrapped non-fatal Error type",
err: fmt.Errorf("outer: %w", NewError(fmt.Errorf("inner"))),
want: false,
},
{
name: "wrapped basic error",
err: fmt.Errorf("outer: %w", fmt.Errorf("inner")),
want: false,
},
{
name: "fatal error wrapping fatal error",
err: NewFatalError(NewFatalError(fmt.Errorf("double fatal"))),
want: true,
},
{
name: "fatal error wrapping non-fatal Error",
err: NewFatalError(NewError(fmt.Errorf("mixed"))),
want: true,
},
{
name: "non-fatal Error wrapping fatal error",
err: NewError(NewFatalError(fmt.Errorf("mixed"))),
want: true,
},
{
name: "Error wrapping Error",
err: NewError(NewError(fmt.Errorf("double wrapped"))),
want: false,
},
{
name: "wrapped nil error",
err: fmt.Errorf("outer: %w", nil),
want: false,
},
{
name: "fatal wrapping nil",
err: NewFatalError(nil),
want: false,
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := IsFatal(tt.err)
if got != tt.want {
t.Errorf("IsFatal() = %v, want %v", got, tt.want)
if tt.err != nil {
t.Errorf("Error was: %v", tt.err)
}
}
})
}
}

16
gemget.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/env bash
set -eu
set -o pipefail
# Max response size 10MiB
LOG_LEVEL=debug \
PRINT_WORKER_STATUS=false \
DRY_RUN=false \
NUM_OF_WORKERS=1 \
WORKER_BATCH_SIZE=1 \
BLACKLIST_PATH="$(pwd)/blacklist.txt" \
MAX_RESPONSE_SIZE=10485760 \
RESPONSE_TIMEOUT=10 \
PANIC_ON_UNEXPECTED_ERROR=true \
go run ./bin/gemget/main.go "$@"

View File

@@ -1,55 +0,0 @@
package gemini
import (
"fmt"
"gemini-grc/common"
"os"
"strings"
"gemini-grc/config"
"gemini-grc/logging"
)
var Blacklist *[]string //nolint:gochecknoglobals
func LoadBlacklist() {
if Blacklist == nil {
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
if err != nil {
Blacklist = &[]string{}
logging.LogWarn("Could not load Blacklist file: %v", err)
return
}
lines := strings.Split(string(data), "\n")
// Ignore lines starting with '#' (comments)
filteredLines := func() []string {
out := make([]string, 0, len(lines))
for _, line := range lines {
if !strings.HasPrefix(line, "#") {
out = append(out, line)
}
}
return out
}()
if len(lines) > 0 {
Blacklist = &filteredLines
logging.LogInfo("Blacklist has %d entries", len(*Blacklist))
}
}
}
func IsBlacklisted(u string) bool {
url, err := common.ParseURL(u, "")
if err != nil {
return false
}
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
for _, v := range *Blacklist {
if v == url.Hostname || v == hostWithPort {
return true
}
}
return false
}

52
gemini/errors.go Normal file
View File

@@ -0,0 +1,52 @@
package gemini
import (
"fmt"
"gemini-grc/errors"
)
// GeminiError is used to represent
// Gemini network protocol errors only.
// Should be recorded to the snapshot.
// See https://geminiprotocol.net/docs/protocol-specification.gmi
type GeminiError struct {
Msg string
Code int
Header string
}
func (e *GeminiError) Error() string {
return fmt.Sprintf("gemini error: code %d %s", e.Code, e.Msg)
}
func NewGeminiError(code int, header string) error {
var msg string
switch {
case code >= 10 && code < 20:
msg = "needs input"
case code >= 30 && code < 40:
msg = "redirect"
case code >= 40 && code < 50:
msg = "bad request"
case code >= 50 && code < 60:
msg = "server error"
case code >= 60 && code < 70:
msg = "TLS error"
default:
msg = "unexpected Status code"
}
return &GeminiError{
Msg: msg,
Code: code,
Header: header,
}
}
func IsGeminiError(err error) bool {
if err == nil {
return false
}
var asError *GeminiError
return errors.As(err, &asError)
}

View File

@@ -2,13 +2,13 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/common"
"net/url" "net/url"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
"strings" "strings"
"gemini-grc/common/snapshot"
"gemini-grc/logging" "gemini-grc/logging"
) )
@@ -64,7 +64,7 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
return finalPath, nil return finalPath, nil
} }
func SaveToFile(rootPath string, s *common.Snapshot, done chan struct{}) { func SaveToFile(rootPath string, s *snapshot.Snapshot, done chan struct{}) {
parentPath := path.Join(rootPath, s.URL.Hostname) parentPath := path.Join(rootPath, s.URL.Hostname)
urlPath := s.URL.Path urlPath := s.URL.Path
// If path is empty, add `index.gmi` as the file to save // If path is empty, add `index.gmi` as the file to save
@@ -105,7 +105,7 @@ func ReadLines(path string) []string {
panic(fmt.Sprintf("Failed to read file: %s", err)) panic(fmt.Sprintf("Failed to read file: %s", err))
} }
lines := strings.Split(string(data), "\n") lines := strings.Split(string(data), "\n")
// Remove last line if empty // remove last line if empty
// (happens when file ends with '\n') // (happens when file ends with '\n')
if lines[len(lines)-1] == "" { if lines[len(lines)-1] == "" {
lines = lines[:len(lines)-1] lines = lines[:len(lines)-1]

View File

@@ -1,49 +0,0 @@
package gemini
import (
"fmt"
"regexp"
"strconv"
"gemini-grc/common"
)
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
// If no valid digits are found, it returns an error.
func ParseFirstTwoDigits(input string) (int, error) {
// Define the regular expression pattern to match one or two leading digits
re := regexp.MustCompile(`^(\d{1,2})`)
// Find the first match in the string
matches := re.FindStringSubmatch(input)
if len(matches) == 0 {
return 0, fmt.Errorf("%w", common.ErrGeminiResponseHeader)
}
// Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1])
if err != nil {
return 0, fmt.Errorf("%w: %w", common.ErrTextParse, err)
}
return snapshot, nil
}
// extractRedirectTarget returns the redirection
// URL by parsing the header (or error message)
func extractRedirectTarget(currentURL common.URL, input string) (*common.URL, error) {
// \d+ - matches one or more digits
// \s+ - matches one or more whitespace
// ([^\r]+) - captures everything until it hits a \r (or end of string)
pattern := `\d+\s+([^\r]+)`
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) < 2 {
return nil, fmt.Errorf("%w: %s", common.ErrGeminiRedirect, input)
}
newURL, err := common.DeriveAbsoluteURL(currentURL, matches[1])
if err != nil {
return nil, fmt.Errorf("%w: %w: %s", common.ErrGeminiRedirect, err, input)
}
return newURL, nil
}

View File

@@ -5,22 +5,24 @@ import (
"net/url" "net/url"
"regexp" "regexp"
"gemini-grc/common" "gemini-grc/common/linkList"
url2 "gemini-grc/common/url"
"gemini-grc/errors"
"gemini-grc/logging" "gemini-grc/logging"
"gemini-grc/util" "gemini-grc/util"
) )
func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList { func GetPageLinks(currentURL url2.URL, gemtext string) linkList.LinkList {
linkLines := util.GetLinesMatchingRegex(gemtext, `(?m)^=>[ \t]+.*`) linkLines := util.GetLinesMatchingRegex(gemtext, `(?m)^=>[ \t]+.*`)
if len(linkLines) == 0 { if len(linkLines) == 0 {
return nil return nil
} }
var linkURLs common.LinkList var linkURLs linkList.LinkList
// Normalize URLs in links // Normalize URLs in links
for _, line := range linkLines { for _, line := range linkLines {
linkUrl, err := ParseGeminiLinkLine(line, currentURL.String()) linkUrl, err := ParseGeminiLinkLine(line, currentURL.String())
if err != nil { if err != nil {
logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err) logging.LogDebug("error parsing gemini link line: %s", err)
continue continue
} }
linkURLs = append(linkURLs, *linkUrl) linkURLs = append(linkURLs, *linkUrl)
@@ -31,19 +33,18 @@ func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList {
// ParseGeminiLinkLine takes a single link line and the current URL, // ParseGeminiLinkLine takes a single link line and the current URL,
// return the URL converted to an absolute URL // return the URL converted to an absolute URL
// and its description. // and its description.
func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error) { func ParseGeminiLinkLine(linkLine string, currentURL string) (*url2.URL, error) {
// Check: currentURL is parseable // Check: currentURL is parseable
baseURL, err := url.Parse(currentURL) baseURL, err := url.Parse(currentURL)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
} }
// Extract the actual URL and the description // Extract the actual URL and the description
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`) re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
matches := re.FindStringSubmatch(linkLine) matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 { if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged return nil, errors.NewError(fmt.Errorf("error parsing link line: no regexp match for line %s", linkLine))
return nil, fmt.Errorf("%w could not parse gemini link %s", common.ErrGeminiLinkLineParse, linkLine)
} }
originalURLStr := matches[1] originalURLStr := matches[1]
@@ -51,7 +52,7 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error
// Check: Unescape the URL if escaped // Check: Unescape the URL if escaped
_, err = url.QueryUnescape(originalURLStr) _, err = url.QueryUnescape(originalURLStr)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err) return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
} }
description := "" description := ""
@@ -62,8 +63,7 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error
// Parse the URL from the link line // Parse the URL from the link line
parsedURL, err := url.Parse(originalURLStr) parsedURL, err := url.Parse(originalURLStr)
if err != nil { if err != nil {
// If URL parsing fails, return an error return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
} }
// If link URL is relative, resolve full URL // If link URL is relative, resolve full URL
@@ -71,17 +71,16 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error
parsedURL = baseURL.ResolveReference(parsedURL) parsedURL = baseURL.ResolveReference(parsedURL)
} }
// Remove usual first space from URL description: // remove usual first space from URL description:
// => URL description // => URL description
// ^^^^^^^^^^^^ // ^^^^^^^^^^^^
if len(description) > 0 && description[0] == ' ' { if len(description) > 0 && description[0] == ' ' {
description = description[1:] description = description[1:]
} }
finalURL, err := common.ParseURL(parsedURL.String(), description) finalURL, err := url2.ParseURL(parsedURL.String(), description, true)
if err != nil { if err != nil {
// If URL parsing fails, return an error return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
} }
return finalURL, nil return finalURL, nil

View File

@@ -1,18 +1,18 @@
package gemini package gemini
import ( import (
"errors"
"reflect" "reflect"
"strings"
"testing" "testing"
"gemini-grc/common" "gemini-grc/common/url"
) )
type TestData struct { type TestData struct {
currentURL string currentURL string
link string link string
value *common.URL value *url.URL
error error error string
} }
var data = []TestData{ var data = []TestData{
@@ -20,12 +20,12 @@ var data = []TestData{
currentURL: "https://gemini.com/", currentURL: "https://gemini.com/",
link: "https://gemini.com/", link: "https://gemini.com/",
value: nil, value: nil,
error: common.ErrGeminiLinkLineParse, error: "error parsing link line",
}, },
{ {
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> archive/ Complete Archive", link: "=> archive/ Complete Archive",
value: &common.URL{ value: &url.URL{
Protocol: "gemini", Protocol: "gemini",
Hostname: "gemi.dev", Hostname: "gemi.dev",
Port: 1965, Port: 1965,
@@ -33,12 +33,12 @@ var data = []TestData{
Descr: "Complete Archive", Descr: "Complete Archive",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd/archive/", Full: "gemini://gemi.dev:1965/cgi-bin/xkcd/archive/",
}, },
error: nil, error: "",
}, },
{ {
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?a=5&b=6 Example", link: "=> /cgi-bin/xkcd.cgi?a=5&b=6 Example",
value: &common.URL{ value: &url.URL{
Protocol: "gemini", Protocol: "gemini",
Hostname: "gemi.dev", Hostname: "gemi.dev",
Port: 1965, Port: 1965,
@@ -46,12 +46,12 @@ var data = []TestData{
Descr: "Example", Descr: "Example",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?a=5&b=6", Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?a=5&b=6",
}, },
error: nil, error: "",
}, },
{ {
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?1494 XKCD 1494: Insurance", link: "=> /cgi-bin/xkcd.cgi?1494 XKCD 1494: Insurance",
value: &common.URL{ value: &url.URL{
Protocol: "gemini", Protocol: "gemini",
Hostname: "gemi.dev", Hostname: "gemi.dev",
Port: 1965, Port: 1965,
@@ -59,12 +59,12 @@ var data = []TestData{
Descr: "XKCD 1494: Insurance", Descr: "XKCD 1494: Insurance",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494", Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494",
}, },
error: nil, error: "",
}, },
{ {
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?1494#f XKCD 1494: Insurance", link: "=> /cgi-bin/xkcd.cgi?1494#f XKCD 1494: Insurance",
value: &common.URL{ value: &url.URL{
Protocol: "gemini", Protocol: "gemini",
Hostname: "gemi.dev", Hostname: "gemi.dev",
Port: 1965, Port: 1965,
@@ -72,12 +72,12 @@ var data = []TestData{
Descr: "XKCD 1494: Insurance", Descr: "XKCD 1494: Insurance",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494#f", Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494#f",
}, },
error: nil, error: "",
}, },
{ {
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?c=5#d XKCD 1494: Insurance", link: "=> /cgi-bin/xkcd.cgi?c=5#d XKCD 1494: Insurance",
value: &common.URL{ value: &url.URL{
Protocol: "gemini", Protocol: "gemini",
Hostname: "gemi.dev", Hostname: "gemi.dev",
Port: 1965, Port: 1965,
@@ -85,12 +85,12 @@ var data = []TestData{
Descr: "XKCD 1494: Insurance", Descr: "XKCD 1494: Insurance",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?c=5#d", Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?c=5#d",
}, },
error: nil, error: "",
}, },
{ {
currentURL: "gemini://a.b/c#d", currentURL: "gemini://a.b/c#d",
link: "=> /d/e#f", link: "=> /d/e#f",
value: &common.URL{ value: &url.URL{
Protocol: "gemini", Protocol: "gemini",
Hostname: "a.b", Hostname: "a.b",
Port: 1965, Port: 1965,
@@ -98,7 +98,7 @@ var data = []TestData{
Descr: "", Descr: "",
Full: "gemini://a.b:1965/d/e#f", Full: "gemini://a.b:1965/d/e#f",
}, },
error: nil, error: "",
}, },
} }
@@ -110,13 +110,10 @@ func Test(t *testing.T) {
if expected.value != nil { if expected.value != nil {
t.Errorf("data[%d]: Expected value %v, got %v", i, nil, expected.value) t.Errorf("data[%d]: Expected value %v, got %v", i, nil, expected.value)
} }
if !errors.Is(err, common.ErrGeminiLinkLineParse) { if !strings.HasPrefix(err.Error(), expected.error) {
t.Errorf("data[%d]: expected error %v, got %v", i, expected.error, err) t.Errorf("data[%d]: expected error %v, got %v", i, expected.error, err)
} }
} else { } else {
if expected.error != nil {
t.Errorf("data[%d]: Expected error %v, got %v", i, nil, expected.error)
}
if !(reflect.DeepEqual(result, expected.value)) { if !(reflect.DeepEqual(result, expected.value)) {
t.Errorf("data[%d]: expected %#v, got %#v", i, expected.value, result) t.Errorf("data[%d]: expected %#v, got %#v", i, expected.value, result)
} }

View File

@@ -1,69 +0,0 @@
package gemini
import (
"testing"
"gemini-grc/common"
)
func TestExtractRedirectTargetFullURL(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr"
result, err := extractRedirectTarget(*currentURL, input)
expected := "gemini://target.gr:1965"
if err != nil || (result.String() != expected) {
t.Errorf("fail: Expected %s got %s", expected, result)
}
}
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr/"
result, err := extractRedirectTarget(*currentURL, input)
expected := "gemini://target.gr:1965/"
if err != nil || (result.String() != expected) {
t.Errorf("fail: Expected %s got %s", expected, result)
}
}
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 /a/b"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://nox.im:1965", "")
input := "redirect: 31 ./"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://nox.im:1965/") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://status.zvava.org:1965", "")
input := "redirect: 31 index.gmi"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetWrong(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31"
result, err := extractRedirectTarget(*currentURL, input)
if result != nil || err == nil {
t.Errorf("fail: result should be nil, err is %s", err)
}
}

View File

@@ -1,54 +0,0 @@
package gemini
import "sync"
// Used to limit requests per
// IP address. Maps IP address
// to number of active connections.
type IpAddressPool struct {
IPs map[string]int
Lock sync.RWMutex
}
func (p *IpAddressPool) Set(key string, value int) {
p.Lock.Lock() // Lock for writing
defer p.Lock.Unlock() // Ensure mutex is unlocked after the write
p.IPs[key] = value
}
func (p *IpAddressPool) Get(key string) int {
p.Lock.RLock() // Lock for reading
defer p.Lock.RUnlock() // Ensure mutex is unlocked after reading
if value, ok := p.IPs[key]; !ok {
return 0
} else {
return value
}
}
func (p *IpAddressPool) Delete(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
delete(p.IPs, key)
}
func (p *IpAddressPool) Incr(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
if _, ok := p.IPs[key]; !ok {
p.IPs[key] = 1
} else {
p.IPs[key] = p.IPs[key] + 1
}
}
func (p *IpAddressPool) Decr(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
if val, ok := p.IPs[key]; ok {
p.IPs[key] = val - 1
if p.IPs[key] == 0 {
delete(p.IPs, key)
}
}
}

View File

@@ -2,44 +2,78 @@ package gemini
import ( import (
"crypto/tls" "crypto/tls"
"errors"
"fmt" "fmt"
"io" "io"
"net" "net"
gourl "net/url" stdurl "net/url"
"regexp" "regexp"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
"gemini-grc/common" errors2 "gemini-grc/common/errors"
"gemini-grc/common/snapshot"
_url "gemini-grc/common/url"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/errors"
"gemini-grc/logging" "gemini-grc/logging"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
type PageData struct { // Visit given URL, using the Gemini protocol.
ResponseCode int // Mutates given Snapshot with the data.
ResponseHeader string // In case of error, we store the error string
MimeType string // inside snapshot and return the error.
Lang string func Visit(url string) (s *snapshot.Snapshot, err error) {
GemText string s, err = snapshot.SnapshotFromURL(url, true)
Data []byte
}
func getHostIPAddresses(hostname string) ([]string, error) {
addrs, err := net.LookupHost(hostname)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w:%w", common.ErrNetworkDNS, err) return nil, err
} }
return addrs, nil
defer func() {
if err != nil {
// GeminiError and HostError should
// be stored in the snapshot. Other
// errors are returned.
if errors2.IsHostError(err) {
s.Error = null.StringFrom(err.Error())
err = nil
} else if IsGeminiError(err) {
s.Error = null.StringFrom(err.Error())
s.Header = null.StringFrom(errors.Unwrap(err).(*GeminiError).Header)
s.ResponseCode = null.IntFrom(int64(errors.Unwrap(err).(*GeminiError).Code))
err = nil
} else {
s = nil
}
}
}()
data, err := ConnectAndGetData(s.URL.String())
if err != nil {
return s, err
}
s, err = processData(*s, data)
if err != nil {
return s, err
}
if isGeminiCapsule(s) {
links := GetPageLinks(s.URL, s.GemText.String)
if len(links) > 0 {
logging.LogDebug("Found %d links", len(links))
s.Links = null.ValueFrom(links)
}
}
return s, nil
} }
func ConnectAndGetData(url string) ([]byte, error) { func ConnectAndGetData(url string) ([]byte, error) {
parsedURL, err := gourl.Parse(url) parsedURL, err := stdurl.Parse(url)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) return nil, errors.NewError(err)
} }
hostname := parsedURL.Hostname() hostname := parsedURL.Hostname()
port := parsedURL.Port() port := parsedURL.Port()
@@ -47,29 +81,28 @@ func ConnectAndGetData(url string) ([]byte, error) {
port = "1965" port = "1965"
} }
host := fmt.Sprintf("%s:%s", hostname, port) host := fmt.Sprintf("%s:%s", hostname, port)
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
// Establish the underlying TCP connection. // Establish the underlying TCP connection.
dialer := &net.Dialer{ dialer := &net.Dialer{
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, Timeout: timeoutDuration,
} }
conn, err := dialer.Dial("tcp", host) conn, err := dialer.Dial("tcp", host)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err) return nil, errors2.NewHostError(err)
} }
// Make sure we always close the connection. // Make sure we always close the connection.
defer func() { defer func() {
// No need to handle error:
// Connection will time out eventually if still open somehow.
_ = conn.Close() _ = conn.Close()
}() }()
// Set read and write timeouts on the TCP connection. // Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err) return nil, errors2.NewHostError(err)
} }
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err) return nil, errors2.NewHostError(err)
} }
// Perform the TLS handshake // Perform the TLS handshake
@@ -79,8 +112,17 @@ func ConnectAndGetData(url string) ([]byte, error) {
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites. // MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
} }
tlsConn := tls.Client(conn, tlsConfig) tlsConn := tls.Client(conn, tlsConfig)
if err := tlsConn.Handshake(); err != nil { err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
return nil, fmt.Errorf("%w: %w", common.ErrNetworkTLS, err) if err != nil {
return nil, errors2.NewHostError(err)
}
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, errors2.NewHostError(err)
}
err = tlsConn.Handshake()
if err != nil {
return nil, errors2.NewHostError(err)
} }
// We read `buf`-sized chunks and add data to `data`. // We read `buf`-sized chunks and add data to `data`.
@@ -91,10 +133,10 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Fix for stupid server bug: // Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!' // Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitly in the URL. // when the port is 1965 and is still specified explicitly in the URL.
_url, _ := common.ParseURL(url, "") url2, _ := _url.ParseURL(url, "", true)
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort()))) _, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetworkCannotWrite, err) return nil, errors2.NewHostError(err)
} }
// Read response bytes in len(buf) byte chunks // Read response bytes in len(buf) byte chunks
for { for {
@@ -103,90 +145,50 @@ func ConnectAndGetData(url string) ([]byte, error) {
data = append(data, buf[:n]...) data = append(data, buf[:n]...)
} }
if len(data) > config.CONFIG.MaxResponseSize { if len(data) > config.CONFIG.MaxResponseSize {
return nil, fmt.Errorf("%w: %v", common.ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize) return nil, errors2.NewHostError(err)
} }
if err != nil { if err != nil {
if errors.Is(err, io.EOF) { if errors.Is(err, io.EOF) {
break break
} }
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err) return nil, errors2.NewHostError(err)
} }
} }
return data, nil return data, nil
} }
// Visit given URL, using the Gemini protocol. func processData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
// Mutates given Snapshot with the data.
// In case of error, we store the error string
// inside snapshot and return the error.
func Visit(s *common.Snapshot) (err error) {
// Don't forget to also store error
// response code (if we have one)
// and header
defer func() {
if err != nil {
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*common.GeminiError)) {
s.Header = null.StringFrom(err.(*common.GeminiError).Header)
s.ResponseCode = null.IntFrom(int64(err.(*common.GeminiError).Code))
}
}
}()
s.Timestamp = null.TimeFrom(time.Now())
data, err := ConnectAndGetData(s.URL.String())
if err != nil {
return err
}
pageData, err := processData(data)
if err != nil {
return err
}
s.Header = null.StringFrom(pageData.ResponseHeader)
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
s.MimeType = null.StringFrom(pageData.MimeType)
s.Lang = null.StringFrom(pageData.Lang)
if pageData.GemText != "" {
s.GemText = null.StringFrom(pageData.GemText)
}
if pageData.Data != nil {
s.Data = null.ValueFrom(pageData.Data)
}
return nil
}
// processData returne results from
// parsing Gemini header data:
// Code, mime type and lang (optional)
// Returns error if header was invalid
func processData(data []byte) (*PageData, error) {
header, body, err := getHeadersAndData(data) header, body, err := getHeadersAndData(data)
if err != nil { if err != nil {
return nil, err return nil, err
} }
code, mimeType, lang := getMimeTypeAndLang(header) code, mimeType, lang := getMimeTypeAndLang(header)
logging.LogDebug("Header: %s", strings.TrimSpace(header))
if code != 20 { if code != 0 {
return nil, common.NewErrGeminiStatusCode(code, header) s.ResponseCode = null.IntFrom(int64(code))
}
if header != "" {
s.Header = null.StringFrom(header)
}
if mimeType != "" {
s.MimeType = null.StringFrom(mimeType)
}
if lang != "" {
s.Lang = null.StringFrom(lang)
} }
pageData := PageData{
ResponseCode: code,
ResponseHeader: header,
MimeType: mimeType,
Lang: lang,
}
// If we've got a Gemini document, populate // If we've got a Gemini document, populate
// `GemText` field, otherwise raw data goes to `Data`. // `GemText` field, otherwise raw data goes to `Data`.
if mimeType == "text/gemini" { if mimeType == "text/gemini" {
validBody, err := BytesToValidUTF8(body) validBody, err := BytesToValidUTF8(body)
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrUTF8Parse, err) return nil, errors.NewError(err)
} }
pageData.GemText = validBody s.GemText = null.StringFrom(validBody)
} else { } else {
pageData.Data = body s.Data = null.ValueFrom(body)
} }
return &pageData, nil return &s, nil
} }
// Checks for a Gemini header, which is // Checks for a Gemini header, which is
@@ -196,29 +198,42 @@ func processData(data []byte) (*PageData, error) {
func getHeadersAndData(data []byte) (string, []byte, error) { func getHeadersAndData(data []byte) (string, []byte, error) {
firstLineEnds := slices.Index(data, '\n') firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 { if firstLineEnds == -1 {
return "", nil, common.ErrGeminiResponseHeader return "", nil, errors2.NewHostError(fmt.Errorf("error parsing header"))
} }
firstLine := string(data[:firstLineEnds]) firstLine := string(data[:firstLineEnds])
rest := data[firstLineEnds+1:] rest := data[firstLineEnds+1:]
return firstLine, rest, nil return strings.TrimSpace(firstLine), rest, nil
} }
// Parses code, mime type and language // getMimeTypeAndLang Parses code, mime type and language
// from a Gemini header. // given a Gemini header.
// Examples:
// `20 text/gemini lang=en` (code, mimetype, lang)
// `20 text/gemini` (code, mimetype)
// `31 gemini://redirected.to/other/site` (code)
func getMimeTypeAndLang(headers string) (int, string, string) { func getMimeTypeAndLang(headers string) (int, string, string) {
// Regex that parses code, mimetype & optional charset/lang parameters // First try to match the full format: "<code> <mimetype> [charset=<value>] [lang=<value>]"
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`) // The regex looks for:
// - A number (\d+)
// - Followed by whitespace and a mimetype ([a-zA-Z0-9/\-+]+)
// - Optionally followed by charset and/or lang parameters in any order
// - Only capturing the lang value, ignoring charset
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:(?:[\s;]+(?:charset=[^;\s]+|lang=([a-zA-Z0-9-]+)))*)\s*$`)
matches := re.FindStringSubmatch(headers) matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 { if matches == nil || len(matches) <= 1 {
// Try to get code at least // If full format doesn't match, try to match redirect format: "<code> <URL>"
re := regexp.MustCompile(`^(\d+)\s+`) // This handles cases like "31 gemini://example.com"
re := regexp.MustCompile(`^(\d+)\s+(.+)$`)
matches := re.FindStringSubmatch(headers) matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 { if matches == nil || len(matches) <= 1 {
return 0, "", "" // If redirect format doesn't match, try to match just a status code
// This handles cases like "99"
re := regexp.MustCompile(`^(\d+)\s*$`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
return 0, "", ""
}
code, err := strconv.Atoi(matches[1])
if err != nil {
return 0, "", ""
}
return code, "", ""
} }
code, err := strconv.Atoi(matches[1]) code, err := strconv.Atoi(matches[1])
if err != nil { if err != nil {
@@ -231,6 +246,10 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
return 0, "", "" return 0, "", ""
} }
mimeType := matches[2] mimeType := matches[2]
param := matches[3] // This will capture either charset or lang value lang := matches[3] // Will be empty string if no lang parameter was found
return code, mimeType, param return code, mimeType, lang
}
func isGeminiCapsule(s *snapshot.Snapshot) bool {
return !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini"
} }

View File

@@ -1,78 +1,366 @@
package gemini package gemini
import ( import (
"slices"
"strings"
"testing" "testing"
"gemini-grc/common/snapshot"
) )
// Test for input: `20 text/gemini` func TestGetHeadersAndData(t *testing.T) {
func TestGetMimeTypeAndLang1(t *testing.T) {
t.Parallel() t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini") tests := []struct {
if code != 20 || mimeType != "text/gemini" || lang != "" { input []byte
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) header string
body []byte
expectError bool
}{
{[]byte("20 text/gemini\r\nThis is the body"), "20 text/gemini", []byte("This is the body"), false},
{[]byte("20 text/gemini\nThis is the body"), "20 text/gemini", []byte("This is the body"), false},
{[]byte("53 No proxying!\r\n"), "53 No proxying!", []byte(""), false},
{[]byte("No header"), "", nil, true},
}
for _, test := range tests {
header, body, err := getHeadersAndData(test.input)
if test.expectError && err == nil {
t.Errorf("Expected error, got nil for input: %s", test.input)
}
if !test.expectError && err != nil {
t.Errorf("Unexpected error for input '%s': %v", test.input, err)
}
if header != test.header {
t.Errorf("Expected header '%s', got '%s' for input: %s", test.header, header, test.input)
}
if !slices.Equal(body, test.body) {
t.Errorf("Expected body '%s', got '%s' for input: %s", test.body, string(body), test.input)
}
} }
} }
func TestGetMimeTypeAndLang11(t *testing.T) { func TestGetMimeTypeAndLang(t *testing.T) {
t.Parallel() t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n") tests := []struct {
if code != 20 || mimeType != "text/gemini" || lang != "" { header string
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang) code int
mimeType string
lang string
}{
{"20 text/gemini lang=en", 20, "text/gemini", "en"},
{"20 text/gemini", 20, "text/gemini", ""},
{"31 gemini://redirected.to/other/site", 31, "", ""},
{"20 text/plain;charset=utf-8", 20, "text/plain", ""},
{"20 text/plain;lang=el-GR", 20, "text/plain", "el-GR"},
{"20 text/gemini;lang=en-US;charset=utf-8", 20, "text/gemini", "en-US"}, // charset should be ignored
{"Invalid header", 0, "", ""},
{"99", 99, "", ""},
}
for _, test := range tests {
code, mimeType, lang := getMimeTypeAndLang(test.header)
if code != test.code {
t.Errorf("Expected code %d, got %d for header: %s", test.code, code, test.header)
}
if mimeType != test.mimeType {
t.Errorf("Expected mimeType '%s', got '%s' for header: %s", test.mimeType, mimeType, test.header)
}
if lang != test.lang {
t.Errorf("Expected lang '%s', got '%s' for header: %s", test.lang, lang, test.header)
}
} }
} }
func TestGetMimeTypeAndLang12(t *testing.T) { func TestProcessData(t *testing.T) {
t.Parallel() t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8") tests := []struct {
if code != 20 || mimeType != "text/plain" || lang != "utf-8" { name string
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang) inputData []byte
expectedCode int
expectedMime string
expectedLang string
expectedData []byte
expectedError bool
}{
{
name: "Gemini document",
inputData: []byte("20 text/gemini\r\n# Hello\nWorld"),
expectedCode: 20,
expectedMime: "text/gemini",
expectedLang: "",
expectedData: []byte("# Hello\nWorld"),
expectedError: false,
},
{
name: "Gemini document with language",
inputData: []byte("20 text/gemini lang=en\r\n# Hello\nWorld"),
expectedCode: 20,
expectedMime: "text/gemini",
expectedLang: "en",
expectedData: []byte("# Hello\nWorld"),
expectedError: false,
},
{
name: "Non-Gemini document",
inputData: []byte("20 text/html\r\n<h1>Hello</h1>"),
expectedCode: 20,
expectedMime: "text/html",
expectedLang: "",
expectedData: []byte("<h1>Hello</h1>"),
expectedError: false,
},
{
name: "Error header",
inputData: []byte("53 No proxying!\r\n"),
expectedCode: 53,
expectedMime: "",
expectedLang: "",
expectedData: []byte(""),
expectedError: false,
},
{
name: "Invalid header",
inputData: []byte("Invalid header"),
expectedError: true,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
s := snapshot.Snapshot{}
result, err := processData(s, test.inputData)
if test.expectedError && err == nil {
t.Errorf("Expected error, got nil")
return
}
if !test.expectedError && err != nil {
t.Errorf("Unexpected error: %v", err)
return
}
if test.expectedError {
return
}
if int(result.ResponseCode.ValueOrZero()) != test.expectedCode {
t.Errorf("Expected code %d, got %d", test.expectedCode, int(result.ResponseCode.ValueOrZero()))
}
if result.MimeType.ValueOrZero() != test.expectedMime {
t.Errorf("Expected mimeType '%s', got '%s'", test.expectedMime, result.MimeType.ValueOrZero())
}
if result.Lang.ValueOrZero() != test.expectedLang {
t.Errorf("Expected lang '%s', got '%s'", test.expectedLang, result.Lang.ValueOrZero())
}
if test.expectedMime == "text/gemini" {
if !strings.Contains(result.GemText.String, string(test.expectedData)) {
t.Errorf("Expected GemText '%s', got '%s'", test.expectedData, result.GemText.String)
}
} else {
if !slices.Equal(result.Data.ValueOrZero(), test.expectedData) {
t.Errorf("Expected data '%s', got '%s'", test.expectedData, result.Data.ValueOrZero())
}
}
})
} }
} }
func TestGetMimeTypeAndLang13(t *testing.T) { //// Mock Gemini server for testing ConnectAndGetData
//func mockGeminiServer(response string, delay time.Duration, closeConnection bool) net.Listener {
// listener, err := net.Listen("tcp", "127.0.0.1:0") // Bind to a random available port
// if err != nil {
// panic(fmt.Sprintf("Failed to create mock server: %v", err))
// }
//
// go func() {
// conn, err := listener.Accept()
// if err != nil {
// if !closeConnection { // Don't panic if we closed the connection on purpose
// panic(fmt.Sprintf("Failed to accept connection: %v", err))
// }
// return
// }
// defer conn.Close()
//
// time.Sleep(delay) // Simulate network latency
//
// _, err = conn.Write([]byte(response))
// if err != nil && !closeConnection {
// panic(fmt.Sprintf("Failed to write response: %v", err))
// }
// }()
//
// return listener
//}
// func TestConnectAndGetData(t *testing.T) {
// config.CONFIG = config.ConfigStruct{
// ResponseTimeout: 5,
// MaxResponseSize: 1024 * 1024,
// }
// tests := []struct {
// name string
// serverResponse string
// serverDelay time.Duration
// expectedData []byte
// expectedError bool
// closeConnection bool
// }{
// {
// name: "Successful response",
// serverResponse: "20 text/gemini\r\n# Hello",
// expectedData: []byte("20 text/gemini\r\n# Hello"),
// expectedError: false,
// },
// {
// name: "Server error",
// serverResponse: "50 Server error\r\n",
// expectedData: []byte("50 Server error\r\n"),
// expectedError: false,
// },
// {
// name: "Timeout",
// serverDelay: 6 * time.Second, // Longer than the timeout
// expectedError: true,
// },
// {
// name: "Server closes connection",
// closeConnection: true,
// expectedError: true,
// },
// }
// for _, test := range tests {
// t.Run(test.name, func(t *testing.T) {
// listener := mockGeminiServer(test.serverResponse, test.serverDelay, test.closeConnection)
// defer func() {
// test.closeConnection = true // Prevent panic in mock server
// listener.Close()
// }()
// addr := listener.Addr().String()
// data, err := ConnectAndGetData(fmt.Sprintf("gemini://%s/", addr))
// if test.expectedError && err == nil {
// t.Errorf("Expected error, got nil")
// }
// if !test.expectedError && err != nil {
// t.Errorf("Unexpected error: %v", err)
// }
// if !slices.Equal(data, test.expectedData) {
// t.Errorf("Expected data '%s', got '%s'", test.expectedData, data)
// }
// })
// }
// }
// func TestVisit(t *testing.T) {
// config.CONFIG = config.ConfigStruct{
// ResponseTimeout: 5,
// MaxResponseSize: 1024 * 1024,
// }
// tests := []struct {
// name string
// serverResponse string
// expectedCode int
// expectedMime string
// expectedError bool
// expectedLinks []string
// }{
// {
// name: "Successful response",
// serverResponse: "20 text/gemini\r\n# Hello\n=> /link1 Link 1\n=> /link2 Link 2",
// expectedCode: 20,
// expectedMime: "text/gemini",
// expectedError: false,
// expectedLinks: []string{"gemini://127.0.0.1:1965/link1", "gemini://127.0.0.1:1965/link2"},
// },
// {
// name: "Server error",
// serverResponse: "50 Server error\r\n",
// expectedCode: 50,
// expectedMime: "Server error",
// expectedError: false,
// expectedLinks: []string{},
// },
// }
// for _, test := range tests {
// t.Run(test.name, func(t *testing.T) {
// listener := mockGeminiServer(test.serverResponse, 0, false)
// defer listener.Close()
// addr := listener.Addr().String()
// snapshot, err := Visit(fmt.Sprintf("gemini://%s/", addr))
// if test.expectedError && err == nil {
// t.Errorf("Expected error, got nil")
// }
// if !test.expectedError && err != nil {
// t.Errorf("Unexpected error: %v", err)
// }
// if snapshot.ResponseCode.ValueOrZero() != int64(test.expectedCode) {
// t.Errorf("Expected code %d, got %d", test.expectedCode, snapshot.ResponseCode.ValueOrZero())
// }
// if snapshot.MimeType.ValueOrZero() != test.expectedMime {
// t.Errorf("Expected mimeType '%s', got '%s'", test.expectedMime, snapshot.MimeType.ValueOrZero())
// }
// if test.expectedLinks != nil {
// links, _ := snapshot.Links.Value()
// if len(links) != len(test.expectedLinks) {
// t.Errorf("Expected %d links, got %d", len(test.expectedLinks), len(links))
// }
// for i, link := range links {
// if link != test.expectedLinks[i] {
// t.Errorf("Expected link '%s', got '%s'", test.expectedLinks[i], link)
// }
// }
// }
// })
// }
// }
func TestVisit_InvalidURL(t *testing.T) {
t.Parallel() t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8") _, err := Visit("invalid-url")
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" { if err == nil {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang) t.Errorf("Expected error for invalid URL, got nil")
} }
} }
func TestGetTypeAndLang2(t *testing.T) { //func TestVisit_GeminiError(t *testing.T) {
t.Parallel() // listener := mockGeminiServer("51 Not Found\r\n", 0, false)
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en") // defer listener.Close()
if code != 20 || mimeType != "text/gemini" || lang != "en" { // addr := listener.Addr().String()
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang) //
} // s, err := Visit(fmt.Sprintf("gemini://%s/", addr))
} // if err != nil {
// t.Errorf("Unexpected error: %v", err)
func TestGetTypeAndLang21(t *testing.T) { // }
t.Parallel() //
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en") // expectedError := "51 Not Found"
if code != 20 || mimeType != "text/gemini" || lang != "en" { // if s.Error.ValueOrZero() != expectedError {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang) // t.Errorf("Expected error in snapshot: %v, got %v", expectedError, s.Error)
} // }
} //
// expectedCode := 51
func TestGetMimeTypeAndLang3(t *testing.T) { // if s.ResponseCode.ValueOrZero() != int64(expectedCode) {
t.Parallel() // t.Errorf("Expected code %d, got %d", expectedCode, s.ResponseCode.ValueOrZero())
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page") // }
if code != 31 || mimeType != "" || lang != "" { //}
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang4(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang5(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("")
if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}

View File

@@ -26,7 +26,7 @@ func BytesToValidUTF8(input []byte) (string, error) {
if len(input) > maxSize { if len(input) > maxSize {
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize) return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
} }
// Remove NULL byte 0x00 (ReplaceAll accepts slices) // remove NULL byte 0x00 (ReplaceAll accepts slices)
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{}) inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
if utf8.Valid(inputNoNull) { if utf8.Valid(inputNoNull) {
return string(inputNoNull), nil return string(inputNoNull), nil

View File

@@ -2,10 +2,11 @@ package gemini
import ( import (
"fmt" "fmt"
"gemini-grc/common"
"strings" "strings"
"sync" "sync"
"gemini-grc/common/snapshot"
geminiUrl "gemini-grc/common/url"
"gemini-grc/logging" "gemini-grc/logging"
) )
@@ -16,7 +17,7 @@ import (
// list is stored for caching. // list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals var RobotsCache sync.Map //nolint:gochecknoglobals
func populateBlacklist(key string) (entries []string) { func populateRobotsCache(key string) (entries []string, _err error) {
// We either store an empty list when // We either store an empty list when
// no rules, or a list of disallowed URLs. // no rules, or a list of disallowed URLs.
// This applies even if we have an error // This applies even if we have an error
@@ -27,53 +28,60 @@ func populateBlacklist(key string) (entries []string) {
url := fmt.Sprintf("gemini://%s/robots.txt", key) url := fmt.Sprintf("gemini://%s/robots.txt", key)
robotsContent, err := ConnectAndGetData(url) robotsContent, err := ConnectAndGetData(url)
if err != nil { if err != nil {
logging.LogDebug("robots.txt error %s", err) return []string{}, err
return []string{}
} }
robotsData, err := processData(robotsContent) s, err := snapshot.SnapshotFromURL(url, true)
if err != nil {
return []string{}, nil
}
s, err = processData(*s, robotsContent)
if err != nil { if err != nil {
logging.LogDebug("robots.txt error %s", err) logging.LogDebug("robots.txt error %s", err)
return []string{} return []string{}, nil
} }
if robotsData.ResponseCode != 20 { if s.ResponseCode.ValueOrZero() != 20 {
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode) logging.LogDebug("robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
return []string{} return []string{}, nil
} }
// Some return text/plain, others text/gemini. // Some return text/plain, others text/gemini.
// According to spec, the first is correct, // According to spec, the first is correct,
// however let's be lenient // however let's be lenient
var data string var data string
switch { switch {
case robotsData.MimeType == "text/plain": case s.MimeType.ValueOrZero() == "text/plain":
data = string(robotsData.Data) data = string(s.Data.ValueOrZero())
case robotsData.MimeType == "text/gemini": case s.MimeType.ValueOrZero() == "text/gemini":
data = robotsData.GemText data = s.GemText.ValueOrZero()
default: default:
return []string{} return []string{}, nil
} }
entries = ParseRobotsTxt(data, key) entries = ParseRobotsTxt(data, key)
return entries return entries, nil
} }
// RobotMatch checks if the snapshot URL matches // RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule. // a robots.txt allow rule.
func RobotMatch(u string) bool { func RobotMatch(u string) (bool, error) {
url, err := common.ParseURL(u, "") url, err := geminiUrl.ParseURL(u, "", true)
if err != nil { if err != nil {
return false return false, err
} }
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port)) key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
logging.LogDebug("Checking robots.txt cache for %s", key)
var disallowedURLs []string var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key) cacheEntries, ok := RobotsCache.Load(key)
if !ok { if !ok {
// First time check, populate robot cache // First time check, populate robot cache
disallowedURLs = populateBlacklist(key) disallowedURLs, err := populateRobotsCache(key)
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs) if err != nil {
return false, err
}
if len(disallowedURLs) > 0 {
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
}
} else { } else {
disallowedURLs, _ = cacheEntries.([]string) disallowedURLs, _ = cacheEntries.([]string)
} }
return isURLblocked(disallowedURLs, url.Full) return isURLblocked(disallowedURLs, url.Full), nil
} }
func isURLblocked(disallowedURLs []string, input string) bool { func isURLblocked(disallowedURLs []string, input string) bool {

View File

@@ -1,344 +0,0 @@
package gemini
import (
"errors"
"fmt"
"gemini-grc/common"
_db "gemini-grc/db"
"strings"
"time"
"gemini-grc/logging"
"gemini-grc/util"
"github.com/guregu/null/v5"
"github.com/jmoiron/sqlx"
)
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers)
statusChan = make(chan WorkerStatus, numOfWorkers)
go PrintWorkerStatus(numOfWorkers, statusChan)
for i := range numOfWorkers {
go func(i int) {
// Jitter to avoid starting everything at the same time
time.Sleep(time.Duration(util.SecureRandomInt(10)) * time.Second)
for {
RunWorkerWithTx(i, db, nil)
}
}(i)
}
}
func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) {
statusChan <- WorkerStatus{
id: workerID,
status: "Starting up",
}
defer func() {
statusChan <- WorkerStatus{
id: workerID,
status: "Done",
}
}()
tx, err := db.Beginx()
if err != nil {
panic(fmt.Sprintf("Failed to begin transaction: %v", err))
}
runWorker(workerID, tx, url)
logging.LogDebug("[%d] Committing transaction", workerID)
err = tx.Commit()
// On deadlock errors, rollback and return, otherwise panic.
if err != nil {
logging.LogError("[%d] Failed to commit transaction: %w", workerID, err)
if _db.IsDeadlockError(err) {
logging.LogError("[%d] Deadlock detected. Rolling back", workerID)
time.Sleep(time.Duration(10) * time.Second)
err := tx.Rollback()
if err != nil {
panic(fmt.Sprintf("[%d] Failed to roll back transaction: %v", workerID, err))
}
return
}
panic(fmt.Sprintf("[%d] Failed to commit transaction: %v", workerID, err))
}
logging.LogDebug("[%d] Worker done!", workerID)
}
func runWorker(workerID int, tx *sqlx.Tx, url *string) {
var urls []string
var err error
// If not given a specific URL,
// get some random ones to visit from db.
if url == nil {
statusChan <- WorkerStatus{
id: workerID,
status: "Getting URLs",
}
urls, err = _db.GetURLsToVisit(tx)
if err != nil {
logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err)
panic("This should never happen")
} else if len(urls) == 0 {
logging.LogInfo("[%d] No URLs to visit.", workerID)
time.Sleep(1 * time.Minute)
return
}
} else {
geminiURL, err := common.ParseURL(*url, "")
if err != nil {
logging.LogError("Invalid URL given: %s", *url)
return
}
urls = []string{geminiURL.String()}
}
// Start visiting URLs.
total := len(urls)
for i, u := range urls {
logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, u)
// We differentiate between errors:
// Unexpected errors are the ones returned from the following function.
// If an error is unexpected (which should never happen) we panic.
// Expected errors are stored as strings within the snapshot.
err := workOnUrl(workerID, tx, u)
if err != nil {
logging.LogError("[%d] Unexpected GeminiError %w while visiting %s", workerID, err, u)
util.PrintStackAndPanic(err)
}
logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total)
}
}
// workOnUrl visits a URL and stores the result.
// unexpected errors are returned.
// expected errors are stored within the snapshot.
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
if url == "" {
return fmt.Errorf("nil URL given")
}
if IsBlacklisted(url) {
logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, url)
return nil
}
s := common.SnapshotFromURL(url)
// If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be
// crawled.
if RobotMatch(url) {
s.Error = null.StringFrom(common.ErrGeminiRobotsDisallowed.Error())
err = _db.OverwriteSnapshot(workerID, tx, s)
if err != nil {
return fmt.Errorf("[%d] %w", workerID, err)
}
return nil
}
// Resolve IP address via DNS
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Resolving %s", url),
}
IPs, err := getHostIPAddresses(s.Host)
if err != nil {
s.Error = null.StringFrom(err.Error())
err = _db.OverwriteSnapshot(workerID, tx, s)
if err != nil {
return fmt.Errorf("[%d] %w", workerID, err)
}
return nil
}
for {
count := 1
if isAnotherWorkerVisitingHost(workerID, IPs) {
logging.LogDebug("[%d] Another worker is visiting this host, waiting", workerID)
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host),
}
time.Sleep(2 * time.Second) // Avoid flood-retrying
count++
if count == 3 {
return
}
} else {
break
}
}
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Adding to pool %s", url),
}
AddIPsToPool(IPs)
// After finishing, remove the host IPs from
// the connections pool, with a small delay
// to avoid potentially hitting the same IP quickly.
defer func() {
go func() {
time.Sleep(1 * time.Second)
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Removing from pool %s", url),
}
RemoveIPsFromPool(IPs)
}()
}()
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Visiting %s", url),
}
err = Visit(s)
if err != nil {
if !common.IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", workerID, url, err)
return err
}
s.Error = null.StringFrom(err.Error())
// Check if error is redirection, and handle it
if errors.As(err, new(*common.GeminiError)) &&
err.(*common.GeminiError).Msg == "redirect" {
err = handleRedirection(workerID, tx, s)
if err != nil {
if common.IsKnownError(err) {
s.Error = null.StringFrom(err.Error())
} else {
return err
}
}
}
}
// If this is a gemini page, parse possible links inside
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
links := GetPageLinks(s.URL, s.GemText.String)
if len(links) > 0 {
logging.LogDebug("[%d] Found %d links", workerID, len(links))
s.Links = null.ValueFrom(links)
err = storeLinks(tx, s)
if err != nil {
return err
}
}
} else {
logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID)
}
err = _db.OverwriteSnapshot(workerID, tx, s)
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
if err != nil {
return err
}
return nil
}
func isAnotherWorkerVisitingHost(workerID int, IPs []string) bool {
IPPool.Lock.RLock()
defer func() {
IPPool.Lock.RUnlock()
}()
logging.LogDebug("[%d] Checking pool for IPs", workerID)
for _, ip := range IPs {
_, ok := IPPool.IPs[ip]
if ok {
return true
}
}
return false
}
func storeLinks(tx *sqlx.Tx, s *common.Snapshot) error {
if s.Links.Valid {
var batchSnapshots []*common.Snapshot
for _, link := range s.Links.ValueOrZero() {
if shouldPersistURL(&link) {
newSnapshot := &common.Snapshot{
URL: link,
Host: link.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
batchSnapshots = append(batchSnapshots, newSnapshot)
}
}
if len(batchSnapshots) > 0 {
err := _db.SaveLinksToDBinBatches(tx, batchSnapshots)
if err != nil {
return err
}
}
}
return nil
}
// shouldPersistURL returns true if we
// should save the URL in the _db.
// Only gemini:// urls are saved.
func shouldPersistURL(u *common.URL) bool {
return strings.HasPrefix(u.String(), "gemini://")
}
func haveWeVisitedURL(tx *sqlx.Tx, u *common.URL) (bool, error) {
var result bool
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u.String())
if err != nil {
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
}
if result {
return result, nil
}
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshot.url=$1`, u.String())
if err != nil {
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
}
return result, nil
}
// handleRedirection saves redirect URL as new snapshot
func handleRedirection(workerID int, tx *sqlx.Tx, s *common.Snapshot) error {
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
if err != nil {
if errors.Is(err, common.ErrGeminiRedirect) {
logging.LogDebug("[%d] %s", workerID, err)
}
return err
}
logging.LogDebug("[%d] Page redirects to %s", workerID, newURL)
// Insert fresh snapshot with new URL
if shouldPersistURL(newURL) {
snapshot := &common.Snapshot{
// UID: uid.UID(),
URL: *newURL,
Host: newURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String())
err = _db.SaveSnapshotIfNew(tx, snapshot)
if err != nil {
return err
}
}
return nil
}
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]common.Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []common.Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err
}
return snapshots, nil
}

12
go.mod
View File

@@ -4,13 +4,13 @@ go 1.23.1
require ( require (
github.com/guregu/null/v5 v5.0.0 github.com/guregu/null/v5 v5.0.0
github.com/jackc/pgx/v5 v5.7.1 github.com/jackc/pgx/v5 v5.7.2
github.com/jmoiron/sqlx v1.4.0 github.com/jmoiron/sqlx v1.4.0
github.com/lib/pq v1.10.9 github.com/lib/pq v1.10.9
github.com/matoous/go-nanoid/v2 v2.1.0 github.com/matoous/go-nanoid/v2 v2.1.0
github.com/rs/zerolog v1.33.0 github.com/rs/zerolog v1.33.0
github.com/stretchr/testify v1.9.0 github.com/stretchr/testify v1.9.0
golang.org/x/text v0.19.0 golang.org/x/text v0.21.0
) )
require ( require (
@@ -19,12 +19,12 @@ require (
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/kr/text v0.2.0 // indirect github.com/kr/text v0.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-colorable v0.1.14 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-isatty v0.0.20 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.13.1 // indirect github.com/rogpeppe/go-internal v1.13.1 // indirect
golang.org/x/crypto v0.27.0 // indirect golang.org/x/crypto v0.32.0 // indirect
golang.org/x/sync v0.8.0 // indirect golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.25.0 // indirect golang.org/x/sys v0.29.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
) )

23
go.sum
View File

@@ -14,8 +14,8 @@ github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsI
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs= github.com/jackc/pgx/v5 v5.7.2 h1:mLoDLV6sonKlvjIEsV56SkWNCnuNv531l94GaIzO+XI=
github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA= github.com/jackc/pgx/v5 v5.7.2/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
@@ -28,8 +28,9 @@ github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE= github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM= github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
@@ -49,17 +50,17 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc=
golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU=
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=

32
gopher/errors.go Normal file
View File

@@ -0,0 +1,32 @@
package gopher
import (
"gemini-grc/errors"
)
// GopherError is an error encountered while
// visiting a Gopher host, and is only for
// Gopher errors (item type indicator 3).
type GopherError struct {
Err error
}
func (e *GopherError) Error() string {
return e.Err.Error()
}
func (e *GopherError) Unwrap() error {
return e.Err
}
func NewGopherError(err error) error {
return &GopherError{Err: err}
}
func IsGopherError(err error) bool {
if err == nil {
return false
}
var asError *GopherError
return errors.As(err, &asError)
}

283
gopher/network.go Normal file
View File

@@ -0,0 +1,283 @@
package gopher
import (
"fmt"
"io"
"net"
stdurl "net/url"
"regexp"
"strings"
"time"
"unicode/utf8"
errors2 "gemini-grc/common/errors"
"gemini-grc/common/linkList"
"gemini-grc/common/snapshot"
_url "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/errors"
"gemini-grc/logging"
"github.com/guregu/null/v5"
)
// References:
// RFC 1436 https://www.rfc-editor.org/rfc/rfc1436.html
// The default port for Gopher is 70.
// Originally Gopher used ASCII or
// ISO-8859-1, now others use UTF-8.
// In any case, just converting to UTF-8
// will work. If not, we bail.
// Here's the complete list of Gopher item type indicators (prefixes):
//
// `0` - Plain Text File
// `1` - Directory/Menu
// `2` - CSO Phone Book Server
// `3` - Error Message
// `4` - BinHexed Macintosh File
// `5` - DOS Binary Archive
// `6` - UNIX uuencoded File
// `7` - Index/Search Server
// `8` - Telnet Session
// `9` - Binary File
// `+` - Mirror/Redundant Server
// `g` - GIF Image
// `I` - Image File (non-GIF)
// `T` - TN3270 Session
// `i` - Informational Message (menu line)
// `h` - HTML File
// `s` - Sound/Music File
// `d` - Document File
// `w` - WHOIS Service
// `;` - Document File with Alternative View
// `<` - Video File
// `M` - MIME File (mail message or similar)
// `:` - Bitmap Image
// `c` - Calendar File
// `p` - PostScript File
// The most commonly used ones are `0` (text), `1` (directory), `i` (info), and `3` (error).
// The original Gopher protocol only specified types 0-9, `+`, `g`, `I`, and `T`.
// The others were added by various implementations and extensions over time.
// Error methodology:
// HostError for DNS/network errors
// GopherError for network/gopher errors
// NewError for other errors
// NewFatalError for other fatal errors
func Visit(url string) (*snapshot.Snapshot, error) {
s, err := snapshot.SnapshotFromURL(url, false)
if err != nil {
return nil, err
}
data, err := connectAndGetData(url)
if err != nil {
logging.LogDebug("Error: %s", err.Error())
if IsGopherError(err) || errors2.IsHostError(err) {
s.Error = null.StringFrom(err.Error())
return s, nil
}
return nil, err
}
isValidUTF8 := utf8.ValidString(string(data))
if isValidUTF8 {
s.GemText = null.StringFrom(removeNullChars(string(data)))
} else {
s.Data = null.ValueFrom(data)
}
if !isValidUTF8 {
return s, nil
}
responseError := checkForError(string(data))
if responseError != nil {
s.Error = null.StringFrom(responseError.Error())
return s, nil
}
links := getGopherPageLinks(string(data))
linkURLs := linkList.LinkList(make([]_url.URL, len(links)))
for i, link := range links {
linkURL, err := _url.ParseURL(link, "", true)
if err == nil {
linkURLs[i] = *linkURL
}
}
if len(links) != 0 {
s.Links = null.ValueFrom(linkURLs)
}
return s, nil
}
func connectAndGetData(url string) ([]byte, error) {
parsedURL, err := stdurl.Parse(url)
if err != nil {
return nil, errors.NewError(err)
}
hostname := parsedURL.Hostname()
port := parsedURL.Port()
if port == "" {
port = "70"
}
host := fmt.Sprintf("%s:%s", hostname, port)
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
// Establish the underlying TCP connection.
dialer := &net.Dialer{
Timeout: timeoutDuration,
}
logging.LogDebug("Dialing %s", host)
conn, err := dialer.Dial("tcp", host)
if err != nil {
return nil, errors2.NewHostError(err)
}
// Make sure we always close the connection.
defer func() {
_ = conn.Close()
}()
// Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, errors2.NewHostError(err)
}
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, errors2.NewHostError(err)
}
// We read `buf`-sized chunks and add data to `data`.
buf := make([]byte, 4096)
var data []byte
// Send Gopher request to trigger server response.
payload := constructPayloadFromPath(parsedURL.Path)
_, err = conn.Write([]byte(fmt.Sprintf("%s\r\n", payload)))
if err != nil {
return nil, errors2.NewHostError(err)
}
// Read response bytes in len(buf) byte chunks
for {
n, err := conn.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
}
if err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, errors2.NewHostError(err)
}
if len(data) > config.CONFIG.MaxResponseSize {
return nil, errors2.NewHostError(fmt.Errorf("response exceeded max"))
}
}
logging.LogDebug("Got %d bytes", len(data))
return data, nil
}
func constructPayloadFromPath(urlpath string) string {
// remove Gopher item type in URL from payload, if one.
re := regexp.MustCompile(`^/[\w]/.*`)
payloadWithoutItemtype := urlpath
if re.Match([]byte(urlpath)) {
payloadWithoutItemtype = strings.Join(strings.Split(urlpath, "/")[2:], "/")
}
if !strings.HasPrefix(payloadWithoutItemtype, "/") {
payloadWithoutItemtype = fmt.Sprintf("/%s", payloadWithoutItemtype)
}
return payloadWithoutItemtype
}
func checkForError(utfData string) error {
lines := strings.Split(strings.TrimSpace(utfData), "\n")
var firstLine string
if len(lines) > 0 {
firstLine = lines[0]
} else {
return nil
}
if strings.HasPrefix(firstLine, "3") {
split := strings.Split(firstLine, "\t")
return NewGopherError(fmt.Errorf("gopher error: %s", strings.TrimSpace(split[0])))
}
return nil
}
func getGopherPageLinks(content string) []string {
var links []string
lines := strings.Split(strings.TrimSpace(content), "\n")
for _, line := range lines {
if line == "" || line == "." {
continue
}
if len(line) < 1 {
continue
}
itemType := line[0]
if itemType == 'i' {
continue
}
parts := strings.SplitN(line[1:], "\t", 4)
if len(parts) < 3 {
continue
}
selector := strings.TrimSpace(parts[1])
host := strings.TrimSpace(parts[2])
if host == "" {
continue
}
// Handle HTML links first
if itemType == 'h' && strings.HasPrefix(selector, "URL:") {
if url := strings.TrimSpace(selector[4:]); url != "" {
links = append(links, url)
}
continue
}
// For gopher links, build URL carefully
var url strings.Builder
// Protocol and host:port
url.WriteString("gopher://")
url.WriteString(host)
url.WriteString(":")
if len(parts) > 3 && strings.TrimSpace(parts[3]) != "" {
url.WriteString(strings.TrimSpace(parts[3]))
} else {
url.WriteString("70")
}
// Path: always /type + selector
url.WriteString("/")
url.WriteString(string(itemType))
if strings.HasPrefix(selector, "/") {
url.WriteString(selector)
} else {
url.WriteString("/" + selector)
}
links = append(links, url.String())
}
return links
}
func removeNullChars(input string) string {
// Replace all null characters with an empty string
return strings.ReplaceAll(input, "\u0000", "")
}

298
gopher/network_test.go Normal file
View File

@@ -0,0 +1,298 @@
package gopher
import (
"net"
"testing"
"gemini-grc/common/errors"
"gemini-grc/config"
"github.com/stretchr/testify/assert"
)
func TestConstructPayloadFromPath(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "Path with Gopher item type",
input: "/1/path/to/resource",
expected: "/path/to/resource",
},
{
name: "Path with different item type",
input: "/0/another/path",
expected: "/another/path",
},
{
name: "Path without item type but with leading slash",
input: "/simple/path",
expected: "/simple/path",
},
{
name: "Path without item type and without leading slash",
input: "no/leading/slash",
expected: "/no/leading/slash",
},
{
name: "Empty path",
input: "",
expected: "/",
},
{
name: "Single character item type",
input: "/h/homepage",
expected: "/homepage",
},
{
name: "Single slash",
input: "/",
expected: "/",
},
{
name: "Item type-looking path",
input: "/1",
expected: "/1",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := constructPayloadFromPath(tt.input)
if result != tt.expected {
t.Errorf("constructPayloadFromPath(%q) = %q, want %q",
tt.input, result, tt.expected)
}
})
}
}
func TestParseLinks(t *testing.T) {
tests := []struct {
name string
currentURL string
input string
want int // number of expected links
wantErr bool
}{
{
name: "Empty input",
currentURL: "gopher://example.com:70",
input: "",
want: 0,
wantErr: false,
},
{
name: "Single directory link",
currentURL: "gopher://example.com:70",
input: "1About Us\t/about\texample.com",
want: 1,
wantErr: false,
},
{
name: "Single text file link",
currentURL: "gopher://example.com:70",
input: "0README\t/readme.txt\texample.com",
want: 1,
wantErr: false,
},
{
name: "Multiple links of different types",
currentURL: "gopher://example.com:70",
input: "1About Us\t/about\texample.com\n0README\t/readme.txt\texample.com\n1Contact\t/contact\texample.com",
want: 3,
wantErr: false,
},
{
name: "Ignore non-linkable types",
currentURL: "gopher://example.com:70",
input: "iInfo line\t/info\texample.com\n1Directory\t/dir\texample.com\n0Text\t/text.txt\texample.com",
want: 2,
wantErr: false,
},
{
name: "Malformed lines",
currentURL: "gopher://example.com:70",
input: "1Incomplete line\n0No tabs\n1Missing parts\t",
want: 0,
wantErr: false,
},
{
name: "Mixed valid and invalid lines",
currentURL: "gopher://example.com:70",
input: "1Valid link\t/valid\texample.com\n1Incomplete\t\n0Text file\t/text.txt\texample.com\n1Another valid\t/another\texample.com",
want: 3,
wantErr: false,
},
{
name: "Absolute URLs",
currentURL: "gopher://example.com:70",
input: "1External link\tgopher://external.com/path\texternal.com\n0Document\tgopher://other.com/doc.txt\tother.com",
want: 2,
wantErr: false,
},
{
name: "With whitespace",
currentURL: "gopher://example.com:70",
input: " 1Padded line \t/padded\texample.com\n0Text file \t/doc.txt\texample.com",
want: 2,
wantErr: false,
},
{
name: "Special characters in paths",
currentURL: "gopher://example.com:70",
input: "1Special chars\t/path with spaces\texample.com\n0Doc\t/über/päth.txt\texample.com",
want: 2,
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := getGopherPageLinks(tt.input)
assert.Equal(t, tt.want, len(got), "expected %d links, got %d", tt.want, len(got))
})
}
}
func TestCheckForError(t *testing.T) {
tests := []struct {
name string
input string
wantError bool
errorPrefix string
}{
{
name: "No error",
input: "1Directory\t/dir\texample.com\n0Text\t/text.txt\texample.com",
wantError: false,
errorPrefix: "",
},
{
name: "Simple error message",
input: "3Error: File not found\t\texample.com",
wantError: true,
errorPrefix: "gopher error: 3Error: File not found",
},
{
name: "Error with multiple tabs",
input: "3File not found\t/error\texample.com\t70",
wantError: true,
errorPrefix: "gopher error: 3File not found",
},
{
name: "Error among valid entries",
input: `1Welcome\t/welcome\texample.com
3Access denied\t\texample.com
0README\t/readme.txt\texample.com`,
wantError: false,
errorPrefix: "",
},
{
name: "Error with no tabs",
input: "3Server is down for maintenance",
wantError: true,
errorPrefix: "gopher error: 3Server is down for maintenance",
},
{
name: "Multiple errors (should return first)",
input: `3First error\t\texample.com
3Second error\t\texample.com`,
wantError: true,
errorPrefix: "gopher error: 3First error",
},
{
name: "Error with whitespace",
input: " 3 Error with spaces \t\texample.com",
wantError: true,
errorPrefix: "gopher error: 3 Error with spaces",
},
{
name: "Empty input",
input: "",
wantError: false,
errorPrefix: "",
},
{
name: "Just newlines",
input: "\n\n\n",
wantError: false,
errorPrefix: "",
},
{
name: "Error after empty lines",
input: `
3Error after empty lines\t\texample.com`,
wantError: true,
errorPrefix: "gopher error: 3Error after empty lines",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := checkForError(tt.input)
if !tt.wantError {
assert.NoError(t, err)
return
}
assert.Error(t, err)
assert.Contains(t, err.Error(), tt.errorPrefix)
})
}
}
func TestConnectAndGetDataTimeout(t *testing.T) {
// Start a test server that doesn't respond
listener, err := net.Listen("tcp", "localhost:0")
if err != nil {
t.Fatalf("Failed to start listener: %v", err)
}
defer listener.Close()
// Accept the connection but don't respond
go func() {
conn, err := listener.Accept()
if err != nil {
t.Logf("Failed to accept connection: %v", err)
return
}
defer conn.Close()
// Keep the connection open without sending any data to simulate a timeout
select {}
}()
// Construct the URL of our test server
address := listener.Addr().String()
testURL := "gopher://" + address + "/testpath"
// Save original config values
originalTimeout := config.CONFIG.ResponseTimeout
originalMaxSize := config.CONFIG.MaxResponseSize
// Set test config values
config.CONFIG.ResponseTimeout = 1 // Set a very short timeout for this test
config.CONFIG.MaxResponseSize = 1024 // Just for consistency, we won't reach this
// Test the function
_, err = connectAndGetData(testURL)
// Reset config values
config.CONFIG.ResponseTimeout = originalTimeout
config.CONFIG.MaxResponseSize = originalMaxSize
// Check if the error is due to timeout
if err == nil {
t.Error("Expected an error due to timeout, but got no error")
} else if !errors.IsHostError(err) {
t.Errorf("Expected a HostError, but got: %v", err)
} else {
// Here you might want to check if the specific error message contains 'timeout'
// However, since we don't have the exact error string, we're checking the type
t.Logf("Successfully timed out: %v", err)
}
}

View File

@@ -11,44 +11,39 @@ var hostPool = HostPool{hostnames: make(map[string]struct{})} //nolint:gocheckno
type HostPool struct { type HostPool struct {
hostnames map[string]struct{} hostnames map[string]struct{}
Lock sync.RWMutex lock sync.RWMutex
} }
func (p *HostPool) Add(key string) { //func (p *HostPool) add(key string) {
p.Lock.Lock() // p.lock.Lock()
defer p.Lock.Unlock() // defer p.lock.Unlock()
p.hostnames[key] = struct{}{} // p.hostnames[key] = struct{}{}
} //}
//
//func (p *HostPool) has(key string) bool {
// p.lock.RLock()
// defer p.lock.RUnlock()
// _, ok := p.hostnames[key]
// return ok
//}
func (p *HostPool) Get(key string) bool { func RemoveHostFromPool(key string) {
p.Lock.RLock() hostPool.lock.Lock()
defer p.Lock.RUnlock() defer hostPool.lock.Unlock()
_, ok := p.hostnames[key] delete(hostPool.hostnames, key)
return ok
}
func (p *HostPool) Delete(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
delete(p.hostnames, key)
} }
func AddHostToHostPool(key string) { func AddHostToHostPool(key string) {
for { for {
// Sleep until the host doesn't exist in pool, hostPool.lock.Lock()
// then add it. _, exists := hostPool.hostnames[key]
if hostPool.Get(key) { if !exists {
time.Sleep(1 * time.Second) // Avoid flood-retrying hostPool.hostnames[key] = struct{}{}
logging.LogInfo("Waiting to add %s to pool...", key) hostPool.lock.Unlock()
} else {
hostPool.Add(key)
return return
} }
} hostPool.lock.Unlock()
} time.Sleep(1 * time.Second)
logging.LogInfo("Waiting to add %s to pool...", key)
func RemoveHostFromHostPool(key string) {
if hostPool.Get(key) {
hostPool.Delete(key)
} }
} }

65
main.go
View File

@@ -1,13 +1,16 @@
package main package main
import ( import (
main2 "gemini-grc/db" "fmt"
"os" "os"
"os/signal" "os/signal"
"syscall" "syscall"
"gemini-grc/common"
"gemini-grc/common/blackList"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/gemini" "gemini-grc/db"
"gemini-grc/errors"
"gemini-grc/logging" "gemini-grc/logging"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
"github.com/rs/zerolog" "github.com/rs/zerolog"
@@ -19,39 +22,61 @@ func main() {
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
zerolog.SetGlobalLevel(config.CONFIG.LogLevel) zerolog.SetGlobalLevel(config.CONFIG.LogLevel)
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"}) zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
if err := runApp(); err != nil { err := runApp()
logging.LogError("Application error: %w", err) if err != nil {
var asErr *errors.Error
if errors.As(err, &asErr) {
logging.LogError("Unexpected error: %v", err)
_, _ = fmt.Fprintf(os.Stderr, "Unexpected error: %v", err)
} else {
logging.LogError("Unexpected error: %v", err)
}
os.Exit(1) os.Exit(1)
} }
} }
func runApp() error { func runApp() (err error) {
logging.LogInfo("Starting up. Press Ctrl+C to exit") logging.LogInfo("gemcrawl %s starting up. Press Ctrl+C to exit", common.VERSION)
signals := make(chan os.Signal, 1) signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
db := main2.ConnectToDB() _db, err := db.ConnectToDB()
if err != nil {
return err
}
defer func(db *sqlx.DB) { defer func(db *sqlx.DB) {
err := db.Close() _ = db.Close()
if err != nil { }(_db)
// TODO properly log & hangle error
panic(err)
}
}(db)
gemini.LoadBlacklist() err = blackList.LoadBlacklist()
if err != nil {
return err
}
common.StatusChan = make(chan common.WorkerStatus, config.CONFIG.NumOfWorkers)
common.ErrorsChan = make(chan error, config.CONFIG.NumOfWorkers)
// If there's an argument, visit this // If there's an argument, visit this
// URL only and don't spawn other workers // URL only and don't spawn other workers
if len(os.Args) > 1 { if len(os.Args) > 1 {
url := os.Args[1] url := os.Args[1]
go gemini.RunWorkerWithTx(0, db, &url) err = common.CrawlOneURL(_db, &url)
} else { return err
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
} }
<-signals go common.SpawnWorkers(config.CONFIG.NumOfWorkers, _db)
logging.LogWarn("Received SIGINT or SIGTERM signal, exiting")
return nil for {
select {
case <-signals:
logging.LogWarn("Received SIGINT or SIGTERM signal, exiting")
return nil
case err := <-common.ErrorsChan:
if errors.IsFatal(err) {
return err
}
logging.LogError("%s", fmt.Sprintf("%v", err))
}
}
} }

View File

@@ -5,11 +5,12 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"math/big" "math/big"
"regexp"
"runtime/debug" "runtime/debug"
) )
func PrintStackAndPanic(err error) { func PrintStackAndPanic(err error) {
fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack()) fmt.Printf("PANIC Error %s Stack trace:\n%s", err, debug.Stack())
panic("PANIC") panic("PANIC")
} }
@@ -34,3 +35,10 @@ func PrettyJson(data string) string {
marshalled, _ := json.MarshalIndent(data, "", " ") marshalled, _ := json.MarshalIndent(data, "", " ")
return fmt.Sprintf("%s\n", marshalled) return fmt.Sprintf("%s\n", marshalled)
} }
// GetLinesMatchingRegex returns all lines that match given regex
func GetLinesMatchingRegex(input string, pattern string) []string {
re := regexp.MustCompile(pattern)
matches := re.FindAllString(input, -1)
return matches
}