Better error handling, many fixes all around

This commit is contained in:
2024-12-09 19:53:15 +02:00
parent b52d4f6532
commit 7a36614232
15 changed files with 520 additions and 233 deletions

2
.gitignore vendored
View File

@@ -1,6 +1,8 @@
.idea
.goroot
**/.#*
**/*~
/.go
/cmd
/db/initdb.sql
/db/*sh

View File

@@ -5,13 +5,13 @@ import (
"fmt"
)
type ErrGeminiStatusCode struct {
type GeminiError struct {
Msg string
Code int
Header string
}
func (e *ErrGeminiStatusCode) Error() string {
func (e *GeminiError) Error() string {
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
}
@@ -31,7 +31,7 @@ func NewErrGeminiStatusCode(code int, header string) error {
default:
msg = "unexpected status code"
}
return &ErrGeminiStatusCode{
return &GeminiError{
Msg: msg,
Code: code,
Header: header,
@@ -39,7 +39,6 @@ func NewErrGeminiStatusCode(code int, header string) error {
}
var (
ErrGemini = errors.New("gemini general error")
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
ErrGeminiResponseHeader = errors.New("gemini response header error")
@@ -64,8 +63,10 @@ var (
// we would lose ability to check wrapped
// errors via errors.Is().
var KnownErrors = []error{
ErrGemini,
var errGemini *GeminiError
var knownErrors = []error{ //nolint:gochecknoglobals
errGemini,
ErrGeminiLinkLineParse,
ErrGeminiRobotsParse,
ErrGeminiRobotsDisallowed,
@@ -87,14 +88,14 @@ var KnownErrors = []error{
}
func IsKnownError(err error) bool {
var errGeminiStatusCode *ErrGeminiStatusCode
if errors.As(err, &errGeminiStatusCode) {
return true
}
for _, known := range KnownErrors {
for _, known := range knownErrors {
if errors.Is(err, known) {
return true
}
}
// Check for wrapped errors as well
if errors.As(err, new(*GeminiError)) {
return true
}
return false
}

24
gemini/errors_test.go Normal file
View File

@@ -0,0 +1,24 @@
package gemini
import (
"errors"
"fmt"
"testing"
)
func TestErrGemini(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
if !errors.As(err, new(*GeminiError)) {
t.Errorf("TestErrGemini fail")
}
}
func TestErrGeminiWrapped(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
errWrapped := fmt.Errorf("%w wrapped", err)
if !errors.As(errWrapped, new(*GeminiError)) {
t.Errorf("TestErrGeminiWrapped fail")
}
}

View File

@@ -78,7 +78,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
finalPath, err := calcFilePath(parentPath, urlPath)
if err != nil {
logging.LogError("Error saving %s: %w", s.URL, err)
logging.LogError("GeminiError saving %s: %w", s.URL, err)
return
}
// Ensure the directory exists
@@ -93,7 +93,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
}
if err != nil {
logging.LogError("Error saving %s: %w", s.URL.Full, err)
logging.LogError("GeminiError saving %s: %w", s.URL.Full, err)
}
close(done)
}

View File

@@ -9,14 +9,16 @@ import (
"gemini-grc/logging"
)
func ProcessGemini(snapshot *Snapshot) *Snapshot {
func GetPageLinks(currentURL URL, gemtext string) LinkList {
// Grab link lines
linkLines := ExtractLinkLines(snapshot.GemText.String)
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
linkLines := ExtractLinkLines(gemtext)
if len(linkLines) == 0 {
return nil
}
var linkURLs LinkList
// Normalize URLs in links, and store them in snapshot
for _, line := range linkLines {
normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String())
normalizedLink, descr, err := NormalizeLink(line, currentURL.String())
if err != nil {
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
continue
@@ -26,13 +28,10 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
continue
}
if snapshot.Links == nil {
snapshot.Links = &LinkList{*geminiUrl}
} else {
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
}
logging.LogDebug(geminiUrl.String())
linkURLs = append(linkURLs, *geminiUrl)
}
return snapshot
return linkURLs
}
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
@@ -124,3 +123,22 @@ func ParseFirstTwoDigits(input string) (int, error) {
return snapshot, nil
}
// extractRedirectTarget returns the redirection
// URL by parsing the header (or error message)
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
// \d+ - matches one or more digits
// \s+ - matches one or more whitespace
// ([^\r]+) - captures everything until it hits a \r (or end of string)
pattern := `\d+\s+([^\r]+)`
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) < 2 {
return nil, fmt.Errorf("%w: Cannot find redirect target from header %s", ErrGeminiResponseHeader, input)
}
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
if err != nil {
return nil, fmt.Errorf("%w: Cannot find redirect target from header: %w", ErrGeminiResponseHeader, err)
}
return newURL, nil
}

47
gemini/gemini_test.go Normal file
View File

@@ -0,0 +1,47 @@
package gemini
import (
"fmt"
"testing"
)
func TestExtractRedirectTargetFullURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://target.gr:1965") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 /a/b"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
input := "redirect: 31 ./"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://nox.im:1965/") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetWrong(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
input := "redirect: 31 fsdsdf"
result, err := extractRedirectTarget(*currentURL, input)
fmt.Println(err)
if result != nil || err == nil {
t.Errorf("fail: result should be nil, err is %s", err)
}
}

View File

@@ -3,8 +3,11 @@ package gemini
import (
"database/sql/driver"
"fmt"
"gemini-grc/logging"
"net/url"
"path"
"strconv"
"strings"
)
type URL struct {
@@ -38,7 +41,13 @@ func (u URL) String() string {
return u.Full
}
// Value implements the driver.Valuer interface
func (u URL) StringNoDefaultPort() string {
if u.Port == 1965 {
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
}
return u.Full
}
func (u URL) Value() (driver.Value, error) {
if u.Full == "" {
return nil, nil
@@ -65,3 +74,24 @@ func ParseURL(input string, descr string) (*URL, error) {
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
}
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
logging.LogDebug("Calculating redirect URL. Current %s header string %s", currentURL, input)
// If URL is absolute, return just it
if strings.Contains(input, "://") {
return ParseURL(input, "")
}
// input is a path. Clean it and construct
// new path
var newPath string
// Handle weird cases found in the wild
if strings.HasPrefix(input, "/") {
newPath = path.Clean(input)
} else if input == "./" || input == "." {
newPath = path.Join(currentURL.Path, "/")
} else {
newPath = path.Join(currentURL.Path, path.Clean(input))
}
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
return ParseURL(strURL, "")
}

103
gemini/gemini_url_test.go Normal file
View File

@@ -0,0 +1,103 @@
package gemini
import (
"reflect"
"testing"
)
func TestParseURL(t *testing.T) {
t.Parallel()
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
parsed, err := ParseURL(input, "")
value, _ := parsed.Value()
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
t.Errorf("fail: %s", parsed)
}
}
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "gemini://a.b/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "a.b",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://a.b:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://smol.gr:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "c/d"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b/c/d",
Descr: "",
Full: "gemini://smol.gr:1965/a/b/c/d",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}

View File

@@ -4,15 +4,18 @@ import (
"crypto/tls"
"errors"
"fmt"
"gemini-grc/logging"
"io"
"net"
gourl "net/url"
"regexp"
"slices"
"strconv"
"strings"
"time"
"gemini-grc/config"
"github.com/guregu/null/v5"
)
@@ -55,8 +58,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
host := fmt.Sprintf("%s:%s", hostname, port)
// Establish the underlying TCP connection.
dialer := &net.Dialer{
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
KeepAlive: 10 * time.Second,
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
}
conn, err := dialer.Dial("tcp", host)
if err != nil {
@@ -95,7 +97,11 @@ func ConnectAndGetData(url string) ([]byte, error) {
var data []byte
// Send Gemini request to trigger server response.
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url)))
// Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitely in the URL.
_url, _ := ParseURL(url, "")
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
}
@@ -111,9 +117,8 @@ func ConnectAndGetData(url string) ([]byte, error) {
if err != nil {
if errors.Is(err, io.EOF) {
break
} else {
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
}
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
}
}
return data, nil
@@ -121,7 +126,19 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Visit given URL, using the Gemini protocol.
// Mutates given Snapshot with the data.
func Visit(s *Snapshot) error {
// In case of error, we store the error string
// inside snapshot and return the error.
func Visit(s *Snapshot) (err error) {
// Don't forget to also store error
// response code (if we have one)
defer func() {
if err != nil {
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*GeminiError)) {
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
}
}
}()
data, err := ConnectAndGetData(s.URL.String())
if err != nil {
return err
@@ -130,6 +147,9 @@ func Visit(s *Snapshot) error {
if err != nil {
return err
}
//marshalled, _ := json.MarshalIndent(pageData, "", " ")
//fmt.Printf("%s\n", marshalled)
s.Header = null.StringFrom(pageData.ResponseHeader)
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
s.MimeType = null.StringFrom(pageData.MimeType)
s.Lang = null.StringFrom(pageData.Lang)
@@ -142,24 +162,21 @@ func Visit(s *Snapshot) error {
return nil
}
// Update given snapshot with the
// Gemini header data: response code,
// mime type and lang (optional)
// processData returne results from
// parsing Gemini header data:
// Code, mime type and lang (optional)
// Returns error if header was invalid
func processData(data []byte) (*PageData, error) {
header, body, err := getHeadersAndData(data)
if err != nil {
return nil, err
}
code, mimeType, lang := getMimeTypeAndLang(header)
var geminiError error
logging.LogDebug("Header: %s", strings.TrimSpace(header))
if code != 20 {
geminiError = NewErrGeminiStatusCode(code, header)
return nil, NewErrGeminiStatusCode(code, header)
}
fmt.Printf("%v\n", header)
if geminiError != nil {
return nil, geminiError
}
pageData := PageData{
ResponseCode: code,
ResponseHeader: header,
@@ -201,11 +218,11 @@ func getHeadersAndData(data []byte) (string, []byte, error) {
// `20 text/gemini` (code, mimetype)
// `31 gemini://redirected.to/other/site` (code)
func getMimeTypeAndLang(headers string) (int, string, string) {
// Regex that parses code, mimetype & lang
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
// Regex that parses code, mimetype & optional charset/lang parameters
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
// Try to get code at least.
// Try to get code at least
re := regexp.MustCompile(`^(\d+)\s+`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
@@ -222,6 +239,6 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
return 0, "", ""
}
mimeType := matches[2]
lang := matches[4]
return code, mimeType, lang
param := matches[3] // This will capture either charset or lang value
return code, mimeType, param
}

View File

@@ -21,7 +21,31 @@ func TestGetMimeTypeAndLang11(t *testing.T) {
}
}
func TestGetMimeTypeAndLang12(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang13(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang2(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang21(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" {

View File

@@ -1,10 +1,13 @@
package gemini
import (
"encoding/json"
"fmt"
"gemini-grc/config"
"os"
"gemini-grc/logging"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx"
)
@@ -33,25 +36,40 @@ func ConnectToDB() *sqlx.DB {
return db
}
func SaveSnapshotToDBIfNotExists(tx *sqlx.Tx, s *Snapshot) error {
func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s))
}
if config.CONFIG.DryRun {
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
return nil
}
query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO NOTHING
`
_, err := tx.NamedExec(query, s)
_, err = tx.NamedExec(query, s)
if err != nil {
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
}
return nil
}
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
fmt.Printf("%+v", s)
func UpsertSnapshot(id int, tx *sqlx.Tx, s *Snapshot) error {
marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s))
}
if config.CONFIG.DryRun {
logging.LogDebug("[%d] Would upsert snapshot %s", id, marshalled)
return nil
}
query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO UPDATE SET
url = EXCLUDED.url,
host = EXCLUDED.host,
@@ -62,23 +80,29 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
links = EXCLUDED.links,
lang = EXCLUDED.lang,
response_code = EXCLUDED.response_code,
error = EXCLUDED.error
`
_, err := tx.NamedExec(query, s)
error = EXCLUDED.error`
_, err = tx.NamedExec(query, s)
//if err != nil {
// logging.LogError("[%s] GeminiError upserting snapshot: %w", s.URL, err)
// panic("This shouldn't happen")
//}
if err != nil {
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
return fmt.Errorf("[%s] GeminiError upserting snapshot: %w", s.URL, err)
}
return nil
}
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
const batchSize = 5000
query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO NOTHING
`
@@ -92,7 +116,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
_, err := tx.NamedExec(query, batch)
if err != nil {
logging.LogError("Error batch inserting snapshots: %w", err)
logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err)
}
}
@@ -101,14 +125,17 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
}
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO NOTHING
`
_, err := tx.NamedExec(query, snapshots)
if err != nil {
logging.LogError("Error batch inserting snapshots: %w", err)
logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err)
}
return nil

View File

@@ -57,8 +57,8 @@ func populateBlacklist(key string) (entries []string) {
// RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule.
func RobotMatch(url URL) bool {
logging.LogDebug("Checking robots.txt cache for %s", url.String())
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
logging.LogDebug("Checking robots.txt cache for %s", key)
var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key)
if !ok {

View File

@@ -27,46 +27,17 @@ func (l *LinkList) Scan(value interface{}) error {
}
type Snapshot struct {
ID int `db:"id" json:"id,omitempty"`
UID string `db:"uid" json:"uid,omitempty"`
URL URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Links *LinkList `db:"links" json:"links,omitempty"`
Lang null.String `db:"lang" json:"lang,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
ID int `db:"id" json:"id,omitempty"`
//UID string `db:"uid" json:"uid,omitempty"`
URL URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Header null.String `db:"header" json:"header,omitempty"` // Response header.
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
Lang null.String `db:"lang" json:"lang,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
}
//func SnapshotToJSON(g Snapshot) string {
// // Serialize the Person struct to JSON
// jsonData, err := json.MarshalIndent(g, "", "\t")
// if err != nil {
// logging.LogError("Error serializing to JSON: %w", err)
// }
// return string(jsonData)
//}
//
//func SnapshotFromJSON(input string) Snapshot {
// var snapshot Snapshot
// err := json.Unmarshal([]byte(input), &snapshot)
// if err != nil {
// logging.LogError("Error deserializing from JSON: %w", err)
// }
// return snapshot
//}
//
//func ShouldPersistSnapshot(result *Snapshot) bool {
// if !result.MimeType.Valid {
// return false
// }
// if result.MimeType.String == "text/gemini" ||
// strings.HasPrefix(result.MimeType.String, "image/") ||
// strings.HasPrefix(result.MimeType.String, "text/") {
// return true
// }
// return false
//}

View File

@@ -3,14 +3,13 @@ package gemini
import (
"errors"
"fmt"
"regexp"
"strings"
"time"
"gemini-grc/config"
"gemini-grc/logging"
"gemini-grc/uid"
"gemini-grc/util"
"github.com/guregu/null/v5"
"github.com/jmoiron/sqlx"
)
@@ -27,12 +26,13 @@ func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
}
func RunWorker(id int, db *sqlx.DB, url *string) {
// Start the DB transaction
// Each worker runs within a DB transaction.
tx, err := db.Beginx()
if err != nil {
logging.LogError("Failed to begin transaction: %w", err)
}
// Commit/rollback at the end
defer func() {
err = tx.Commit()
if err != nil {
@@ -46,73 +46,57 @@ func RunWorker(id int, db *sqlx.DB, url *string) {
var snapshots []Snapshot
// If not given a specific URL,
// get some random ones to visit from DB.
if url == nil {
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
} else {
snapshots, err = GetSnapshotFromURL(tx, *url)
if len(snapshots) == 0 {
snapshotURL, err := ParseURL(*url, "")
if err != nil {
panic("Invalid URL: " + *url)
}
snapshots = []Snapshot{{
UID: uid.UID(),
URL: *snapshotURL,
Host: snapshotURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}}
if err != nil {
logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err)
panic("This should never happen")
} else if len(snapshots) == 0 {
logging.LogInfo("[%d] No snapshots to visit.", id)
time.Sleep(1 * time.Minute)
return
}
} else {
snapshotURL, err := ParseURL(*url, "")
if err != nil {
logging.LogError("Invalid URL given: " + *url)
return
}
snapshots = []Snapshot{{
//UID: uid.UID(),
URL: *snapshotURL,
Host: snapshotURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}}
}
if err != nil {
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
time.Sleep(10 * time.Second)
return
} else if len(snapshots) == 0 {
logging.LogInfo("[%d] No snapshots to visit.", id)
time.Sleep(1 * time.Minute)
return
}
// Start visiting URLs.
total := len(snapshots)
for i, s := range snapshots {
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
// We differentiate between errors:
// Unexpected errors are the ones returned from the following function.
// If an error is unexpected (which should never happen) we panic.
// Expected errors are stored as strings within the snapshot,
// so that they can also be stored in DB.
err = workOnSnapshot(id, tx, &s)
if err != nil {
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err)
logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err)
util.PrintStackAndPanic(err)
}
if s.Error.Valid {
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String)
logging.LogWarn("[%d] Error: %v", id, s.Error.String)
}
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
}
logging.LogInfo("[%d] Worker done.", id)
}
func handleRedirection(tx *sqlx.Tx, s *Snapshot) error {
re := regexp.MustCompile(`gemini://\S+`)
matches := re.FindStringSubmatch(s.Error.ValueOrZero())
if len(matches) == 1 {
newURL := matches[0]
logging.LogDebug("Page redirects to %s", newURL)
_url, err := ParseURL(newURL, "")
// Insert fresh snapshot with new URL
if err == nil {
snapshot := &Snapshot{
UID: uid.UID(),
URL: *_url,
Host: _url.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
err := SaveSnapshotToDBIfNotExists(tx, snapshot)
if err != nil {
return err
}
}
}
return nil
}
// workOnSnapshot visits a URL and stores the result.
// errors should be returned only if they are unexpected.
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
if IsBlacklisted(s.URL) {
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
@@ -123,31 +107,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
// add it as an error so next time it won't be
// crawled.
if RobotMatch(s.URL) {
s.Error = null.StringFrom("robots.txt disallow match")
err = SaveSnapshotToDB(tx, s)
s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
err = UpsertSnapshot(id, tx, s)
if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err)
return fmt.Errorf("[%d] %w", id, err)
}
return nil
}
// Resolve IP address via DNS
IPs, err := getHostIPAddresses(s.Host)
if err != nil {
s.Error = null.StringFrom("DNS Resolve error")
err = SaveSnapshotToDB(tx, s)
s.Error = null.StringFrom(err.Error())
err = UpsertSnapshot(id, tx, s)
if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err)
return fmt.Errorf("[%d] %w", id, err)
}
return nil
}
defer func() {
time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs)
}()
// If the host's ip is in the connections pool,
// stop and add the url in the queue later.
// If the host's ip is in the connections pool we stop
IpPool.Lock.RLock()
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
for _, ip := range IPs {
@@ -155,7 +134,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
if ok {
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
IpPool.Lock.RUnlock()
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
time.Sleep(1 * time.Second) // Avoid flood-retrying
return nil
}
}
@@ -163,84 +142,115 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
AddIPsToPool(IPs)
// After finishing, remove the host IPs from
// the connections pool, with a small delay
// to avoid potentially hitting the same IP quickly.
defer func() {
time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs)
}()
url := s.URL.String()
logging.LogDebug("[%d] Dialing %s", id, url)
err = Visit(s)
if err != nil {
if !IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
if config.CONFIG.PanicOnUnexpectedError {
util.PrintStackAndPanic(err)
}
} else {
s.Error = null.StringFrom(err.Error())
return err
}
if errors.As(err, new(*ErrGeminiStatusCode)) {
err = handleRedirection(tx, s)
// Check if error is redirection, and handle it
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*GeminiError)) &&
err.(*GeminiError).Msg == "redirect" {
err = handleRedirection(id, tx, s)
if err != nil {
return err
}
}
}
logging.LogDebug("[%d] Finished dialing.", id)
logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero())
// If this is a gemini page, parse possible links inside
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
logging.LogDebug("[%d] [%s] Processing", id, url)
s = ProcessGemini(s)
links := GetPageLinks(s.URL, s.GemText.String)
logging.LogDebug("[%d] Found %d links", id, len(links))
if len(links) > 0 {
s.Links = null.ValueFrom(links)
}
} else {
logging.LogDebug("[%d] Not looking for page links", id)
}
logging.LogDebug("[%d] Saving", id)
err = SaveSnapshotToDB(tx, s)
err = UpsertSnapshot(id, tx, s)
if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err)
return err
}
// Store links in batch
if s.Links != nil {
var batchSnapshots []*Snapshot
timestamp := null.TimeFrom(time.Now())
err = storeLinks(tx, s)
if err != nil {
return err
}
return nil
}
for _, link := range *s.Links {
if shouldPersistURL(tx, link) {
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
if s.Links.Valid {
var batchSnapshots []*Snapshot
for _, link := range s.Links.ValueOrZero() {
if shouldPersistURL(link) {
newSnapshot := &Snapshot{
UID: uid.UID(),
//UID: uid.UID(),
URL: link,
Host: link.Hostname,
Timestamp: timestamp,
Timestamp: null.TimeFrom(time.Now()),
}
batchSnapshots = append(batchSnapshots, newSnapshot)
}
}
if len(batchSnapshots) > 0 {
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
err = SaveLinksToDBinBatches(tx, batchSnapshots)
err := SaveLinksToDBinBatches(tx, batchSnapshots)
if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err)
return err
}
}
}
return nil
}
// Should we save the given URL for crawling?
func shouldPersistURL(tx *sqlx.Tx, u URL) bool {
if !strings.HasPrefix(u.String(), "gemini://") {
return false
}
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)`
var exists bool
err := tx.Get(&exists, query, u.String())
// shouldPersistURL returns true if we
// should save the URL in the DB.
// Only gemini:// urls are saved.
func shouldPersistURL(u URL) bool {
return strings.HasPrefix(u.String(), "gemini://")
}
func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error {
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
if err != nil {
fmt.Println("Error executing query:", err)
return false
return err
}
return !exists
logging.LogDebug("[%d] Page redirects to %s", id, newURL)
// Insert fresh snapshot with new URL
snapshot := &Snapshot{
//UID: uid.UID(),
URL: *newURL,
Host: newURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String())
err = SaveSnapshotIfNew(tx, snapshot)
if err != nil {
return err
}
return nil
}
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
// Old, unoptimized query
//
// query := `
// query := `
// SELECT DISTINCT ON (host) *
// FROM snapshots
// WHERE response_code IS NULL
@@ -249,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
// LIMIT $1
// `
query := `
WITH RankedSnapshots AS (
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
links, lang, response_code, error,
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
)
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
links, lang, response_code, error
FROM RankedSnapshots
WHERE rn = 1
LIMIT $1
`
SELECT *
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
ORDER BY RANDOM()
LIMIT $1
`
//query := `
// WITH RankedSnapshots AS (
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
// links, lang, response_code, error,
// ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
// FROM snapshots
// WHERE response_code IS NULL
// AND error IS NULL
// )
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
// links, lang, response_code, error
// FROM RankedSnapshots
// WHERE rn = 1
// LIMIT $1
//`
var snapshots []Snapshot
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
if err != nil {

21
main.go
View File

@@ -1,16 +1,16 @@
package main
import (
"os"
"os/signal"
"syscall"
"gemini-grc/config"
"gemini-grc/gemini"
"gemini-grc/http"
"gemini-grc/logging"
"github.com/jmoiron/sqlx"
"github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log"
"os"
"os/signal"
"syscall"
)
func main() {
@@ -26,9 +26,10 @@ func main() {
func runApp() error {
logging.LogInfo("Starting up. Press Ctrl+C to exit")
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
server := http.CreateServer("localhost:8899")
db := gemini.ConnectToDB()
// !!! DANGER !!!
@@ -44,7 +45,8 @@ func runApp() error {
}(db)
gemini.LoadBlacklist()
// If there's an argument, assume it's a URL
// to visit and ignore database state.
if len(os.Args) > 1 {
url := os.Args[1]
go gemini.RunWorker(0, db, &url)
@@ -52,7 +54,10 @@ func runApp() error {
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
}
<-sigs
<-signals
if err := server.Close(); err != nil {
logging.LogError("GeminiError during server shutdown: %s", err)
}
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
return nil
}