Reorganize code for more granular imports

This commit is contained in:
2025-02-26 10:34:25 +02:00
parent 8350e106d6
commit ca008b0796
23 changed files with 1549 additions and 1232 deletions

View File

@@ -2,13 +2,13 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"net/url"
"os"
"path"
"path/filepath"
"strings"
"gemini-grc/common/snapshot"
"gemini-grc/logging"
)
@@ -64,7 +64,7 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
return finalPath, nil
}
func SaveToFile(rootPath string, s *common.Snapshot, done chan struct{}) {
func SaveToFile(rootPath string, s *snapshot.Snapshot, done chan struct{}) {
parentPath := path.Join(rootPath, s.URL.Hostname)
urlPath := s.URL.Path
// If path is empty, add `index.gmi` as the file to save
@@ -105,7 +105,7 @@ func ReadLines(path string) []string {
panic(fmt.Sprintf("Failed to read file: %s", err))
}
lines := strings.Split(string(data), "\n")
// Remove last line if empty
// remove last line if empty
// (happens when file ends with '\n')
if lines[len(lines)-1] == "" {
lines = lines[:len(lines)-1]

View File

@@ -1,49 +0,0 @@
package gemini
import (
"fmt"
"regexp"
"strconv"
"gemini-grc/common"
)
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
// If no valid digits are found, it returns an error.
func ParseFirstTwoDigits(input string) (int, error) {
// Define the regular expression pattern to match one or two leading digits
re := regexp.MustCompile(`^(\d{1,2})`)
// Find the first match in the string
matches := re.FindStringSubmatch(input)
if len(matches) == 0 {
return 0, fmt.Errorf("%w", common.ErrGeminiResponseHeader)
}
// Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1])
if err != nil {
return 0, fmt.Errorf("%w: %w", common.ErrTextParse, err)
}
return snapshot, nil
}
// extractRedirectTarget returns the redirection
// URL by parsing the header (or error message)
func extractRedirectTarget(currentURL common.URL, input string) (*common.URL, error) {
// \d+ - matches one or more digits
// \s+ - matches one or more whitespace
// ([^\r]+) - captures everything until it hits a \r (or end of string)
pattern := `\d+\s+([^\r]+)`
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) < 2 {
return nil, fmt.Errorf("%w: %s", common.ErrGeminiRedirect, input)
}
newURL, err := common.DeriveAbsoluteURL(currentURL, matches[1])
if err != nil {
return nil, fmt.Errorf("%w: %w: %s", common.ErrGeminiRedirect, err, input)
}
return newURL, nil
}

View File

@@ -5,22 +5,24 @@ import (
"net/url"
"regexp"
"gemini-grc/common"
"gemini-grc/common/linkList"
url2 "gemini-grc/common/url"
"gemini-grc/errors"
"gemini-grc/logging"
"gemini-grc/util"
)
func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList {
func GetPageLinks(currentURL url2.URL, gemtext string) linkList.LinkList {
linkLines := util.GetLinesMatchingRegex(gemtext, `(?m)^=>[ \t]+.*`)
if len(linkLines) == 0 {
return nil
}
var linkURLs common.LinkList
var linkURLs linkList.LinkList
// Normalize URLs in links
for _, line := range linkLines {
linkUrl, err := ParseGeminiLinkLine(line, currentURL.String())
if err != nil {
logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err)
logging.LogDebug("error parsing gemini link line: %s", err)
continue
}
linkURLs = append(linkURLs, *linkUrl)
@@ -31,19 +33,18 @@ func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList {
// ParseGeminiLinkLine takes a single link line and the current URL,
// return the URL converted to an absolute URL
// and its description.
func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error) {
func ParseGeminiLinkLine(linkLine string, currentURL string) (*url2.URL, error) {
// Check: currentURL is parseable
baseURL, err := url.Parse(currentURL)
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
}
// Extract the actual URL and the description
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged
return nil, fmt.Errorf("%w could not parse gemini link %s", common.ErrGeminiLinkLineParse, linkLine)
return nil, errors.NewError(fmt.Errorf("error parsing link line: no regexp match for line %s", linkLine))
}
originalURLStr := matches[1]
@@ -51,7 +52,7 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error
// Check: Unescape the URL if escaped
_, err = url.QueryUnescape(originalURLStr)
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err)
return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
}
description := ""
@@ -62,8 +63,7 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error
// Parse the URL from the link line
parsedURL, err := url.Parse(originalURLStr)
if err != nil {
// If URL parsing fails, return an error
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
}
// If link URL is relative, resolve full URL
@@ -71,17 +71,16 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error
parsedURL = baseURL.ResolveReference(parsedURL)
}
// Remove usual first space from URL description:
// remove usual first space from URL description:
// => URL description
// ^^^^^^^^^^^^
if len(description) > 0 && description[0] == ' ' {
description = description[1:]
}
finalURL, err := common.ParseURL(parsedURL.String(), description, true)
finalURL, err := url2.ParseURL(parsedURL.String(), description, true)
if err != nil {
// If URL parsing fails, return an error
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
return nil, errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
}
return finalURL, nil

View File

@@ -1,18 +1,18 @@
package gemini
import (
"errors"
"reflect"
"strings"
"testing"
"gemini-grc/common"
"gemini-grc/common/url"
)
type TestData struct {
currentURL string
link string
value *common.URL
error error
value *url.URL
error string
}
var data = []TestData{
@@ -20,12 +20,12 @@ var data = []TestData{
currentURL: "https://gemini.com/",
link: "https://gemini.com/",
value: nil,
error: common.ErrGeminiLinkLineParse,
error: "error parsing link line",
},
{
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> archive/ Complete Archive",
value: &common.URL{
value: &url.URL{
Protocol: "gemini",
Hostname: "gemi.dev",
Port: 1965,
@@ -33,12 +33,12 @@ var data = []TestData{
Descr: "Complete Archive",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd/archive/",
},
error: nil,
error: "",
},
{
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?a=5&b=6 Example",
value: &common.URL{
value: &url.URL{
Protocol: "gemini",
Hostname: "gemi.dev",
Port: 1965,
@@ -46,12 +46,12 @@ var data = []TestData{
Descr: "Example",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?a=5&b=6",
},
error: nil,
error: "",
},
{
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?1494 XKCD 1494: Insurance",
value: &common.URL{
value: &url.URL{
Protocol: "gemini",
Hostname: "gemi.dev",
Port: 1965,
@@ -59,12 +59,12 @@ var data = []TestData{
Descr: "XKCD 1494: Insurance",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494",
},
error: nil,
error: "",
},
{
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?1494#f XKCD 1494: Insurance",
value: &common.URL{
value: &url.URL{
Protocol: "gemini",
Hostname: "gemi.dev",
Port: 1965,
@@ -72,12 +72,12 @@ var data = []TestData{
Descr: "XKCD 1494: Insurance",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494#f",
},
error: nil,
error: "",
},
{
currentURL: "gemini://gemi.dev/cgi-bin/xkcd/",
link: "=> /cgi-bin/xkcd.cgi?c=5#d XKCD 1494: Insurance",
value: &common.URL{
value: &url.URL{
Protocol: "gemini",
Hostname: "gemi.dev",
Port: 1965,
@@ -85,12 +85,12 @@ var data = []TestData{
Descr: "XKCD 1494: Insurance",
Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?c=5#d",
},
error: nil,
error: "",
},
{
currentURL: "gemini://a.b/c#d",
link: "=> /d/e#f",
value: &common.URL{
value: &url.URL{
Protocol: "gemini",
Hostname: "a.b",
Port: 1965,
@@ -98,7 +98,7 @@ var data = []TestData{
Descr: "",
Full: "gemini://a.b:1965/d/e#f",
},
error: nil,
error: "",
},
}
@@ -110,13 +110,10 @@ func Test(t *testing.T) {
if expected.value != nil {
t.Errorf("data[%d]: Expected value %v, got %v", i, nil, expected.value)
}
if !errors.Is(err, common.ErrGeminiLinkLineParse) {
if !strings.HasPrefix(err.Error(), expected.error) {
t.Errorf("data[%d]: expected error %v, got %v", i, expected.error, err)
}
} else {
if expected.error != nil {
t.Errorf("data[%d]: Expected error %v, got %v", i, nil, expected.error)
}
if !(reflect.DeepEqual(result, expected.value)) {
t.Errorf("data[%d]: expected %#v, got %#v", i, expected.value, result)
}

View File

@@ -1,69 +0,0 @@
package gemini
import (
"testing"
"gemini-grc/common"
)
func TestExtractRedirectTargetFullURL(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr"
result, err := extractRedirectTarget(*currentURL, input)
expected := "gemini://target.gr:1965"
if err != nil || (result.String() != expected) {
t.Errorf("fail: Expected %s got %s", expected, result)
}
}
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr/"
result, err := extractRedirectTarget(*currentURL, input)
expected := "gemini://target.gr:1965/"
if err != nil || (result.String() != expected) {
t.Errorf("fail: Expected %s got %s", expected, result)
}
}
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 /a/b"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://nox.im:1965", "")
input := "redirect: 31 ./"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://nox.im:1965/") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://status.zvava.org:1965", "")
input := "redirect: 31 index.gmi"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
t.Errorf("fail: %s", result)
}
}
func TestExtractRedirectTargetWrong(t *testing.T) {
t.Parallel()
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31"
result, err := extractRedirectTarget(*currentURL, input)
if result != nil || err == nil {
t.Errorf("fail: result should be nil, err is %s", err)
}
}

View File

@@ -1,54 +0,0 @@
package gemini
import "sync"
// Used to limit requests per
// IP address. Maps IP address
// to number of active connections.
type IpAddressPool struct {
IPs map[string]int
Lock sync.RWMutex
}
func (p *IpAddressPool) Set(key string, value int) {
p.Lock.Lock() // Lock for writing
defer p.Lock.Unlock() // Ensure mutex is unlocked after the write
p.IPs[key] = value
}
func (p *IpAddressPool) Get(key string) int {
p.Lock.RLock() // Lock for reading
defer p.Lock.RUnlock() // Ensure mutex is unlocked after reading
if value, ok := p.IPs[key]; !ok {
return 0
} else {
return value
}
}
func (p *IpAddressPool) Delete(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
delete(p.IPs, key)
}
func (p *IpAddressPool) Incr(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
if _, ok := p.IPs[key]; !ok {
p.IPs[key] = 1
} else {
p.IPs[key] = p.IPs[key] + 1
}
}
func (p *IpAddressPool) Decr(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
if val, ok := p.IPs[key]; ok {
p.IPs[key] = val - 1
if p.IPs[key] == 0 {
delete(p.IPs, key)
}
}
}

View File

@@ -2,44 +2,78 @@ package gemini
import (
"crypto/tls"
"errors"
"fmt"
"io"
"net"
gourl "net/url"
stdurl "net/url"
"regexp"
"slices"
"strconv"
"strings"
"time"
"gemini-grc/common"
errors2 "gemini-grc/common/errors"
"gemini-grc/common/snapshot"
_url "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/errors"
"gemini-grc/logging"
"github.com/guregu/null/v5"
)
type PageData struct {
ResponseCode int
ResponseHeader string
MimeType string
Lang string
GemText string
Data []byte
}
func getHostIPAddresses(hostname string) ([]string, error) {
addrs, err := net.LookupHost(hostname)
// Visit given URL, using the Gemini protocol.
// Mutates given Snapshot with the data.
// In case of error, we store the error string
// inside snapshot and return the error.
func Visit(url string) (s *snapshot.Snapshot, err error) {
s, err = snapshot.SnapshotFromURL(url, true)
if err != nil {
return nil, fmt.Errorf("%w:%w", common.ErrNetworkDNS, err)
return nil, err
}
return addrs, nil
defer func() {
if err != nil {
// GeminiError and HostError should
// be stored in the snapshot. Other
// errors are returned.
if errors2.IsHostError(err) {
s.Error = null.StringFrom(err.Error())
err = nil
} else if IsGeminiError(err) {
s.Error = null.StringFrom(err.Error())
s.Header = null.StringFrom(errors.Unwrap(err).(*GeminiError).Header)
s.ResponseCode = null.IntFrom(int64(errors.Unwrap(err).(*GeminiError).Code))
err = nil
} else {
s = nil
}
}
}()
data, err := ConnectAndGetData(s.URL.String())
if err != nil {
return s, err
}
s, err = processData(*s, data)
if err != nil {
return s, err
}
if isGeminiCapsule(s) {
links := GetPageLinks(s.URL, s.GemText.String)
if len(links) > 0 {
logging.LogDebug("Found %d links", len(links))
s.Links = null.ValueFrom(links)
}
}
return s, nil
}
func ConnectAndGetData(url string) ([]byte, error) {
parsedURL, err := gourl.Parse(url)
parsedURL, err := stdurl.Parse(url)
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
return nil, errors.NewError(err)
}
hostname := parsedURL.Hostname()
port := parsedURL.Port()
@@ -47,29 +81,28 @@ func ConnectAndGetData(url string) ([]byte, error) {
port = "1965"
}
host := fmt.Sprintf("%s:%s", hostname, port)
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
// Establish the underlying TCP connection.
dialer := &net.Dialer{
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
Timeout: timeoutDuration,
}
conn, err := dialer.Dial("tcp", host)
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
return nil, errors2.NewHostError(err)
}
// Make sure we always close the connection.
defer func() {
// No need to handle error:
// Connection will time out eventually if still open somehow.
_ = conn.Close()
}()
// Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
return nil, errors2.NewHostError(err)
}
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
return nil, errors2.NewHostError(err)
}
// Perform the TLS handshake
@@ -79,8 +112,17 @@ func ConnectAndGetData(url string) ([]byte, error) {
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
}
tlsConn := tls.Client(conn, tlsConfig)
if err := tlsConn.Handshake(); err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetworkTLS, err)
err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, errors2.NewHostError(err)
}
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, errors2.NewHostError(err)
}
err = tlsConn.Handshake()
if err != nil {
return nil, errors2.NewHostError(err)
}
// We read `buf`-sized chunks and add data to `data`.
@@ -91,10 +133,10 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitly in the URL.
_url, _ := common.ParseURL(url, "")
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
url2, _ := _url.ParseURL(url, "", true)
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrNetworkCannotWrite, err)
return nil, errors2.NewHostError(err)
}
// Read response bytes in len(buf) byte chunks
for {
@@ -103,90 +145,50 @@ func ConnectAndGetData(url string) ([]byte, error) {
data = append(data, buf[:n]...)
}
if len(data) > config.CONFIG.MaxResponseSize {
return nil, fmt.Errorf("%w: %v", common.ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
return nil, errors2.NewHostError(err)
}
if err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
return nil, errors2.NewHostError(err)
}
}
return data, nil
}
// Visit given URL, using the Gemini protocol.
// Mutates given Snapshot with the data.
// In case of error, we store the error string
// inside snapshot and return the error.
func Visit(s *common.Snapshot) (err error) {
// Don't forget to also store error
// response code (if we have one)
// and header
defer func() {
if err != nil {
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*common.GeminiError)) {
s.Header = null.StringFrom(err.(*common.GeminiError).Header)
s.ResponseCode = null.IntFrom(int64(err.(*common.GeminiError).Code))
}
}
}()
s.Timestamp = null.TimeFrom(time.Now())
data, err := ConnectAndGetData(s.URL.String())
if err != nil {
return err
}
pageData, err := processData(data)
if err != nil {
return err
}
s.Header = null.StringFrom(pageData.ResponseHeader)
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
s.MimeType = null.StringFrom(pageData.MimeType)
s.Lang = null.StringFrom(pageData.Lang)
if pageData.GemText != "" {
s.GemText = null.StringFrom(pageData.GemText)
}
if pageData.Data != nil {
s.Data = null.ValueFrom(pageData.Data)
}
return nil
}
// processData returne results from
// parsing Gemini header data:
// Code, mime type and lang (optional)
// Returns error if header was invalid
func processData(data []byte) (*PageData, error) {
func processData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
header, body, err := getHeadersAndData(data)
if err != nil {
return nil, err
}
code, mimeType, lang := getMimeTypeAndLang(header)
logging.LogDebug("Header: %s", strings.TrimSpace(header))
if code != 20 {
return nil, common.NewErrGeminiStatusCode(code, header)
if code != 0 {
s.ResponseCode = null.IntFrom(int64(code))
}
if header != "" {
s.Header = null.StringFrom(header)
}
if mimeType != "" {
s.MimeType = null.StringFrom(mimeType)
}
if lang != "" {
s.Lang = null.StringFrom(lang)
}
pageData := PageData{
ResponseCode: code,
ResponseHeader: header,
MimeType: mimeType,
Lang: lang,
}
// If we've got a Gemini document, populate
// `GemText` field, otherwise raw data goes to `Data`.
if mimeType == "text/gemini" {
validBody, err := BytesToValidUTF8(body)
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrUTF8Parse, err)
return nil, errors.NewError(err)
}
pageData.GemText = validBody
s.GemText = null.StringFrom(validBody)
} else {
pageData.Data = body
s.Data = null.ValueFrom(body)
}
return &pageData, nil
return &s, nil
}
// Checks for a Gemini header, which is
@@ -196,29 +198,42 @@ func processData(data []byte) (*PageData, error) {
func getHeadersAndData(data []byte) (string, []byte, error) {
firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 {
return "", nil, common.ErrGeminiResponseHeader
return "", nil, errors2.NewHostError(fmt.Errorf("error parsing header"))
}
firstLine := string(data[:firstLineEnds])
rest := data[firstLineEnds+1:]
return firstLine, rest, nil
return strings.TrimSpace(firstLine), rest, nil
}
// Parses code, mime type and language
// from a Gemini header.
// Examples:
// `20 text/gemini lang=en` (code, mimetype, lang)
// `20 text/gemini` (code, mimetype)
// `31 gemini://redirected.to/other/site` (code)
// getMimeTypeAndLang Parses code, mime type and language
// given a Gemini header.
func getMimeTypeAndLang(headers string) (int, string, string) {
// Regex that parses code, mimetype & optional charset/lang parameters
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
// First try to match the full format: "<code> <mimetype> [charset=<value>] [lang=<value>]"
// The regex looks for:
// - A number (\d+)
// - Followed by whitespace and a mimetype ([a-zA-Z0-9/\-+]+)
// - Optionally followed by charset and/or lang parameters in any order
// - Only capturing the lang value, ignoring charset
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:(?:[\s;]+(?:charset=[^;\s]+|lang=([a-zA-Z0-9-]+)))*)\s*$`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
// Try to get code at least
re := regexp.MustCompile(`^(\d+)\s+`)
// If full format doesn't match, try to match redirect format: "<code> <URL>"
// This handles cases like "31 gemini://example.com"
re := regexp.MustCompile(`^(\d+)\s+(.+)$`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
return 0, "", ""
// If redirect format doesn't match, try to match just a status code
// This handles cases like "99"
re := regexp.MustCompile(`^(\d+)\s*$`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
return 0, "", ""
}
code, err := strconv.Atoi(matches[1])
if err != nil {
return 0, "", ""
}
return code, "", ""
}
code, err := strconv.Atoi(matches[1])
if err != nil {
@@ -231,6 +246,10 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
return 0, "", ""
}
mimeType := matches[2]
param := matches[3] // This will capture either charset or lang value
return code, mimeType, param
lang := matches[3] // Will be empty string if no lang parameter was found
return code, mimeType, lang
}
func isGeminiCapsule(s *snapshot.Snapshot) bool {
return !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini"
}

View File

@@ -1,78 +1,366 @@
package gemini
import (
"slices"
"strings"
"testing"
"gemini-grc/common/snapshot"
)
// Test for input: `20 text/gemini`
func TestGetMimeTypeAndLang1(t *testing.T) {
func TestGetHeadersAndData(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
tests := []struct {
input []byte
header string
body []byte
expectError bool
}{
{[]byte("20 text/gemini\r\nThis is the body"), "20 text/gemini", []byte("This is the body"), false},
{[]byte("20 text/gemini\nThis is the body"), "20 text/gemini", []byte("This is the body"), false},
{[]byte("53 No proxying!\r\n"), "53 No proxying!", []byte(""), false},
{[]byte("No header"), "", nil, true},
}
for _, test := range tests {
header, body, err := getHeadersAndData(test.input)
if test.expectError && err == nil {
t.Errorf("Expected error, got nil for input: %s", test.input)
}
if !test.expectError && err != nil {
t.Errorf("Unexpected error for input '%s': %v", test.input, err)
}
if header != test.header {
t.Errorf("Expected header '%s', got '%s' for input: %s", test.header, header, test.input)
}
if !slices.Equal(body, test.body) {
t.Errorf("Expected body '%s', got '%s' for input: %s", test.body, string(body), test.input)
}
}
}
func TestGetMimeTypeAndLang11(t *testing.T) {
func TestGetMimeTypeAndLang(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
tests := []struct {
header string
code int
mimeType string
lang string
}{
{"20 text/gemini lang=en", 20, "text/gemini", "en"},
{"20 text/gemini", 20, "text/gemini", ""},
{"31 gemini://redirected.to/other/site", 31, "", ""},
{"20 text/plain;charset=utf-8", 20, "text/plain", ""},
{"20 text/plain;lang=el-GR", 20, "text/plain", "el-GR"},
{"20 text/gemini;lang=en-US;charset=utf-8", 20, "text/gemini", "en-US"}, // charset should be ignored
{"Invalid header", 0, "", ""},
{"99", 99, "", ""},
}
for _, test := range tests {
code, mimeType, lang := getMimeTypeAndLang(test.header)
if code != test.code {
t.Errorf("Expected code %d, got %d for header: %s", test.code, code, test.header)
}
if mimeType != test.mimeType {
t.Errorf("Expected mimeType '%s', got '%s' for header: %s", test.mimeType, mimeType, test.header)
}
if lang != test.lang {
t.Errorf("Expected lang '%s', got '%s' for header: %s", test.lang, lang, test.header)
}
}
}
func TestGetMimeTypeAndLang12(t *testing.T) {
func TestProcessData(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
tests := []struct {
name string
inputData []byte
expectedCode int
expectedMime string
expectedLang string
expectedData []byte
expectedError bool
}{
{
name: "Gemini document",
inputData: []byte("20 text/gemini\r\n# Hello\nWorld"),
expectedCode: 20,
expectedMime: "text/gemini",
expectedLang: "",
expectedData: []byte("# Hello\nWorld"),
expectedError: false,
},
{
name: "Gemini document with language",
inputData: []byte("20 text/gemini lang=en\r\n# Hello\nWorld"),
expectedCode: 20,
expectedMime: "text/gemini",
expectedLang: "en",
expectedData: []byte("# Hello\nWorld"),
expectedError: false,
},
{
name: "Non-Gemini document",
inputData: []byte("20 text/html\r\n<h1>Hello</h1>"),
expectedCode: 20,
expectedMime: "text/html",
expectedLang: "",
expectedData: []byte("<h1>Hello</h1>"),
expectedError: false,
},
{
name: "Error header",
inputData: []byte("53 No proxying!\r\n"),
expectedCode: 53,
expectedMime: "",
expectedLang: "",
expectedData: []byte(""),
expectedError: false,
},
{
name: "Invalid header",
inputData: []byte("Invalid header"),
expectedError: true,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
s := snapshot.Snapshot{}
result, err := processData(s, test.inputData)
if test.expectedError && err == nil {
t.Errorf("Expected error, got nil")
return
}
if !test.expectedError && err != nil {
t.Errorf("Unexpected error: %v", err)
return
}
if test.expectedError {
return
}
if int(result.ResponseCode.ValueOrZero()) != test.expectedCode {
t.Errorf("Expected code %d, got %d", test.expectedCode, int(result.ResponseCode.ValueOrZero()))
}
if result.MimeType.ValueOrZero() != test.expectedMime {
t.Errorf("Expected mimeType '%s', got '%s'", test.expectedMime, result.MimeType.ValueOrZero())
}
if result.Lang.ValueOrZero() != test.expectedLang {
t.Errorf("Expected lang '%s', got '%s'", test.expectedLang, result.Lang.ValueOrZero())
}
if test.expectedMime == "text/gemini" {
if !strings.Contains(result.GemText.String, string(test.expectedData)) {
t.Errorf("Expected GemText '%s', got '%s'", test.expectedData, result.GemText.String)
}
} else {
if !slices.Equal(result.Data.ValueOrZero(), test.expectedData) {
t.Errorf("Expected data '%s', got '%s'", test.expectedData, result.Data.ValueOrZero())
}
}
})
}
}
func TestGetMimeTypeAndLang13(t *testing.T) {
//// Mock Gemini server for testing ConnectAndGetData
//func mockGeminiServer(response string, delay time.Duration, closeConnection bool) net.Listener {
// listener, err := net.Listen("tcp", "127.0.0.1:0") // Bind to a random available port
// if err != nil {
// panic(fmt.Sprintf("Failed to create mock server: %v", err))
// }
//
// go func() {
// conn, err := listener.Accept()
// if err != nil {
// if !closeConnection { // Don't panic if we closed the connection on purpose
// panic(fmt.Sprintf("Failed to accept connection: %v", err))
// }
// return
// }
// defer conn.Close()
//
// time.Sleep(delay) // Simulate network latency
//
// _, err = conn.Write([]byte(response))
// if err != nil && !closeConnection {
// panic(fmt.Sprintf("Failed to write response: %v", err))
// }
// }()
//
// return listener
//}
// func TestConnectAndGetData(t *testing.T) {
// config.CONFIG = config.ConfigStruct{
// ResponseTimeout: 5,
// MaxResponseSize: 1024 * 1024,
// }
// tests := []struct {
// name string
// serverResponse string
// serverDelay time.Duration
// expectedData []byte
// expectedError bool
// closeConnection bool
// }{
// {
// name: "Successful response",
// serverResponse: "20 text/gemini\r\n# Hello",
// expectedData: []byte("20 text/gemini\r\n# Hello"),
// expectedError: false,
// },
// {
// name: "Server error",
// serverResponse: "50 Server error\r\n",
// expectedData: []byte("50 Server error\r\n"),
// expectedError: false,
// },
// {
// name: "Timeout",
// serverDelay: 6 * time.Second, // Longer than the timeout
// expectedError: true,
// },
// {
// name: "Server closes connection",
// closeConnection: true,
// expectedError: true,
// },
// }
// for _, test := range tests {
// t.Run(test.name, func(t *testing.T) {
// listener := mockGeminiServer(test.serverResponse, test.serverDelay, test.closeConnection)
// defer func() {
// test.closeConnection = true // Prevent panic in mock server
// listener.Close()
// }()
// addr := listener.Addr().String()
// data, err := ConnectAndGetData(fmt.Sprintf("gemini://%s/", addr))
// if test.expectedError && err == nil {
// t.Errorf("Expected error, got nil")
// }
// if !test.expectedError && err != nil {
// t.Errorf("Unexpected error: %v", err)
// }
// if !slices.Equal(data, test.expectedData) {
// t.Errorf("Expected data '%s', got '%s'", test.expectedData, data)
// }
// })
// }
// }
// func TestVisit(t *testing.T) {
// config.CONFIG = config.ConfigStruct{
// ResponseTimeout: 5,
// MaxResponseSize: 1024 * 1024,
// }
// tests := []struct {
// name string
// serverResponse string
// expectedCode int
// expectedMime string
// expectedError bool
// expectedLinks []string
// }{
// {
// name: "Successful response",
// serverResponse: "20 text/gemini\r\n# Hello\n=> /link1 Link 1\n=> /link2 Link 2",
// expectedCode: 20,
// expectedMime: "text/gemini",
// expectedError: false,
// expectedLinks: []string{"gemini://127.0.0.1:1965/link1", "gemini://127.0.0.1:1965/link2"},
// },
// {
// name: "Server error",
// serverResponse: "50 Server error\r\n",
// expectedCode: 50,
// expectedMime: "Server error",
// expectedError: false,
// expectedLinks: []string{},
// },
// }
// for _, test := range tests {
// t.Run(test.name, func(t *testing.T) {
// listener := mockGeminiServer(test.serverResponse, 0, false)
// defer listener.Close()
// addr := listener.Addr().String()
// snapshot, err := Visit(fmt.Sprintf("gemini://%s/", addr))
// if test.expectedError && err == nil {
// t.Errorf("Expected error, got nil")
// }
// if !test.expectedError && err != nil {
// t.Errorf("Unexpected error: %v", err)
// }
// if snapshot.ResponseCode.ValueOrZero() != int64(test.expectedCode) {
// t.Errorf("Expected code %d, got %d", test.expectedCode, snapshot.ResponseCode.ValueOrZero())
// }
// if snapshot.MimeType.ValueOrZero() != test.expectedMime {
// t.Errorf("Expected mimeType '%s', got '%s'", test.expectedMime, snapshot.MimeType.ValueOrZero())
// }
// if test.expectedLinks != nil {
// links, _ := snapshot.Links.Value()
// if len(links) != len(test.expectedLinks) {
// t.Errorf("Expected %d links, got %d", len(test.expectedLinks), len(links))
// }
// for i, link := range links {
// if link != test.expectedLinks[i] {
// t.Errorf("Expected link '%s', got '%s'", test.expectedLinks[i], link)
// }
// }
// }
// })
// }
// }
func TestVisit_InvalidURL(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
_, err := Visit("invalid-url")
if err == nil {
t.Errorf("Expected error for invalid URL, got nil")
}
}
func TestGetTypeAndLang2(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang21(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang3(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
if code != 31 || mimeType != "" || lang != "" {
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang4(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang5(t *testing.T) {
t.Parallel()
code, mimeType, lang := getMimeTypeAndLang("")
if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
//func TestVisit_GeminiError(t *testing.T) {
// listener := mockGeminiServer("51 Not Found\r\n", 0, false)
// defer listener.Close()
// addr := listener.Addr().String()
//
// s, err := Visit(fmt.Sprintf("gemini://%s/", addr))
// if err != nil {
// t.Errorf("Unexpected error: %v", err)
// }
//
// expectedError := "51 Not Found"
// if s.Error.ValueOrZero() != expectedError {
// t.Errorf("Expected error in snapshot: %v, got %v", expectedError, s.Error)
// }
//
// expectedCode := 51
// if s.ResponseCode.ValueOrZero() != int64(expectedCode) {
// t.Errorf("Expected code %d, got %d", expectedCode, s.ResponseCode.ValueOrZero())
// }
//}

View File

@@ -26,7 +26,7 @@ func BytesToValidUTF8(input []byte) (string, error) {
if len(input) > maxSize {
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
}
// Remove NULL byte 0x00 (ReplaceAll accepts slices)
// remove NULL byte 0x00 (ReplaceAll accepts slices)
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
if utf8.Valid(inputNoNull) {
return string(inputNoNull), nil

View File

@@ -2,10 +2,11 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"strings"
"sync"
"gemini-grc/common/snapshot"
geminiUrl "gemini-grc/common/url"
"gemini-grc/logging"
)
@@ -16,7 +17,7 @@ import (
// list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals
func populateBlacklist(key string) (entries []string) {
func populateRobotsCache(key string) (entries []string, _err error) {
// We either store an empty list when
// no rules, or a list of disallowed URLs.
// This applies even if we have an error
@@ -27,53 +28,60 @@ func populateBlacklist(key string) (entries []string) {
url := fmt.Sprintf("gemini://%s/robots.txt", key)
robotsContent, err := ConnectAndGetData(url)
if err != nil {
logging.LogDebug("robots.txt error %s", err)
return []string{}
return []string{}, err
}
robotsData, err := processData(robotsContent)
s, err := snapshot.SnapshotFromURL(url, true)
if err != nil {
return []string{}, nil
}
s, err = processData(*s, robotsContent)
if err != nil {
logging.LogDebug("robots.txt error %s", err)
return []string{}
return []string{}, nil
}
if robotsData.ResponseCode != 20 {
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
return []string{}
if s.ResponseCode.ValueOrZero() != 20 {
logging.LogDebug("robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
return []string{}, nil
}
// Some return text/plain, others text/gemini.
// According to spec, the first is correct,
// however let's be lenient
var data string
switch {
case robotsData.MimeType == "text/plain":
data = string(robotsData.Data)
case robotsData.MimeType == "text/gemini":
data = robotsData.GemText
case s.MimeType.ValueOrZero() == "text/plain":
data = string(s.Data.ValueOrZero())
case s.MimeType.ValueOrZero() == "text/gemini":
data = s.GemText.ValueOrZero()
default:
return []string{}
return []string{}, nil
}
entries = ParseRobotsTxt(data, key)
return entries
return entries, nil
}
// RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule.
func RobotMatch(u string) bool {
url, err := common.ParseURL(u, "")
func RobotMatch(u string) (bool, error) {
url, err := geminiUrl.ParseURL(u, "", true)
if err != nil {
return false
return false, err
}
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
logging.LogDebug("Checking robots.txt cache for %s", key)
var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key)
if !ok {
// First time check, populate robot cache
disallowedURLs = populateBlacklist(key)
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
disallowedURLs, err := populateRobotsCache(key)
if err != nil {
return false, err
}
if len(disallowedURLs) > 0 {
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
}
} else {
disallowedURLs, _ = cacheEntries.([]string)
}
return isURLblocked(disallowedURLs, url.Full)
return isURLblocked(disallowedURLs, url.Full), nil
}
func isURLblocked(disallowedURLs []string, input string) bool {

View File

@@ -1,344 +0,0 @@
package gemini
import (
"errors"
"fmt"
"gemini-grc/common"
_db "gemini-grc/db"
"strings"
"time"
"gemini-grc/logging"
"gemini-grc/util"
"github.com/guregu/null/v5"
"github.com/jmoiron/sqlx"
)
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers)
statusChan = make(chan WorkerStatus, numOfWorkers)
go PrintWorkerStatus(numOfWorkers, statusChan)
for i := range numOfWorkers {
go func(i int) {
// Jitter to avoid starting everything at the same time
time.Sleep(time.Duration(util.SecureRandomInt(10)) * time.Second)
for {
RunWorkerWithTx(i, db, nil)
}
}(i)
}
}
func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) {
statusChan <- WorkerStatus{
id: workerID,
status: "Starting up",
}
defer func() {
statusChan <- WorkerStatus{
id: workerID,
status: "Done",
}
}()
tx, err := db.Beginx()
if err != nil {
panic(fmt.Sprintf("Failed to begin transaction: %v", err))
}
runWorker(workerID, tx, url)
logging.LogDebug("[%d] Committing transaction", workerID)
err = tx.Commit()
// On deadlock errors, rollback and return, otherwise panic.
if err != nil {
logging.LogError("[%d] Failed to commit transaction: %w", workerID, err)
if _db.IsDeadlockError(err) {
logging.LogError("[%d] Deadlock detected. Rolling back", workerID)
time.Sleep(time.Duration(10) * time.Second)
err := tx.Rollback()
if err != nil {
panic(fmt.Sprintf("[%d] Failed to roll back transaction: %v", workerID, err))
}
return
}
panic(fmt.Sprintf("[%d] Failed to commit transaction: %v", workerID, err))
}
logging.LogDebug("[%d] Worker done!", workerID)
}
func runWorker(workerID int, tx *sqlx.Tx, url *string) {
var urls []string
var err error
// If not given a specific URL,
// get some random ones to visit from db.
if url == nil {
statusChan <- WorkerStatus{
id: workerID,
status: "Getting URLs",
}
urls, err = _db.GetURLsToVisit(tx)
if err != nil {
logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err)
panic("This should never happen")
} else if len(urls) == 0 {
logging.LogInfo("[%d] No URLs to visit.", workerID)
time.Sleep(1 * time.Minute)
return
}
} else {
geminiURL, err := common.ParseURL(*url, "")
if err != nil {
logging.LogError("Invalid URL given: %s", *url)
return
}
urls = []string{geminiURL.String()}
}
// Start visiting URLs.
total := len(urls)
for i, u := range urls {
logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, u)
// We differentiate between errors:
// Unexpected errors are the ones returned from the following function.
// If an error is unexpected (which should never happen) we panic.
// Expected errors are stored as strings within the snapshot.
err := workOnUrl(workerID, tx, u)
if err != nil {
logging.LogError("[%d] Unexpected GeminiError %w while visiting %s", workerID, err, u)
util.PrintStackAndPanic(err)
}
logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total)
}
}
// workOnUrl visits a URL and stores the result.
// unexpected errors are returned.
// expected errors are stored within the snapshot.
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
if url == "" {
return fmt.Errorf("nil URL given")
}
if IsBlacklisted(url) {
logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, url)
return nil
}
s := common.SnapshotFromURL(url)
// If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be
// crawled.
if RobotMatch(url) {
s.Error = null.StringFrom(common.ErrGeminiRobotsDisallowed.Error())
err = _db.OverwriteSnapshot(workerID, tx, s)
if err != nil {
return fmt.Errorf("[%d] %w", workerID, err)
}
return nil
}
// Resolve IP address via DNS
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Resolving %s", url),
}
IPs, err := getHostIPAddresses(s.Host)
if err != nil {
s.Error = null.StringFrom(err.Error())
err = _db.OverwriteSnapshot(workerID, tx, s)
if err != nil {
return fmt.Errorf("[%d] %w", workerID, err)
}
return nil
}
for {
count := 1
if isAnotherWorkerVisitingHost(workerID, IPs) {
logging.LogDebug("[%d] Another worker is visiting this host, waiting", workerID)
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host),
}
time.Sleep(2 * time.Second) // Avoid flood-retrying
count++
if count == 3 {
return
}
} else {
break
}
}
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Adding to pool %s", url),
}
AddIPsToPool(IPs)
// After finishing, remove the host IPs from
// the connections pool, with a small delay
// to avoid potentially hitting the same IP quickly.
defer func() {
go func() {
time.Sleep(1 * time.Second)
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Removing from pool %s", url),
}
RemoveIPsFromPool(IPs)
}()
}()
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Visiting %s", url),
}
err = Visit(s)
if err != nil {
if !common.IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", workerID, url, err)
return err
}
s.Error = null.StringFrom(err.Error())
// Check if error is redirection, and handle it
if errors.As(err, new(*common.GeminiError)) &&
err.(*common.GeminiError).Msg == "redirect" {
err = handleRedirection(workerID, tx, s)
if err != nil {
if common.IsKnownError(err) {
s.Error = null.StringFrom(err.Error())
} else {
return err
}
}
}
}
// If this is a gemini page, parse possible links inside
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
links := GetPageLinks(s.URL, s.GemText.String)
if len(links) > 0 {
logging.LogDebug("[%d] Found %d links", workerID, len(links))
s.Links = null.ValueFrom(links)
err = storeLinks(tx, s)
if err != nil {
return err
}
}
} else {
logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID)
}
err = _db.OverwriteSnapshot(workerID, tx, s)
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
if err != nil {
return err
}
return nil
}
func isAnotherWorkerVisitingHost(workerID int, IPs []string) bool {
IPPool.Lock.RLock()
defer func() {
IPPool.Lock.RUnlock()
}()
logging.LogDebug("[%d] Checking pool for IPs", workerID)
for _, ip := range IPs {
_, ok := IPPool.IPs[ip]
if ok {
return true
}
}
return false
}
func storeLinks(tx *sqlx.Tx, s *common.Snapshot) error {
if s.Links.Valid {
var batchSnapshots []*common.Snapshot
for _, link := range s.Links.ValueOrZero() {
if shouldPersistURL(&link) {
newSnapshot := &common.Snapshot{
URL: link,
Host: link.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
batchSnapshots = append(batchSnapshots, newSnapshot)
}
}
if len(batchSnapshots) > 0 {
err := _db.SaveLinksToDBinBatches(tx, batchSnapshots)
if err != nil {
return err
}
}
}
return nil
}
// shouldPersistURL returns true if we
// should save the URL in the _db.
// Only gemini:// urls are saved.
func shouldPersistURL(u *common.URL) bool {
return strings.HasPrefix(u.String(), "gemini://")
}
func haveWeVisitedURL(tx *sqlx.Tx, u *common.URL) (bool, error) {
var result bool
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u.String())
if err != nil {
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
}
if result {
return result, nil
}
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshot.url=$1`, u.String())
if err != nil {
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
}
return result, nil
}
// handleRedirection saves redirect URL as new snapshot
func handleRedirection(workerID int, tx *sqlx.Tx, s *common.Snapshot) error {
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
if err != nil {
if errors.Is(err, common.ErrGeminiRedirect) {
logging.LogDebug("[%d] %s", workerID, err)
}
return err
}
logging.LogDebug("[%d] Page redirects to %s", workerID, newURL)
// Insert fresh snapshot with new URL
if shouldPersistURL(newURL) {
snapshot := &common.Snapshot{
// UID: uid.UID(),
URL: *newURL,
Host: newURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String())
err = _db.SaveSnapshotIfNew(tx, snapshot)
if err != nil {
return err
}
}
return nil
}
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]common.Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []common.Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err
}
return snapshots, nil
}