Lots of features, first version that reliably crawls Geminispace.

- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
- [x] Save image/* and text/* files
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
.

.

.
This commit is contained in:
2024-10-21 20:04:09 +03:00
parent 212345764b
commit cd60c1363b
37 changed files with 1231 additions and 323 deletions

18
gemini/blacklist.go Normal file
View File

@@ -0,0 +1,18 @@
package gemini
import "strings"
var Blacklist *[]string
func InBlacklist(s *Snapshot) bool {
if Blacklist == nil {
data := ReadLines("blacklist.txt")
Blacklist = &data
}
for _, l := range *Blacklist {
if strings.HasPrefix(s.URL.String(), l) {
return true
}
}
return false
}

32
gemini/connectionPool.go Normal file
View File

@@ -0,0 +1,32 @@
package gemini
import (
"gemini-grc/logging"
)
var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)}
func AddIPsToPool(IPs []string) {
IpPool.Lock.Lock()
for _, ip := range IPs {
logging.LogDebug("Adding %s to pool", ip)
IpPool.IPs[ip]++
}
IpPool.Lock.Unlock()
}
func RemoveIPsFromPool(IPs []string) {
IpPool.Lock.Lock()
for _, ip := range IPs {
_, ok := IpPool.IPs[ip]
if ok {
logging.LogDebug("Removing %s from pool", ip)
if IpPool.IPs[ip] == 1 {
delete(IpPool.IPs, ip)
} else {
IpPool.IPs[ip]--
}
}
}
IpPool.Lock.Unlock()
}

113
gemini/files.go Normal file
View File

@@ -0,0 +1,113 @@
package gemini
import (
"fmt"
"gemini-grc/logging"
"net/url"
"os"
"path"
"path/filepath"
"strings"
)
// sanitizePath encodes invalid filesystem characters using URL encoding.
// Example:
// /example/path/to/page?query=param&another=value
// would become
// example/path/to/page%3Fquery%3Dparam%26another%3Dvalue
func sanitizePath(p string) string {
// Split the path into its components
components := strings.Split(p, "/")
// Encode each component separately
for i, component := range components {
// Decode any existing percent-encoded characters
decodedComponent, err := url.PathUnescape(component)
if err != nil {
decodedComponent = component // Fallback to original if unescape fails
}
// Encode the component to escape invalid filesystem characters
encodedComponent := url.QueryEscape(decodedComponent)
// Replace '+' (from QueryEscape) with '%20' to handle spaces correctly
encodedComponent = strings.ReplaceAll(encodedComponent, "+", "%20")
components[i] = encodedComponent
}
// Rejoin the components into a sanitized path
safe := filepath.Join(components...)
return safe
}
// getFilePath constructs a safe file path from the root path and URL path.
// It URL-encodes invalid filesystem characters to ensure the path is valid.
func calcFilePath(rootPath, urlPath string) (string, error) {
// Normalize the URL path
cleanPath := filepath.Clean(urlPath)
// Safe check to prevent directory traversal
if strings.Contains(cleanPath, "..") {
return "", fmt.Errorf("Invalid URL path: contains directory traversal")
}
// Sanitize the path by encoding invalid characters
safePath := sanitizePath(cleanPath)
// Join the root path and the sanitized URL path
finalPath := filepath.Join(rootPath, safePath)
return finalPath, nil
}
func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
parentPath := path.Join(rootPath, s.URL.Hostname)
urlPath := s.URL.Path
// If path is empty, add `index.gmi` as the file to save
if urlPath == "" || urlPath == "." {
urlPath = fmt.Sprintf("index.gmi")
}
// If path ends with '/' then add index.gmi for the
// directory to be created.
if strings.HasSuffix(urlPath, "/") {
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
}
finalPath, err := calcFilePath(parentPath, urlPath)
if err != nil {
logging.LogError("Error saving %s: %w", s.URL, err)
return
}
// Ensure the directory exists
dir := filepath.Dir(finalPath)
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
logging.LogError("Failed to create directory: %w", err)
return
}
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
err = os.WriteFile(finalPath, (*s).Data.V, 0666)
} else {
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666)
}
if err != nil {
logging.LogError("Error saving %s: %w", s.URL.Full, err)
}
close(done)
}
func ReadLines(path string) []string {
data, err := os.ReadFile(path)
if err != nil {
panic(fmt.Sprintf("Failed to read blacklist file: %s", err))
}
lines := strings.Split(string(data), "\n")
// Remove last line if empty
// (happens when file ends with '\n')
if lines[len(lines)-1] == "" {
lines = lines[:len(lines)-1]
}
logging.LogInfo("Loaded %d blacklist URLs", len(lines))
return lines
}

186
gemini/gemini.go Normal file
View File

@@ -0,0 +1,186 @@
package gemini
import (
"errors"
"fmt"
"gemini-grc/logging"
"net/url"
go_url "net/url"
"regexp"
"strconv"
"strings"
)
func isGeminiURL(url string) bool {
_, err := go_url.Parse(url)
if err != nil {
logging.LogWarn("[%s] Invalid URL: %v", url, err)
return false
}
return strings.HasPrefix(url, "gemini://")
}
func parseLinks(s Snapshot, queue chan string) {
for _, link := range *s.Links {
if strings.HasPrefix(link.Full, "gemini://") {
go func(link GeminiUrl) {
// fmt.Printf("LINK: %s\n", link)
queue <- link.Full
}(link)
}
}
}
func checkGeminiStatusCode(code int) error {
switch {
case code == 20:
return nil
case code >= 10 && code < 20:
return fmt.Errorf("Gemini response %d needs data input", code)
case code >= 30 && code < 40:
return fmt.Errorf("Gemini response %d redirect", code)
case code >= 40 && code < 50:
return fmt.Errorf("Gemini response %d server error", code)
case code >= 50 && code < 60:
return fmt.Errorf("Gemini response %d server permanent error", code)
case code >= 60 && code < 70:
return fmt.Errorf("Gemini response %d certificate error", code)
default:
return fmt.Errorf("Unexpected/unhandled Gemini response %d", code)
}
}
func ProcessGemini(snapshot *Snapshot) *Snapshot {
// Grab link lines
linkLines := ExtractLinkLines(snapshot.GemText.String)
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
// Normalize URLs in links, and store them in snapshot
for _, line := range linkLines {
normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String())
if error != nil {
logging.LogWarn("Cannot normalize URL in line '%s': %v", line, error)
continue
}
geminiUrl, error := ParseUrl(normalizedLink, descr)
if error != nil {
logging.LogWarn("Cannot parse URL in link '%s': %v", line, error)
continue
}
if snapshot.Links == nil {
snapshot.Links = &LinkList{*geminiUrl}
} else {
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
}
}
return snapshot
}
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
str_port := u.Port()
path := u.Path
if str_port == "" {
str_port = "1965"
}
port, err := strconv.Atoi(str_port)
if err != nil {
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
}
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
}
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
func ExtractLinkLines(gemtext string) []string {
// Define the regular expression pattern to match link lines
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
// Find all matches using the regular expression
matches := re.FindAllString(gemtext, -1)
return matches
}
// Take a single link line and the current URL,
// return the URL converted to an absolute URL
// and its description.
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
// Parse the current URL
baseURL, err := url.Parse(currentURL)
if err != nil {
return "", "", fmt.Errorf("Invalid current URL: %v", err)
}
// Regular expression to extract the URL part from a link line
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
// Use regex to extract the URL and the rest of the line
matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged
return "", "", fmt.Errorf("Not a link line: %v", linkLine)
}
originalURLStr := matches[1]
_, err = url.QueryUnescape(originalURLStr)
if err != nil {
return "", "", fmt.Errorf("Error decoding URL: %w", err)
}
restOfLine := ""
if len(matches) > 2 {
restOfLine = matches[2]
}
// Parse the URL from the link line
parsedURL, err := url.Parse(originalURLStr)
if err != nil {
// If URL parsing fails, return an error
return "", "", fmt.Errorf("Invalid URL '%s': %v", originalURLStr, err)
}
// Resolve relative URLs against the base URL
if !parsedURL.IsAbs() {
parsedURL = baseURL.ResolveReference(parsedURL)
}
// Construct the canonicalized link line
canonicalURLStr := parsedURL.String()
// Remove usual first space from URL description:
// => URL description
// ^^^^^^^^^^^^
if len(restOfLine) > 0 && restOfLine[0] == ' ' {
restOfLine = restOfLine[1:]
}
return canonicalURLStr, restOfLine, nil
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
// return canonicalizedLine, nil
}
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
// If no valid digits are found, it returns an error.
func ParseFirstTwoDigits(input string) (int, error) {
// Define the regular expression pattern to match one or two leading digits
re := regexp.MustCompile(`^(\d{1,2})`)
// Find the first match in the string
matches := re.FindStringSubmatch(input)
if len(matches) == 0 {
return 0, errors.New("no digits found at the beginning of the string")
}
// Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1])
if err != nil {
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
}
return snapshot, nil
}

57
gemini/gemini_url.go Normal file
View File

@@ -0,0 +1,57 @@
package gemini
import (
"encoding/json"
"fmt"
"gemini-grc/logging"
)
type GeminiUrl struct {
Protocol string `json:"protocol,omitempty"`
Hostname string `json:"hostname,omitempty"`
Port int `json:"port,omitempty"`
Path string `json:"path,omitempty"`
Descr string `json:"descr,omitempty"`
Full string `json:"full,omitempty"`
}
func (g *GeminiUrl) Scan(value interface{}) error {
if value == nil {
// Clear the fields in the current GeminiUrl object (not the pointer itself)
*g = GeminiUrl{}
return nil
}
b, ok := value.(string)
if !ok {
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
}
parsedUrl, err := ParseUrl(b, "")
if err != nil {
return err
}
*g = *parsedUrl
return nil
}
func (u GeminiUrl) String() string {
return u.Full
// return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path)
}
func GeminiUrltoJSON(g GeminiUrl) string {
// Serialize the Person struct to JSON
jsonData, err := json.Marshal(g)
if err != nil {
logging.LogError("Error serializing to JSON: %w", err)
}
return string(jsonData)
}
func GeminiUrlFromJSON(input string) GeminiUrl {
var geminiUrl GeminiUrl
err := json.Unmarshal([]byte(input), &geminiUrl)
if err != nil {
logging.LogError("Error deserializing from JSON: %w", err)
}
return geminiUrl
}

54
gemini/ip-address-pool.go Normal file
View File

@@ -0,0 +1,54 @@
package gemini
import "sync"
// Used to limit requests per
// IP address. Maps IP address
// to number of active connections.
type IpAddressPool struct {
IPs map[string]int
Lock sync.RWMutex
}
func (p *IpAddressPool) Set(key string, value int) {
p.Lock.Lock() // Lock for writing
defer p.Lock.Unlock() // Ensure mutex is unlocked after the write
p.IPs[key] = value
}
func (p *IpAddressPool) Get(key string) int {
p.Lock.RLock() // Lock for reading
defer p.Lock.RUnlock() // Ensure mutex is unlocked after reading
if value, ok := p.IPs[key]; !ok {
return 0
} else {
return value
}
}
func (p *IpAddressPool) Delete(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
delete(p.IPs, key)
}
func (p *IpAddressPool) Incr(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
if _, ok := p.IPs[key]; !ok {
p.IPs[key] = 1
} else {
p.IPs[key] = p.IPs[key] + 1
}
}
func (p *IpAddressPool) Decr(key string) {
p.Lock.Lock()
defer p.Lock.Unlock()
if val, ok := p.IPs[key]; ok {
p.IPs[key] = val - 1
if p.IPs[key] == 0 {
delete(p.IPs, key)
}
}
}

192
gemini/network.go Normal file
View File

@@ -0,0 +1,192 @@
package gemini
import (
"crypto/tls"
"fmt"
"gemini-grc/config"
"io"
"net"
"regexp"
"slices"
"strconv"
"time"
"github.com/guregu/null/v5"
)
// Resolve the URL hostname and
// check if we already have an open
// connection to this host.
// If we can connect, return a list
// of the resolved IPs.
func getHostIPAddresses(hostname string) ([]string, error) {
addrs, err := net.LookupHost(hostname)
if err != nil {
return nil, err
}
IpPool.Lock.RLock()
defer func() {
IpPool.Lock.RUnlock()
}()
return addrs, nil
}
// Connect to given URL, using the Gemini protocol.
// Return a Snapshot with the data or the error.
// Any errors are stored within the snapshot.
func Visit(s *Snapshot) {
// Establish the underlying TCP connection.
host := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
dialer := &net.Dialer{
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, // Set the overall connection timeout
KeepAlive: 30 * time.Second,
}
conn, err := dialer.Dial("tcp", host)
if err != nil {
s.Error = null.StringFrom(fmt.Sprintf("TCP connection failed: %v", err))
return
}
// Make sure we always close the connection.
defer func() {
err := conn.Close()
if err != nil {
s.Error = null.StringFrom(fmt.Sprintf("Error closing connection: %s", err))
}
}()
// Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil {
s.Error = null.StringFrom(fmt.Sprintf("Error setting connection deadline: %s", err))
return
}
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil {
s.Error = null.StringFrom(fmt.Sprintf("Error setting connection deadline: %s", err))
return
}
// Perform the TLS handshake
tlsConfig := &tls.Config{
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure.
ServerName: s.URL.Hostname, // SNI
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
}
tlsConn := tls.Client(conn, tlsConfig)
if err := tlsConn.Handshake(); err != nil {
s.Error = null.StringFrom(fmt.Sprintf("TLS handshake error: %v", err))
return
}
// We read `buf`-sized chunks and add data to `data`.
buf := make([]byte, 4096)
var data []byte
// Send Gemini request to trigger server response.
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", s.URL.String())))
if err != nil {
s.Error = null.StringFrom(fmt.Sprintf("Error sending network request: %s", err))
return
}
// Read response bytes in len(buf) byte chunks
for {
n, err := tlsConn.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
}
if len(data) > config.CONFIG.MaxResponseSize {
data = []byte{}
s.Error = null.StringFrom(fmt.Sprintf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize))
}
if err != nil {
if err == io.EOF {
break
} else {
s.Error = null.StringFrom(fmt.Sprintf("Network error: %s", err))
return
}
}
}
// Great, response data received.
err = processResponse(s, data)
if err != nil {
s.Error = null.StringFrom(err.Error())
}
return
}
// Update given snapshot with the
// Gemini header data: response code,
// mime type and lang (optional)
func processResponse(snapshot *Snapshot, data []byte) error {
headers, body, err := getHeadersAndData(data)
if err != nil {
return err
}
code, mimeType, lang := getMimeTypeAndLang(headers)
geminiError := checkGeminiStatusCode(code)
if geminiError != nil {
return geminiError
}
snapshot.ResponseCode = null.IntFrom(int64(code))
snapshot.MimeType = null.StringFrom(mimeType)
snapshot.Lang = null.StringFrom(lang)
// If we've got a Gemini document, populate
// `GemText` field, otherwise raw data goes to `Data`.
if mimeType == "text/gemini" {
validBody, err := EnsureValidUTF8(body)
if err != nil {
return fmt.Errorf("UTF-8 error: %w", err)
}
snapshot.GemText = null.StringFrom(string(validBody))
} else {
snapshot.Data = null.ValueFrom(body)
}
return nil
}
// Checks for a Gemini header, which is
// basically the first line of the response
// and should contain the response code,
// mimeType and language.
func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 {
return "", nil, fmt.Errorf("Could not parse response header")
}
firstLine = string(data[:firstLineEnds])
rest = data[firstLineEnds+1:]
return string(firstLine), rest, nil
}
// Parses code, mime type and language
// from a Gemini header.
// Examples:
// `20 text/gemini lang=en` (code, mimetype, lang)
// `20 text/gemini` (code, mimetype)
// `31 gemini://redirected.to/other/site` (code)
func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) {
// Regex that parses code, mimetype & lang
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
// Try to get code at least.
re := regexp.MustCompile(`^(\d+)\s+`)
matches := re.FindStringSubmatch(headers)
if matches == nil || len(matches) <= 1 {
return 0, "", ""
}
code, err := strconv.Atoi(matches[1])
if err != nil {
return 0, "", ""
}
return code, "", ""
}
code, err := strconv.Atoi(matches[1])
if err != nil {
return 0, "", ""
}
mimeType = matches[2]
lang = matches[4]
return code, mimeType, lang
}

48
gemini/network_test.go Normal file
View File

@@ -0,0 +1,48 @@
package gemini
import (
"testing"
)
// Test for input: `20 text/gemini`
func TestGetMimeTypeAndLang1(t *testing.T) {
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang11(t *testing.T) {
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
if code != 20 || mimeType != "text/gemini" || lang != "" {
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetTypeAndLang2(t *testing.T) {
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
if code != 20 || mimeType != "text/gemini" || lang != "en" {
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang3(t *testing.T) {
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
if code != 31 || mimeType != "" || lang != "" {
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang4(t *testing.T) {
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}
func TestGetMimeTypeAndLang5(t *testing.T) {
code, mimeType, lang := getMimeTypeAndLang("")
if code != 0 || mimeType != "" || lang != "" {
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
}
}

72
gemini/persistence.go Normal file
View File

@@ -0,0 +1,72 @@
package gemini
import (
"fmt"
"gemini-grc/logging"
"os"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx"
)
func ConnectToDB() *sqlx.DB {
connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s",
os.Getenv("PG_USER"),
os.Getenv("PG_PASSWORD"),
os.Getenv("PG_HOST"),
os.Getenv("PG_PORT"),
os.Getenv("PG_DATABASE"),
)
// Create a connection pool
db, err := sqlx.Open("pgx", connStr)
if err != nil {
panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err))
}
db.SetMaxOpenConns(20)
err = db.Ping()
if err != nil {
panic(fmt.Sprintf("Unable to ping database: %v\n", err))
}
logging.LogDebug("Connected to database")
return db
}
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO UPDATE SET
url = EXCLUDED.url,
host = EXCLUDED.host,
timestamp = EXCLUDED.timestamp,
mimetype = EXCLUDED.mimetype,
data = EXCLUDED.data,
gemtext = EXCLUDED.gemtext,
links = EXCLUDED.links,
lang = EXCLUDED.lang,
response_code = EXCLUDED.response_code,
error = EXCLUDED.error
`
_, err := tx.NamedExec(query, s)
if err != nil {
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
}
return nil
}
func SaveLinkToDB(tx *sqlx.Tx, s *Snapshot) error {
query := `
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (uid) DO NOTHING
`
_, err := tx.NamedExec(query, s)
if err != nil {
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
}
return nil
}

33
gemini/processing.go Normal file
View File

@@ -0,0 +1,33 @@
package gemini
import (
"bytes"
"fmt"
"io"
"unicode/utf8"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/transform"
)
func EnsureValidUTF8(input []byte) (string, error) {
// Remove NULL byte 0x00
inputNoNull := bytes.ReplaceAll(input, []byte{0}, nil)
isValidUTF8 := utf8.Valid(inputNoNull)
if !isValidUTF8 {
encodings := []transform.Transformer{
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
// TODO: Try more encodings?
}
for _, encoding := range encodings {
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
result, err := io.ReadAll(reader)
if err != nil {
return "", fmt.Errorf("UTF-8 error: %w", err)
}
return string(result), nil
}
}
return string(inputNoNull), nil
}

13
gemini/processing_test.go Normal file
View File

@@ -0,0 +1,13 @@
package gemini
import "testing"
// Make sure NULL bytes are removed
func TestEnsureValidUTF8(t *testing.T) {
// Create a string with a null byte
strWithNull := "Hello" + string('\x00') + "world"
result, _ := EnsureValidUTF8([]byte(strWithNull))
if result != "Helloworld" {
t.Errorf("Expected string without NULL byte, got %s", result)
}
}

74
gemini/snapshot.go Normal file
View File

@@ -0,0 +1,74 @@
package gemini
import (
"database/sql/driver"
"encoding/json"
"fmt"
"gemini-grc/logging"
"strings"
"github.com/guregu/null/v5"
)
type LinkList []GeminiUrl
func (l LinkList) Value() (driver.Value, error) {
return json.Marshal(l)
}
func (l *LinkList) Scan(value interface{}) error {
if value == nil {
*l = nil
return nil
}
b, ok := value.([]byte) // Type assertion! Converts to []byte
if !ok {
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
}
return json.Unmarshal(b, l)
}
type Snapshot struct {
ID int `db:"id" json:"id,omitempty"`
UID string `db:"uid" json:"uid,omitempty"`
URL GeminiUrl `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Links *LinkList `db:"links" json:"links,omitempty"`
Lang null.String `db:"lang" json:"lang,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
}
func SnapshotToJSON(g Snapshot) string {
// Serialize the Person struct to JSON
jsonData, err := json.MarshalIndent(g, "", "\t")
if err != nil {
logging.LogError("Error serializing to JSON: %w", err)
}
return string(jsonData)
}
func SnapshotFromJSON(input string) Snapshot {
var snapshot Snapshot
err := json.Unmarshal([]byte(input), &snapshot)
if err != nil {
logging.LogError("Error deserializing from JSON: %w", err)
}
return snapshot
}
func ShouldPersistSnapshot(result *Snapshot) bool {
if !result.MimeType.Valid {
return false
}
if result.MimeType.String == "text/gemini" ||
strings.HasPrefix(result.MimeType.String, "image/") ||
strings.HasPrefix(result.MimeType.String, "text/") {
return true
}
return false
}

218
gemini/worker.go Normal file
View File

@@ -0,0 +1,218 @@
package gemini
import (
"database/sql"
"fmt"
"gemini-grc/config"
"gemini-grc/logging"
"gemini-grc/uid"
"runtime/debug"
"strings"
"time"
"github.com/guregu/null/v5"
"github.com/jmoiron/sqlx"
)
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers)
for i := 0; i < numOfWorkers; i++ {
go func(i int) {
for {
runWorker(i, db)
}
}(i)
}
}
func printPoolIPs() {
fmt.Printf("%v", IpPool.IPs)
}
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
// Wrap errors with more info.
defer func() {
if err != nil {
err = fmt.Errorf("[%d] Worker Error: %w", id, err)
}
}()
IPs, err := getHostIPAddresses(s.Host)
if err != nil {
s.Error = null.StringFrom("DNS Resolve error")
err = SaveSnapshotToDB(tx, s)
if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err)
}
return nil
}
// If the host's ip is in the pool, stop
// and add the url in the queue later.
IpPool.Lock.RLock()
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL)
for _, ip := range IPs {
_, ok := IpPool.IPs[ip]
if ok {
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL)
IpPool.Lock.RUnlock()
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
return nil
}
}
IpPool.Lock.RUnlock()
AddIPsToPool(IPs)
url := s.URL.String()
logging.LogDebug("[%d] Dialing %s", id, url)
Visit(s)
logging.LogDebug("[%d] Finished dialing.", id)
go func() {
time.Sleep(5 * time.Second)
RemoveIPsFromPool(IPs)
}()
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
logging.LogDebug("[%d] [%s] Processing", id, url)
s = ProcessGemini(s)
}
logging.LogDebug("[%d] Saving", id)
err = SaveSnapshotToDB(tx, s)
if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err)
}
// Store links
if s.Links != nil {
for _, link := range *s.Links {
newSnapshot := Snapshot{UID: uid.UID(), URL: link, Host: link.Hostname, Timestamp: null.TimeFrom(time.Now())}
if shouldPersistURL(tx, link) {
logging.LogDebug("[%d] Saving link %s", id, link)
err = SaveLinkToDB(tx, &newSnapshot)
if err != nil {
return fmt.Errorf("[%d] DB Error: %w", id, err)
}
}
}
}
return nil
}
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
if !strings.HasPrefix(u.String(), "gemini://") {
return false
}
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)`
var exists bool
err := tx.Get(&exists, query, u.String())
if err != nil {
fmt.Println("Error executing query:", err)
return false
}
return !exists
}
// Select a random snapshot.
func GetRandomSnapshot(tx *sqlx.Tx) (*Snapshot, error) {
query := `SELECT * FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
ORDER BY RANDOM()
LIMIT 1
FOR UPDATE SKIP LOCKED`
// AND (timestamp < NOW() - INTERVAL '1 day' OR timestamp IS NULL)
var snapshot Snapshot
err := tx.Get(&snapshot, query)
if err != nil {
if err == sql.ErrNoRows {
// Handle the case where no rows were found
return nil, nil
}
// Handle other potential errors
return nil, err
}
return &snapshot, nil
}
func GetRandomSnapshots(tx *sqlx.Tx) ([]Snapshot, error) {
query := `
SELECT * FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
ORDER BY RANDOM()
LIMIT $1
FOR UPDATE SKIP LOCKED
`
var snapshots []Snapshot
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
if err != nil {
return nil, err
}
return snapshots, nil
}
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
query := `
SELECT DISTINCT ON (host) *
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
ORDER BY host, RANDOM()
LIMIT $1
`
var snapshots []Snapshot
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
if err != nil {
return nil, err
}
return snapshots, nil
}
func runWorker(id int, db *sqlx.DB) {
// Start the transaction
tx, err := db.Beginx()
if err != nil {
logging.LogError("Failed to begin transaction: %w", err)
}
defer func() {
err = tx.Commit()
if err != nil {
logging.LogError("[%d] Failed to commit transaction: %w", id, err)
tx.Rollback()
}
}()
snapshots, err := GetRandomSnapshotsDistinctHosts(tx)
if err != nil {
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
time.Sleep(10 * time.Second)
return
} else if len(snapshots) == 0 {
logging.LogInfo("[%d] No remaining snapshots to visit.", id)
time.Sleep(1 * time.Minute)
return
}
total := len(snapshots)
for i, s := range snapshots {
if InBlacklist(&s) {
logging.LogWarn("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
}
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
err = workOnSnapshot(id, tx, &s)
if err != nil {
logging.LogError("[%d] [%s] Error %w", id, s.URL, err)
// TODO Remove panic and gracefully handle/log error
fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack())
panic("ERROR encountered")
}
if s.Error.Valid {
logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String))
}
logging.LogDebug("[%d] Done %d/%d.", id, i, total)
}
logging.LogInfo("[%d] Worker done.", id)
}