Lots of features, first version that reliably crawls Geminispace.
- [x] Concurrent downloading with workers - [x] Concurrent connection limit per host - [x] URL Blacklist - [x] Save image/* and text/* files - [x] Configuration via environment variables - [x] Storing snapshots in PostgreSQL - [x] Proper response header & body UTF-8 and format validation . . .
This commit is contained in:
18
gemini/blacklist.go
Normal file
18
gemini/blacklist.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package gemini
|
||||
|
||||
import "strings"
|
||||
|
||||
var Blacklist *[]string
|
||||
|
||||
func InBlacklist(s *Snapshot) bool {
|
||||
if Blacklist == nil {
|
||||
data := ReadLines("blacklist.txt")
|
||||
Blacklist = &data
|
||||
}
|
||||
for _, l := range *Blacklist {
|
||||
if strings.HasPrefix(s.URL.String(), l) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
32
gemini/connectionPool.go
Normal file
32
gemini/connectionPool.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)}
|
||||
|
||||
func AddIPsToPool(IPs []string) {
|
||||
IpPool.Lock.Lock()
|
||||
for _, ip := range IPs {
|
||||
logging.LogDebug("Adding %s to pool", ip)
|
||||
IpPool.IPs[ip]++
|
||||
}
|
||||
IpPool.Lock.Unlock()
|
||||
}
|
||||
|
||||
func RemoveIPsFromPool(IPs []string) {
|
||||
IpPool.Lock.Lock()
|
||||
for _, ip := range IPs {
|
||||
_, ok := IpPool.IPs[ip]
|
||||
if ok {
|
||||
logging.LogDebug("Removing %s from pool", ip)
|
||||
if IpPool.IPs[ip] == 1 {
|
||||
delete(IpPool.IPs, ip)
|
||||
} else {
|
||||
IpPool.IPs[ip]--
|
||||
}
|
||||
}
|
||||
}
|
||||
IpPool.Lock.Unlock()
|
||||
}
|
||||
113
gemini/files.go
Normal file
113
gemini/files.go
Normal file
@@ -0,0 +1,113 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// sanitizePath encodes invalid filesystem characters using URL encoding.
|
||||
// Example:
|
||||
// /example/path/to/page?query=param&another=value
|
||||
// would become
|
||||
// example/path/to/page%3Fquery%3Dparam%26another%3Dvalue
|
||||
func sanitizePath(p string) string {
|
||||
// Split the path into its components
|
||||
components := strings.Split(p, "/")
|
||||
|
||||
// Encode each component separately
|
||||
for i, component := range components {
|
||||
// Decode any existing percent-encoded characters
|
||||
decodedComponent, err := url.PathUnescape(component)
|
||||
if err != nil {
|
||||
decodedComponent = component // Fallback to original if unescape fails
|
||||
}
|
||||
|
||||
// Encode the component to escape invalid filesystem characters
|
||||
encodedComponent := url.QueryEscape(decodedComponent)
|
||||
|
||||
// Replace '+' (from QueryEscape) with '%20' to handle spaces correctly
|
||||
encodedComponent = strings.ReplaceAll(encodedComponent, "+", "%20")
|
||||
|
||||
components[i] = encodedComponent
|
||||
}
|
||||
|
||||
// Rejoin the components into a sanitized path
|
||||
safe := filepath.Join(components...)
|
||||
|
||||
return safe
|
||||
}
|
||||
|
||||
// getFilePath constructs a safe file path from the root path and URL path.
|
||||
// It URL-encodes invalid filesystem characters to ensure the path is valid.
|
||||
func calcFilePath(rootPath, urlPath string) (string, error) {
|
||||
// Normalize the URL path
|
||||
cleanPath := filepath.Clean(urlPath)
|
||||
|
||||
// Safe check to prevent directory traversal
|
||||
if strings.Contains(cleanPath, "..") {
|
||||
return "", fmt.Errorf("Invalid URL path: contains directory traversal")
|
||||
}
|
||||
|
||||
// Sanitize the path by encoding invalid characters
|
||||
safePath := sanitizePath(cleanPath)
|
||||
|
||||
// Join the root path and the sanitized URL path
|
||||
finalPath := filepath.Join(rootPath, safePath)
|
||||
|
||||
return finalPath, nil
|
||||
}
|
||||
|
||||
func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
||||
parentPath := path.Join(rootPath, s.URL.Hostname)
|
||||
urlPath := s.URL.Path
|
||||
// If path is empty, add `index.gmi` as the file to save
|
||||
if urlPath == "" || urlPath == "." {
|
||||
urlPath = fmt.Sprintf("index.gmi")
|
||||
}
|
||||
// If path ends with '/' then add index.gmi for the
|
||||
// directory to be created.
|
||||
if strings.HasSuffix(urlPath, "/") {
|
||||
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
|
||||
}
|
||||
|
||||
finalPath, err := calcFilePath(parentPath, urlPath)
|
||||
if err != nil {
|
||||
logging.LogError("Error saving %s: %w", s.URL, err)
|
||||
return
|
||||
}
|
||||
// Ensure the directory exists
|
||||
dir := filepath.Dir(finalPath)
|
||||
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
|
||||
logging.LogError("Failed to create directory: %w", err)
|
||||
return
|
||||
}
|
||||
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||
err = os.WriteFile(finalPath, (*s).Data.V, 0666)
|
||||
} else {
|
||||
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666)
|
||||
}
|
||||
if err != nil {
|
||||
logging.LogError("Error saving %s: %w", s.URL.Full, err)
|
||||
}
|
||||
close(done)
|
||||
}
|
||||
|
||||
func ReadLines(path string) []string {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Failed to read blacklist file: %s", err))
|
||||
}
|
||||
lines := strings.Split(string(data), "\n")
|
||||
// Remove last line if empty
|
||||
// (happens when file ends with '\n')
|
||||
if lines[len(lines)-1] == "" {
|
||||
lines = lines[:len(lines)-1]
|
||||
}
|
||||
logging.LogInfo("Loaded %d blacklist URLs", len(lines))
|
||||
return lines
|
||||
}
|
||||
186
gemini/gemini.go
Normal file
186
gemini/gemini.go
Normal file
@@ -0,0 +1,186 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"net/url"
|
||||
go_url "net/url"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func isGeminiURL(url string) bool {
|
||||
_, err := go_url.Parse(url)
|
||||
if err != nil {
|
||||
logging.LogWarn("[%s] Invalid URL: %v", url, err)
|
||||
return false
|
||||
}
|
||||
return strings.HasPrefix(url, "gemini://")
|
||||
}
|
||||
|
||||
func parseLinks(s Snapshot, queue chan string) {
|
||||
for _, link := range *s.Links {
|
||||
if strings.HasPrefix(link.Full, "gemini://") {
|
||||
go func(link GeminiUrl) {
|
||||
// fmt.Printf("LINK: %s\n", link)
|
||||
queue <- link.Full
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func checkGeminiStatusCode(code int) error {
|
||||
switch {
|
||||
case code == 20:
|
||||
return nil
|
||||
case code >= 10 && code < 20:
|
||||
return fmt.Errorf("Gemini response %d needs data input", code)
|
||||
case code >= 30 && code < 40:
|
||||
return fmt.Errorf("Gemini response %d redirect", code)
|
||||
case code >= 40 && code < 50:
|
||||
return fmt.Errorf("Gemini response %d server error", code)
|
||||
case code >= 50 && code < 60:
|
||||
return fmt.Errorf("Gemini response %d server permanent error", code)
|
||||
case code >= 60 && code < 70:
|
||||
return fmt.Errorf("Gemini response %d certificate error", code)
|
||||
default:
|
||||
return fmt.Errorf("Unexpected/unhandled Gemini response %d", code)
|
||||
}
|
||||
}
|
||||
|
||||
func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
||||
// Grab link lines
|
||||
linkLines := ExtractLinkLines(snapshot.GemText.String)
|
||||
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
|
||||
|
||||
// Normalize URLs in links, and store them in snapshot
|
||||
for _, line := range linkLines {
|
||||
normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String())
|
||||
if error != nil {
|
||||
logging.LogWarn("Cannot normalize URL in line '%s': %v", line, error)
|
||||
continue
|
||||
}
|
||||
geminiUrl, error := ParseUrl(normalizedLink, descr)
|
||||
if error != nil {
|
||||
logging.LogWarn("Cannot parse URL in link '%s': %v", line, error)
|
||||
continue
|
||||
}
|
||||
if snapshot.Links == nil {
|
||||
snapshot.Links = &LinkList{*geminiUrl}
|
||||
} else {
|
||||
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
|
||||
}
|
||||
}
|
||||
return snapshot
|
||||
}
|
||||
|
||||
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||
u, err := url.Parse(input)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
|
||||
}
|
||||
protocol := u.Scheme
|
||||
hostname := u.Hostname()
|
||||
str_port := u.Port()
|
||||
path := u.Path
|
||||
if str_port == "" {
|
||||
str_port = "1965"
|
||||
}
|
||||
port, err := strconv.Atoi(str_port)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
|
||||
}
|
||||
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
|
||||
}
|
||||
|
||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||
func ExtractLinkLines(gemtext string) []string {
|
||||
// Define the regular expression pattern to match link lines
|
||||
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
|
||||
|
||||
// Find all matches using the regular expression
|
||||
matches := re.FindAllString(gemtext, -1)
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
// Take a single link line and the current URL,
|
||||
// return the URL converted to an absolute URL
|
||||
// and its description.
|
||||
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
|
||||
// Parse the current URL
|
||||
baseURL, err := url.Parse(currentURL)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("Invalid current URL: %v", err)
|
||||
}
|
||||
|
||||
// Regular expression to extract the URL part from a link line
|
||||
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
|
||||
|
||||
// Use regex to extract the URL and the rest of the line
|
||||
matches := re.FindStringSubmatch(linkLine)
|
||||
if len(matches) == 0 {
|
||||
// If the line doesn't match the expected format, return it unchanged
|
||||
return "", "", fmt.Errorf("Not a link line: %v", linkLine)
|
||||
}
|
||||
|
||||
originalURLStr := matches[1]
|
||||
_, err = url.QueryUnescape(originalURLStr)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("Error decoding URL: %w", err)
|
||||
}
|
||||
|
||||
restOfLine := ""
|
||||
if len(matches) > 2 {
|
||||
restOfLine = matches[2]
|
||||
}
|
||||
|
||||
// Parse the URL from the link line
|
||||
parsedURL, err := url.Parse(originalURLStr)
|
||||
if err != nil {
|
||||
// If URL parsing fails, return an error
|
||||
return "", "", fmt.Errorf("Invalid URL '%s': %v", originalURLStr, err)
|
||||
}
|
||||
|
||||
// Resolve relative URLs against the base URL
|
||||
if !parsedURL.IsAbs() {
|
||||
parsedURL = baseURL.ResolveReference(parsedURL)
|
||||
}
|
||||
|
||||
// Construct the canonicalized link line
|
||||
canonicalURLStr := parsedURL.String()
|
||||
|
||||
// Remove usual first space from URL description:
|
||||
// => URL description
|
||||
// ^^^^^^^^^^^^
|
||||
if len(restOfLine) > 0 && restOfLine[0] == ' ' {
|
||||
restOfLine = restOfLine[1:]
|
||||
}
|
||||
|
||||
return canonicalURLStr, restOfLine, nil
|
||||
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
|
||||
// return canonicalizedLine, nil
|
||||
}
|
||||
|
||||
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
|
||||
// If no valid digits are found, it returns an error.
|
||||
func ParseFirstTwoDigits(input string) (int, error) {
|
||||
// Define the regular expression pattern to match one or two leading digits
|
||||
re := regexp.MustCompile(`^(\d{1,2})`)
|
||||
|
||||
// Find the first match in the string
|
||||
matches := re.FindStringSubmatch(input)
|
||||
if len(matches) == 0 {
|
||||
return 0, errors.New("no digits found at the beginning of the string")
|
||||
}
|
||||
|
||||
// Parse the captured match as an integer
|
||||
snapshot, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
|
||||
}
|
||||
|
||||
return snapshot, nil
|
||||
}
|
||||
57
gemini/gemini_url.go
Normal file
57
gemini/gemini_url.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
type GeminiUrl struct {
|
||||
Protocol string `json:"protocol,omitempty"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
Port int `json:"port,omitempty"`
|
||||
Path string `json:"path,omitempty"`
|
||||
Descr string `json:"descr,omitempty"`
|
||||
Full string `json:"full,omitempty"`
|
||||
}
|
||||
|
||||
func (g *GeminiUrl) Scan(value interface{}) error {
|
||||
if value == nil {
|
||||
// Clear the fields in the current GeminiUrl object (not the pointer itself)
|
||||
*g = GeminiUrl{}
|
||||
return nil
|
||||
}
|
||||
b, ok := value.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
|
||||
}
|
||||
parsedUrl, err := ParseUrl(b, "")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
*g = *parsedUrl
|
||||
return nil
|
||||
}
|
||||
|
||||
func (u GeminiUrl) String() string {
|
||||
return u.Full
|
||||
// return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path)
|
||||
}
|
||||
|
||||
func GeminiUrltoJSON(g GeminiUrl) string {
|
||||
// Serialize the Person struct to JSON
|
||||
jsonData, err := json.Marshal(g)
|
||||
if err != nil {
|
||||
logging.LogError("Error serializing to JSON: %w", err)
|
||||
}
|
||||
return string(jsonData)
|
||||
}
|
||||
|
||||
func GeminiUrlFromJSON(input string) GeminiUrl {
|
||||
var geminiUrl GeminiUrl
|
||||
err := json.Unmarshal([]byte(input), &geminiUrl)
|
||||
if err != nil {
|
||||
logging.LogError("Error deserializing from JSON: %w", err)
|
||||
}
|
||||
return geminiUrl
|
||||
}
|
||||
54
gemini/ip-address-pool.go
Normal file
54
gemini/ip-address-pool.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package gemini
|
||||
|
||||
import "sync"
|
||||
|
||||
// Used to limit requests per
|
||||
// IP address. Maps IP address
|
||||
// to number of active connections.
|
||||
type IpAddressPool struct {
|
||||
IPs map[string]int
|
||||
Lock sync.RWMutex
|
||||
}
|
||||
|
||||
func (p *IpAddressPool) Set(key string, value int) {
|
||||
p.Lock.Lock() // Lock for writing
|
||||
defer p.Lock.Unlock() // Ensure mutex is unlocked after the write
|
||||
p.IPs[key] = value
|
||||
}
|
||||
|
||||
func (p *IpAddressPool) Get(key string) int {
|
||||
p.Lock.RLock() // Lock for reading
|
||||
defer p.Lock.RUnlock() // Ensure mutex is unlocked after reading
|
||||
if value, ok := p.IPs[key]; !ok {
|
||||
return 0
|
||||
} else {
|
||||
return value
|
||||
}
|
||||
}
|
||||
|
||||
func (p *IpAddressPool) Delete(key string) {
|
||||
p.Lock.Lock()
|
||||
defer p.Lock.Unlock()
|
||||
delete(p.IPs, key)
|
||||
}
|
||||
|
||||
func (p *IpAddressPool) Incr(key string) {
|
||||
p.Lock.Lock()
|
||||
defer p.Lock.Unlock()
|
||||
if _, ok := p.IPs[key]; !ok {
|
||||
p.IPs[key] = 1
|
||||
} else {
|
||||
p.IPs[key] = p.IPs[key] + 1
|
||||
}
|
||||
}
|
||||
|
||||
func (p *IpAddressPool) Decr(key string) {
|
||||
p.Lock.Lock()
|
||||
defer p.Lock.Unlock()
|
||||
if val, ok := p.IPs[key]; ok {
|
||||
p.IPs[key] = val - 1
|
||||
if p.IPs[key] == 0 {
|
||||
delete(p.IPs, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
192
gemini/network.go
Normal file
192
gemini/network.go
Normal file
@@ -0,0 +1,192 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"gemini-grc/config"
|
||||
"io"
|
||||
"net"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
// Resolve the URL hostname and
|
||||
// check if we already have an open
|
||||
// connection to this host.
|
||||
// If we can connect, return a list
|
||||
// of the resolved IPs.
|
||||
func getHostIPAddresses(hostname string) ([]string, error) {
|
||||
addrs, err := net.LookupHost(hostname)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
IpPool.Lock.RLock()
|
||||
defer func() {
|
||||
IpPool.Lock.RUnlock()
|
||||
}()
|
||||
return addrs, nil
|
||||
}
|
||||
|
||||
// Connect to given URL, using the Gemini protocol.
|
||||
// Return a Snapshot with the data or the error.
|
||||
// Any errors are stored within the snapshot.
|
||||
func Visit(s *Snapshot) {
|
||||
// Establish the underlying TCP connection.
|
||||
host := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
|
||||
dialer := &net.Dialer{
|
||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, // Set the overall connection timeout
|
||||
KeepAlive: 30 * time.Second,
|
||||
}
|
||||
conn, err := dialer.Dial("tcp", host)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("TCP connection failed: %v", err))
|
||||
return
|
||||
}
|
||||
// Make sure we always close the connection.
|
||||
defer func() {
|
||||
err := conn.Close()
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error closing connection: %s", err))
|
||||
}
|
||||
}()
|
||||
|
||||
// Set read and write timeouts on the TCP connection.
|
||||
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error setting connection deadline: %s", err))
|
||||
return
|
||||
}
|
||||
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error setting connection deadline: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Perform the TLS handshake
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure.
|
||||
ServerName: s.URL.Hostname, // SNI
|
||||
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
|
||||
}
|
||||
tlsConn := tls.Client(conn, tlsConfig)
|
||||
if err := tlsConn.Handshake(); err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("TLS handshake error: %v", err))
|
||||
return
|
||||
}
|
||||
|
||||
// We read `buf`-sized chunks and add data to `data`.
|
||||
buf := make([]byte, 4096)
|
||||
var data []byte
|
||||
|
||||
// Send Gemini request to trigger server response.
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", s.URL.String())))
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error sending network request: %s", err))
|
||||
return
|
||||
}
|
||||
// Read response bytes in len(buf) byte chunks
|
||||
for {
|
||||
n, err := tlsConn.Read(buf)
|
||||
if n > 0 {
|
||||
data = append(data, buf[:n]...)
|
||||
}
|
||||
if len(data) > config.CONFIG.MaxResponseSize {
|
||||
data = []byte{}
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize))
|
||||
}
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
} else {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Network error: %s", err))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
// Great, response data received.
|
||||
err = processResponse(s, data)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Update given snapshot with the
|
||||
// Gemini header data: response code,
|
||||
// mime type and lang (optional)
|
||||
func processResponse(snapshot *Snapshot, data []byte) error {
|
||||
headers, body, err := getHeadersAndData(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
code, mimeType, lang := getMimeTypeAndLang(headers)
|
||||
geminiError := checkGeminiStatusCode(code)
|
||||
if geminiError != nil {
|
||||
return geminiError
|
||||
}
|
||||
snapshot.ResponseCode = null.IntFrom(int64(code))
|
||||
snapshot.MimeType = null.StringFrom(mimeType)
|
||||
snapshot.Lang = null.StringFrom(lang)
|
||||
// If we've got a Gemini document, populate
|
||||
// `GemText` field, otherwise raw data goes to `Data`.
|
||||
if mimeType == "text/gemini" {
|
||||
validBody, err := EnsureValidUTF8(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("UTF-8 error: %w", err)
|
||||
}
|
||||
snapshot.GemText = null.StringFrom(string(validBody))
|
||||
} else {
|
||||
snapshot.Data = null.ValueFrom(body)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Checks for a Gemini header, which is
|
||||
// basically the first line of the response
|
||||
// and should contain the response code,
|
||||
// mimeType and language.
|
||||
func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
|
||||
firstLineEnds := slices.Index(data, '\n')
|
||||
if firstLineEnds == -1 {
|
||||
return "", nil, fmt.Errorf("Could not parse response header")
|
||||
}
|
||||
firstLine = string(data[:firstLineEnds])
|
||||
rest = data[firstLineEnds+1:]
|
||||
return string(firstLine), rest, nil
|
||||
}
|
||||
|
||||
// Parses code, mime type and language
|
||||
// from a Gemini header.
|
||||
// Examples:
|
||||
// `20 text/gemini lang=en` (code, mimetype, lang)
|
||||
// `20 text/gemini` (code, mimetype)
|
||||
// `31 gemini://redirected.to/other/site` (code)
|
||||
func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) {
|
||||
// Regex that parses code, mimetype & lang
|
||||
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
|
||||
matches := re.FindStringSubmatch(headers)
|
||||
if matches == nil || len(matches) <= 1 {
|
||||
// Try to get code at least.
|
||||
re := regexp.MustCompile(`^(\d+)\s+`)
|
||||
matches := re.FindStringSubmatch(headers)
|
||||
if matches == nil || len(matches) <= 1 {
|
||||
return 0, "", ""
|
||||
}
|
||||
code, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return 0, "", ""
|
||||
}
|
||||
return code, "", ""
|
||||
}
|
||||
code, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return 0, "", ""
|
||||
}
|
||||
mimeType = matches[2]
|
||||
lang = matches[4]
|
||||
return code, mimeType, lang
|
||||
}
|
||||
48
gemini/network_test.go
Normal file
48
gemini/network_test.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Test for input: `20 text/gemini`
|
||||
func TestGetMimeTypeAndLang1(t *testing.T) {
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang11(t *testing.T) {
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetTypeAndLang2(t *testing.T) {
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang3(t *testing.T) {
|
||||
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
|
||||
if code != 31 || mimeType != "" || lang != "" {
|
||||
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang4(t *testing.T) {
|
||||
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
|
||||
if code != 0 || mimeType != "" || lang != "" {
|
||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang5(t *testing.T) {
|
||||
code, mimeType, lang := getMimeTypeAndLang("")
|
||||
if code != 0 || mimeType != "" || lang != "" {
|
||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
72
gemini/persistence.go
Normal file
72
gemini/persistence.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"os"
|
||||
|
||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
func ConnectToDB() *sqlx.DB {
|
||||
connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s",
|
||||
os.Getenv("PG_USER"),
|
||||
os.Getenv("PG_PASSWORD"),
|
||||
os.Getenv("PG_HOST"),
|
||||
os.Getenv("PG_PORT"),
|
||||
os.Getenv("PG_DATABASE"),
|
||||
)
|
||||
|
||||
// Create a connection pool
|
||||
db, err := sqlx.Open("pgx", connStr)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err))
|
||||
}
|
||||
db.SetMaxOpenConns(20)
|
||||
err = db.Ping()
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Unable to ping database: %v\n", err))
|
||||
}
|
||||
|
||||
logging.LogDebug("Connected to database")
|
||||
return db
|
||||
}
|
||||
|
||||
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (uid) DO UPDATE SET
|
||||
url = EXCLUDED.url,
|
||||
host = EXCLUDED.host,
|
||||
timestamp = EXCLUDED.timestamp,
|
||||
mimetype = EXCLUDED.mimetype,
|
||||
data = EXCLUDED.data,
|
||||
gemtext = EXCLUDED.gemtext,
|
||||
links = EXCLUDED.links,
|
||||
lang = EXCLUDED.lang,
|
||||
response_code = EXCLUDED.response_code,
|
||||
error = EXCLUDED.error
|
||||
`
|
||||
_, err := tx.NamedExec(query, s)
|
||||
if err != nil {
|
||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func SaveLinkToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (uid) DO NOTHING
|
||||
`
|
||||
_, err := tx.NamedExec(query, s)
|
||||
if err != nil {
|
||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
||||
}
|
||||
return nil
|
||||
}
|
||||
33
gemini/processing.go
Normal file
33
gemini/processing.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
func EnsureValidUTF8(input []byte) (string, error) {
|
||||
// Remove NULL byte 0x00
|
||||
inputNoNull := bytes.ReplaceAll(input, []byte{0}, nil)
|
||||
isValidUTF8 := utf8.Valid(inputNoNull)
|
||||
if !isValidUTF8 {
|
||||
encodings := []transform.Transformer{
|
||||
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
|
||||
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
|
||||
// TODO: Try more encodings?
|
||||
}
|
||||
for _, encoding := range encodings {
|
||||
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
|
||||
result, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("UTF-8 error: %w", err)
|
||||
}
|
||||
return string(result), nil
|
||||
}
|
||||
}
|
||||
return string(inputNoNull), nil
|
||||
}
|
||||
13
gemini/processing_test.go
Normal file
13
gemini/processing_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package gemini
|
||||
|
||||
import "testing"
|
||||
|
||||
// Make sure NULL bytes are removed
|
||||
func TestEnsureValidUTF8(t *testing.T) {
|
||||
// Create a string with a null byte
|
||||
strWithNull := "Hello" + string('\x00') + "world"
|
||||
result, _ := EnsureValidUTF8([]byte(strWithNull))
|
||||
if result != "Helloworld" {
|
||||
t.Errorf("Expected string without NULL byte, got %s", result)
|
||||
}
|
||||
}
|
||||
74
gemini/snapshot.go
Normal file
74
gemini/snapshot.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"database/sql/driver"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"strings"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
type LinkList []GeminiUrl
|
||||
|
||||
func (l LinkList) Value() (driver.Value, error) {
|
||||
return json.Marshal(l)
|
||||
}
|
||||
|
||||
func (l *LinkList) Scan(value interface{}) error {
|
||||
if value == nil {
|
||||
*l = nil
|
||||
return nil
|
||||
}
|
||||
b, ok := value.([]byte) // Type assertion! Converts to []byte
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
|
||||
}
|
||||
return json.Unmarshal(b, l)
|
||||
}
|
||||
|
||||
type Snapshot struct {
|
||||
ID int `db:"id" json:"id,omitempty"`
|
||||
UID string `db:"uid" json:"uid,omitempty"`
|
||||
URL GeminiUrl `db:"url" json:"url,omitempty"`
|
||||
Host string `db:"host" json:"host,omitempty"`
|
||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||
Links *LinkList `db:"links" json:"links,omitempty"`
|
||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
}
|
||||
|
||||
func SnapshotToJSON(g Snapshot) string {
|
||||
// Serialize the Person struct to JSON
|
||||
jsonData, err := json.MarshalIndent(g, "", "\t")
|
||||
if err != nil {
|
||||
logging.LogError("Error serializing to JSON: %w", err)
|
||||
}
|
||||
return string(jsonData)
|
||||
}
|
||||
|
||||
func SnapshotFromJSON(input string) Snapshot {
|
||||
var snapshot Snapshot
|
||||
err := json.Unmarshal([]byte(input), &snapshot)
|
||||
if err != nil {
|
||||
logging.LogError("Error deserializing from JSON: %w", err)
|
||||
}
|
||||
return snapshot
|
||||
}
|
||||
|
||||
func ShouldPersistSnapshot(result *Snapshot) bool {
|
||||
if !result.MimeType.Valid {
|
||||
return false
|
||||
}
|
||||
if result.MimeType.String == "text/gemini" ||
|
||||
strings.HasPrefix(result.MimeType.String, "image/") ||
|
||||
strings.HasPrefix(result.MimeType.String, "text/") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
218
gemini/worker.go
Normal file
218
gemini/worker.go
Normal file
@@ -0,0 +1,218 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/logging"
|
||||
"gemini-grc/uid"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
||||
for i := 0; i < numOfWorkers; i++ {
|
||||
go func(i int) {
|
||||
for {
|
||||
runWorker(i, db)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
}
|
||||
|
||||
func printPoolIPs() {
|
||||
fmt.Printf("%v", IpPool.IPs)
|
||||
}
|
||||
|
||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
// Wrap errors with more info.
|
||||
defer func() {
|
||||
if err != nil {
|
||||
err = fmt.Errorf("[%d] Worker Error: %w", id, err)
|
||||
}
|
||||
}()
|
||||
|
||||
IPs, err := getHostIPAddresses(s.Host)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom("DNS Resolve error")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// If the host's ip is in the pool, stop
|
||||
// and add the url in the queue later.
|
||||
IpPool.Lock.RLock()
|
||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL)
|
||||
for _, ip := range IPs {
|
||||
_, ok := IpPool.IPs[ip]
|
||||
if ok {
|
||||
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL)
|
||||
IpPool.Lock.RUnlock()
|
||||
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
|
||||
return nil
|
||||
}
|
||||
}
|
||||
IpPool.Lock.RUnlock()
|
||||
|
||||
AddIPsToPool(IPs)
|
||||
|
||||
url := s.URL.String()
|
||||
logging.LogDebug("[%d] Dialing %s", id, url)
|
||||
Visit(s)
|
||||
logging.LogDebug("[%d] Finished dialing.", id)
|
||||
|
||||
go func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
RemoveIPsFromPool(IPs)
|
||||
}()
|
||||
|
||||
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||
logging.LogDebug("[%d] [%s] Processing", id, url)
|
||||
s = ProcessGemini(s)
|
||||
}
|
||||
logging.LogDebug("[%d] Saving", id)
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
}
|
||||
|
||||
// Store links
|
||||
if s.Links != nil {
|
||||
for _, link := range *s.Links {
|
||||
newSnapshot := Snapshot{UID: uid.UID(), URL: link, Host: link.Hostname, Timestamp: null.TimeFrom(time.Now())}
|
||||
if shouldPersistURL(tx, link) {
|
||||
logging.LogDebug("[%d] Saving link %s", id, link)
|
||||
err = SaveLinkToDB(tx, &newSnapshot)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
|
||||
if !strings.HasPrefix(u.String(), "gemini://") {
|
||||
return false
|
||||
}
|
||||
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)`
|
||||
var exists bool
|
||||
err := tx.Get(&exists, query, u.String())
|
||||
if err != nil {
|
||||
fmt.Println("Error executing query:", err)
|
||||
return false
|
||||
}
|
||||
return !exists
|
||||
}
|
||||
|
||||
// Select a random snapshot.
|
||||
func GetRandomSnapshot(tx *sqlx.Tx) (*Snapshot, error) {
|
||||
query := `SELECT * FROM snapshots
|
||||
WHERE response_code IS NULL
|
||||
AND error IS NULL
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED`
|
||||
// AND (timestamp < NOW() - INTERVAL '1 day' OR timestamp IS NULL)
|
||||
var snapshot Snapshot
|
||||
err := tx.Get(&snapshot, query)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
// Handle the case where no rows were found
|
||||
return nil, nil
|
||||
}
|
||||
// Handle other potential errors
|
||||
return nil, err
|
||||
}
|
||||
return &snapshot, nil
|
||||
}
|
||||
|
||||
func GetRandomSnapshots(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||
query := `
|
||||
SELECT * FROM snapshots
|
||||
WHERE response_code IS NULL
|
||||
AND error IS NULL
|
||||
ORDER BY RANDOM()
|
||||
LIMIT $1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
`
|
||||
var snapshots []Snapshot
|
||||
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return snapshots, nil
|
||||
}
|
||||
|
||||
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||
query := `
|
||||
SELECT DISTINCT ON (host) *
|
||||
FROM snapshots
|
||||
WHERE response_code IS NULL
|
||||
AND error IS NULL
|
||||
ORDER BY host, RANDOM()
|
||||
LIMIT $1
|
||||
`
|
||||
var snapshots []Snapshot
|
||||
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return snapshots, nil
|
||||
}
|
||||
|
||||
func runWorker(id int, db *sqlx.DB) {
|
||||
// Start the transaction
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
logging.LogError("Failed to begin transaction: %w", err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
logging.LogError("[%d] Failed to commit transaction: %w", id, err)
|
||||
tx.Rollback()
|
||||
}
|
||||
}()
|
||||
|
||||
snapshots, err := GetRandomSnapshotsDistinctHosts(tx)
|
||||
|
||||
if err != nil {
|
||||
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
|
||||
time.Sleep(10 * time.Second)
|
||||
return
|
||||
} else if len(snapshots) == 0 {
|
||||
logging.LogInfo("[%d] No remaining snapshots to visit.", id)
|
||||
time.Sleep(1 * time.Minute)
|
||||
return
|
||||
}
|
||||
total := len(snapshots)
|
||||
for i, s := range snapshots {
|
||||
if InBlacklist(&s) {
|
||||
logging.LogWarn("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
|
||||
}
|
||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
||||
err = workOnSnapshot(id, tx, &s)
|
||||
if err != nil {
|
||||
logging.LogError("[%d] [%s] Error %w", id, s.URL, err)
|
||||
// TODO Remove panic and gracefully handle/log error
|
||||
fmt.Printf("Error %s Stack trace:\n%s", err, debug.Stack())
|
||||
panic("ERROR encountered")
|
||||
}
|
||||
if s.Error.Valid {
|
||||
logging.LogWarn("[%d] [%s] Error: %v", id, s.URL, fmt.Errorf(s.Error.String))
|
||||
}
|
||||
logging.LogDebug("[%d] Done %d/%d.", id, i, total)
|
||||
}
|
||||
logging.LogInfo("[%d] Worker done.", id)
|
||||
}
|
||||
Reference in New Issue
Block a user