.
This commit is contained in:
51
gemini/blacklist.go
Normal file
51
gemini/blacklist.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
var Blacklist *[]string //nolint:gochecknoglobals
|
||||
|
||||
func LoadBlacklist() {
|
||||
if Blacklist == nil {
|
||||
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
|
||||
|
||||
if err != nil {
|
||||
Blacklist = &[]string{}
|
||||
logging.LogWarn("Could not load Blacklist file: %v", err)
|
||||
return
|
||||
}
|
||||
lines := strings.Split(string(data), "\n")
|
||||
|
||||
// Ignore lines starting with '#' (comments)
|
||||
filteredLines := func() []string {
|
||||
out := make([]string, 0, len(lines))
|
||||
for _, line := range lines {
|
||||
if !strings.HasPrefix(line, "#") {
|
||||
out = append(out, line)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}()
|
||||
|
||||
if len(lines) > 0 {
|
||||
Blacklist = &filteredLines
|
||||
logging.LogInfo("Blacklist has %d entries", len(*Blacklist))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func IsBlacklisted(url URL) bool {
|
||||
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
|
||||
for _, v := range *Blacklist {
|
||||
if v == url.Hostname || v == hostWithPort {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -4,11 +4,11 @@ import (
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
var IpPool IpAddressPool = IpAddressPool{IPs: make(map[string]int)}
|
||||
var IpPool = IpAddressPool{IPs: make(map[string]int)}
|
||||
|
||||
func AddIPsToPool(IPs []string) {
|
||||
func AddIPsToPool(ips []string) {
|
||||
IpPool.Lock.Lock()
|
||||
for _, ip := range IPs {
|
||||
for _, ip := range ips {
|
||||
logging.LogDebug("Adding %s to pool", ip)
|
||||
IpPool.IPs[ip]++
|
||||
}
|
||||
|
||||
100
gemini/errors.go
Normal file
100
gemini/errors.go
Normal file
@@ -0,0 +1,100 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type ErrGeminiStatusCode struct {
|
||||
Msg string
|
||||
Code int
|
||||
Header string
|
||||
}
|
||||
|
||||
func (e *ErrGeminiStatusCode) Error() string {
|
||||
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
|
||||
}
|
||||
|
||||
func NewErrGeminiStatusCode(code int, header string) error {
|
||||
var msg string
|
||||
switch {
|
||||
case code >= 10 && code < 20:
|
||||
msg = "needs input"
|
||||
case code >= 30 && code < 40:
|
||||
msg = "redirect"
|
||||
case code >= 40 && code < 50:
|
||||
msg = "bad request"
|
||||
case code >= 50 && code < 60:
|
||||
msg = "server error"
|
||||
case code >= 60 && code < 70:
|
||||
msg = "TLS error"
|
||||
default:
|
||||
msg = "unexpected status code"
|
||||
}
|
||||
return &ErrGeminiStatusCode{
|
||||
Msg: msg,
|
||||
Code: code,
|
||||
Header: header,
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
ErrGemini = errors.New("gemini general error")
|
||||
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
|
||||
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
|
||||
ErrGeminiResponseHeader = errors.New("gemini response header error")
|
||||
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
|
||||
|
||||
ErrURLParse = errors.New("URL parse error")
|
||||
ErrURLDecode = errors.New("URL decode error")
|
||||
ErrUTF8Parse = errors.New("UTF-8 parse error")
|
||||
ErrTextParse = errors.New("text parse error")
|
||||
|
||||
ErrNetwork = errors.New("network error")
|
||||
ErrNetworkDNS = errors.New("network DNS error")
|
||||
ErrNetworkTLS = errors.New("network TLS error")
|
||||
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
|
||||
ErrNetworkCannotWrite = errors.New("network error - cannot write")
|
||||
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
|
||||
|
||||
ErrDatabase = errors.New("database error")
|
||||
)
|
||||
|
||||
// We could have used a map for speed, but
|
||||
// we would lose ability to check wrapped
|
||||
// errors via errors.Is().
|
||||
|
||||
var KnownErrors = []error{
|
||||
ErrGemini,
|
||||
ErrGeminiLinkLineParse,
|
||||
ErrGeminiRobotsParse,
|
||||
ErrGeminiRobotsDisallowed,
|
||||
ErrGeminiResponseHeader,
|
||||
|
||||
ErrURLParse,
|
||||
ErrURLDecode,
|
||||
ErrUTF8Parse,
|
||||
ErrTextParse,
|
||||
|
||||
ErrNetwork,
|
||||
ErrNetworkDNS,
|
||||
ErrNetworkTLS,
|
||||
ErrNetworkSetConnectionDeadline,
|
||||
ErrNetworkCannotWrite,
|
||||
ErrNetworkResponseSizeExceededMax,
|
||||
|
||||
ErrDatabase,
|
||||
}
|
||||
|
||||
func IsKnownError(err error) bool {
|
||||
var errGeminiStatusCode *ErrGeminiStatusCode
|
||||
if errors.As(err, &errGeminiStatusCode) {
|
||||
return true
|
||||
}
|
||||
for _, known := range KnownErrors {
|
||||
if errors.Is(err, known) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -2,12 +2,13 @@ package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
// sanitizePath encodes invalid filesystem characters using URL encoding.
|
||||
@@ -87,9 +88,9 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
||||
return
|
||||
}
|
||||
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||
err = os.WriteFile(finalPath, (*s).Data.V, 0666)
|
||||
err = os.WriteFile(finalPath, (*s).Data.V, 0o666)
|
||||
} else {
|
||||
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666)
|
||||
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
|
||||
}
|
||||
if err != nil {
|
||||
logging.LogError("Error saving %s: %w", s.URL.Full, err)
|
||||
|
||||
@@ -1,32 +1,13 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func checkGeminiStatusCode(code int) error {
|
||||
switch {
|
||||
case code == 20:
|
||||
return nil
|
||||
case code >= 10 && code < 20:
|
||||
return fmt.Errorf("gemini response %d needs data input", code)
|
||||
case code >= 30 && code < 40:
|
||||
return fmt.Errorf("gemini response %d redirect", code)
|
||||
case code >= 40 && code < 50:
|
||||
return fmt.Errorf("gemini response %d server error", code)
|
||||
case code >= 50 && code < 60:
|
||||
return fmt.Errorf("gemini response %d server permanent error", code)
|
||||
case code >= 60 && code < 70:
|
||||
return fmt.Errorf("gemini response %d certificate error", code)
|
||||
default:
|
||||
return fmt.Errorf("unexpected/unhandled Gemini response %d", code)
|
||||
}
|
||||
}
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
||||
// Grab link lines
|
||||
@@ -40,7 +21,7 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
||||
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
|
||||
continue
|
||||
}
|
||||
geminiUrl, err := ParseUrl(normalizedLink, descr)
|
||||
geminiUrl, err := ParseURL(normalizedLink, descr)
|
||||
if err != nil {
|
||||
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
|
||||
continue
|
||||
@@ -54,25 +35,6 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
||||
return snapshot
|
||||
}
|
||||
|
||||
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||
u, err := url.Parse(input)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
|
||||
}
|
||||
protocol := u.Scheme
|
||||
hostname := u.Hostname()
|
||||
strPort := u.Port()
|
||||
path := u.Path
|
||||
if strPort == "" {
|
||||
strPort = "1965"
|
||||
}
|
||||
port, err := strconv.Atoi(strPort)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing URL %s: %w", input, err)
|
||||
}
|
||||
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
|
||||
}
|
||||
|
||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||
func ExtractLinkLines(gemtext string) []string {
|
||||
// Define the regular expression pattern to match link lines
|
||||
@@ -87,11 +49,11 @@ func ExtractLinkLines(gemtext string) []string {
|
||||
// NormalizeLink takes a single link line and the current URL,
|
||||
// return the URL converted to an absolute URL
|
||||
// and its description.
|
||||
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
|
||||
func NormalizeLink(linkLine string, currentURL string) (string, string, error) {
|
||||
// Parse the current URL
|
||||
baseURL, err := url.Parse(currentURL)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("invalid current URL: %v", err)
|
||||
return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
|
||||
}
|
||||
|
||||
// Regular expression to extract the URL part from a link line
|
||||
@@ -101,13 +63,13 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
|
||||
matches := re.FindStringSubmatch(linkLine)
|
||||
if len(matches) == 0 {
|
||||
// If the line doesn't match the expected format, return it unchanged
|
||||
return "", "", fmt.Errorf("not a link line: %v", linkLine)
|
||||
return "", "", fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine)
|
||||
}
|
||||
|
||||
originalURLStr := matches[1]
|
||||
_, err = url.QueryUnescape(originalURLStr)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("error decoding URL: %w", err)
|
||||
return "", "", fmt.Errorf("%w: %w", ErrURLDecode, err)
|
||||
}
|
||||
|
||||
restOfLine := ""
|
||||
@@ -119,7 +81,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
|
||||
parsedURL, err := url.Parse(originalURLStr)
|
||||
if err != nil {
|
||||
// If URL parsing fails, return an error
|
||||
return "", "", fmt.Errorf("invalid URL '%s': %v", originalURLStr, err)
|
||||
return "", "", fmt.Errorf("%w: %w", ErrURLParse, err)
|
||||
}
|
||||
|
||||
// Resolve relative URLs against the base URL
|
||||
@@ -151,13 +113,13 @@ func ParseFirstTwoDigits(input string) (int, error) {
|
||||
// Find the first match in the string
|
||||
matches := re.FindStringSubmatch(input)
|
||||
if len(matches) == 0 {
|
||||
return 0, errors.New("no digits found at the beginning of the string")
|
||||
return 0, fmt.Errorf("%w", ErrGeminiResponseHeader)
|
||||
}
|
||||
|
||||
// Parse the captured match as an integer
|
||||
snapshot, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
|
||||
return 0, fmt.Errorf("%w: %w", ErrTextParse, err)
|
||||
}
|
||||
|
||||
return snapshot, nil
|
||||
|
||||
@@ -34,7 +34,7 @@ func (u *URL) Scan(value interface{}) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (u *URL) String() string {
|
||||
func (u URL) String() string {
|
||||
return u.Full
|
||||
}
|
||||
|
||||
@@ -62,7 +62,8 @@ func ParseURL(input string, descr string) (*URL, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: Input %s Error %w", ErrURLParse, input, err)
|
||||
}
|
||||
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
|
||||
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
|
||||
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
|
||||
}
|
||||
|
||||
//func GeminiUrltoJSON(g URL) string {
|
||||
|
||||
@@ -2,25 +2,27 @@ package gemini
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"gemini-grc/config"
|
||||
"io"
|
||||
"net"
|
||||
go_url "net/url"
|
||||
gourl "net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"gemini-grc/config"
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
type GeminiPageData struct {
|
||||
ResponseCode int
|
||||
MimeType string
|
||||
Lang string
|
||||
GemText string
|
||||
Data []byte
|
||||
type PageData struct {
|
||||
ResponseCode int
|
||||
ResponseHeader string
|
||||
MimeType string
|
||||
Lang string
|
||||
GemText string
|
||||
Data []byte
|
||||
}
|
||||
|
||||
// Resolve the URL hostname and
|
||||
@@ -31,7 +33,7 @@ type GeminiPageData struct {
|
||||
func getHostIPAddresses(hostname string) ([]string, error) {
|
||||
addrs, err := net.LookupHost(hostname)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err)
|
||||
}
|
||||
IpPool.Lock.RLock()
|
||||
defer func() {
|
||||
@@ -41,12 +43,12 @@ func getHostIPAddresses(hostname string) ([]string, error) {
|
||||
}
|
||||
|
||||
func ConnectAndGetData(url string) ([]byte, error) {
|
||||
parsedUrl, err := go_url.Parse(url)
|
||||
parsedURL, err := gourl.Parse(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Could not parse URL, error %w", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
||||
}
|
||||
hostname := parsedUrl.Hostname()
|
||||
port := parsedUrl.Port()
|
||||
hostname := parsedURL.Hostname()
|
||||
port := parsedURL.Port()
|
||||
if port == "" {
|
||||
port = "1965"
|
||||
}
|
||||
@@ -58,34 +60,34 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
||||
}
|
||||
conn, err := dialer.Dial("tcp", host)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("TCP connection failed: %w", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
||||
}
|
||||
// Make sure we always close the connection.
|
||||
defer func() {
|
||||
// No need to handle error:
|
||||
// Connection will timeout eventually if still open somehow.
|
||||
conn.Close()
|
||||
// Connection will time out eventually if still open somehow.
|
||||
_ = conn.Close()
|
||||
}()
|
||||
|
||||
// Set read and write timeouts on the TCP connection.
|
||||
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error setting connection deadline: %w", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
|
||||
}
|
||||
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error setting connection deadline: %w", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
|
||||
}
|
||||
|
||||
// Perform the TLS handshake
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure.
|
||||
ServerName: parsedUrl.Hostname(), // SNI should not include port
|
||||
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
|
||||
ServerName: parsedURL.Hostname(), // SNI should not include port
|
||||
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
|
||||
}
|
||||
tlsConn := tls.Client(conn, tlsConfig)
|
||||
if err := tlsConn.Handshake(); err != nil {
|
||||
return nil, fmt.Errorf("TLS handshake error: %w", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err)
|
||||
}
|
||||
|
||||
// We read `buf`-sized chunks and add data to `data`.
|
||||
@@ -95,7 +97,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
||||
// Send Gemini request to trigger server response.
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url)))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error sending network request: %w", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
|
||||
}
|
||||
// Read response bytes in len(buf) byte chunks
|
||||
for {
|
||||
@@ -104,68 +106,72 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
||||
data = append(data, buf[:n]...)
|
||||
}
|
||||
if len(data) > config.CONFIG.MaxResponseSize {
|
||||
data = []byte{}
|
||||
return nil, fmt.Errorf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize)
|
||||
return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
|
||||
}
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else {
|
||||
return nil, fmt.Errorf("Network error: %s", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Connect to given URL, using the Gemini protocol.
|
||||
// Mutate given Snapshot with the data or the error.
|
||||
func Visit(s *Snapshot) {
|
||||
// Visit given URL, using the Gemini protocol.
|
||||
// Mutates given Snapshot with the data.
|
||||
func Visit(s *Snapshot) error {
|
||||
data, err := ConnectAndGetData(s.URL.String())
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return
|
||||
return err
|
||||
}
|
||||
pageData, err := processData(data)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return
|
||||
return err
|
||||
}
|
||||
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
||||
s.MimeType = null.StringFrom(pageData.MimeType)
|
||||
s.Lang = null.StringFrom(pageData.Lang)
|
||||
if pageData.GemText != "" {
|
||||
s.GemText = null.StringFrom(string(pageData.GemText))
|
||||
s.GemText = null.StringFrom(pageData.GemText)
|
||||
}
|
||||
if pageData.Data != nil {
|
||||
s.Data = null.ValueFrom(pageData.Data)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Update given snapshot with the
|
||||
// Gemini header data: response code,
|
||||
// mime type and lang (optional)
|
||||
func processData(data []byte) (*GeminiPageData, error) {
|
||||
headers, body, err := getHeadersAndData(data)
|
||||
func processData(data []byte) (*PageData, error) {
|
||||
header, body, err := getHeadersAndData(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
code, mimeType, lang := getMimeTypeAndLang(headers)
|
||||
geminiError := checkGeminiStatusCode(code)
|
||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||
var geminiError error
|
||||
if code != 20 {
|
||||
geminiError = NewErrGeminiStatusCode(code, header)
|
||||
}
|
||||
fmt.Printf("%v\n", header)
|
||||
|
||||
if geminiError != nil {
|
||||
return nil, geminiError
|
||||
}
|
||||
pageData := GeminiPageData{
|
||||
ResponseCode: code,
|
||||
MimeType: mimeType,
|
||||
Lang: lang,
|
||||
pageData := PageData{
|
||||
ResponseCode: code,
|
||||
ResponseHeader: header,
|
||||
MimeType: mimeType,
|
||||
Lang: lang,
|
||||
}
|
||||
// If we've got a Gemini document, populate
|
||||
// `GemText` field, otherwise raw data goes to `Data`.
|
||||
if mimeType == "text/gemini" {
|
||||
validBody, err := EnsureValidUTF8(body)
|
||||
validBody, err := BytesToValidUTF8(body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("UTF-8 error: %w", err)
|
||||
return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err)
|
||||
}
|
||||
pageData.GemText = validBody
|
||||
} else {
|
||||
@@ -178,14 +184,14 @@ func processData(data []byte) (*GeminiPageData, error) {
|
||||
// basically the first line of the response
|
||||
// and should contain the response code,
|
||||
// mimeType and language.
|
||||
func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
|
||||
func getHeadersAndData(data []byte) (string, []byte, error) {
|
||||
firstLineEnds := slices.Index(data, '\n')
|
||||
if firstLineEnds == -1 {
|
||||
return "", nil, fmt.Errorf("Could not parse response header")
|
||||
return "", nil, ErrGeminiResponseHeader
|
||||
}
|
||||
firstLine = string(data[:firstLineEnds])
|
||||
rest = data[firstLineEnds+1:]
|
||||
return string(firstLine), rest, nil
|
||||
firstLine := string(data[:firstLineEnds])
|
||||
rest := data[firstLineEnds+1:]
|
||||
return firstLine, rest, nil
|
||||
}
|
||||
|
||||
// Parses code, mime type and language
|
||||
@@ -194,7 +200,7 @@ func getHeadersAndData(data []byte) (firstLine string, rest []byte, err error) {
|
||||
// `20 text/gemini lang=en` (code, mimetype, lang)
|
||||
// `20 text/gemini` (code, mimetype)
|
||||
// `31 gemini://redirected.to/other/site` (code)
|
||||
func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string) {
|
||||
func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||
// Regex that parses code, mimetype & lang
|
||||
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
|
||||
matches := re.FindStringSubmatch(headers)
|
||||
@@ -215,7 +221,7 @@ func getMimeTypeAndLang(headers string) (code int, mimeType string, lang string)
|
||||
if err != nil {
|
||||
return 0, "", ""
|
||||
}
|
||||
mimeType = matches[2]
|
||||
lang = matches[4]
|
||||
mimeType := matches[2]
|
||||
lang := matches[4]
|
||||
return code, mimeType, lang
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
|
||||
// Test for input: `20 text/gemini`
|
||||
func TestGetMimeTypeAndLang1(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
@@ -13,6 +14,7 @@ func TestGetMimeTypeAndLang1(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang11(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini\n")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "" {
|
||||
t.Errorf("Expected (20, 'text/gemini', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
@@ -20,6 +22,7 @@ func TestGetMimeTypeAndLang11(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGetTypeAndLang2(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
@@ -27,6 +30,7 @@ func TestGetTypeAndLang2(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang3(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("31 gemini://redirect.to/page")
|
||||
if code != 31 || mimeType != "" || lang != "" {
|
||||
t.Errorf("Expected (20, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
@@ -34,6 +38,7 @@ func TestGetMimeTypeAndLang3(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang4(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("aaafdasdasd")
|
||||
if code != 0 || mimeType != "" || lang != "" {
|
||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
@@ -41,6 +46,7 @@ func TestGetMimeTypeAndLang4(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang5(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("")
|
||||
if code != 0 || mimeType != "" || lang != "" {
|
||||
t.Errorf("Expected (0, '', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
|
||||
@@ -2,9 +2,9 @@ package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"os"
|
||||
|
||||
"gemini-grc/logging"
|
||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@@ -33,11 +33,26 @@ func ConnectToDB() *sqlx.DB {
|
||||
return db
|
||||
}
|
||||
|
||||
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
func SaveSnapshotToDBIfNotExists(tx *sqlx.Tx, s *Snapshot) error {
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (uid) DO UPDATE SET
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
`
|
||||
_, err := tx.NamedExec(query, s)
|
||||
if err != nil {
|
||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
fmt.Printf("%+v", s)
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (url) DO UPDATE SET
|
||||
url = EXCLUDED.url,
|
||||
host = EXCLUDED.host,
|
||||
timestamp = EXCLUDED.timestamp,
|
||||
@@ -64,7 +79,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (uid) DO NOTHING
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
`
|
||||
|
||||
for i := 0; i < len(snapshots); i += batchSize {
|
||||
@@ -89,7 +104,7 @@ func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (uid) DO NOTHING
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
`
|
||||
_, err := tx.NamedExec(query, snapshots)
|
||||
if err != nil {
|
||||
|
||||
@@ -2,33 +2,58 @@ package gemini
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
"golang.org/x/text/encoding/japanese"
|
||||
"golang.org/x/text/encoding/korean"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrInputTooLarge = errors.New("input too large")
|
||||
ErrUTF8Conversion = errors.New("UTF-8 conversion error")
|
||||
)
|
||||
|
||||
func BytesToValidUTF8(input []byte) (string, error) {
|
||||
if len(input) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
const maxSize = 10 * 1024 * 1024 // 10MB
|
||||
if len(input) > maxSize {
|
||||
return "", fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize)
|
||||
}
|
||||
// Remove NULL byte 0x00 (ReplaceAll accepts slices)
|
||||
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
|
||||
isValidUTF8 := utf8.Valid(inputNoNull)
|
||||
if isValidUTF8 {
|
||||
if utf8.Valid(inputNoNull) {
|
||||
return string(inputNoNull), nil
|
||||
}
|
||||
encodings := []transform.Transformer{
|
||||
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
|
||||
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
|
||||
// TODO: Try more encodings?
|
||||
charmap.ISO8859_1.NewDecoder(),
|
||||
charmap.ISO8859_7.NewDecoder(),
|
||||
charmap.Windows1250.NewDecoder(), // Central European
|
||||
charmap.Windows1251.NewDecoder(), // Cyrillic
|
||||
charmap.Windows1252.NewDecoder(),
|
||||
charmap.Windows1256.NewDecoder(), // Arabic
|
||||
japanese.EUCJP.NewDecoder(), // Japanese
|
||||
korean.EUCKR.NewDecoder(), // Korean
|
||||
}
|
||||
// First successful conversion wins.
|
||||
var lastErr error
|
||||
for _, encoding := range encodings {
|
||||
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
|
||||
result, err := io.ReadAll(reader)
|
||||
if err == nil {
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
if utf8.Valid(result) {
|
||||
return string(result), nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("UTF-8 error: %w", err)
|
||||
|
||||
return "", fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr)
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import "testing"
|
||||
|
||||
// Make sure NULL bytes are removed
|
||||
func TestEnsureValidUTF8(t *testing.T) {
|
||||
t.Parallel()
|
||||
// Create a string with a null byte
|
||||
strWithNull := "Hello" + string('\x00') + "world"
|
||||
result, _ := BytesToValidUTF8([]byte(strWithNull))
|
||||
|
||||
@@ -2,16 +2,18 @@ package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
// key: "host:port" (string)
|
||||
// value:
|
||||
// empty []string if no robots data, or
|
||||
// list of URL prefixes ([]string) in robots
|
||||
var RobotsCache sync.Map
|
||||
// RobotsCache is a map of blocked URLs
|
||||
// key: URL
|
||||
// value: []string list of disallowed URLs
|
||||
// If a key has no blocked URLs, an empty
|
||||
// list is stored for caching.
|
||||
var RobotsCache sync.Map //nolint:gochecknoglobals
|
||||
|
||||
func populateBlacklist(key string) (entries []string) {
|
||||
// We either store an empty list when
|
||||
@@ -40,43 +42,40 @@ func populateBlacklist(key string) (entries []string) {
|
||||
// According to spec, the first is correct,
|
||||
// however let's be lenient
|
||||
var data string
|
||||
if robotsData.MimeType == "text/plain" {
|
||||
switch {
|
||||
case robotsData.MimeType == "text/plain":
|
||||
data = string(robotsData.Data)
|
||||
} else if robotsData.MimeType == "text/gemini" {
|
||||
case robotsData.MimeType == "text/gemini":
|
||||
data = robotsData.GemText
|
||||
} else {
|
||||
default:
|
||||
return []string{}
|
||||
}
|
||||
entries = ParseRobotsTxt(string(data), key)
|
||||
entries = ParseRobotsTxt(data, key)
|
||||
return entries
|
||||
}
|
||||
|
||||
// Check if the snapshot URL matches
|
||||
// RobotMatch checks if the snapshot URL matches
|
||||
// a robots.txt allow rule.
|
||||
func RobotMatch(s *Snapshot) bool {
|
||||
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
|
||||
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
|
||||
v, ok := RobotsCache.Load(key)
|
||||
func RobotMatch(url URL) bool {
|
||||
logging.LogDebug("Checking robots.txt cache for %s", url.String())
|
||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||
var disallowedURLs []string
|
||||
cacheEntries, ok := RobotsCache.Load(key)
|
||||
if !ok {
|
||||
// First time check, populate robot cache
|
||||
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
|
||||
disallowedURLs := populateBlacklist(key)
|
||||
for _, url := range disallowedURLs {
|
||||
if strings.HasPrefix(s.URL.String(), url) {
|
||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||
return true
|
||||
}
|
||||
}
|
||||
disallowedURLs = populateBlacklist(key)
|
||||
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||
} else {
|
||||
if len(v.([]string)) == 0 {
|
||||
logging.LogDebug("No robots.txt or no rules, allowed")
|
||||
return false
|
||||
}
|
||||
for _, url := range v.([]string) {
|
||||
if strings.HasPrefix(s.URL.String(), url) {
|
||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||
return true
|
||||
}
|
||||
disallowedURLs, _ = cacheEntries.([]string)
|
||||
}
|
||||
return isURLblocked(disallowedURLs, url.Full)
|
||||
}
|
||||
|
||||
func isURLblocked(disallowedURLs []string, input string) bool {
|
||||
for _, url := range disallowedURLs {
|
||||
if strings.HasPrefix(strings.ToLower(input), url) {
|
||||
logging.LogDebug("robots.txt match: %s matches %s", input, url)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
||||
@@ -5,7 +5,7 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Takes robots.txt content and a host, and
|
||||
// ParseRobotsTxt takes robots.txt content and a host, and
|
||||
// returns a list of full URLs that shouldn't
|
||||
// be visited.
|
||||
// TODO Also take into account the user agent?
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseRobotsTxt(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := `User-agent: *
|
||||
Disallow: /cgi-bin/wp.cgi/view
|
||||
Disallow: /cgi-bin/wp.cgi/media
|
||||
@@ -26,6 +27,7 @@ Disallow: /admin/`
|
||||
}
|
||||
|
||||
func TestParseRobotsTxtEmpty(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := ``
|
||||
|
||||
result := ParseRobotsTxt(input, "example.com")
|
||||
@@ -34,3 +36,20 @@ func TestParseRobotsTxtEmpty(t *testing.T) {
|
||||
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsURLblocked(t *testing.T) {
|
||||
t.Parallel()
|
||||
disallowedURLs := []string{
|
||||
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||
"gemini://example.com/admin/",
|
||||
}
|
||||
url := "gemini://example.com/admin/index.html"
|
||||
if !isURLblocked(disallowedURLs, url) {
|
||||
t.Errorf("Expected %s to be blocked", url)
|
||||
}
|
||||
url = "gemini://example1.com/admin/index.html"
|
||||
if isURLblocked(disallowedURLs, url) {
|
||||
t.Errorf("expected %s to not be blocked", url)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,15 +4,13 @@ import (
|
||||
"database/sql/driver"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"strings"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
type LinkList []GeminiUrl
|
||||
type LinkList []URL
|
||||
|
||||
func (l LinkList) Value() (driver.Value, error) {
|
||||
func (l *LinkList) Value() (driver.Value, error) {
|
||||
return json.Marshal(l)
|
||||
}
|
||||
|
||||
@@ -31,7 +29,7 @@ func (l *LinkList) Scan(value interface{}) error {
|
||||
type Snapshot struct {
|
||||
ID int `db:"id" json:"id,omitempty"`
|
||||
UID string `db:"uid" json:"uid,omitempty"`
|
||||
URL GeminiUrl `db:"url" json:"url,omitempty"`
|
||||
URL URL `db:"url" json:"url,omitempty"`
|
||||
Host string `db:"host" json:"host,omitempty"`
|
||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||
@@ -43,32 +41,32 @@ type Snapshot struct {
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
}
|
||||
|
||||
func SnapshotToJSON(g Snapshot) string {
|
||||
// Serialize the Person struct to JSON
|
||||
jsonData, err := json.MarshalIndent(g, "", "\t")
|
||||
if err != nil {
|
||||
logging.LogError("Error serializing to JSON: %w", err)
|
||||
}
|
||||
return string(jsonData)
|
||||
}
|
||||
|
||||
func SnapshotFromJSON(input string) Snapshot {
|
||||
var snapshot Snapshot
|
||||
err := json.Unmarshal([]byte(input), &snapshot)
|
||||
if err != nil {
|
||||
logging.LogError("Error deserializing from JSON: %w", err)
|
||||
}
|
||||
return snapshot
|
||||
}
|
||||
|
||||
func ShouldPersistSnapshot(result *Snapshot) bool {
|
||||
if !result.MimeType.Valid {
|
||||
return false
|
||||
}
|
||||
if result.MimeType.String == "text/gemini" ||
|
||||
strings.HasPrefix(result.MimeType.String, "image/") ||
|
||||
strings.HasPrefix(result.MimeType.String, "text/") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
//func SnapshotToJSON(g Snapshot) string {
|
||||
// // Serialize the Person struct to JSON
|
||||
// jsonData, err := json.MarshalIndent(g, "", "\t")
|
||||
// if err != nil {
|
||||
// logging.LogError("Error serializing to JSON: %w", err)
|
||||
// }
|
||||
// return string(jsonData)
|
||||
//}
|
||||
//
|
||||
//func SnapshotFromJSON(input string) Snapshot {
|
||||
// var snapshot Snapshot
|
||||
// err := json.Unmarshal([]byte(input), &snapshot)
|
||||
// if err != nil {
|
||||
// logging.LogError("Error deserializing from JSON: %w", err)
|
||||
// }
|
||||
// return snapshot
|
||||
//}
|
||||
//
|
||||
//func ShouldPersistSnapshot(result *Snapshot) bool {
|
||||
// if !result.MimeType.Valid {
|
||||
// return false
|
||||
// }
|
||||
// if result.MimeType.String == "text/gemini" ||
|
||||
// strings.HasPrefix(result.MimeType.String, "image/") ||
|
||||
// strings.HasPrefix(result.MimeType.String, "text/") {
|
||||
// return true
|
||||
// }
|
||||
// return false
|
||||
//}
|
||||
|
||||
126
gemini/worker.go
126
gemini/worker.go
@@ -1,30 +1,32 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/logging"
|
||||
"gemini-grc/uid"
|
||||
"gemini-grc/util"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
||||
for i := 0; i < numOfWorkers; i++ {
|
||||
for i := range numOfWorkers {
|
||||
go func(i int) {
|
||||
for {
|
||||
runWorker(i, db)
|
||||
RunWorker(i, db, nil)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
}
|
||||
|
||||
func runWorker(id int, db *sqlx.DB) {
|
||||
func RunWorker(id int, db *sqlx.DB, url *string) {
|
||||
// Start the DB transaction
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
@@ -42,38 +44,85 @@ func runWorker(id int, db *sqlx.DB) {
|
||||
}
|
||||
}()
|
||||
|
||||
snapshots, err := GetRandomSnapshotsDistinctHosts(tx)
|
||||
var snapshots []Snapshot
|
||||
|
||||
if url == nil {
|
||||
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
|
||||
} else {
|
||||
snapshots, err = GetSnapshotFromURL(tx, *url)
|
||||
if len(snapshots) == 0 {
|
||||
snapshotURL, err := ParseURL(*url, "")
|
||||
if err != nil {
|
||||
panic("Invalid URL: " + *url)
|
||||
}
|
||||
snapshots = []Snapshot{{
|
||||
UID: uid.UID(),
|
||||
URL: *snapshotURL,
|
||||
Host: snapshotURL.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
|
||||
time.Sleep(10 * time.Second)
|
||||
return
|
||||
} else if len(snapshots) == 0 {
|
||||
logging.LogInfo("[%d] No remaining snapshots to visit.", id)
|
||||
logging.LogInfo("[%d] No snapshots to visit.", id)
|
||||
time.Sleep(1 * time.Minute)
|
||||
return
|
||||
}
|
||||
total := len(snapshots)
|
||||
for i, s := range snapshots {
|
||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
|
||||
err = workOnSnapshot(id, tx, &s)
|
||||
if err != nil {
|
||||
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL, err)
|
||||
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err)
|
||||
util.PrintStackAndPanic(err)
|
||||
}
|
||||
if s.Error.Valid {
|
||||
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL, s.Error.String)
|
||||
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String)
|
||||
}
|
||||
logging.LogDebug("[%d] Done %d/%d.", id, i, total)
|
||||
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
|
||||
}
|
||||
logging.LogInfo("[%d] Worker done.", id)
|
||||
}
|
||||
|
||||
func handleRedirection(tx *sqlx.Tx, s *Snapshot) error {
|
||||
re := regexp.MustCompile(`gemini://\S+`)
|
||||
matches := re.FindStringSubmatch(s.Error.ValueOrZero())
|
||||
if len(matches) == 1 {
|
||||
newURL := matches[0]
|
||||
logging.LogDebug("Page redirects to %s", newURL)
|
||||
_url, err := ParseURL(newURL, "")
|
||||
// Insert fresh snapshot with new URL
|
||||
if err == nil {
|
||||
snapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
URL: *_url,
|
||||
Host: _url.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
err := SaveSnapshotToDBIfNotExists(tx, snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
if IsBlacklisted(s.URL) {
|
||||
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
|
||||
return nil
|
||||
}
|
||||
|
||||
// If URL matches a robots.txt disallow line,
|
||||
// add it as an error so next time it won't be
|
||||
// crawled.
|
||||
if RobotMatch(s) {
|
||||
if RobotMatch(s.URL) {
|
||||
s.Error = null.StringFrom("robots.txt disallow match")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
if err != nil {
|
||||
@@ -92,14 +141,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
return nil
|
||||
}
|
||||
|
||||
defer func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
RemoveIPsFromPool(IPs)
|
||||
}()
|
||||
|
||||
// If the host's ip is in the connections pool,
|
||||
// stop and add the url in the queue later.
|
||||
IpPool.Lock.RLock()
|
||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL)
|
||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
|
||||
for _, ip := range IPs {
|
||||
_, ok := IpPool.IPs[ip]
|
||||
if ok {
|
||||
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL)
|
||||
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
|
||||
IpPool.Lock.RUnlock()
|
||||
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
|
||||
return nil
|
||||
@@ -111,15 +165,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
|
||||
url := s.URL.String()
|
||||
logging.LogDebug("[%d] Dialing %s", id, url)
|
||||
Visit(s)
|
||||
err = Visit(s)
|
||||
if err != nil {
|
||||
if !IsKnownError(err) {
|
||||
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
|
||||
if config.CONFIG.PanicOnUnexpectedError {
|
||||
util.PrintStackAndPanic(err)
|
||||
}
|
||||
} else {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
}
|
||||
if errors.As(err, new(*ErrGeminiStatusCode)) {
|
||||
err = handleRedirection(tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
logging.LogDebug("[%d] Finished dialing.", id)
|
||||
|
||||
go func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
RemoveIPsFromPool(IPs)
|
||||
}()
|
||||
|
||||
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||
logging.LogDebug("[%d] [%s] Processing", id, url)
|
||||
s = ProcessGemini(s)
|
||||
}
|
||||
@@ -158,7 +223,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
}
|
||||
|
||||
// Should we save the given URL for crawling?
|
||||
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
|
||||
func shouldPersistURL(tx *sqlx.Tx, u URL) bool {
|
||||
if !strings.HasPrefix(u.String(), "gemini://") {
|
||||
return false
|
||||
}
|
||||
@@ -205,3 +270,18 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||
}
|
||||
return snapshots, nil
|
||||
}
|
||||
|
||||
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
|
||||
query := `
|
||||
SELECT *
|
||||
FROM snapshots
|
||||
WHERE url=$1
|
||||
LIMIT 1
|
||||
`
|
||||
var snapshots []Snapshot
|
||||
err := tx.Select(&snapshots, query, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return snapshots, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user