Better error handling, many fixes all around
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,6 +1,8 @@
|
||||
.idea
|
||||
.goroot
|
||||
**/.#*
|
||||
**/*~
|
||||
/.go
|
||||
/cmd
|
||||
/db/initdb.sql
|
||||
/db/*sh
|
||||
|
||||
@@ -5,13 +5,13 @@ import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type ErrGeminiStatusCode struct {
|
||||
type GeminiError struct {
|
||||
Msg string
|
||||
Code int
|
||||
Header string
|
||||
}
|
||||
|
||||
func (e *ErrGeminiStatusCode) Error() string {
|
||||
func (e *GeminiError) Error() string {
|
||||
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ func NewErrGeminiStatusCode(code int, header string) error {
|
||||
default:
|
||||
msg = "unexpected status code"
|
||||
}
|
||||
return &ErrGeminiStatusCode{
|
||||
return &GeminiError{
|
||||
Msg: msg,
|
||||
Code: code,
|
||||
Header: header,
|
||||
@@ -39,7 +39,6 @@ func NewErrGeminiStatusCode(code int, header string) error {
|
||||
}
|
||||
|
||||
var (
|
||||
ErrGemini = errors.New("gemini general error")
|
||||
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
|
||||
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
|
||||
ErrGeminiResponseHeader = errors.New("gemini response header error")
|
||||
@@ -64,8 +63,10 @@ var (
|
||||
// we would lose ability to check wrapped
|
||||
// errors via errors.Is().
|
||||
|
||||
var KnownErrors = []error{
|
||||
ErrGemini,
|
||||
var errGemini *GeminiError
|
||||
|
||||
var knownErrors = []error{ //nolint:gochecknoglobals
|
||||
errGemini,
|
||||
ErrGeminiLinkLineParse,
|
||||
ErrGeminiRobotsParse,
|
||||
ErrGeminiRobotsDisallowed,
|
||||
@@ -87,14 +88,14 @@ var KnownErrors = []error{
|
||||
}
|
||||
|
||||
func IsKnownError(err error) bool {
|
||||
var errGeminiStatusCode *ErrGeminiStatusCode
|
||||
if errors.As(err, &errGeminiStatusCode) {
|
||||
return true
|
||||
}
|
||||
for _, known := range KnownErrors {
|
||||
for _, known := range knownErrors {
|
||||
if errors.Is(err, known) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
// Check for wrapped errors as well
|
||||
if errors.As(err, new(*GeminiError)) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
24
gemini/errors_test.go
Normal file
24
gemini/errors_test.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestErrGemini(t *testing.T) {
|
||||
t.Parallel()
|
||||
err := NewErrGeminiStatusCode(50, "50 server error")
|
||||
if !errors.As(err, new(*GeminiError)) {
|
||||
t.Errorf("TestErrGemini fail")
|
||||
}
|
||||
}
|
||||
|
||||
func TestErrGeminiWrapped(t *testing.T) {
|
||||
t.Parallel()
|
||||
err := NewErrGeminiStatusCode(50, "50 server error")
|
||||
errWrapped := fmt.Errorf("%w wrapped", err)
|
||||
if !errors.As(errWrapped, new(*GeminiError)) {
|
||||
t.Errorf("TestErrGeminiWrapped fail")
|
||||
}
|
||||
}
|
||||
@@ -78,7 +78,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
||||
|
||||
finalPath, err := calcFilePath(parentPath, urlPath)
|
||||
if err != nil {
|
||||
logging.LogError("Error saving %s: %w", s.URL, err)
|
||||
logging.LogError("GeminiError saving %s: %w", s.URL, err)
|
||||
return
|
||||
}
|
||||
// Ensure the directory exists
|
||||
@@ -93,7 +93,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
||||
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
|
||||
}
|
||||
if err != nil {
|
||||
logging.LogError("Error saving %s: %w", s.URL.Full, err)
|
||||
logging.LogError("GeminiError saving %s: %w", s.URL.Full, err)
|
||||
}
|
||||
close(done)
|
||||
}
|
||||
|
||||
@@ -9,14 +9,16 @@ import (
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
||||
func GetPageLinks(currentURL URL, gemtext string) LinkList {
|
||||
// Grab link lines
|
||||
linkLines := ExtractLinkLines(snapshot.GemText.String)
|
||||
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
|
||||
|
||||
linkLines := ExtractLinkLines(gemtext)
|
||||
if len(linkLines) == 0 {
|
||||
return nil
|
||||
}
|
||||
var linkURLs LinkList
|
||||
// Normalize URLs in links, and store them in snapshot
|
||||
for _, line := range linkLines {
|
||||
normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String())
|
||||
normalizedLink, descr, err := NormalizeLink(line, currentURL.String())
|
||||
if err != nil {
|
||||
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
|
||||
continue
|
||||
@@ -26,13 +28,10 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
||||
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
|
||||
continue
|
||||
}
|
||||
if snapshot.Links == nil {
|
||||
snapshot.Links = &LinkList{*geminiUrl}
|
||||
} else {
|
||||
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
|
||||
}
|
||||
logging.LogDebug(geminiUrl.String())
|
||||
linkURLs = append(linkURLs, *geminiUrl)
|
||||
}
|
||||
return snapshot
|
||||
return linkURLs
|
||||
}
|
||||
|
||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||
@@ -124,3 +123,22 @@ func ParseFirstTwoDigits(input string) (int, error) {
|
||||
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
// extractRedirectTarget returns the redirection
|
||||
// URL by parsing the header (or error message)
|
||||
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
|
||||
// \d+ - matches one or more digits
|
||||
// \s+ - matches one or more whitespace
|
||||
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
||||
pattern := `\d+\s+([^\r]+)`
|
||||
re := regexp.MustCompile(pattern)
|
||||
matches := re.FindStringSubmatch(input)
|
||||
if len(matches) < 2 {
|
||||
return nil, fmt.Errorf("%w: Cannot find redirect target from header %s", ErrGeminiResponseHeader, input)
|
||||
}
|
||||
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: Cannot find redirect target from header: %w", ErrGeminiResponseHeader, err)
|
||||
}
|
||||
return newURL, nil
|
||||
}
|
||||
|
||||
47
gemini/gemini_test.go
Normal file
47
gemini/gemini_test.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||
input := "redirect: 31 gemini://target.gr"
|
||||
result, err := extractRedirectTarget(*currentURL, input)
|
||||
if err != nil || (result.String() != "gemini://target.gr:1965") {
|
||||
t.Errorf("fail: %s", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||
input := "redirect: 31 /a/b"
|
||||
result, err := extractRedirectTarget(*currentURL, input)
|
||||
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
||||
t.Errorf("fail: %s", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
|
||||
input := "redirect: 31 ./"
|
||||
result, err := extractRedirectTarget(*currentURL, input)
|
||||
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
||||
t.Errorf("fail: %s", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetWrong(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||
input := "redirect: 31 fsdsdf"
|
||||
result, err := extractRedirectTarget(*currentURL, input)
|
||||
fmt.Println(err)
|
||||
if result != nil || err == nil {
|
||||
t.Errorf("fail: result should be nil, err is %s", err)
|
||||
}
|
||||
}
|
||||
@@ -3,8 +3,11 @@ package gemini
|
||||
import (
|
||||
"database/sql/driver"
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"net/url"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type URL struct {
|
||||
@@ -38,7 +41,13 @@ func (u URL) String() string {
|
||||
return u.Full
|
||||
}
|
||||
|
||||
// Value implements the driver.Valuer interface
|
||||
func (u URL) StringNoDefaultPort() string {
|
||||
if u.Port == 1965 {
|
||||
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||
}
|
||||
return u.Full
|
||||
}
|
||||
|
||||
func (u URL) Value() (driver.Value, error) {
|
||||
if u.Full == "" {
|
||||
return nil, nil
|
||||
@@ -65,3 +74,24 @@ func ParseURL(input string, descr string) (*URL, error) {
|
||||
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
|
||||
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
|
||||
}
|
||||
|
||||
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
|
||||
logging.LogDebug("Calculating redirect URL. Current %s header string %s", currentURL, input)
|
||||
// If URL is absolute, return just it
|
||||
if strings.Contains(input, "://") {
|
||||
return ParseURL(input, "")
|
||||
}
|
||||
// input is a path. Clean it and construct
|
||||
// new path
|
||||
var newPath string
|
||||
// Handle weird cases found in the wild
|
||||
if strings.HasPrefix(input, "/") {
|
||||
newPath = path.Clean(input)
|
||||
} else if input == "./" || input == "." {
|
||||
newPath = path.Join(currentURL.Path, "/")
|
||||
} else {
|
||||
newPath = path.Join(currentURL.Path, path.Clean(input))
|
||||
}
|
||||
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
|
||||
return ParseURL(strURL, "")
|
||||
}
|
||||
|
||||
103
gemini/gemini_url_test.go
Normal file
103
gemini/gemini_url_test.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
|
||||
parsed, err := ParseURL(input, "")
|
||||
value, _ := parsed.Value()
|
||||
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
|
||||
t.Errorf("fail: %s", parsed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL := URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b",
|
||||
Descr: "Nothing",
|
||||
Full: "gemini://smol.gr:1965/a/b",
|
||||
}
|
||||
input := "gemini://a.b/c"
|
||||
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||
if err != nil {
|
||||
t.Errorf("fail: %v", err)
|
||||
}
|
||||
expected := &URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "a.b",
|
||||
Port: 1965,
|
||||
Path: "/c",
|
||||
Descr: "",
|
||||
Full: "gemini://a.b:1965/c",
|
||||
}
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL := URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b",
|
||||
Descr: "Nothing",
|
||||
Full: "gemini://smol.gr:1965/a/b",
|
||||
}
|
||||
input := "/c"
|
||||
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||
if err != nil {
|
||||
t.Errorf("fail: %v", err)
|
||||
}
|
||||
expected := &URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/c",
|
||||
Descr: "",
|
||||
Full: "gemini://smol.gr:1965/c",
|
||||
}
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL := URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b",
|
||||
Descr: "Nothing",
|
||||
Full: "gemini://smol.gr:1965/a/b",
|
||||
}
|
||||
input := "c/d"
|
||||
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||
if err != nil {
|
||||
t.Errorf("fail: %v", err)
|
||||
}
|
||||
expected := &URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b/c/d",
|
||||
Descr: "",
|
||||
Full: "gemini://smol.gr:1965/a/b/c/d",
|
||||
}
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
@@ -4,15 +4,18 @@ import (
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"io"
|
||||
"net"
|
||||
gourl "net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gemini-grc/config"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
@@ -55,8 +58,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||
// Establish the underlying TCP connection.
|
||||
dialer := &net.Dialer{
|
||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
||||
KeepAlive: 10 * time.Second,
|
||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
||||
}
|
||||
conn, err := dialer.Dial("tcp", host)
|
||||
if err != nil {
|
||||
@@ -95,7 +97,11 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
||||
var data []byte
|
||||
|
||||
// Send Gemini request to trigger server response.
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url)))
|
||||
// Fix for stupid server bug:
|
||||
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||
// when the port is 1965 and is still specified explicitely in the URL.
|
||||
_url, _ := ParseURL(url, "")
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
|
||||
}
|
||||
@@ -111,9 +117,8 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else {
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
||||
}
|
||||
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
||||
}
|
||||
}
|
||||
return data, nil
|
||||
@@ -121,7 +126,19 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
||||
|
||||
// Visit given URL, using the Gemini protocol.
|
||||
// Mutates given Snapshot with the data.
|
||||
func Visit(s *Snapshot) error {
|
||||
// In case of error, we store the error string
|
||||
// inside snapshot and return the error.
|
||||
func Visit(s *Snapshot) (err error) {
|
||||
// Don't forget to also store error
|
||||
// response code (if we have one)
|
||||
defer func() {
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
if errors.As(err, new(*GeminiError)) {
|
||||
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
|
||||
}
|
||||
}
|
||||
}()
|
||||
data, err := ConnectAndGetData(s.URL.String())
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -130,6 +147,9 @@ func Visit(s *Snapshot) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
//marshalled, _ := json.MarshalIndent(pageData, "", " ")
|
||||
//fmt.Printf("%s\n", marshalled)
|
||||
s.Header = null.StringFrom(pageData.ResponseHeader)
|
||||
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
||||
s.MimeType = null.StringFrom(pageData.MimeType)
|
||||
s.Lang = null.StringFrom(pageData.Lang)
|
||||
@@ -142,24 +162,21 @@ func Visit(s *Snapshot) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Update given snapshot with the
|
||||
// Gemini header data: response code,
|
||||
// mime type and lang (optional)
|
||||
// processData returne results from
|
||||
// parsing Gemini header data:
|
||||
// Code, mime type and lang (optional)
|
||||
// Returns error if header was invalid
|
||||
func processData(data []byte) (*PageData, error) {
|
||||
header, body, err := getHeadersAndData(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||
var geminiError error
|
||||
logging.LogDebug("Header: %s", strings.TrimSpace(header))
|
||||
if code != 20 {
|
||||
geminiError = NewErrGeminiStatusCode(code, header)
|
||||
return nil, NewErrGeminiStatusCode(code, header)
|
||||
}
|
||||
fmt.Printf("%v\n", header)
|
||||
|
||||
if geminiError != nil {
|
||||
return nil, geminiError
|
||||
}
|
||||
pageData := PageData{
|
||||
ResponseCode: code,
|
||||
ResponseHeader: header,
|
||||
@@ -201,11 +218,11 @@ func getHeadersAndData(data []byte) (string, []byte, error) {
|
||||
// `20 text/gemini` (code, mimetype)
|
||||
// `31 gemini://redirected.to/other/site` (code)
|
||||
func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||
// Regex that parses code, mimetype & lang
|
||||
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
|
||||
// Regex that parses code, mimetype & optional charset/lang parameters
|
||||
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
|
||||
matches := re.FindStringSubmatch(headers)
|
||||
if matches == nil || len(matches) <= 1 {
|
||||
// Try to get code at least.
|
||||
// Try to get code at least
|
||||
re := regexp.MustCompile(`^(\d+)\s+`)
|
||||
matches := re.FindStringSubmatch(headers)
|
||||
if matches == nil || len(matches) <= 1 {
|
||||
@@ -222,6 +239,6 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||
return 0, "", ""
|
||||
}
|
||||
mimeType := matches[2]
|
||||
lang := matches[4]
|
||||
return code, mimeType, lang
|
||||
param := matches[3] // This will capture either charset or lang value
|
||||
return code, mimeType, param
|
||||
}
|
||||
|
||||
@@ -21,7 +21,31 @@ func TestGetMimeTypeAndLang11(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang12(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
|
||||
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
|
||||
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMimeTypeAndLang13(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
|
||||
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetTypeAndLang2(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetTypeAndLang21(t *testing.T) {
|
||||
t.Parallel()
|
||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gemini-grc/config"
|
||||
"os"
|
||||
|
||||
"gemini-grc/logging"
|
||||
|
||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@@ -33,25 +36,40 @@ func ConnectToDB() *sqlx.DB {
|
||||
return db
|
||||
}
|
||||
|
||||
func SaveSnapshotToDBIfNotExists(tx *sqlx.Tx, s *Snapshot) error {
|
||||
func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
|
||||
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("JSON serialization error for %v", s))
|
||||
}
|
||||
if config.CONFIG.DryRun {
|
||||
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
|
||||
return nil
|
||||
}
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
`
|
||||
_, err := tx.NamedExec(query, s)
|
||||
_, err = tx.NamedExec(query, s)
|
||||
if err != nil {
|
||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
||||
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
fmt.Printf("%+v", s)
|
||||
func UpsertSnapshot(id int, tx *sqlx.Tx, s *Snapshot) error {
|
||||
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("JSON serialization error for %v", s))
|
||||
}
|
||||
if config.CONFIG.DryRun {
|
||||
logging.LogDebug("[%d] Would upsert snapshot %s", id, marshalled)
|
||||
return nil
|
||||
}
|
||||
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (url) DO UPDATE SET
|
||||
url = EXCLUDED.url,
|
||||
host = EXCLUDED.host,
|
||||
@@ -62,23 +80,29 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
links = EXCLUDED.links,
|
||||
lang = EXCLUDED.lang,
|
||||
response_code = EXCLUDED.response_code,
|
||||
error = EXCLUDED.error
|
||||
`
|
||||
_, err := tx.NamedExec(query, s)
|
||||
error = EXCLUDED.error`
|
||||
_, err = tx.NamedExec(query, s)
|
||||
//if err != nil {
|
||||
// logging.LogError("[%s] GeminiError upserting snapshot: %w", s.URL, err)
|
||||
// panic("This shouldn't happen")
|
||||
//}
|
||||
if err != nil {
|
||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
||||
return fmt.Errorf("[%s] GeminiError upserting snapshot: %w", s.URL, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||
if config.CONFIG.DryRun {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
|
||||
const batchSize = 5000
|
||||
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
`
|
||||
|
||||
@@ -92,7 +116,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||
|
||||
_, err := tx.NamedExec(query, batch)
|
||||
if err != nil {
|
||||
logging.LogError("Error batch inserting snapshots: %w", err)
|
||||
logging.LogError("GeminiError batch inserting snapshots: %w", err)
|
||||
return fmt.Errorf("DB error: %w", err)
|
||||
}
|
||||
}
|
||||
@@ -101,14 +125,17 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||
}
|
||||
|
||||
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||
if config.CONFIG.DryRun {
|
||||
return nil
|
||||
}
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
`
|
||||
_, err := tx.NamedExec(query, snapshots)
|
||||
if err != nil {
|
||||
logging.LogError("Error batch inserting snapshots: %w", err)
|
||||
logging.LogError("GeminiError batch inserting snapshots: %w", err)
|
||||
return fmt.Errorf("DB error: %w", err)
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -57,8 +57,8 @@ func populateBlacklist(key string) (entries []string) {
|
||||
// RobotMatch checks if the snapshot URL matches
|
||||
// a robots.txt allow rule.
|
||||
func RobotMatch(url URL) bool {
|
||||
logging.LogDebug("Checking robots.txt cache for %s", url.String())
|
||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||
logging.LogDebug("Checking robots.txt cache for %s", key)
|
||||
var disallowedURLs []string
|
||||
cacheEntries, ok := RobotsCache.Load(key)
|
||||
if !ok {
|
||||
|
||||
@@ -27,46 +27,17 @@ func (l *LinkList) Scan(value interface{}) error {
|
||||
}
|
||||
|
||||
type Snapshot struct {
|
||||
ID int `db:"id" json:"id,omitempty"`
|
||||
UID string `db:"uid" json:"uid,omitempty"`
|
||||
URL URL `db:"url" json:"url,omitempty"`
|
||||
Host string `db:"host" json:"host,omitempty"`
|
||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||
Links *LinkList `db:"links" json:"links,omitempty"`
|
||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
ID int `db:"id" json:"id,omitempty"`
|
||||
//UID string `db:"uid" json:"uid,omitempty"`
|
||||
URL URL `db:"url" json:"url,omitempty"`
|
||||
Host string `db:"host" json:"host,omitempty"`
|
||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
||||
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
|
||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
}
|
||||
|
||||
//func SnapshotToJSON(g Snapshot) string {
|
||||
// // Serialize the Person struct to JSON
|
||||
// jsonData, err := json.MarshalIndent(g, "", "\t")
|
||||
// if err != nil {
|
||||
// logging.LogError("Error serializing to JSON: %w", err)
|
||||
// }
|
||||
// return string(jsonData)
|
||||
//}
|
||||
//
|
||||
//func SnapshotFromJSON(input string) Snapshot {
|
||||
// var snapshot Snapshot
|
||||
// err := json.Unmarshal([]byte(input), &snapshot)
|
||||
// if err != nil {
|
||||
// logging.LogError("Error deserializing from JSON: %w", err)
|
||||
// }
|
||||
// return snapshot
|
||||
//}
|
||||
//
|
||||
//func ShouldPersistSnapshot(result *Snapshot) bool {
|
||||
// if !result.MimeType.Valid {
|
||||
// return false
|
||||
// }
|
||||
// if result.MimeType.String == "text/gemini" ||
|
||||
// strings.HasPrefix(result.MimeType.String, "image/") ||
|
||||
// strings.HasPrefix(result.MimeType.String, "text/") {
|
||||
// return true
|
||||
// }
|
||||
// return false
|
||||
//}
|
||||
|
||||
250
gemini/worker.go
250
gemini/worker.go
@@ -3,14 +3,13 @@ package gemini
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/logging"
|
||||
"gemini-grc/uid"
|
||||
"gemini-grc/util"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@@ -27,12 +26,13 @@ func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||
}
|
||||
|
||||
func RunWorker(id int, db *sqlx.DB, url *string) {
|
||||
// Start the DB transaction
|
||||
// Each worker runs within a DB transaction.
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
logging.LogError("Failed to begin transaction: %w", err)
|
||||
}
|
||||
|
||||
// Commit/rollback at the end
|
||||
defer func() {
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
@@ -46,73 +46,57 @@ func RunWorker(id int, db *sqlx.DB, url *string) {
|
||||
|
||||
var snapshots []Snapshot
|
||||
|
||||
// If not given a specific URL,
|
||||
// get some random ones to visit from DB.
|
||||
if url == nil {
|
||||
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
|
||||
} else {
|
||||
snapshots, err = GetSnapshotFromURL(tx, *url)
|
||||
if len(snapshots) == 0 {
|
||||
snapshotURL, err := ParseURL(*url, "")
|
||||
if err != nil {
|
||||
panic("Invalid URL: " + *url)
|
||||
}
|
||||
snapshots = []Snapshot{{
|
||||
UID: uid.UID(),
|
||||
URL: *snapshotURL,
|
||||
Host: snapshotURL.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}}
|
||||
if err != nil {
|
||||
logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err)
|
||||
panic("This should never happen")
|
||||
} else if len(snapshots) == 0 {
|
||||
logging.LogInfo("[%d] No snapshots to visit.", id)
|
||||
time.Sleep(1 * time.Minute)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
snapshotURL, err := ParseURL(*url, "")
|
||||
if err != nil {
|
||||
logging.LogError("Invalid URL given: " + *url)
|
||||
return
|
||||
|
||||
}
|
||||
snapshots = []Snapshot{{
|
||||
//UID: uid.UID(),
|
||||
URL: *snapshotURL,
|
||||
Host: snapshotURL.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
|
||||
time.Sleep(10 * time.Second)
|
||||
return
|
||||
} else if len(snapshots) == 0 {
|
||||
logging.LogInfo("[%d] No snapshots to visit.", id)
|
||||
time.Sleep(1 * time.Minute)
|
||||
return
|
||||
}
|
||||
// Start visiting URLs.
|
||||
total := len(snapshots)
|
||||
for i, s := range snapshots {
|
||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
|
||||
// We differentiate between errors:
|
||||
// Unexpected errors are the ones returned from the following function.
|
||||
// If an error is unexpected (which should never happen) we panic.
|
||||
// Expected errors are stored as strings within the snapshot,
|
||||
// so that they can also be stored in DB.
|
||||
err = workOnSnapshot(id, tx, &s)
|
||||
if err != nil {
|
||||
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err)
|
||||
logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err)
|
||||
util.PrintStackAndPanic(err)
|
||||
}
|
||||
if s.Error.Valid {
|
||||
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String)
|
||||
logging.LogWarn("[%d] Error: %v", id, s.Error.String)
|
||||
}
|
||||
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
|
||||
}
|
||||
logging.LogInfo("[%d] Worker done.", id)
|
||||
}
|
||||
|
||||
func handleRedirection(tx *sqlx.Tx, s *Snapshot) error {
|
||||
re := regexp.MustCompile(`gemini://\S+`)
|
||||
matches := re.FindStringSubmatch(s.Error.ValueOrZero())
|
||||
if len(matches) == 1 {
|
||||
newURL := matches[0]
|
||||
logging.LogDebug("Page redirects to %s", newURL)
|
||||
_url, err := ParseURL(newURL, "")
|
||||
// Insert fresh snapshot with new URL
|
||||
if err == nil {
|
||||
snapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
URL: *_url,
|
||||
Host: _url.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
err := SaveSnapshotToDBIfNotExists(tx, snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// workOnSnapshot visits a URL and stores the result.
|
||||
// errors should be returned only if they are unexpected.
|
||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
if IsBlacklisted(s.URL) {
|
||||
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
|
||||
@@ -123,31 +107,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
// add it as an error so next time it won't be
|
||||
// crawled.
|
||||
if RobotMatch(s.URL) {
|
||||
s.Error = null.StringFrom("robots.txt disallow match")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
|
||||
err = UpsertSnapshot(id, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return fmt.Errorf("[%d] %w", id, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Resolve IP address via DNS
|
||||
IPs, err := getHostIPAddresses(s.Host)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom("DNS Resolve error")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
err = UpsertSnapshot(id, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return fmt.Errorf("[%d] %w", id, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
defer func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
RemoveIPsFromPool(IPs)
|
||||
}()
|
||||
|
||||
// If the host's ip is in the connections pool,
|
||||
// stop and add the url in the queue later.
|
||||
// If the host's ip is in the connections pool we stop
|
||||
IpPool.Lock.RLock()
|
||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
|
||||
for _, ip := range IPs {
|
||||
@@ -155,7 +134,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
if ok {
|
||||
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
|
||||
IpPool.Lock.RUnlock()
|
||||
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
|
||||
time.Sleep(1 * time.Second) // Avoid flood-retrying
|
||||
return nil
|
||||
}
|
||||
}
|
||||
@@ -163,84 +142,115 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
|
||||
AddIPsToPool(IPs)
|
||||
|
||||
// After finishing, remove the host IPs from
|
||||
// the connections pool, with a small delay
|
||||
// to avoid potentially hitting the same IP quickly.
|
||||
defer func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
RemoveIPsFromPool(IPs)
|
||||
}()
|
||||
|
||||
url := s.URL.String()
|
||||
logging.LogDebug("[%d] Dialing %s", id, url)
|
||||
|
||||
err = Visit(s)
|
||||
if err != nil {
|
||||
if !IsKnownError(err) {
|
||||
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
|
||||
if config.CONFIG.PanicOnUnexpectedError {
|
||||
util.PrintStackAndPanic(err)
|
||||
}
|
||||
} else {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return err
|
||||
}
|
||||
if errors.As(err, new(*ErrGeminiStatusCode)) {
|
||||
err = handleRedirection(tx, s)
|
||||
// Check if error is redirection, and handle it
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
if errors.As(err, new(*GeminiError)) &&
|
||||
err.(*GeminiError).Msg == "redirect" {
|
||||
err = handleRedirection(id, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
logging.LogDebug("[%d] Finished dialing.", id)
|
||||
logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero())
|
||||
|
||||
// If this is a gemini page, parse possible links inside
|
||||
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||
logging.LogDebug("[%d] [%s] Processing", id, url)
|
||||
s = ProcessGemini(s)
|
||||
links := GetPageLinks(s.URL, s.GemText.String)
|
||||
logging.LogDebug("[%d] Found %d links", id, len(links))
|
||||
if len(links) > 0 {
|
||||
s.Links = null.ValueFrom(links)
|
||||
}
|
||||
} else {
|
||||
logging.LogDebug("[%d] Not looking for page links", id)
|
||||
}
|
||||
logging.LogDebug("[%d] Saving", id)
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
|
||||
err = UpsertSnapshot(id, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return err
|
||||
}
|
||||
|
||||
// Store links in batch
|
||||
if s.Links != nil {
|
||||
var batchSnapshots []*Snapshot
|
||||
timestamp := null.TimeFrom(time.Now())
|
||||
err = storeLinks(tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, link := range *s.Links {
|
||||
if shouldPersistURL(tx, link) {
|
||||
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
|
||||
if s.Links.Valid {
|
||||
var batchSnapshots []*Snapshot
|
||||
for _, link := range s.Links.ValueOrZero() {
|
||||
if shouldPersistURL(link) {
|
||||
newSnapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
//UID: uid.UID(),
|
||||
URL: link,
|
||||
Host: link.Hostname,
|
||||
Timestamp: timestamp,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
if len(batchSnapshots) > 0 {
|
||||
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
|
||||
err = SaveLinksToDBinBatches(tx, batchSnapshots)
|
||||
err := SaveLinksToDBinBatches(tx, batchSnapshots)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Should we save the given URL for crawling?
|
||||
func shouldPersistURL(tx *sqlx.Tx, u URL) bool {
|
||||
if !strings.HasPrefix(u.String(), "gemini://") {
|
||||
return false
|
||||
}
|
||||
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)`
|
||||
var exists bool
|
||||
err := tx.Get(&exists, query, u.String())
|
||||
// shouldPersistURL returns true if we
|
||||
// should save the URL in the DB.
|
||||
// Only gemini:// urls are saved.
|
||||
func shouldPersistURL(u URL) bool {
|
||||
return strings.HasPrefix(u.String(), "gemini://")
|
||||
}
|
||||
|
||||
func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error {
|
||||
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
|
||||
if err != nil {
|
||||
fmt.Println("Error executing query:", err)
|
||||
return false
|
||||
return err
|
||||
}
|
||||
return !exists
|
||||
logging.LogDebug("[%d] Page redirects to %s", id, newURL)
|
||||
// Insert fresh snapshot with new URL
|
||||
snapshot := &Snapshot{
|
||||
//UID: uid.UID(),
|
||||
URL: *newURL,
|
||||
Host: newURL.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String())
|
||||
err = SaveSnapshotIfNew(tx, snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||
// Old, unoptimized query
|
||||
//
|
||||
// query := `
|
||||
|
||||
// query := `
|
||||
// SELECT DISTINCT ON (host) *
|
||||
// FROM snapshots
|
||||
// WHERE response_code IS NULL
|
||||
@@ -249,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||
// LIMIT $1
|
||||
// `
|
||||
query := `
|
||||
WITH RankedSnapshots AS (
|
||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
||||
links, lang, response_code, error,
|
||||
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||
FROM snapshots
|
||||
WHERE response_code IS NULL
|
||||
AND error IS NULL
|
||||
)
|
||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
||||
links, lang, response_code, error
|
||||
FROM RankedSnapshots
|
||||
WHERE rn = 1
|
||||
LIMIT $1
|
||||
`
|
||||
SELECT *
|
||||
FROM snapshots
|
||||
WHERE response_code IS NULL
|
||||
AND error IS NULL
|
||||
ORDER BY RANDOM()
|
||||
LIMIT $1
|
||||
`
|
||||
//query := `
|
||||
// WITH RankedSnapshots AS (
|
||||
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||
// links, lang, response_code, error,
|
||||
// ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||
// FROM snapshots
|
||||
// WHERE response_code IS NULL
|
||||
// AND error IS NULL
|
||||
// )
|
||||
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||
// links, lang, response_code, error
|
||||
// FROM RankedSnapshots
|
||||
// WHERE rn = 1
|
||||
// LIMIT $1
|
||||
//`
|
||||
var snapshots []Snapshot
|
||||
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
||||
if err != nil {
|
||||
|
||||
21
main.go
21
main.go
@@ -1,16 +1,16 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/gemini"
|
||||
"gemini-grc/http"
|
||||
"gemini-grc/logging"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/rs/zerolog"
|
||||
zlog "github.com/rs/zerolog/log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -26,9 +26,10 @@ func main() {
|
||||
|
||||
func runApp() error {
|
||||
logging.LogInfo("Starting up. Press Ctrl+C to exit")
|
||||
sigs := make(chan os.Signal, 1)
|
||||
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
server := http.CreateServer("localhost:8899")
|
||||
db := gemini.ConnectToDB()
|
||||
|
||||
// !!! DANGER !!!
|
||||
@@ -44,7 +45,8 @@ func runApp() error {
|
||||
}(db)
|
||||
|
||||
gemini.LoadBlacklist()
|
||||
|
||||
// If there's an argument, assume it's a URL
|
||||
// to visit and ignore database state.
|
||||
if len(os.Args) > 1 {
|
||||
url := os.Args[1]
|
||||
go gemini.RunWorker(0, db, &url)
|
||||
@@ -52,7 +54,10 @@ func runApp() error {
|
||||
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
|
||||
}
|
||||
|
||||
<-sigs
|
||||
<-signals
|
||||
if err := server.Close(); err != nil {
|
||||
logging.LogError("GeminiError during server shutdown: %s", err)
|
||||
}
|
||||
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user