Better error handling, many fixes all around
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,6 +1,8 @@
|
|||||||
.idea
|
.idea
|
||||||
|
.goroot
|
||||||
**/.#*
|
**/.#*
|
||||||
**/*~
|
**/*~
|
||||||
|
/.go
|
||||||
/cmd
|
/cmd
|
||||||
/db/initdb.sql
|
/db/initdb.sql
|
||||||
/db/*sh
|
/db/*sh
|
||||||
|
|||||||
@@ -5,13 +5,13 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ErrGeminiStatusCode struct {
|
type GeminiError struct {
|
||||||
Msg string
|
Msg string
|
||||||
Code int
|
Code int
|
||||||
Header string
|
Header string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *ErrGeminiStatusCode) Error() string {
|
func (e *GeminiError) Error() string {
|
||||||
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
|
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -31,7 +31,7 @@ func NewErrGeminiStatusCode(code int, header string) error {
|
|||||||
default:
|
default:
|
||||||
msg = "unexpected status code"
|
msg = "unexpected status code"
|
||||||
}
|
}
|
||||||
return &ErrGeminiStatusCode{
|
return &GeminiError{
|
||||||
Msg: msg,
|
Msg: msg,
|
||||||
Code: code,
|
Code: code,
|
||||||
Header: header,
|
Header: header,
|
||||||
@@ -39,7 +39,6 @@ func NewErrGeminiStatusCode(code int, header string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ErrGemini = errors.New("gemini general error")
|
|
||||||
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
|
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
|
||||||
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
|
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
|
||||||
ErrGeminiResponseHeader = errors.New("gemini response header error")
|
ErrGeminiResponseHeader = errors.New("gemini response header error")
|
||||||
@@ -64,8 +63,10 @@ var (
|
|||||||
// we would lose ability to check wrapped
|
// we would lose ability to check wrapped
|
||||||
// errors via errors.Is().
|
// errors via errors.Is().
|
||||||
|
|
||||||
var KnownErrors = []error{
|
var errGemini *GeminiError
|
||||||
ErrGemini,
|
|
||||||
|
var knownErrors = []error{ //nolint:gochecknoglobals
|
||||||
|
errGemini,
|
||||||
ErrGeminiLinkLineParse,
|
ErrGeminiLinkLineParse,
|
||||||
ErrGeminiRobotsParse,
|
ErrGeminiRobotsParse,
|
||||||
ErrGeminiRobotsDisallowed,
|
ErrGeminiRobotsDisallowed,
|
||||||
@@ -87,14 +88,14 @@ var KnownErrors = []error{
|
|||||||
}
|
}
|
||||||
|
|
||||||
func IsKnownError(err error) bool {
|
func IsKnownError(err error) bool {
|
||||||
var errGeminiStatusCode *ErrGeminiStatusCode
|
for _, known := range knownErrors {
|
||||||
if errors.As(err, &errGeminiStatusCode) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
for _, known := range KnownErrors {
|
|
||||||
if errors.Is(err, known) {
|
if errors.Is(err, known) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Check for wrapped errors as well
|
||||||
|
if errors.As(err, new(*GeminiError)) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
24
gemini/errors_test.go
Normal file
24
gemini/errors_test.go
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestErrGemini(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
err := NewErrGeminiStatusCode(50, "50 server error")
|
||||||
|
if !errors.As(err, new(*GeminiError)) {
|
||||||
|
t.Errorf("TestErrGemini fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrGeminiWrapped(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
err := NewErrGeminiStatusCode(50, "50 server error")
|
||||||
|
errWrapped := fmt.Errorf("%w wrapped", err)
|
||||||
|
if !errors.As(errWrapped, new(*GeminiError)) {
|
||||||
|
t.Errorf("TestErrGeminiWrapped fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -78,7 +78,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
|||||||
|
|
||||||
finalPath, err := calcFilePath(parentPath, urlPath)
|
finalPath, err := calcFilePath(parentPath, urlPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error saving %s: %w", s.URL, err)
|
logging.LogError("GeminiError saving %s: %w", s.URL, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Ensure the directory exists
|
// Ensure the directory exists
|
||||||
@@ -93,7 +93,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
|||||||
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
|
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error saving %s: %w", s.URL.Full, err)
|
logging.LogError("GeminiError saving %s: %w", s.URL.Full, err)
|
||||||
}
|
}
|
||||||
close(done)
|
close(done)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,14 +9,16 @@ import (
|
|||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
func GetPageLinks(currentURL URL, gemtext string) LinkList {
|
||||||
// Grab link lines
|
// Grab link lines
|
||||||
linkLines := ExtractLinkLines(snapshot.GemText.String)
|
linkLines := ExtractLinkLines(gemtext)
|
||||||
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
|
if len(linkLines) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var linkURLs LinkList
|
||||||
// Normalize URLs in links, and store them in snapshot
|
// Normalize URLs in links, and store them in snapshot
|
||||||
for _, line := range linkLines {
|
for _, line := range linkLines {
|
||||||
normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String())
|
normalizedLink, descr, err := NormalizeLink(line, currentURL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
|
logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err)
|
||||||
continue
|
continue
|
||||||
@@ -26,13 +28,10 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
|||||||
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
|
logging.LogDebug("Cannot parse URL in link '%s': %v", line, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if snapshot.Links == nil {
|
logging.LogDebug(geminiUrl.String())
|
||||||
snapshot.Links = &LinkList{*geminiUrl}
|
linkURLs = append(linkURLs, *geminiUrl)
|
||||||
} else {
|
|
||||||
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return snapshot
|
return linkURLs
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||||
@@ -124,3 +123,22 @@ func ParseFirstTwoDigits(input string) (int, error) {
|
|||||||
|
|
||||||
return snapshot, nil
|
return snapshot, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// extractRedirectTarget returns the redirection
|
||||||
|
// URL by parsing the header (or error message)
|
||||||
|
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
|
||||||
|
// \d+ - matches one or more digits
|
||||||
|
// \s+ - matches one or more whitespace
|
||||||
|
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
||||||
|
pattern := `\d+\s+([^\r]+)`
|
||||||
|
re := regexp.MustCompile(pattern)
|
||||||
|
matches := re.FindStringSubmatch(input)
|
||||||
|
if len(matches) < 2 {
|
||||||
|
return nil, fmt.Errorf("%w: Cannot find redirect target from header %s", ErrGeminiResponseHeader, input)
|
||||||
|
}
|
||||||
|
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%w: Cannot find redirect target from header: %w", ErrGeminiResponseHeader, err)
|
||||||
|
}
|
||||||
|
return newURL, nil
|
||||||
|
}
|
||||||
|
|||||||
47
gemini/gemini_test.go
Normal file
47
gemini/gemini_test.go
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||||
|
input := "redirect: 31 gemini://target.gr"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://target.gr:1965") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||||
|
input := "redirect: 31 /a/b"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
|
||||||
|
input := "redirect: 31 ./"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
||||||
|
t.Errorf("fail: %s", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractRedirectTargetWrong(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
||||||
|
input := "redirect: 31 fsdsdf"
|
||||||
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
|
fmt.Println(err)
|
||||||
|
if result != nil || err == nil {
|
||||||
|
t.Errorf("fail: result should be nil, err is %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,8 +3,11 @@ package gemini
|
|||||||
import (
|
import (
|
||||||
"database/sql/driver"
|
"database/sql/driver"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/logging"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type URL struct {
|
type URL struct {
|
||||||
@@ -38,7 +41,13 @@ func (u URL) String() string {
|
|||||||
return u.Full
|
return u.Full
|
||||||
}
|
}
|
||||||
|
|
||||||
// Value implements the driver.Valuer interface
|
func (u URL) StringNoDefaultPort() string {
|
||||||
|
if u.Port == 1965 {
|
||||||
|
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||||
|
}
|
||||||
|
return u.Full
|
||||||
|
}
|
||||||
|
|
||||||
func (u URL) Value() (driver.Value, error) {
|
func (u URL) Value() (driver.Value, error) {
|
||||||
if u.Full == "" {
|
if u.Full == "" {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
@@ -65,3 +74,24 @@ func ParseURL(input string, descr string) (*URL, error) {
|
|||||||
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
|
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path)
|
||||||
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
|
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
|
||||||
|
logging.LogDebug("Calculating redirect URL. Current %s header string %s", currentURL, input)
|
||||||
|
// If URL is absolute, return just it
|
||||||
|
if strings.Contains(input, "://") {
|
||||||
|
return ParseURL(input, "")
|
||||||
|
}
|
||||||
|
// input is a path. Clean it and construct
|
||||||
|
// new path
|
||||||
|
var newPath string
|
||||||
|
// Handle weird cases found in the wild
|
||||||
|
if strings.HasPrefix(input, "/") {
|
||||||
|
newPath = path.Clean(input)
|
||||||
|
} else if input == "./" || input == "." {
|
||||||
|
newPath = path.Join(currentURL.Path, "/")
|
||||||
|
} else {
|
||||||
|
newPath = path.Join(currentURL.Path, path.Clean(input))
|
||||||
|
}
|
||||||
|
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
|
||||||
|
return ParseURL(strURL, "")
|
||||||
|
}
|
||||||
|
|||||||
103
gemini/gemini_url_test.go
Normal file
103
gemini/gemini_url_test.go
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
|
||||||
|
parsed, err := ParseURL(input, "")
|
||||||
|
value, _ := parsed.Value()
|
||||||
|
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
|
||||||
|
t.Errorf("fail: %s", parsed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL := URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b",
|
||||||
|
Descr: "Nothing",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b",
|
||||||
|
}
|
||||||
|
input := "gemini://a.b/c"
|
||||||
|
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("fail: %v", err)
|
||||||
|
}
|
||||||
|
expected := &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "a.b",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/c",
|
||||||
|
Descr: "",
|
||||||
|
Full: "gemini://a.b:1965/c",
|
||||||
|
}
|
||||||
|
pass := reflect.DeepEqual(output, expected)
|
||||||
|
if !pass {
|
||||||
|
t.Errorf("fail: %#v != %#v", output, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL := URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b",
|
||||||
|
Descr: "Nothing",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b",
|
||||||
|
}
|
||||||
|
input := "/c"
|
||||||
|
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("fail: %v", err)
|
||||||
|
}
|
||||||
|
expected := &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/c",
|
||||||
|
Descr: "",
|
||||||
|
Full: "gemini://smol.gr:1965/c",
|
||||||
|
}
|
||||||
|
pass := reflect.DeepEqual(output, expected)
|
||||||
|
if !pass {
|
||||||
|
t.Errorf("fail: %#v != %#v", output, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
currentURL := URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b",
|
||||||
|
Descr: "Nothing",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b",
|
||||||
|
}
|
||||||
|
input := "c/d"
|
||||||
|
output, err := DeriveAbsoluteURL(currentURL, input)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("fail: %v", err)
|
||||||
|
}
|
||||||
|
expected := &URL{
|
||||||
|
Protocol: "gemini",
|
||||||
|
Hostname: "smol.gr",
|
||||||
|
Port: 1965,
|
||||||
|
Path: "/a/b/c/d",
|
||||||
|
Descr: "",
|
||||||
|
Full: "gemini://smol.gr:1965/a/b/c/d",
|
||||||
|
}
|
||||||
|
pass := reflect.DeepEqual(output, expected)
|
||||||
|
if !pass {
|
||||||
|
t.Errorf("fail: %#v != %#v", output, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,15 +4,18 @@ import (
|
|||||||
"crypto/tls"
|
"crypto/tls"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/logging"
|
||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
gourl "net/url"
|
gourl "net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"gemini-grc/config"
|
"gemini-grc/config"
|
||||||
|
|
||||||
"github.com/guregu/null/v5"
|
"github.com/guregu/null/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -55,8 +58,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||||
// Establish the underlying TCP connection.
|
// Establish the underlying TCP connection.
|
||||||
dialer := &net.Dialer{
|
dialer := &net.Dialer{
|
||||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
||||||
KeepAlive: 10 * time.Second,
|
|
||||||
}
|
}
|
||||||
conn, err := dialer.Dial("tcp", host)
|
conn, err := dialer.Dial("tcp", host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -95,7 +97,11 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
var data []byte
|
var data []byte
|
||||||
|
|
||||||
// Send Gemini request to trigger server response.
|
// Send Gemini request to trigger server response.
|
||||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url)))
|
// Fix for stupid server bug:
|
||||||
|
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||||
|
// when the port is 1965 and is still specified explicitely in the URL.
|
||||||
|
_url, _ := ParseURL(url, "")
|
||||||
|
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
|
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
|
||||||
}
|
}
|
||||||
@@ -111,9 +117,8 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
} else {
|
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
|
||||||
}
|
}
|
||||||
|
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return data, nil
|
return data, nil
|
||||||
@@ -121,7 +126,19 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
|
|
||||||
// Visit given URL, using the Gemini protocol.
|
// Visit given URL, using the Gemini protocol.
|
||||||
// Mutates given Snapshot with the data.
|
// Mutates given Snapshot with the data.
|
||||||
func Visit(s *Snapshot) error {
|
// In case of error, we store the error string
|
||||||
|
// inside snapshot and return the error.
|
||||||
|
func Visit(s *Snapshot) (err error) {
|
||||||
|
// Don't forget to also store error
|
||||||
|
// response code (if we have one)
|
||||||
|
defer func() {
|
||||||
|
if err != nil {
|
||||||
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
if errors.As(err, new(*GeminiError)) {
|
||||||
|
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
data, err := ConnectAndGetData(s.URL.String())
|
data, err := ConnectAndGetData(s.URL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -130,6 +147,9 @@ func Visit(s *Snapshot) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
//marshalled, _ := json.MarshalIndent(pageData, "", " ")
|
||||||
|
//fmt.Printf("%s\n", marshalled)
|
||||||
|
s.Header = null.StringFrom(pageData.ResponseHeader)
|
||||||
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
||||||
s.MimeType = null.StringFrom(pageData.MimeType)
|
s.MimeType = null.StringFrom(pageData.MimeType)
|
||||||
s.Lang = null.StringFrom(pageData.Lang)
|
s.Lang = null.StringFrom(pageData.Lang)
|
||||||
@@ -142,24 +162,21 @@ func Visit(s *Snapshot) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update given snapshot with the
|
// processData returne results from
|
||||||
// Gemini header data: response code,
|
// parsing Gemini header data:
|
||||||
// mime type and lang (optional)
|
// Code, mime type and lang (optional)
|
||||||
|
// Returns error if header was invalid
|
||||||
func processData(data []byte) (*PageData, error) {
|
func processData(data []byte) (*PageData, error) {
|
||||||
header, body, err := getHeadersAndData(data)
|
header, body, err := getHeadersAndData(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||||
var geminiError error
|
logging.LogDebug("Header: %s", strings.TrimSpace(header))
|
||||||
if code != 20 {
|
if code != 20 {
|
||||||
geminiError = NewErrGeminiStatusCode(code, header)
|
return nil, NewErrGeminiStatusCode(code, header)
|
||||||
}
|
}
|
||||||
fmt.Printf("%v\n", header)
|
|
||||||
|
|
||||||
if geminiError != nil {
|
|
||||||
return nil, geminiError
|
|
||||||
}
|
|
||||||
pageData := PageData{
|
pageData := PageData{
|
||||||
ResponseCode: code,
|
ResponseCode: code,
|
||||||
ResponseHeader: header,
|
ResponseHeader: header,
|
||||||
@@ -201,11 +218,11 @@ func getHeadersAndData(data []byte) (string, []byte, error) {
|
|||||||
// `20 text/gemini` (code, mimetype)
|
// `20 text/gemini` (code, mimetype)
|
||||||
// `31 gemini://redirected.to/other/site` (code)
|
// `31 gemini://redirected.to/other/site` (code)
|
||||||
func getMimeTypeAndLang(headers string) (int, string, string) {
|
func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||||
// Regex that parses code, mimetype & lang
|
// Regex that parses code, mimetype & optional charset/lang parameters
|
||||||
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`)
|
re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`)
|
||||||
matches := re.FindStringSubmatch(headers)
|
matches := re.FindStringSubmatch(headers)
|
||||||
if matches == nil || len(matches) <= 1 {
|
if matches == nil || len(matches) <= 1 {
|
||||||
// Try to get code at least.
|
// Try to get code at least
|
||||||
re := regexp.MustCompile(`^(\d+)\s+`)
|
re := regexp.MustCompile(`^(\d+)\s+`)
|
||||||
matches := re.FindStringSubmatch(headers)
|
matches := re.FindStringSubmatch(headers)
|
||||||
if matches == nil || len(matches) <= 1 {
|
if matches == nil || len(matches) <= 1 {
|
||||||
@@ -222,6 +239,6 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
|
|||||||
return 0, "", ""
|
return 0, "", ""
|
||||||
}
|
}
|
||||||
mimeType := matches[2]
|
mimeType := matches[2]
|
||||||
lang := matches[4]
|
param := matches[3] // This will capture either charset or lang value
|
||||||
return code, mimeType, lang
|
return code, mimeType, param
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,7 +21,31 @@ func TestGetMimeTypeAndLang11(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetMimeTypeAndLang12(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8")
|
||||||
|
if code != 20 || mimeType != "text/plain" || lang != "utf-8" {
|
||||||
|
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetMimeTypeAndLang13(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8")
|
||||||
|
if code != 20 || mimeType != "text/gemini" || lang != "utf-8" {
|
||||||
|
t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestGetTypeAndLang2(t *testing.T) {
|
func TestGetTypeAndLang2(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en")
|
||||||
|
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||||
|
t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetTypeAndLang21(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en")
|
||||||
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
if code != 20 || mimeType != "text/gemini" || lang != "en" {
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/config"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
|
|
||||||
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
@@ -33,25 +36,40 @@ func ConnectToDB() *sqlx.DB {
|
|||||||
return db
|
return db
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveSnapshotToDBIfNotExists(tx *sqlx.Tx, s *Snapshot) error {
|
func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
|
||||||
|
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("JSON serialization error for %v", s))
|
||||||
|
}
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
ON CONFLICT (url) DO NOTHING
|
ON CONFLICT (url) DO NOTHING
|
||||||
`
|
`
|
||||||
_, err := tx.NamedExec(query, s)
|
_, err = tx.NamedExec(query, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
|
||||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
func UpsertSnapshot(id int, tx *sqlx.Tx, s *Snapshot) error {
|
||||||
fmt.Printf("%+v", s)
|
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("JSON serialization error for %v", s))
|
||||||
|
}
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
logging.LogDebug("[%d] Would upsert snapshot %s", id, marshalled)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
ON CONFLICT (url) DO UPDATE SET
|
ON CONFLICT (url) DO UPDATE SET
|
||||||
url = EXCLUDED.url,
|
url = EXCLUDED.url,
|
||||||
host = EXCLUDED.host,
|
host = EXCLUDED.host,
|
||||||
@@ -62,23 +80,29 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
|||||||
links = EXCLUDED.links,
|
links = EXCLUDED.links,
|
||||||
lang = EXCLUDED.lang,
|
lang = EXCLUDED.lang,
|
||||||
response_code = EXCLUDED.response_code,
|
response_code = EXCLUDED.response_code,
|
||||||
error = EXCLUDED.error
|
error = EXCLUDED.error`
|
||||||
`
|
_, err = tx.NamedExec(query, s)
|
||||||
_, err := tx.NamedExec(query, s)
|
//if err != nil {
|
||||||
|
// logging.LogError("[%s] GeminiError upserting snapshot: %w", s.URL, err)
|
||||||
|
// panic("This shouldn't happen")
|
||||||
|
//}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
return fmt.Errorf("[%s] GeminiError upserting snapshot: %w", s.URL, err)
|
||||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
|
// Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe
|
||||||
const batchSize = 5000
|
const batchSize = 5000
|
||||||
|
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
ON CONFLICT (url) DO NOTHING
|
ON CONFLICT (url) DO NOTHING
|
||||||
`
|
`
|
||||||
|
|
||||||
@@ -92,7 +116,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
|||||||
|
|
||||||
_, err := tx.NamedExec(query, batch)
|
_, err := tx.NamedExec(query, batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error batch inserting snapshots: %w", err)
|
logging.LogError("GeminiError batch inserting snapshots: %w", err)
|
||||||
return fmt.Errorf("DB error: %w", err)
|
return fmt.Errorf("DB error: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -101,14 +125,17 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||||
|
if config.CONFIG.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||||
ON CONFLICT (url) DO NOTHING
|
ON CONFLICT (url) DO NOTHING
|
||||||
`
|
`
|
||||||
_, err := tx.NamedExec(query, snapshots)
|
_, err := tx.NamedExec(query, snapshots)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Error batch inserting snapshots: %w", err)
|
logging.LogError("GeminiError batch inserting snapshots: %w", err)
|
||||||
return fmt.Errorf("DB error: %w", err)
|
return fmt.Errorf("DB error: %w", err)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -57,8 +57,8 @@ func populateBlacklist(key string) (entries []string) {
|
|||||||
// RobotMatch checks if the snapshot URL matches
|
// RobotMatch checks if the snapshot URL matches
|
||||||
// a robots.txt allow rule.
|
// a robots.txt allow rule.
|
||||||
func RobotMatch(url URL) bool {
|
func RobotMatch(url URL) bool {
|
||||||
logging.LogDebug("Checking robots.txt cache for %s", url.String())
|
|
||||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||||
|
logging.LogDebug("Checking robots.txt cache for %s", key)
|
||||||
var disallowedURLs []string
|
var disallowedURLs []string
|
||||||
cacheEntries, ok := RobotsCache.Load(key)
|
cacheEntries, ok := RobotsCache.Load(key)
|
||||||
if !ok {
|
if !ok {
|
||||||
|
|||||||
@@ -27,46 +27,17 @@ func (l *LinkList) Scan(value interface{}) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Snapshot struct {
|
type Snapshot struct {
|
||||||
ID int `db:"id" json:"id,omitempty"`
|
ID int `db:"id" json:"id,omitempty"`
|
||||||
UID string `db:"uid" json:"uid,omitempty"`
|
//UID string `db:"uid" json:"uid,omitempty"`
|
||||||
URL URL `db:"url" json:"url,omitempty"`
|
URL URL `db:"url" json:"url,omitempty"`
|
||||||
Host string `db:"host" json:"host,omitempty"`
|
Host string `db:"host" json:"host,omitempty"`
|
||||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||||
Links *LinkList `db:"links" json:"links,omitempty"`
|
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
||||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
|
||||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
||||||
|
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||||
}
|
}
|
||||||
|
|
||||||
//func SnapshotToJSON(g Snapshot) string {
|
|
||||||
// // Serialize the Person struct to JSON
|
|
||||||
// jsonData, err := json.MarshalIndent(g, "", "\t")
|
|
||||||
// if err != nil {
|
|
||||||
// logging.LogError("Error serializing to JSON: %w", err)
|
|
||||||
// }
|
|
||||||
// return string(jsonData)
|
|
||||||
//}
|
|
||||||
//
|
|
||||||
//func SnapshotFromJSON(input string) Snapshot {
|
|
||||||
// var snapshot Snapshot
|
|
||||||
// err := json.Unmarshal([]byte(input), &snapshot)
|
|
||||||
// if err != nil {
|
|
||||||
// logging.LogError("Error deserializing from JSON: %w", err)
|
|
||||||
// }
|
|
||||||
// return snapshot
|
|
||||||
//}
|
|
||||||
//
|
|
||||||
//func ShouldPersistSnapshot(result *Snapshot) bool {
|
|
||||||
// if !result.MimeType.Valid {
|
|
||||||
// return false
|
|
||||||
// }
|
|
||||||
// if result.MimeType.String == "text/gemini" ||
|
|
||||||
// strings.HasPrefix(result.MimeType.String, "image/") ||
|
|
||||||
// strings.HasPrefix(result.MimeType.String, "text/") {
|
|
||||||
// return true
|
|
||||||
// }
|
|
||||||
// return false
|
|
||||||
//}
|
|
||||||
|
|||||||
250
gemini/worker.go
250
gemini/worker.go
@@ -3,14 +3,13 @@ package gemini
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"regexp"
|
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"gemini-grc/config"
|
"gemini-grc/config"
|
||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
"gemini-grc/uid"
|
|
||||||
"gemini-grc/util"
|
"gemini-grc/util"
|
||||||
|
|
||||||
"github.com/guregu/null/v5"
|
"github.com/guregu/null/v5"
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
@@ -27,12 +26,13 @@ func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func RunWorker(id int, db *sqlx.DB, url *string) {
|
func RunWorker(id int, db *sqlx.DB, url *string) {
|
||||||
// Start the DB transaction
|
// Each worker runs within a DB transaction.
|
||||||
tx, err := db.Beginx()
|
tx, err := db.Beginx()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Failed to begin transaction: %w", err)
|
logging.LogError("Failed to begin transaction: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Commit/rollback at the end
|
||||||
defer func() {
|
defer func() {
|
||||||
err = tx.Commit()
|
err = tx.Commit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -46,73 +46,57 @@ func RunWorker(id int, db *sqlx.DB, url *string) {
|
|||||||
|
|
||||||
var snapshots []Snapshot
|
var snapshots []Snapshot
|
||||||
|
|
||||||
|
// If not given a specific URL,
|
||||||
|
// get some random ones to visit from DB.
|
||||||
if url == nil {
|
if url == nil {
|
||||||
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
|
snapshots, err = GetRandomSnapshotsDistinctHosts(tx)
|
||||||
} else {
|
if err != nil {
|
||||||
snapshots, err = GetSnapshotFromURL(tx, *url)
|
logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err)
|
||||||
if len(snapshots) == 0 {
|
panic("This should never happen")
|
||||||
snapshotURL, err := ParseURL(*url, "")
|
} else if len(snapshots) == 0 {
|
||||||
if err != nil {
|
logging.LogInfo("[%d] No snapshots to visit.", id)
|
||||||
panic("Invalid URL: " + *url)
|
time.Sleep(1 * time.Minute)
|
||||||
}
|
return
|
||||||
snapshots = []Snapshot{{
|
|
||||||
UID: uid.UID(),
|
|
||||||
URL: *snapshotURL,
|
|
||||||
Host: snapshotURL.Hostname,
|
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
|
||||||
}}
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
snapshotURL, err := ParseURL(*url, "")
|
||||||
|
if err != nil {
|
||||||
|
logging.LogError("Invalid URL given: " + *url)
|
||||||
|
return
|
||||||
|
|
||||||
|
}
|
||||||
|
snapshots = []Snapshot{{
|
||||||
|
//UID: uid.UID(),
|
||||||
|
URL: *snapshotURL,
|
||||||
|
Host: snapshotURL.Hostname,
|
||||||
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
// Start visiting URLs.
|
||||||
logging.LogError("[%d] Error retrieving snapshot: %w", id, err)
|
|
||||||
time.Sleep(10 * time.Second)
|
|
||||||
return
|
|
||||||
} else if len(snapshots) == 0 {
|
|
||||||
logging.LogInfo("[%d] No snapshots to visit.", id)
|
|
||||||
time.Sleep(1 * time.Minute)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
total := len(snapshots)
|
total := len(snapshots)
|
||||||
for i, s := range snapshots {
|
for i, s := range snapshots {
|
||||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
|
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String())
|
||||||
|
// We differentiate between errors:
|
||||||
|
// Unexpected errors are the ones returned from the following function.
|
||||||
|
// If an error is unexpected (which should never happen) we panic.
|
||||||
|
// Expected errors are stored as strings within the snapshot,
|
||||||
|
// so that they can also be stored in DB.
|
||||||
err = workOnSnapshot(id, tx, &s)
|
err = workOnSnapshot(id, tx, &s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err)
|
logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err)
|
||||||
util.PrintStackAndPanic(err)
|
util.PrintStackAndPanic(err)
|
||||||
}
|
}
|
||||||
if s.Error.Valid {
|
if s.Error.Valid {
|
||||||
logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String)
|
logging.LogWarn("[%d] Error: %v", id, s.Error.String)
|
||||||
}
|
}
|
||||||
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
|
logging.LogDebug("[%d] Done %d/%d.", id, i+1, total)
|
||||||
}
|
}
|
||||||
logging.LogInfo("[%d] Worker done.", id)
|
logging.LogInfo("[%d] Worker done.", id)
|
||||||
}
|
}
|
||||||
|
|
||||||
func handleRedirection(tx *sqlx.Tx, s *Snapshot) error {
|
// workOnSnapshot visits a URL and stores the result.
|
||||||
re := regexp.MustCompile(`gemini://\S+`)
|
// errors should be returned only if they are unexpected.
|
||||||
matches := re.FindStringSubmatch(s.Error.ValueOrZero())
|
|
||||||
if len(matches) == 1 {
|
|
||||||
newURL := matches[0]
|
|
||||||
logging.LogDebug("Page redirects to %s", newURL)
|
|
||||||
_url, err := ParseURL(newURL, "")
|
|
||||||
// Insert fresh snapshot with new URL
|
|
||||||
if err == nil {
|
|
||||||
snapshot := &Snapshot{
|
|
||||||
UID: uid.UID(),
|
|
||||||
URL: *_url,
|
|
||||||
Host: _url.Hostname,
|
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
|
||||||
}
|
|
||||||
err := SaveSnapshotToDBIfNotExists(tx, snapshot)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||||
if IsBlacklisted(s.URL) {
|
if IsBlacklisted(s.URL) {
|
||||||
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
|
logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String())
|
||||||
@@ -123,31 +107,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
// add it as an error so next time it won't be
|
// add it as an error so next time it won't be
|
||||||
// crawled.
|
// crawled.
|
||||||
if RobotMatch(s.URL) {
|
if RobotMatch(s.URL) {
|
||||||
s.Error = null.StringFrom("robots.txt disallow match")
|
s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
|
||||||
err = SaveSnapshotToDB(tx, s)
|
err = UpsertSnapshot(id, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
return fmt.Errorf("[%d] %w", id, err)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Resolve IP address via DNS
|
||||||
IPs, err := getHostIPAddresses(s.Host)
|
IPs, err := getHostIPAddresses(s.Host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.Error = null.StringFrom("DNS Resolve error")
|
s.Error = null.StringFrom(err.Error())
|
||||||
err = SaveSnapshotToDB(tx, s)
|
err = UpsertSnapshot(id, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
return fmt.Errorf("[%d] %w", id, err)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
defer func() {
|
// If the host's ip is in the connections pool we stop
|
||||||
time.Sleep(5 * time.Second)
|
|
||||||
RemoveIPsFromPool(IPs)
|
|
||||||
}()
|
|
||||||
|
|
||||||
// If the host's ip is in the connections pool,
|
|
||||||
// stop and add the url in the queue later.
|
|
||||||
IpPool.Lock.RLock()
|
IpPool.Lock.RLock()
|
||||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
|
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String())
|
||||||
for _, ip := range IPs {
|
for _, ip := range IPs {
|
||||||
@@ -155,7 +134,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
if ok {
|
if ok {
|
||||||
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
|
logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String())
|
||||||
IpPool.Lock.RUnlock()
|
IpPool.Lock.RUnlock()
|
||||||
time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain
|
time.Sleep(1 * time.Second) // Avoid flood-retrying
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -163,84 +142,115 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
|
|
||||||
AddIPsToPool(IPs)
|
AddIPsToPool(IPs)
|
||||||
|
|
||||||
|
// After finishing, remove the host IPs from
|
||||||
|
// the connections pool, with a small delay
|
||||||
|
// to avoid potentially hitting the same IP quickly.
|
||||||
|
defer func() {
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
RemoveIPsFromPool(IPs)
|
||||||
|
}()
|
||||||
|
|
||||||
url := s.URL.String()
|
url := s.URL.String()
|
||||||
logging.LogDebug("[%d] Dialing %s", id, url)
|
logging.LogDebug("[%d] Dialing %s", id, url)
|
||||||
|
|
||||||
err = Visit(s)
|
err = Visit(s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if !IsKnownError(err) {
|
if !IsKnownError(err) {
|
||||||
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
|
logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err)
|
||||||
if config.CONFIG.PanicOnUnexpectedError {
|
return err
|
||||||
util.PrintStackAndPanic(err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s.Error = null.StringFrom(err.Error())
|
|
||||||
}
|
}
|
||||||
if errors.As(err, new(*ErrGeminiStatusCode)) {
|
// Check if error is redirection, and handle it
|
||||||
err = handleRedirection(tx, s)
|
s.Error = null.StringFrom(err.Error())
|
||||||
|
if errors.As(err, new(*GeminiError)) &&
|
||||||
|
err.(*GeminiError).Msg == "redirect" {
|
||||||
|
err = handleRedirection(id, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logging.LogDebug("[%d] Finished dialing.", id)
|
logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero())
|
||||||
|
|
||||||
|
// If this is a gemini page, parse possible links inside
|
||||||
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" {
|
||||||
logging.LogDebug("[%d] [%s] Processing", id, url)
|
links := GetPageLinks(s.URL, s.GemText.String)
|
||||||
s = ProcessGemini(s)
|
logging.LogDebug("[%d] Found %d links", id, len(links))
|
||||||
|
if len(links) > 0 {
|
||||||
|
s.Links = null.ValueFrom(links)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logging.LogDebug("[%d] Not looking for page links", id)
|
||||||
}
|
}
|
||||||
logging.LogDebug("[%d] Saving", id)
|
|
||||||
err = SaveSnapshotToDB(tx, s)
|
err = UpsertSnapshot(id, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store links in batch
|
err = storeLinks(tx, s)
|
||||||
if s.Links != nil {
|
if err != nil {
|
||||||
var batchSnapshots []*Snapshot
|
return err
|
||||||
timestamp := null.TimeFrom(time.Now())
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
for _, link := range *s.Links {
|
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
|
||||||
if shouldPersistURL(tx, link) {
|
if s.Links.Valid {
|
||||||
|
var batchSnapshots []*Snapshot
|
||||||
|
for _, link := range s.Links.ValueOrZero() {
|
||||||
|
if shouldPersistURL(link) {
|
||||||
newSnapshot := &Snapshot{
|
newSnapshot := &Snapshot{
|
||||||
UID: uid.UID(),
|
//UID: uid.UID(),
|
||||||
URL: link,
|
URL: link,
|
||||||
Host: link.Hostname,
|
Host: link.Hostname,
|
||||||
Timestamp: timestamp,
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
}
|
}
|
||||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(batchSnapshots) > 0 {
|
if len(batchSnapshots) > 0 {
|
||||||
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
|
err := SaveLinksToDBinBatches(tx, batchSnapshots)
|
||||||
err = SaveLinksToDBinBatches(tx, batchSnapshots)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should we save the given URL for crawling?
|
// shouldPersistURL returns true if we
|
||||||
func shouldPersistURL(tx *sqlx.Tx, u URL) bool {
|
// should save the URL in the DB.
|
||||||
if !strings.HasPrefix(u.String(), "gemini://") {
|
// Only gemini:// urls are saved.
|
||||||
return false
|
func shouldPersistURL(u URL) bool {
|
||||||
}
|
return strings.HasPrefix(u.String(), "gemini://")
|
||||||
query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)`
|
}
|
||||||
var exists bool
|
|
||||||
err := tx.Get(&exists, query, u.String())
|
func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error {
|
||||||
|
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println("Error executing query:", err)
|
return err
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
return !exists
|
logging.LogDebug("[%d] Page redirects to %s", id, newURL)
|
||||||
|
// Insert fresh snapshot with new URL
|
||||||
|
snapshot := &Snapshot{
|
||||||
|
//UID: uid.UID(),
|
||||||
|
URL: *newURL,
|
||||||
|
Host: newURL.Hostname,
|
||||||
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
|
}
|
||||||
|
logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String())
|
||||||
|
err = SaveSnapshotIfNew(tx, snapshot)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||||
// Old, unoptimized query
|
// Old, unoptimized query
|
||||||
//
|
|
||||||
// query := `
|
// query := `
|
||||||
// SELECT DISTINCT ON (host) *
|
// SELECT DISTINCT ON (host) *
|
||||||
// FROM snapshots
|
// FROM snapshots
|
||||||
// WHERE response_code IS NULL
|
// WHERE response_code IS NULL
|
||||||
@@ -249,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
|||||||
// LIMIT $1
|
// LIMIT $1
|
||||||
// `
|
// `
|
||||||
query := `
|
query := `
|
||||||
WITH RankedSnapshots AS (
|
SELECT *
|
||||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
FROM snapshots
|
||||||
links, lang, response_code, error,
|
WHERE response_code IS NULL
|
||||||
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
AND error IS NULL
|
||||||
FROM snapshots
|
ORDER BY RANDOM()
|
||||||
WHERE response_code IS NULL
|
LIMIT $1
|
||||||
AND error IS NULL
|
`
|
||||||
)
|
//query := `
|
||||||
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
// WITH RankedSnapshots AS (
|
||||||
links, lang, response_code, error
|
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||||
FROM RankedSnapshots
|
// links, lang, response_code, error,
|
||||||
WHERE rn = 1
|
// ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||||
LIMIT $1
|
// FROM snapshots
|
||||||
`
|
// WHERE response_code IS NULL
|
||||||
|
// AND error IS NULL
|
||||||
|
// )
|
||||||
|
// SELECT id, url, host, timestamp, mimetype, data, gemtext,
|
||||||
|
// links, lang, response_code, error
|
||||||
|
// FROM RankedSnapshots
|
||||||
|
// WHERE rn = 1
|
||||||
|
// LIMIT $1
|
||||||
|
//`
|
||||||
var snapshots []Snapshot
|
var snapshots []Snapshot
|
||||||
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
21
main.go
21
main.go
@@ -1,16 +1,16 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
|
||||||
"os/signal"
|
|
||||||
"syscall"
|
|
||||||
|
|
||||||
"gemini-grc/config"
|
"gemini-grc/config"
|
||||||
"gemini-grc/gemini"
|
"gemini-grc/gemini"
|
||||||
|
"gemini-grc/http"
|
||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
"github.com/rs/zerolog"
|
"github.com/rs/zerolog"
|
||||||
zlog "github.com/rs/zerolog/log"
|
zlog "github.com/rs/zerolog/log"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -26,9 +26,10 @@ func main() {
|
|||||||
|
|
||||||
func runApp() error {
|
func runApp() error {
|
||||||
logging.LogInfo("Starting up. Press Ctrl+C to exit")
|
logging.LogInfo("Starting up. Press Ctrl+C to exit")
|
||||||
sigs := make(chan os.Signal, 1)
|
signals := make(chan os.Signal, 1)
|
||||||
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
|
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
|
||||||
|
server := http.CreateServer("localhost:8899")
|
||||||
db := gemini.ConnectToDB()
|
db := gemini.ConnectToDB()
|
||||||
|
|
||||||
// !!! DANGER !!!
|
// !!! DANGER !!!
|
||||||
@@ -44,7 +45,8 @@ func runApp() error {
|
|||||||
}(db)
|
}(db)
|
||||||
|
|
||||||
gemini.LoadBlacklist()
|
gemini.LoadBlacklist()
|
||||||
|
// If there's an argument, assume it's a URL
|
||||||
|
// to visit and ignore database state.
|
||||||
if len(os.Args) > 1 {
|
if len(os.Args) > 1 {
|
||||||
url := os.Args[1]
|
url := os.Args[1]
|
||||||
go gemini.RunWorker(0, db, &url)
|
go gemini.RunWorker(0, db, &url)
|
||||||
@@ -52,7 +54,10 @@ func runApp() error {
|
|||||||
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
|
go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
<-sigs
|
<-signals
|
||||||
|
if err := server.Close(); err != nil {
|
||||||
|
logging.LogError("GeminiError during server shutdown: %s", err)
|
||||||
|
}
|
||||||
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
|
logging.LogInfo("Received SIGINT or SIGTERM signal, exiting")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user