Reorganize code for more granular imports
This commit is contained in:
@@ -1,252 +0,0 @@
|
||||
package common_test
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"gemini-grc/common"
|
||||
)
|
||||
|
||||
func TestParseURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
|
||||
parsed, err := common.ParseURL(input, "", true)
|
||||
value, _ := parsed.Value()
|
||||
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
|
||||
t.Errorf("fail: %s", parsed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL := common.URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b",
|
||||
Descr: "Nothing",
|
||||
Full: "gemini://smol.gr:1965/a/b",
|
||||
}
|
||||
input := "gemini://a.b/c"
|
||||
output, err := common.DeriveAbsoluteURL(currentURL, input)
|
||||
if err != nil {
|
||||
t.Errorf("fail: %v", err)
|
||||
}
|
||||
expected := &common.URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "a.b",
|
||||
Port: 1965,
|
||||
Path: "/c",
|
||||
Descr: "",
|
||||
Full: "gemini://a.b:1965/c",
|
||||
}
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL := common.URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b",
|
||||
Descr: "Nothing",
|
||||
Full: "gemini://smol.gr:1965/a/b",
|
||||
}
|
||||
input := "/c"
|
||||
output, err := common.DeriveAbsoluteURL(currentURL, input)
|
||||
if err != nil {
|
||||
t.Errorf("fail: %v", err)
|
||||
}
|
||||
expected := &common.URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/c",
|
||||
Descr: "",
|
||||
Full: "gemini://smol.gr:1965/c",
|
||||
}
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL := common.URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b",
|
||||
Descr: "Nothing",
|
||||
Full: "gemini://smol.gr:1965/a/b",
|
||||
}
|
||||
input := "c/d"
|
||||
output, err := common.DeriveAbsoluteURL(currentURL, input)
|
||||
if err != nil {
|
||||
t.Errorf("fail: %v", err)
|
||||
}
|
||||
expected := &common.URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b/c/d",
|
||||
Descr: "",
|
||||
Full: "gemini://smol.gr:1965/a/b/c/d",
|
||||
}
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeURLSlash(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net/retro-computing/magazines/"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := input
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeURLNoSlash(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net/retro-computing/magazines"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := input
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeMultiSlash(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net/retro-computing/////////a///magazines"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeTrailingSlash(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net/"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://uscoffings.net/"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNoTrailingSlash(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://uscoffings.net"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeTrailingSlashPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net/a/"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://uscoffings.net/a/"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNoTrailingSlashPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net/a"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://uscoffings.net/a"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeDot(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net/retro-computing/./././////a///magazines"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePort(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://uscoffings.net:1965/a"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://uscoffings.net/a"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := "gemini://chat.gemini.lehmann.cx:11965/"
|
||||
normalized, _ := common.NormalizeURL(input)
|
||||
output := normalized.String()
|
||||
expected := "gemini://chat.gemini.lehmann.cx:11965/"
|
||||
pass := reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
|
||||
input = "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c"
|
||||
normalized, _ = common.NormalizeURL(input)
|
||||
output = normalized.String()
|
||||
expected = "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c"
|
||||
pass = reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
|
||||
input = "gemini://chat.gemini.lehmann.cx:11965/index#1"
|
||||
normalized, _ = common.NormalizeURL(input)
|
||||
output = normalized.String()
|
||||
expected = "gemini://chat.gemini.lehmann.cx:11965/index#1"
|
||||
pass = reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
|
||||
input = "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494"
|
||||
normalized, _ = common.NormalizeURL(input)
|
||||
output = normalized.String()
|
||||
expected = "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494"
|
||||
pass = reflect.DeepEqual(output, expected)
|
||||
if !pass {
|
||||
t.Errorf("fail: %#v != %#v", output, expected)
|
||||
}
|
||||
}
|
||||
27
common/linkList/linkList.go
Normal file
27
common/linkList/linkList.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package linkList
|
||||
|
||||
import (
|
||||
"database/sql/driver"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"gemini-grc/common/url"
|
||||
)
|
||||
|
||||
type LinkList []url.URL
|
||||
|
||||
func (l *LinkList) Value() (driver.Value, error) {
|
||||
return json.Marshal(l)
|
||||
}
|
||||
|
||||
func (l *LinkList) Scan(value interface{}) error {
|
||||
if value == nil {
|
||||
*l = nil
|
||||
return nil
|
||||
}
|
||||
b, ok := value.([]byte) // Type assertion! Converts to []byte
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
|
||||
}
|
||||
return json.Unmarshal(b, l)
|
||||
}
|
||||
13
common/shared.go
Normal file
13
common/shared.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package common
|
||||
|
||||
var (
|
||||
StatusChan chan WorkerStatus
|
||||
// ErrorsChan accepts errors from workers.
|
||||
// In case of fatal error, gracefully
|
||||
// exits the application.
|
||||
ErrorsChan chan error
|
||||
)
|
||||
|
||||
const VERSION string = "0.0.1"
|
||||
|
||||
const CtxKeyLogger string = "CtxKeyLogger"
|
||||
@@ -1,56 +0,0 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"database/sql/driver"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
type LinkList []URL
|
||||
|
||||
func (l *LinkList) Value() (driver.Value, error) {
|
||||
return json.Marshal(l)
|
||||
}
|
||||
|
||||
func (l *LinkList) Scan(value interface{}) error {
|
||||
if value == nil {
|
||||
*l = nil
|
||||
return nil
|
||||
}
|
||||
b, ok := value.([]byte) // Type assertion! Converts to []byte
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
|
||||
}
|
||||
return json.Unmarshal(b, l)
|
||||
}
|
||||
|
||||
type Snapshot struct {
|
||||
ID int `db:"id" json:"id,omitempty"`
|
||||
URL URL `db:"url" json:"url,omitempty"`
|
||||
Host string `db:"host" json:"host,omitempty"`
|
||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
||||
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
|
||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
}
|
||||
|
||||
func SnapshotFromURL(u string) *Snapshot {
|
||||
url, err := ParseURL(u, "")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
newSnapshot := Snapshot{
|
||||
URL: *url,
|
||||
Host: url.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
return &newSnapshot
|
||||
}
|
||||
38
common/snapshot/snapshot.go
Normal file
38
common/snapshot/snapshot.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package snapshot
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"gemini-grc/common/linkList"
|
||||
commonUrl "gemini-grc/common/url"
|
||||
"gemini-grc/errors"
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
type Snapshot struct {
|
||||
ID int `db:"ID" json:"ID,omitempty"`
|
||||
URL commonUrl.URL `db:"url" json:"url,omitempty"`
|
||||
Host string `db:"host" json:"host,omitempty"`
|
||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
|
||||
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
|
||||
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
|
||||
Header null.String `db:"header" json:"header,omitempty"` // Response header.
|
||||
Links null.Value[linkList.LinkList] `db:"links" json:"links,omitempty"`
|
||||
Lang null.String `db:"lang" json:"lang,omitempty"`
|
||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response Status code.
|
||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||
}
|
||||
|
||||
func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
|
||||
url, err := commonUrl.ParseURL(u, "", normalize)
|
||||
if err != nil {
|
||||
return nil, errors.NewError(err)
|
||||
}
|
||||
newSnapshot := Snapshot{
|
||||
URL: *url,
|
||||
Host: url.Hostname,
|
||||
Timestamp: null.TimeFrom(time.Now()),
|
||||
}
|
||||
return &newSnapshot, nil
|
||||
}
|
||||
@@ -1,12 +1,15 @@
|
||||
package common
|
||||
package url
|
||||
|
||||
import (
|
||||
"database/sql/driver"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"gemini-grc/errors"
|
||||
)
|
||||
|
||||
type URL struct {
|
||||
@@ -26,11 +29,10 @@ func (u *URL) Scan(value interface{}) error {
|
||||
}
|
||||
b, ok := value.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("%w: expected string, got %T", ErrDatabaseScan, value)
|
||||
return errors.NewFatalError(fmt.Errorf("database scan error: expected string, got %T", value))
|
||||
}
|
||||
parsedURL, err := ParseURL(b, "", false)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("%w: failed to scan GeminiUrl %s: %v", ErrDatabaseScan, b, err)
|
||||
return err
|
||||
}
|
||||
*u = *parsedURL
|
||||
@@ -42,8 +44,14 @@ func (u URL) String() string {
|
||||
}
|
||||
|
||||
func (u URL) StringNoDefaultPort() string {
|
||||
if u.Port == 1965 {
|
||||
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||
if IsGeminiUrl(u.String()) {
|
||||
if u.Port == 1965 {
|
||||
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||
}
|
||||
} else {
|
||||
if u.Port == 70 {
|
||||
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
|
||||
}
|
||||
}
|
||||
return u.Full
|
||||
}
|
||||
@@ -55,30 +63,43 @@ func (u URL) Value() (driver.Value, error) {
|
||||
return u.Full, nil
|
||||
}
|
||||
|
||||
func IsGeminiUrl(url string) bool {
|
||||
return strings.HasPrefix(url, "gemini://")
|
||||
}
|
||||
|
||||
func IsGopherURL(s string) bool {
|
||||
return strings.HasPrefix(s, "gopher://")
|
||||
}
|
||||
|
||||
func ParseURL(input string, descr string, normalize bool) (*URL, error) {
|
||||
var u *url.URL
|
||||
var err error
|
||||
if normalize {
|
||||
u, err = NormalizeURL(input)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
u, err = url.Parse(input)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: Input %s URL Parse Error: %w", ErrURLParse, input, err)
|
||||
}
|
||||
if u.Scheme != "gemini" {
|
||||
return nil, fmt.Errorf("%w: URL scheme '%s' is not supported", ErrURLNotGemini, u.Scheme)
|
||||
if err != nil {
|
||||
return nil, errors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input))
|
||||
}
|
||||
}
|
||||
protocol := u.Scheme
|
||||
hostname := u.Hostname()
|
||||
strPort := u.Port()
|
||||
// urlPath := u.EscapedPath()
|
||||
urlPath := u.Path
|
||||
if strPort == "" {
|
||||
strPort = "1965"
|
||||
if u.Scheme == "gemini" {
|
||||
strPort = "1965" // default Gemini port
|
||||
} else {
|
||||
strPort = "70" // default Gopher port
|
||||
}
|
||||
}
|
||||
port, err := strconv.Atoi(strPort)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
|
||||
return nil, errors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input))
|
||||
}
|
||||
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
|
||||
// full field should also contain query params and url fragments
|
||||
@@ -113,7 +134,7 @@ func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
|
||||
return ParseURL(strURL, "", true)
|
||||
}
|
||||
|
||||
// NormalizeURL takes a URL string and returns a normalized version.
|
||||
// NormalizeURL takes a URL string and returns a normalized version
|
||||
// Normalized meaning:
|
||||
// - Path normalization (removing redundant slashes, . and .. segments)
|
||||
// - Proper escaping of special characters
|
||||
@@ -124,7 +145,13 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
||||
// Parse the URL
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
||||
return nil, errors.NewError(fmt.Errorf("error normalizing URL: %w: %s", err, rawURL))
|
||||
}
|
||||
if u.Scheme == "" {
|
||||
return nil, errors.NewError(fmt.Errorf("error normalizing URL: No scheme: %s", rawURL))
|
||||
}
|
||||
if u.Host == "" {
|
||||
return nil, errors.NewError(fmt.Errorf("error normalizing URL: No host: %s", rawURL))
|
||||
}
|
||||
|
||||
// Convert scheme to lowercase
|
||||
@@ -135,7 +162,7 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
||||
u.Host = strings.ToLower(u.Host)
|
||||
}
|
||||
|
||||
// Remove default ports
|
||||
// remove default ports
|
||||
if u.Port() != "" {
|
||||
switch {
|
||||
case u.Scheme == "http" && u.Port() == "80":
|
||||
@@ -144,6 +171,8 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
||||
u.Host = u.Hostname()
|
||||
case u.Scheme == "gemini" && u.Port() == "1965":
|
||||
u.Host = u.Hostname()
|
||||
case u.Scheme == "gopher" && u.Port() == "70":
|
||||
u.Host = u.Hostname()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,7 +181,7 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
||||
// Check if there was a trailing slash before cleaning
|
||||
hadTrailingSlash := strings.HasSuffix(u.Path, "/")
|
||||
|
||||
u.Path = path.Clean(u.Path)
|
||||
u.Path = path.Clean(u.EscapedPath())
|
||||
// If path was "/", path.Clean() will return "."
|
||||
if u.Path == "." {
|
||||
u.Path = "/"
|
||||
@@ -162,20 +191,25 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// Properly escape the path
|
||||
// First split on '/' to avoid escaping them
|
||||
// Properly escape the path, but only for unescaped parts
|
||||
parts := strings.Split(u.Path, "/")
|
||||
for i, part := range parts {
|
||||
parts[i] = url.PathEscape(part)
|
||||
// Try to unescape to check if it's already escaped
|
||||
unescaped, err := url.PathUnescape(part)
|
||||
if err != nil || unescaped == part {
|
||||
// Part is not escaped, so escape it
|
||||
parts[i] = url.PathEscape(part)
|
||||
}
|
||||
// If already escaped, leave as is
|
||||
}
|
||||
u.Path = strings.Join(parts, "/")
|
||||
|
||||
// Remove trailing fragment if empty
|
||||
// remove trailing fragment if empty
|
||||
if u.Fragment == "" {
|
||||
u.Fragment = ""
|
||||
}
|
||||
|
||||
// Remove trailing query if empty
|
||||
// remove trailing query if empty
|
||||
if u.RawQuery == "" {
|
||||
u.RawQuery = ""
|
||||
}
|
||||
@@ -188,7 +222,7 @@ func EscapeURL(input string) string {
|
||||
if strings.Contains(input, "%") && !strings.Contains(input, "% ") {
|
||||
return input
|
||||
}
|
||||
// Split URL into parts (protocol, host, path)
|
||||
// Split URL into parts (protocol, host, p)
|
||||
parts := strings.SplitN(input, "://", 2)
|
||||
if len(parts) != 2 {
|
||||
return input
|
||||
@@ -202,18 +236,50 @@ func EscapeURL(input string) string {
|
||||
return input
|
||||
}
|
||||
|
||||
// Split host and path
|
||||
// Split host and p
|
||||
parts = strings.SplitN(remainder, "/", 2)
|
||||
host := parts[0]
|
||||
if len(parts) == 1 {
|
||||
return protocol + "://" + host
|
||||
}
|
||||
|
||||
path := parts[1]
|
||||
|
||||
// Escape the path portion
|
||||
escapedPath := url.PathEscape(path)
|
||||
escapedPath := url.PathEscape(parts[1])
|
||||
|
||||
// Reconstruct the URL
|
||||
return protocol + "://" + host + "/" + escapedPath
|
||||
}
|
||||
|
||||
// TrimTrailingPathSlash trims trailing slash and handles empty path
|
||||
func TrimTrailingPathSlash(path string) string {
|
||||
// Handle empty path (e.g., "http://example.com" -> treat as root)
|
||||
if path == "" {
|
||||
return "/"
|
||||
}
|
||||
|
||||
// Trim trailing slash while preserving root slash
|
||||
path = strings.TrimSuffix(path, "/")
|
||||
if path == "" { // This happens if path was just "/"
|
||||
return "/"
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
// ExtractRedirectTargetFromHeader returns the redirection
|
||||
// URL by parsing the header (or error message)
|
||||
func ExtractRedirectTargetFromHeader(currentURL URL, input string) (*URL, error) {
|
||||
// \d+ - matches one or more digits
|
||||
// \s+ - matches one or more whitespace
|
||||
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
||||
pattern := `\d+\s+([^\r]+)`
|
||||
re := regexp.MustCompile(pattern)
|
||||
matches := re.FindStringSubmatch(input)
|
||||
if len(matches) < 2 {
|
||||
return nil, errors.NewError(fmt.Errorf("error extracting redirect target from string %s", input))
|
||||
}
|
||||
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return newURL, nil
|
||||
}
|
||||
420
common/url/url_test.go
Normal file
420
common/url/url_test.go
Normal file
@@ -0,0 +1,420 @@
|
||||
package url
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestURLOperations(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("ParseURL", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
base string
|
||||
absolute bool
|
||||
want string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "parse CGI URL",
|
||||
input: "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162",
|
||||
base: "",
|
||||
absolute: true,
|
||||
want: "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
parsed, err := ParseURL(tt.input, tt.base, tt.absolute)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("ParseURL() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if !tt.wantErr {
|
||||
value, _ := parsed.Value()
|
||||
if value != tt.want {
|
||||
t.Errorf("ParseURL() = %v, want %v", value, tt.want)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("DeriveAbsoluteURL", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
baseURL := URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b",
|
||||
Descr: "Nothing",
|
||||
Full: "gemini://smol.gr:1965/a/b",
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
current URL
|
||||
input string
|
||||
expected *URL
|
||||
}{
|
||||
{
|
||||
name: "absolute URL input",
|
||||
current: baseURL,
|
||||
input: "gemini://a.b/c",
|
||||
expected: &URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "a.b",
|
||||
Port: 1965,
|
||||
Path: "/c",
|
||||
Full: "gemini://a.b:1965/c",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "absolute path input",
|
||||
current: baseURL,
|
||||
input: "/c",
|
||||
expected: &URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/c",
|
||||
Full: "gemini://smol.gr:1965/c",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "relative path input",
|
||||
current: baseURL,
|
||||
input: "c/d",
|
||||
expected: &URL{
|
||||
Protocol: "gemini",
|
||||
Hostname: "smol.gr",
|
||||
Port: 1965,
|
||||
Path: "/a/b/c/d",
|
||||
Full: "gemini://smol.gr:1965/a/b/c/d",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
output, err := DeriveAbsoluteURL(tt.current, tt.input)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(output, tt.expected) {
|
||||
t.Errorf("got %#v, want %#v", output, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("NormalizeURL", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "with trailing slash",
|
||||
input: "gemini://uscoffings.net/retro-computing/magazines/",
|
||||
expected: "gemini://uscoffings.net/retro-computing/magazines/",
|
||||
},
|
||||
{
|
||||
name: "without trailing slash",
|
||||
input: "gemini://uscoffings.net/retro-computing/magazines",
|
||||
expected: "gemini://uscoffings.net/retro-computing/magazines",
|
||||
},
|
||||
{
|
||||
name: "multiple slashes",
|
||||
input: "gemini://uscoffings.net/retro-computing/////////a///magazines",
|
||||
expected: "gemini://uscoffings.net/retro-computing/a/magazines",
|
||||
},
|
||||
{
|
||||
name: "root with trailing slash",
|
||||
input: "gemini://uscoffings.net/",
|
||||
expected: "gemini://uscoffings.net/",
|
||||
},
|
||||
{
|
||||
name: "root without trailing slash",
|
||||
input: "gemini://uscoffings.net",
|
||||
expected: "gemini://uscoffings.net",
|
||||
},
|
||||
{
|
||||
name: "path with trailing slash",
|
||||
input: "gemini://uscoffings.net/a/",
|
||||
expected: "gemini://uscoffings.net/a/",
|
||||
},
|
||||
{
|
||||
name: "path without trailing slash",
|
||||
input: "gemini://uscoffings.net/a",
|
||||
expected: "gemini://uscoffings.net/a",
|
||||
},
|
||||
{
|
||||
name: "with dot segments",
|
||||
input: "gemini://uscoffings.net/retro-computing/./././////a///magazines",
|
||||
expected: "gemini://uscoffings.net/retro-computing/a/magazines",
|
||||
},
|
||||
{
|
||||
name: "with default port",
|
||||
input: "gemini://uscoffings.net:1965/a",
|
||||
expected: "gemini://uscoffings.net/a",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
normalized, err := NormalizeURL(tt.input)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
output := normalized.String()
|
||||
if output != tt.expected {
|
||||
t.Errorf("got %#v, want %#v", output, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestNormalizeURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "URL with non-default port",
|
||||
input: "gemini://chat.gemini.lehmann.cx:11965/",
|
||||
expected: "gemini://chat.gemini.lehmann.cx:11965/",
|
||||
},
|
||||
{
|
||||
name: "URL with query parameters",
|
||||
input: "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c",
|
||||
expected: "gemini://chat.gemini.lehmann.cx:11965/index?a=1&b=c",
|
||||
},
|
||||
{
|
||||
name: "URL with fragment",
|
||||
input: "gemini://chat.gemini.lehmann.cx:11965/index#1",
|
||||
expected: "gemini://chat.gemini.lehmann.cx:11965/index#1",
|
||||
},
|
||||
{
|
||||
name: "URL with CGI script and query",
|
||||
input: "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494",
|
||||
expected: "gemini://gemi.dev/cgi-bin/xkcd.cgi?1494",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt // capture range variable for parallel testing
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
normalized, err := NormalizeURL(tt.input)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
output := normalized.String()
|
||||
if output != tt.expected {
|
||||
t.Errorf("got %#v, want %#v", output, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePath(t *testing.T) {
|
||||
t.Parallel()
|
||||
tests := []struct {
|
||||
name string
|
||||
input string // URL string to parse
|
||||
expected string // Expected normalized path
|
||||
}{
|
||||
// Basic cases
|
||||
{
|
||||
name: "empty_path",
|
||||
input: "http://example.com",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "root_path",
|
||||
input: "http://example.com/",
|
||||
expected: "/",
|
||||
},
|
||||
{
|
||||
name: "single_trailing_slash",
|
||||
input: "http://example.com/test/",
|
||||
expected: "/test/",
|
||||
},
|
||||
{
|
||||
name: "no_trailing_slash",
|
||||
input: "http://example.com/test",
|
||||
expected: "/test",
|
||||
},
|
||||
|
||||
// Edge cases with slashes
|
||||
{
|
||||
name: "multiple_trailing_slashes",
|
||||
input: "http://example.com/test//",
|
||||
expected: "/test/",
|
||||
},
|
||||
{
|
||||
name: "multiple_consecutive_slashes",
|
||||
input: "http://example.com//test//",
|
||||
expected: "/test/",
|
||||
},
|
||||
{
|
||||
name: "only_slashes",
|
||||
input: "http://example.com////",
|
||||
expected: "/",
|
||||
},
|
||||
|
||||
// Encoded characters
|
||||
{
|
||||
name: "encoded_spaces",
|
||||
input: "http://example.com/foo%20bar/",
|
||||
expected: "/foo%20bar/",
|
||||
},
|
||||
{
|
||||
name: "encoded_special_chars",
|
||||
input: "http://example.com/foo%2Fbar/",
|
||||
expected: "/foo%2Fbar/",
|
||||
},
|
||||
|
||||
// Query parameters and fragments
|
||||
{
|
||||
name: "with_query_parameters",
|
||||
input: "http://example.com/path?query=param",
|
||||
expected: "/path",
|
||||
},
|
||||
{
|
||||
name: "with_fragment",
|
||||
input: "http://example.com/path#fragment",
|
||||
expected: "/path",
|
||||
},
|
||||
{
|
||||
name: "with_both_query_and_fragment",
|
||||
input: "http://example.com/path?query=param#fragment",
|
||||
expected: "/path",
|
||||
},
|
||||
|
||||
// Unicode paths
|
||||
{
|
||||
name: "unicode_characters",
|
||||
input: "http://example.com/über/path/",
|
||||
expected: "/%C3%BCber/path/",
|
||||
},
|
||||
{
|
||||
name: "unicode_encoded",
|
||||
input: "http://example.com/%C3%BCber/path/",
|
||||
expected: "/%C3%BCber/path/",
|
||||
},
|
||||
|
||||
// Weird but valid cases
|
||||
{
|
||||
name: "dot_in_path",
|
||||
input: "http://example.com/./path/",
|
||||
expected: "/path/",
|
||||
},
|
||||
{
|
||||
name: "double_dot_in_path",
|
||||
input: "http://example.com/../path/",
|
||||
expected: "/path/",
|
||||
},
|
||||
{
|
||||
name: "mixed_case",
|
||||
input: "http://example.com/PaTh/",
|
||||
expected: "/PaTh/",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
u, err := ParseURL(tt.input, "", true)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse URL %q: %v", tt.input, err)
|
||||
}
|
||||
|
||||
result := u.Path
|
||||
if result != tt.expected {
|
||||
t.Errorf("Input: %s\nExpected: %q\nGot: %q",
|
||||
u.Path, tt.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||
input := "redirect: 31 gemini://target.gr"
|
||||
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||
expected := "gemini://target.gr:1965"
|
||||
if err != nil || (result.String() != expected) {
|
||||
t.Errorf("fail: Expected %s got %s", expected, result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||
input := "redirect: 31 gemini://target.gr/"
|
||||
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||
expected := "gemini://target.gr:1965/"
|
||||
if err != nil || (result.String() != expected) {
|
||||
t.Errorf("fail: Expected %s got %s", expected, result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||
input := "redirect: 31 /a/b"
|
||||
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
||||
t.Errorf("fail: %s", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://nox.im:1965", "", true)
|
||||
input := "redirect: 31 ./"
|
||||
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
||||
t.Errorf("fail: %s", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://status.zvava.org:1965", "", true)
|
||||
input := "redirect: 31 index.gmi"
|
||||
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
|
||||
t.Errorf("fail: %s", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractRedirectTargetWrong(t *testing.T) {
|
||||
t.Parallel()
|
||||
currentURL, _ := ParseURL("gemini://smol.gr", "", true)
|
||||
input := "redirect: 31"
|
||||
result, err := ExtractRedirectTargetFromHeader(*currentURL, input)
|
||||
if result != nil || err == nil {
|
||||
t.Errorf("fail: result should be nil, err is %s", err)
|
||||
}
|
||||
}
|
||||
320
common/worker.go
Normal file
320
common/worker.go
Normal file
@@ -0,0 +1,320 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"gemini-grc/common/blackList"
|
||||
errors2 "gemini-grc/common/errors"
|
||||
"gemini-grc/common/snapshot"
|
||||
url2 "gemini-grc/common/url"
|
||||
_db "gemini-grc/db"
|
||||
"gemini-grc/errors"
|
||||
"gemini-grc/gemini"
|
||||
"gemini-grc/gopher"
|
||||
"gemini-grc/hostPool"
|
||||
"gemini-grc/logging"
|
||||
"github.com/guregu/null/v5"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
func CrawlOneURL(db *sqlx.DB, url *string) error {
|
||||
parsedURL, err := url2.ParseURL(*url, "", true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !url2.IsGeminiUrl(parsedURL.String()) && !url2.IsGopherURL(parsedURL.String()) {
|
||||
return errors.NewError(fmt.Errorf("error parsing URL: not a Gemini or Gopher URL: %s", parsedURL.String()))
|
||||
}
|
||||
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
return errors.NewFatalError(err)
|
||||
}
|
||||
|
||||
err = _db.InsertURL(tx, parsedURL.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = workOnUrl(0, tx, parsedURL.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
//if _db.IsDeadlockError(err) {
|
||||
// logging.LogError("Deadlock detected. Rolling back")
|
||||
// time.Sleep(time.Duration(10) * time.Second)
|
||||
// err := tx.Rollback()
|
||||
// return errors.NewFatalError(err)
|
||||
//}
|
||||
return errors.NewFatalError(err)
|
||||
}
|
||||
logging.LogInfo("Done")
|
||||
return nil
|
||||
}
|
||||
|
||||
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
||||
go PrintWorkerStatus(numOfWorkers, StatusChan)
|
||||
|
||||
for i := range numOfWorkers {
|
||||
go func(i int) {
|
||||
UpdateWorkerStatus(i, "Waiting to start")
|
||||
// Jitter to avoid starting everything at the same time
|
||||
time.Sleep(time.Duration(i+2) * time.Second)
|
||||
for {
|
||||
// TODO: Use cancellable context with tx, logger & worker ID.
|
||||
// ctx := context.WithCancel()
|
||||
// ctx = context.WithValue(ctx, common.CtxKeyLogger, &RequestLogger{r: r})
|
||||
RunWorkerWithTx(i, db)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
}
|
||||
|
||||
func RunWorkerWithTx(workerID int, db *sqlx.DB) {
|
||||
defer func() {
|
||||
UpdateWorkerStatus(workerID, "Done")
|
||||
}()
|
||||
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
ErrorsChan <- err
|
||||
return
|
||||
}
|
||||
|
||||
err = runWorker(workerID, tx)
|
||||
if err != nil {
|
||||
// TODO: Rollback in this case?
|
||||
ErrorsChan <- err
|
||||
return
|
||||
}
|
||||
|
||||
logging.LogDebug("[%3d] Committing transaction", workerID)
|
||||
err = tx.Commit()
|
||||
// On deadlock errors, rollback and return, otherwise panic.
|
||||
if err != nil {
|
||||
logging.LogError("[%3d] Failed to commit transaction: %w", workerID, err)
|
||||
if _db.IsDeadlockError(err) {
|
||||
logging.LogError("[%3d] Deadlock detected. Rolling back", workerID)
|
||||
time.Sleep(time.Duration(10) * time.Second)
|
||||
err := tx.Rollback()
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("[%3d] Failed to roll back transaction: %v", workerID, err))
|
||||
}
|
||||
return
|
||||
}
|
||||
panic(fmt.Sprintf("[%3d] Failed to commit transaction: %v", workerID, err))
|
||||
}
|
||||
logging.LogDebug("[%3d] Worker done!", workerID)
|
||||
}
|
||||
|
||||
func runWorker(workerID int, tx *sqlx.Tx) error {
|
||||
var urls []string
|
||||
var err error
|
||||
|
||||
UpdateWorkerStatus(workerID, "Getting URLs from DB")
|
||||
urls, err = _db.GetRandomUrls(tx)
|
||||
// urls, err = _db.GetRandomUrlsWithBasePath(tx)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if len(urls) == 0 {
|
||||
logging.LogInfo("[%3d] No URLs to visit, sleeping...", workerID)
|
||||
UpdateWorkerStatus(workerID, "No URLs to visit, sleeping...")
|
||||
time.Sleep(1 * time.Minute)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Start visiting URLs.
|
||||
total := len(urls)
|
||||
for i, u := range urls {
|
||||
logging.LogInfo("[%3d] Starting %d/%d %s", workerID, i+1, total, u)
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Starting %d/%d %s", i+1, total, u))
|
||||
err := workOnUrl(workerID, tx, u)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
logging.LogDebug("[%3d] Done %d/%d.", workerID, i+1, total)
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Done %d/%d %s", i+1, total, u))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// workOnUrl visits a URL and stores the result.
|
||||
// unexpected errors are returned.
|
||||
// expected errors are stored within the snapshot.
|
||||
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
|
||||
s, err := snapshot.SnapshotFromURL(url, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
isGemini := url2.IsGeminiUrl(s.URL.String())
|
||||
isGopher := url2.IsGopherURL(s.URL.String())
|
||||
if !isGemini && !isGopher {
|
||||
return errors.NewError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()))
|
||||
}
|
||||
|
||||
if blackList.IsBlacklisted(s.URL.String()) {
|
||||
logging.LogInfo("[%3d] URL matches blacklist, ignoring", workerID)
|
||||
s.Error = null.StringFrom(errors2.ErrBlacklistMatch.Error())
|
||||
return saveSnapshotAndRemoveURL(tx, s)
|
||||
}
|
||||
|
||||
if isGemini {
|
||||
// If URL matches a robots.txt disallow line,
|
||||
// add it as an error and remove url
|
||||
robotMatch, err := gemini.RobotMatch(s.URL.String())
|
||||
if err != nil {
|
||||
// robotMatch returns only network errors!
|
||||
// we stop because we don't want to hit
|
||||
// the server with another request on this case.
|
||||
return err
|
||||
}
|
||||
if robotMatch {
|
||||
logging.LogInfo("[%3d] URL matches robots.txt, ignoring", workerID)
|
||||
s.Error = null.StringFrom(errors2.ErrRobotsMatch.Error())
|
||||
return saveSnapshotAndRemoveURL(tx, s)
|
||||
}
|
||||
}
|
||||
|
||||
logging.LogDebug("[%3d] Adding to pool %s", workerID, s.URL.String())
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Adding to pool %s", s.URL.String()))
|
||||
hostPool.AddHostToHostPool(s.Host)
|
||||
defer func(s string) {
|
||||
hostPool.RemoveHostFromPool(s)
|
||||
}(s.Host)
|
||||
|
||||
logging.LogDebug("[%3d] Visiting %s", workerID, s.URL.String())
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Visiting %s", s.URL.String()))
|
||||
|
||||
if isGopher {
|
||||
s, err = gopher.Visit(s.URL.String())
|
||||
} else {
|
||||
s, err = gemini.Visit(s.URL.String())
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Handle Gemini redirection.
|
||||
if isGemini &&
|
||||
s.ResponseCode.ValueOrZero() >= 30 &&
|
||||
s.ResponseCode.ValueOrZero() < 40 {
|
||||
err = handleRedirection(workerID, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while handling redirection: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Store links
|
||||
if len(s.Links.ValueOrZero()) > 0 {
|
||||
logging.LogDebug("[%3d] Found %d links", workerID, len(s.Links.ValueOrZero()))
|
||||
err = storeLinks(tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
|
||||
return saveSnapshotAndRemoveURL(tx, s)
|
||||
}
|
||||
|
||||
func storeLinks(tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
if s.Links.Valid { //nolint:nestif
|
||||
for _, link := range s.Links.ValueOrZero() {
|
||||
if shouldPersistURL(&link) {
|
||||
visited, err := haveWeVisitedURL(tx, link.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !visited {
|
||||
err := _db.InsertURL(tx, link.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
logging.LogDebug("Link already persisted: %s", link.Full)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func saveSnapshotAndRemoveURL(tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
err := _db.OverwriteSnapshot(tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = _db.DeleteURL(tx, s.URL.String())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// shouldPersistURL returns true if we
|
||||
// should save the URL in the _db.
|
||||
// Only gemini:// urls are saved.
|
||||
func shouldPersistURL(u *url2.URL) bool {
|
||||
return url2.IsGeminiUrl(u.String()) || url2.IsGopherURL(u.String())
|
||||
}
|
||||
|
||||
func haveWeVisitedURL(tx *sqlx.Tx, u string) (bool, error) {
|
||||
var result []bool
|
||||
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u)
|
||||
if err != nil {
|
||||
return false, errors.NewFatalError(fmt.Errorf("database error: %w", err))
|
||||
}
|
||||
if len(result) > 0 {
|
||||
return result[0], nil
|
||||
}
|
||||
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshots.url=$1`, u)
|
||||
if err != nil {
|
||||
return false, errors.NewFatalError(fmt.Errorf("database error: %w", err))
|
||||
}
|
||||
if len(result) > 0 {
|
||||
return result[0], nil
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// handleRedirection saves redirection URL.
|
||||
func handleRedirection(workerID int, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Error.ValueOrZero())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
logging.LogDebug("[%3d] Page redirects to %s", workerID, newURL)
|
||||
|
||||
haveWeVisited, _ := haveWeVisitedURL(tx, newURL.String())
|
||||
if shouldPersistURL(newURL) && !haveWeVisited {
|
||||
err = _db.InsertURL(tx, newURL.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
logging.LogDebug("[%3d] Saved redirection URL %s", workerID, newURL.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]snapshot.Snapshot, error) {
|
||||
query := `
|
||||
SELECT *
|
||||
FROM snapshots
|
||||
WHERE url=$1
|
||||
LIMIT 1
|
||||
`
|
||||
var snapshots []snapshot.Snapshot
|
||||
err := tx.Select(&snapshots, query, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return snapshots, nil
|
||||
}
|
||||
@@ -1,19 +1,35 @@
|
||||
package gemini
|
||||
package common
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"gemini-grc/config"
|
||||
)
|
||||
|
||||
type WorkerStatus struct {
|
||||
id int
|
||||
status string
|
||||
ID int
|
||||
Status string
|
||||
}
|
||||
|
||||
var statusChan chan WorkerStatus
|
||||
func UpdateWorkerStatus(workerID int, status string) {
|
||||
if !config.GetConfig().PrintWorkerStatus {
|
||||
return
|
||||
}
|
||||
if config.CONFIG.NumOfWorkers > 1 {
|
||||
StatusChan <- WorkerStatus{
|
||||
ID: workerID,
|
||||
Status: status,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
|
||||
// Create a slice to store current status of each worker
|
||||
if !config.GetConfig().PrintWorkerStatus {
|
||||
return
|
||||
}
|
||||
|
||||
// Create a slice to store current Status of each worker
|
||||
statuses := make([]string, totalWorkers)
|
||||
|
||||
// Initialize empty statuses
|
||||
@@ -32,14 +48,14 @@ func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
|
||||
}
|
||||
fmt.Print(output.String())
|
||||
|
||||
// Continuously receive status updates
|
||||
// Continuously receive Status updates
|
||||
for update := range statusChan {
|
||||
if update.id >= totalWorkers {
|
||||
if update.ID >= totalWorkers {
|
||||
continue
|
||||
}
|
||||
|
||||
// Update the status
|
||||
statuses[update.id] = update.status
|
||||
// Update the Status
|
||||
statuses[update.ID] = update.Status
|
||||
|
||||
// Build the complete output string
|
||||
output.Reset()
|
||||
@@ -48,7 +64,7 @@ func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
|
||||
output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status))
|
||||
}
|
||||
|
||||
// Print the entire status
|
||||
// Print the entire Status
|
||||
fmt.Print(output.String())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user