Break up common functions and small refactor.

This commit is contained in:
2025-01-04 15:31:26 +02:00
parent b78fe00221
commit 4e6fad873b
16 changed files with 259 additions and 177 deletions

View File

@@ -2,6 +2,7 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"os"
"strings"
@@ -39,7 +40,11 @@ func LoadBlacklist() {
}
}
func IsBlacklisted(url URL) bool {
func IsBlacklisted(u string) bool {
url, err := common.ParseURL(u, "")
if err != nil {
return false
}
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
for _, v := range *Blacklist {
if v == url.Hostname || v == hostWithPort {

View File

@@ -1,176 +0,0 @@
package gemini
import (
"encoding/json"
"errors"
"fmt"
"os"
"strconv"
"gemini-grc/config"
"gemini-grc/logging"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
"github.com/jmoiron/sqlx"
"github.com/lib/pq"
)
func ConnectToDB() *sqlx.DB {
connStr := fmt.Sprintf("postgres://%s:%s@%s:%s/%s", //nolint:nosprintfhostport
os.Getenv("PG_USER"),
os.Getenv("PG_PASSWORD"),
os.Getenv("PG_HOST"),
os.Getenv("PG_PORT"),
os.Getenv("PG_DATABASE"),
)
// Create a connection pool
db, err := sqlx.Open("pgx", connStr)
if err != nil {
panic(fmt.Sprintf("Unable to connect to database with URL %s: %v\n", connStr, err))
}
// TODO move PG_MAX_OPEN_CONNECTIONS to config env variables
maxConnections, err := strconv.Atoi(os.Getenv("PG_MAX_OPEN_CONNECTIONS"))
if err != nil {
panic(fmt.Sprintf("Unable to set max DB connections: %s\n", err))
}
db.SetMaxOpenConns(maxConnections)
err = db.Ping()
if err != nil {
panic(fmt.Sprintf("Unable to ping database: %v\n", err))
}
logging.LogDebug("Connected to database")
return db
}
// isDeadlockError checks if the error is a PostgreSQL deadlock error
func isDeadlockError(err error) bool {
var pqErr *pq.Error
if errors.As(err, &pqErr) {
return pqErr.Code == "40P01" // PostgreSQL deadlock error code
}
return false
}
func GetSnapshotsToVisit(tx *sqlx.Tx) ([]Snapshot, error) {
var snapshots []Snapshot
err := tx.Select(&snapshots, SQL_SELECT_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS, config.CONFIG.WorkerBatchSize)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrDatabase, err)
}
return snapshots, nil
}
func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
if config.CONFIG.DryRun {
marshalled, err := json.MarshalIndent(s, "", " ")
if err != nil {
panic(fmt.Sprintf("JSON serialization error for %v", s))
}
logging.LogDebug("Would insert (if new) snapshot %s", marshalled)
return nil
}
query := SQL_INSERT_SNAPSHOT_IF_NEW
_, err := tx.NamedExec(query, s)
if err != nil {
return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err)
}
return nil
}
func UpsertSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
// if config.CONFIG.DryRun {
//marshalled, err := json.MarshalIndent(s, "", " ")
//if err != nil {
// panic(fmt.Sprintf("JSON serialization error for %v", s))
//}
//logging.LogDebug("[%d] Would upsert snapshot %s", workedID, marshalled)
// return nil
// }
query := SQL_UPSERT_SNAPSHOT
rows, err := tx.NamedQuery(query, s)
if err != nil {
return fmt.Errorf("[%d] %w while upserting snapshot: %w", workedID, ErrDatabase, err)
}
defer func() {
_err := rows.Close()
if _err != nil {
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, ErrDatabase, _err)
}
}()
if rows.Next() {
var returnedID int
err = rows.Scan(&returnedID)
if err != nil {
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, ErrDatabase, err)
}
s.ID = returnedID
// logging.LogDebug("[%d] Upserted snapshot with ID %d", workedID, returnedID)
}
return nil
}
func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
// if config.CONFIG.DryRun {
//marshalled, err := json.MarshalIndent(s, "", " ")
//if err != nil {
// panic(fmt.Sprintf("JSON serialization error for %v", s))
//}
//logging.LogDebug("[%d] Would upsert snapshot %s", workedID, marshalled)
// return nil
// }
query := SQL_UPDATE_SNAPSHOT
rows, err := tx.NamedQuery(query, s)
if err != nil {
return fmt.Errorf("[%d] %w while updating snapshot: %w", workedID, ErrDatabase, err)
}
defer func() {
_err := rows.Close()
if _err != nil {
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, ErrDatabase, _err)
}
}()
if rows.Next() {
var returnedID int
err = rows.Scan(&returnedID)
if err != nil {
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, ErrDatabase, err)
}
s.ID = returnedID
// logging.LogDebug("[%d] Updated snapshot with ID %d", workedID, returnedID)
}
return nil
}
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
const batchSize = 5000
query := SQL_INSERT_SNAPSHOT_IF_NEW
for i := 0; i < len(snapshots); i += batchSize {
end := i + batchSize
if end > len(snapshots) {
end = len(snapshots)
}
batch := snapshots[i:end]
_, err := tx.NamedExec(query, batch)
if err != nil {
return fmt.Errorf("%w: While saving links in batches: %w", ErrDatabase, err)
}
}
return nil
}
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
if config.CONFIG.DryRun {
return nil
}
query := SQL_INSERT_SNAPSHOT_IF_NEW
_, err := tx.NamedExec(query, snapshots)
if err != nil {
logging.LogError("GeminiError batch inserting snapshots: %w", err)
return fmt.Errorf("DB error: %w", err)
}
return nil
}

View File

@@ -1,78 +0,0 @@
package gemini
const (
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS = `
SELECT *
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
ORDER BY RANDOM()
FOR UPDATE SKIP LOCKED
LIMIT $1
`
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = `
SELECT *
FROM snapshots s
WHERE response_code IS NULL
AND error IS NULL
AND s.id IN (
SELECT MIN(id)
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
GROUP BY host
)
ORDER BY RANDOM()
FOR UPDATE SKIP LOCKED
LIMIT $1
`
SQL_SELECT_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = `
SELECT *
FROM snapshots s
WHERE response_code IS NULL
AND error IS NULL
AND s.id IN (
SELECT MIN(id)
FROM snapshots
WHERE response_code IS NULL
AND error IS NULL
GROUP BY host
)
FOR UPDATE SKIP LOCKED
LIMIT $1
`
SQL_INSERT_SNAPSHOT_IF_NEW = `
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO NOTHING
`
SQL_UPSERT_SNAPSHOT = `INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
ON CONFLICT (url) DO UPDATE SET
url = EXCLUDED.url,
host = EXCLUDED.host,
timestamp = EXCLUDED.timestamp,
mimetype = EXCLUDED.mimetype,
data = EXCLUDED.data,
gemtext = EXCLUDED.gemtext,
links = EXCLUDED.links,
lang = EXCLUDED.lang,
response_code = EXCLUDED.response_code,
error = EXCLUDED.error
RETURNING id
`
SQL_UPDATE_SNAPSHOT = `UPDATE snapshots
SET url = :url,
host = :host,
timestamp = :timestamp,
mimetype = :mimetype,
data = :data,
gemtext = :gemtext,
links = :links,
lang = :lang,
response_code = :response_code,
error = :error
WHERE id = :id
RETURNING id
`
)

View File

@@ -1,100 +0,0 @@
package gemini
import (
"errors"
"fmt"
)
type GeminiError struct {
Msg string
Code int
Header string
}
func (e *GeminiError) Error() string {
return fmt.Sprintf("%s: %s", e.Msg, e.Header)
}
func NewErrGeminiStatusCode(code int, header string) error {
var msg string
switch {
case code >= 10 && code < 20:
msg = "needs input"
case code >= 30 && code < 40:
msg = "redirect"
case code >= 40 && code < 50:
msg = "bad request"
case code >= 50 && code < 60:
msg = "server error"
case code >= 60 && code < 70:
msg = "TLS error"
default:
msg = "unexpected status code"
}
return &GeminiError{
Msg: msg,
Code: code,
Header: header,
}
}
var (
ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error")
ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed")
ErrGeminiResponseHeader = errors.New("gemini response header error")
ErrGeminiRedirect = errors.New("gemini redirection error")
ErrGeminiLinkLineParse = errors.New("gemini link line parse error")
ErrURLParse = errors.New("URL parse error")
ErrURLNotGemini = errors.New("not a Gemini URL")
ErrURLDecode = errors.New("URL decode error")
ErrUTF8Parse = errors.New("UTF-8 parse error")
ErrTextParse = errors.New("text parse error")
ErrNetwork = errors.New("network error")
ErrNetworkDNS = errors.New("network DNS error")
ErrNetworkTLS = errors.New("network TLS error")
ErrNetworkSetConnectionDeadline = errors.New("network error - cannot set connection deadline")
ErrNetworkCannotWrite = errors.New("network error - cannot write")
ErrNetworkResponseSizeExceededMax = errors.New("network error - response size exceeded maximum size")
ErrDatabase = errors.New("database error")
)
// We could have used a map for speed, but
// we would lose ability to check wrapped
// errors via errors.Is().
var errGemini *GeminiError
var knownErrors = []error{ //nolint:gochecknoglobals
errGemini,
ErrGeminiLinkLineParse,
ErrGeminiRobotsParse,
ErrGeminiRobotsDisallowed,
ErrGeminiResponseHeader,
ErrGeminiRedirect,
ErrURLParse,
ErrURLDecode,
ErrUTF8Parse,
ErrTextParse,
ErrNetwork,
ErrNetworkDNS,
ErrNetworkTLS,
ErrNetworkSetConnectionDeadline,
ErrNetworkCannotWrite,
ErrNetworkResponseSizeExceededMax,
ErrDatabase,
}
func IsKnownError(err error) bool {
for _, known := range knownErrors {
if errors.Is(err, known) {
return true
}
}
return errors.As(err, new(*GeminiError))
}

View File

@@ -1,24 +0,0 @@
package gemini
import (
"errors"
"fmt"
"testing"
)
func TestErrGemini(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
if !errors.As(err, new(*GeminiError)) {
t.Errorf("TestErrGemini fail")
}
}
func TestErrGeminiWrapped(t *testing.T) {
t.Parallel()
err := NewErrGeminiStatusCode(50, "50 server error")
errWrapped := fmt.Errorf("%w wrapped", err)
if !errors.As(errWrapped, new(*GeminiError)) {
t.Errorf("TestErrGeminiWrapped fail")
}
}

View File

@@ -2,6 +2,7 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"net/url"
"os"
"path"
@@ -63,7 +64,7 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
return finalPath, nil
}
func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
func SaveToFile(rootPath string, s *common.Snapshot, done chan struct{}) {
parentPath := path.Join(rootPath, s.URL.Hostname)
urlPath := s.URL.Path
// If path is empty, add `index.gmi` as the file to save

View File

@@ -2,6 +2,7 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"net/url"
"regexp"
"strconv"
@@ -9,18 +10,18 @@ import (
"gemini-grc/logging"
)
func GetPageLinks(currentURL URL, gemtext string) LinkList {
func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList {
// Grab link lines
linkLines := ExtractLinkLines(gemtext)
if len(linkLines) == 0 {
return nil
}
var linkURLs LinkList
var linkURLs common.LinkList
// Normalize URLs in links, and store them in snapshot
for _, line := range linkLines {
linkURL, err := NormalizeLink(line, currentURL.String())
if err != nil {
logging.LogDebug("%s: %s", ErrGeminiLinkLineParse, err)
logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err)
continue
}
linkURLs = append(linkURLs, *linkURL)
@@ -42,11 +43,11 @@ func ExtractLinkLines(gemtext string) []string {
// NormalizeLink takes a single link line and the current URL,
// return the URL converted to an absolute URL
// and its description.
func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
func NormalizeLink(linkLine string, currentURL string) (*common.URL, error) {
// Parse the current URL
baseURL, err := url.Parse(currentURL)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
}
// Regular expression to extract the URL part from a link line
@@ -56,13 +57,13 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged
return nil, fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine)
return nil, fmt.Errorf("%w for link line %s", common.ErrGeminiLinkLineParse, linkLine)
}
originalURLStr := matches[1]
_, err = url.QueryUnescape(originalURLStr)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrURLDecode, err)
return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err)
}
restOfLine := ""
@@ -74,7 +75,7 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
parsedURL, err := url.Parse(originalURLStr)
if err != nil {
// If URL parsing fails, return an error
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
}
// Resolve relative URLs against the base URL
@@ -89,10 +90,10 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
restOfLine = restOfLine[1:]
}
finalURL, err := ParseURL(parsedURL.String(), restOfLine)
finalURL, err := common.ParseURL(parsedURL.String(), restOfLine)
if err != nil {
// If URL parsing fails, return an error
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
}
return finalURL, nil
@@ -107,13 +108,13 @@ func ParseFirstTwoDigits(input string) (int, error) {
// Find the first match in the string
matches := re.FindStringSubmatch(input)
if len(matches) == 0 {
return 0, fmt.Errorf("%w", ErrGeminiResponseHeader)
return 0, fmt.Errorf("%w", common.ErrGeminiResponseHeader)
}
// Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1])
if err != nil {
return 0, fmt.Errorf("%w: %w", ErrTextParse, err)
return 0, fmt.Errorf("%w: %w", common.ErrTextParse, err)
}
return snapshot, nil
@@ -121,7 +122,7 @@ func ParseFirstTwoDigits(input string) (int, error) {
// extractRedirectTarget returns the redirection
// URL by parsing the header (or error message)
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
func extractRedirectTarget(currentURL common.URL, input string) (*common.URL, error) {
// \d+ - matches one or more digits
// \s+ - matches one or more whitespace
// ([^\r]+) - captures everything until it hits a \r (or end of string)
@@ -129,11 +130,11 @@ func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) < 2 {
return nil, fmt.Errorf("%w: %s", ErrGeminiRedirect, input)
return nil, fmt.Errorf("%w: %s", common.ErrGeminiRedirect, input)
}
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
newURL, err := common.DeriveAbsoluteURL(currentURL, matches[1])
if err != nil {
return nil, fmt.Errorf("%w: %w: %s", ErrGeminiRedirect, err, input)
return nil, fmt.Errorf("%w: %w: %s", common.ErrGeminiRedirect, err, input)
}
return newURL, nil
}

View File

@@ -1,10 +1,13 @@
package gemini
import "testing"
import (
"gemini-grc/common"
"testing"
)
func TestExtractRedirectTargetFullURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr"
result, err := extractRedirectTarget(*currentURL, input)
expected := "gemini://target.gr:1965"
@@ -15,7 +18,7 @@ func TestExtractRedirectTargetFullURL(t *testing.T) {
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 gemini://target.gr/"
result, err := extractRedirectTarget(*currentURL, input)
expected := "gemini://target.gr:1965/"
@@ -26,7 +29,7 @@ func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31 /a/b"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
@@ -36,7 +39,7 @@ func TestExtractRedirectTargetRelativeURL(t *testing.T) {
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
currentURL, _ := common.ParseURL("gemini://nox.im:1965", "")
input := "redirect: 31 ./"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://nox.im:1965/") {
@@ -46,7 +49,7 @@ func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://status.zvava.org:1965", "")
currentURL, _ := common.ParseURL("gemini://status.zvava.org:1965", "")
input := "redirect: 31 index.gmi"
result, err := extractRedirectTarget(*currentURL, input)
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
@@ -56,7 +59,7 @@ func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
func TestExtractRedirectTargetWrong(t *testing.T) {
t.Parallel()
currentURL, _ := ParseURL("gemini://smol.gr", "")
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
input := "redirect: 31"
result, err := extractRedirectTarget(*currentURL, input)
if result != nil || err == nil {

View File

@@ -1,229 +0,0 @@
package gemini
import (
"database/sql/driver"
"fmt"
"net/url"
"path"
"strconv"
"strings"
)
type URL struct {
Protocol string `json:"protocol,omitempty"`
Hostname string `json:"hostname,omitempty"`
Port int `json:"port,omitempty"`
Path string `json:"path,omitempty"`
Descr string `json:"descr,omitempty"`
Full string `json:"full,omitempty"`
}
func (u *URL) Scan(value interface{}) error {
if value == nil {
// Clear the fields in the current GeminiUrl object (not the pointer itself)
*u = URL{}
return nil
}
b, ok := value.(string)
if !ok {
return fmt.Errorf("failed to scan GeminiUrl: expected string, got %T", value)
}
parsedURL, err := ParseURLNoNormalize(b, "")
if err != nil {
err = fmt.Errorf("failed to scan GeminiUrl %s: %v", b, err)
return err
}
*u = *parsedURL
return nil
}
func (u URL) String() string {
return u.Full
}
func (u URL) StringNoDefaultPort() string {
if u.Port == 1965 {
return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path)
}
return u.Full
}
func (u URL) Value() (driver.Value, error) {
if u.Full == "" {
return nil, nil
}
return u.Full, nil
}
func ParseURLNoNormalize(input string, descr string) (*URL, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("%w: Input %s URL Parse Error: %w", ErrURLParse, input, err)
}
if u.Scheme != "gemini" {
return nil, fmt.Errorf("%w: URL scheme '%s' is not supported", ErrURLNotGemini, u.Scheme)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
urlPath := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
}
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: urlPath, Descr: descr, Full: full}, nil
}
func ParseURL(input string, descr string) (*URL, error) {
u, err := NormalizeURL(input)
if err != nil {
return nil, fmt.Errorf("%w: Input %s URL Parse Error: %w", ErrURLParse, input, err)
}
if u.Scheme != "gemini" {
return nil, fmt.Errorf("%w: URL scheme '%s' is not supported", ErrURLNotGemini, u.Scheme)
}
protocol := u.Scheme
hostname := u.Hostname()
strPort := u.Port()
urlPath := u.Path
if strPort == "" {
strPort = "1965"
}
port, err := strconv.Atoi(strPort)
if err != nil {
return nil, fmt.Errorf("%w: Input %s GeminiError %w", ErrURLParse, input, err)
}
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: urlPath, Descr: descr, Full: full}, nil
}
// DeriveAbsoluteURL converts a (possibly) relative
// URL to an absolute one. Used primarily to calculate
// the full redirection URL target from a response header.
func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) {
// If target URL is absolute, return just it
if strings.Contains(input, "://") {
return ParseURL(input, "")
}
// input is a relative path. Clean it and construct absolute.
var newPath string
// Handle weird cases found in the wild
if strings.HasPrefix(input, "/") {
newPath = path.Clean(input)
} else if input == "./" || input == "." {
newPath = path.Join(currentURL.Path, "/")
} else {
newPath = path.Join(currentURL.Path, "/", path.Clean(input))
}
strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath)
return ParseURL(strURL, "")
}
// NormalizeURL takes a URL string and returns a normalized version.
// Normalized meaning:
// - Path normalization (removing redundant slashes, . and .. segments)
// - Proper escaping of special characters
// - Lowercase scheme and host
// - Removal of default ports
// - Empty path becomes "/"
func NormalizeURL(rawURL string) (*url.URL, error) {
// Parse the URL
u, err := url.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
}
// Convert scheme to lowercase
u.Scheme = strings.ToLower(u.Scheme)
// Convert hostname to lowercase
if u.Host != "" {
u.Host = strings.ToLower(u.Host)
}
// Remove default ports
if u.Port() != "" {
switch {
case u.Scheme == "http" && u.Port() == "80":
u.Host = u.Hostname()
case u.Scheme == "https" && u.Port() == "443":
u.Host = u.Hostname()
case u.Scheme == "gemini" && u.Port() == "1965":
u.Host = u.Hostname()
}
}
// Handle path normalization while preserving trailing slash
if u.Path != "" {
// Check if there was a trailing slash before cleaning
hadTrailingSlash := strings.HasSuffix(u.Path, "/")
u.Path = path.Clean(u.Path)
// If path was "/", path.Clean() will return "."
if u.Path == "." {
u.Path = "/"
} else if hadTrailingSlash && u.Path != "/" {
// Restore trailing slash if it existed and path isn't just "/"
u.Path += "/"
}
}
// Properly escape the path
// First split on '/' to avoid escaping them
parts := strings.Split(u.Path, "/")
for i, part := range parts {
parts[i] = url.PathEscape(part)
}
u.Path = strings.Join(parts, "/")
// Remove trailing fragment if empty
if u.Fragment == "" {
u.Fragment = ""
}
// Remove trailing query if empty
if u.RawQuery == "" {
u.RawQuery = ""
}
return u, nil
}
func EscapeURL(input string) string {
// Only escape if not already escaped
if strings.Contains(input, "%") && !strings.Contains(input, "% ") {
return input
}
// Split URL into parts (protocol, host, path)
parts := strings.SplitN(input, "://", 2)
if len(parts) != 2 {
return input
}
protocol := parts[0]
remainder := parts[1]
// If URL ends with just a slash, return as is
if strings.HasSuffix(remainder, "/") && !strings.Contains(remainder[:len(remainder)-1], "/") {
return input
}
// Split host and path
parts = strings.SplitN(remainder, "/", 2)
host := parts[0]
if len(parts) == 1 {
return protocol + "://" + host
}
path := parts[1]
// Escape the path portion
escapedPath := url.PathEscape(path)
// Reconstruct the URL
return protocol + "://" + host + "/" + escapedPath
}

View File

@@ -1,223 +0,0 @@
package gemini
import (
"reflect"
"testing"
)
func TestParseURL(t *testing.T) {
t.Parallel()
input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162"
parsed, err := ParseURL(input, "")
value, _ := parsed.Value()
if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") {
t.Errorf("fail: %s", parsed)
}
}
func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "gemini://a.b/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "a.b",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://a.b:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "/c"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/c",
Descr: "",
Full: "gemini://smol.gr:1965/c",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) {
t.Parallel()
currentURL := URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b",
Descr: "Nothing",
Full: "gemini://smol.gr:1965/a/b",
}
input := "c/d"
output, err := DeriveAbsoluteURL(currentURL, input)
if err != nil {
t.Errorf("fail: %v", err)
}
expected := &URL{
Protocol: "gemini",
Hostname: "smol.gr",
Port: 1965,
Path: "/a/b/c/d",
Descr: "",
Full: "gemini://smol.gr:1965/a/b/c/d",
}
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeURLSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/magazines/"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := input
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeURLNoSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/magazines"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := input
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeMultiSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/////////a///magazines"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeTrailingSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeNoTrailingSlash(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeTrailingSlashPath(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/a/"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/a/"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeNoTrailingSlashPath(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/a"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/a"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeDot(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net/retro-computing/./././////a///magazines"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/retro-computing/a/magazines"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizePort(t *testing.T) {
t.Parallel()
input := "gemini://uscoffings.net:1965/a"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://uscoffings.net/a"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}
func TestNormalizeURL(t *testing.T) {
t.Parallel()
input := "gemini://chat.gemini.lehmann.cx:11965/"
normalized, _ := NormalizeURL(input)
output := normalized.String()
expected := "gemini://chat.gemini.lehmann.cx:11965/"
pass := reflect.DeepEqual(output, expected)
if !pass {
t.Errorf("fail: %#v != %#v", output, expected)
}
}

View File

@@ -4,6 +4,7 @@ import (
"crypto/tls"
"errors"
"fmt"
"gemini-grc/common"
"io"
"net"
gourl "net/url"
@@ -35,7 +36,7 @@ type PageData struct {
func getHostIPAddresses(hostname string) ([]string, error) {
addrs, err := net.LookupHost(hostname)
if err != nil {
return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err)
return nil, fmt.Errorf("%w:%w", common.ErrNetworkDNS, err)
}
IPPool.Lock.RLock()
defer func() {
@@ -47,7 +48,7 @@ func getHostIPAddresses(hostname string) ([]string, error) {
func ConnectAndGetData(url string) ([]byte, error) {
parsedURL, err := gourl.Parse(url)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
}
hostname := parsedURL.Hostname()
port := parsedURL.Port()
@@ -61,7 +62,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
}
conn, err := dialer.Dial("tcp", host)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
}
// Make sure we always close the connection.
defer func() {
@@ -73,11 +74,11 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
}
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
}
// Perform the TLS handshake
@@ -88,7 +89,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
}
tlsConn := tls.Client(conn, tlsConfig)
if err := tlsConn.Handshake(); err != nil {
return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err)
return nil, fmt.Errorf("%w: %w", common.ErrNetworkTLS, err)
}
// We read `buf`-sized chunks and add data to `data`.
@@ -99,10 +100,10 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitly in the URL.
_url, _ := ParseURL(url, "")
_url, _ := common.ParseURL(url, "")
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
return nil, fmt.Errorf("%w: %w", common.ErrNetworkCannotWrite, err)
}
// Read response bytes in len(buf) byte chunks
for {
@@ -111,13 +112,13 @@ func ConnectAndGetData(url string) ([]byte, error) {
data = append(data, buf[:n]...)
}
if len(data) > config.CONFIG.MaxResponseSize {
return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
return nil, fmt.Errorf("%w: %v", common.ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
}
if err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
}
}
return data, nil
@@ -127,16 +128,16 @@ func ConnectAndGetData(url string) ([]byte, error) {
// Mutates given Snapshot with the data.
// In case of error, we store the error string
// inside snapshot and return the error.
func Visit(s *Snapshot) (err error) {
func Visit(s *common.Snapshot) (err error) {
// Don't forget to also store error
// response code (if we have one)
// and header
defer func() {
if err != nil {
s.Error = null.StringFrom(err.Error())
if errors.As(err, new(*GeminiError)) {
s.Header = null.StringFrom(err.(*GeminiError).Header)
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
if errors.As(err, new(*common.GeminiError)) {
s.Header = null.StringFrom(err.(*common.GeminiError).Header)
s.ResponseCode = null.IntFrom(int64(err.(*common.GeminiError).Code))
}
}
}()
@@ -174,7 +175,7 @@ func processData(data []byte) (*PageData, error) {
code, mimeType, lang := getMimeTypeAndLang(header)
logging.LogDebug("Header: %s", strings.TrimSpace(header))
if code != 20 {
return nil, NewErrGeminiStatusCode(code, header)
return nil, common.NewErrGeminiStatusCode(code, header)
}
pageData := PageData{
@@ -188,7 +189,7 @@ func processData(data []byte) (*PageData, error) {
if mimeType == "text/gemini" {
validBody, err := BytesToValidUTF8(body)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err)
return nil, fmt.Errorf("%w: %w", common.ErrUTF8Parse, err)
}
pageData.GemText = validBody
} else {
@@ -204,7 +205,7 @@ func processData(data []byte) (*PageData, error) {
func getHeadersAndData(data []byte) (string, []byte, error) {
firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 {
return "", nil, ErrGeminiResponseHeader
return "", nil, common.ErrGeminiResponseHeader
}
firstLine := string(data[:firstLineEnds])
rest := data[firstLineEnds+1:]

View File

@@ -2,6 +2,7 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"strings"
"sync"
@@ -56,7 +57,11 @@ func populateBlacklist(key string) (entries []string) {
// RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule.
func RobotMatch(url URL) bool {
func RobotMatch(u string) bool {
url, err := common.ParseURL(u, "")
if err != nil {
return false
}
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
logging.LogDebug("Checking robots.txt cache for %s", key)
var disallowedURLs []string

View File

@@ -1,42 +0,0 @@
package gemini
import (
"database/sql/driver"
"encoding/json"
"fmt"
"github.com/guregu/null/v5"
)
type LinkList []URL
func (l *LinkList) Value() (driver.Value, error) {
return json.Marshal(l)
}
func (l *LinkList) Scan(value interface{}) error {
if value == nil {
*l = nil
return nil
}
b, ok := value.([]byte) // Type assertion! Converts to []byte
if !ok {
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
}
return json.Unmarshal(b, l)
}
type Snapshot struct {
ID int `db:"id" json:"id,omitempty"`
URL URL `db:"url" json:"url,omitempty"`
Host string `db:"host" json:"host,omitempty"`
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
MimeType null.String `db:"mimetype" json:"mimetype,omitempty"`
Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files.
GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files.
Header null.String `db:"header" json:"header,omitempty"` // Response header.
Links null.Value[LinkList] `db:"links" json:"links,omitempty"`
Lang null.String `db:"lang" json:"lang,omitempty"`
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
}

View File

@@ -3,6 +3,8 @@ package gemini
import (
"errors"
"fmt"
"gemini-grc/common"
_db "gemini-grc/db"
"strings"
"time"
@@ -12,54 +14,6 @@ import (
"github.com/jmoiron/sqlx"
)
type WorkerStatus struct {
id int
status string
}
func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
// Create a slice to store current status of each worker
statuses := make([]string, totalWorkers)
// Initialize empty statuses
for i := range statuses {
statuses[i] = ""
}
// Initial print
var output strings.Builder
// \033[H moves the cursor to the top left corner of the screen
// (ie, the first column of the first row in the screen).
// \033[J clears the part of the screen from the cursor to the end of the screen.
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
for i := range statuses {
output.WriteString(fmt.Sprintf("[%2d] \n", i))
}
fmt.Print(output.String())
// Continuously receive status updates
for update := range statusChan {
if update.id >= totalWorkers {
continue
}
// Update the status
statuses[update.id] = update.status
// Build the complete output string
output.Reset()
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
for i, status := range statuses {
output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status))
}
// Print the entire status
fmt.Print(output.String())
}
}
var statusChan chan WorkerStatus
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
logging.LogInfo("Spawning %d workers", numOfWorkers)
statusChan = make(chan WorkerStatus, numOfWorkers)
@@ -97,7 +51,7 @@ func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) {
// On deadlock errors, rollback and return, otherwise panic.
if err != nil {
logging.LogError("[%d] Failed to commit transaction: %w", workerID, err)
if isDeadlockError(err) {
if _db.IsDeadlockError(err) {
logging.LogError("[%d] Deadlock detected. Rolling back", workerID)
time.Sleep(time.Duration(10) * time.Second)
err := tx.Rollback()
@@ -112,78 +66,72 @@ func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) {
}
func runWorker(workerID int, tx *sqlx.Tx, url *string) {
var snapshots []Snapshot
var urls []string
var err error
// If not given a specific URL,
// get some random ones to visit from DB.
// get some random ones to visit from db.
if url == nil {
statusChan <- WorkerStatus{
id: workerID,
status: "Getting snapshots",
status: "Getting URLs",
}
snapshots, err = GetSnapshotsToVisit(tx)
urls, err = _db.GetURLsToVisit(tx)
if err != nil {
logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err)
panic("This should never happen")
} else if len(snapshots) == 0 {
logging.LogInfo("[%d] No snapshots to visit.", workerID)
} else if len(urls) == 0 {
logging.LogInfo("[%d] No URLs to visit.", workerID)
time.Sleep(1 * time.Minute)
return
}
} else {
snapshotURL, err := ParseURL(*url, "")
geminiURL, err := common.ParseURL(*url, "")
if err != nil {
logging.LogError("Invalid URL given: %s", *url)
return
}
snapshots = []Snapshot{{
// UID: uid.UID(),
URL: *snapshotURL,
Host: snapshotURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}}
urls = []string{geminiURL.String()}
}
total := len(snapshots)
for i, s := range snapshots {
logging.LogDebug("[%d] Snapshot %d/%d: %s", workerID, i+1, total, s.URL.String())
}
// Start visiting URLs.
for i, s := range snapshots {
logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, s.URL.String())
total := len(urls)
for i, u := range urls {
logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, u)
// We differentiate between errors:
// Unexpected errors are the ones returned from the following function.
// If an error is unexpected (which should never happen) we panic.
// Expected errors are stored as strings within the snapshot,
// so that they can also be stored in DB.
err := workOnSnapshot(workerID, tx, &s)
// Expected errors are stored as strings within the snapshot.
err := workOnUrl(workerID, tx, u)
if err != nil {
logging.LogError("[%d] [%s] Unexpected GeminiError %w", workerID, s.URL.String(), err)
logging.LogError("[%d] Unexpected GeminiError %w while visiting %s", workerID, err, u)
util.PrintStackAndPanic(err)
}
if s.Error.Valid {
logging.LogDebug("[%d] Error: %v", workerID, s.Error.String)
}
logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total)
}
}
// workOnSnapshot visits a URL and stores the result.
// workOnUrl visits a URL and stores the result.
// unexpected errors are returned.
// expected errors are stored within the snapshot.
func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
if IsBlacklisted(s.URL) {
logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, s.URL.String())
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
if url == "" {
return fmt.Errorf("nil URL given")
}
if IsBlacklisted(url) {
logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, url)
return nil
}
s := common.SnapshotFromURL(url)
// If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be
// crawled.
if RobotMatch(s.URL) {
s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
err = UpsertSnapshot(workerID, tx, s)
if RobotMatch(url) {
s.Error = null.StringFrom(common.ErrGeminiRobotsDisallowed.Error())
err = _db.OverwriteSnapshot(workerID, tx, s)
if err != nil {
return fmt.Errorf("[%d] %w", workerID, err)
}
@@ -191,10 +139,14 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
}
// Resolve IP address via DNS
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Resolving %s", url),
}
IPs, err := getHostIPAddresses(s.Host)
if err != nil {
s.Error = null.StringFrom(err.Error())
err = UpsertSnapshot(workerID, tx, s)
err = _db.OverwriteSnapshot(workerID, tx, s)
if err != nil {
return fmt.Errorf("[%d] %w", workerID, err)
}
@@ -209,7 +161,7 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
id: workerID,
status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host),
}
time.Sleep(1 * time.Second) // Avoid flood-retrying
time.Sleep(2 * time.Second) // Avoid flood-retrying
count++
if count == 3 {
return
@@ -219,6 +171,10 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
}
}
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Adding to pool %s", url),
}
AddIPsToPool(IPs)
// After finishing, remove the host IPs from
// the connections pool, with a small delay
@@ -226,28 +182,32 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
defer func() {
go func() {
time.Sleep(1 * time.Second)
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Removing from pool %s", url),
}
RemoveIPsFromPool(IPs)
}()
}()
statusChan <- WorkerStatus{
id: workerID,
status: fmt.Sprintf("Visiting %s", s.URL.String()),
status: fmt.Sprintf("Visiting %s", url),
}
err = Visit(s)
if err != nil {
if !IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", workerID, s.URL.String(), err)
if !common.IsKnownError(err) {
logging.LogError("[%d] Unknown error visiting %s: %w", workerID, url, err)
return err
}
s.Error = null.StringFrom(err.Error())
// Check if error is redirection, and handle it
if errors.As(err, new(*GeminiError)) &&
err.(*GeminiError).Msg == "redirect" {
if errors.As(err, new(*common.GeminiError)) &&
err.(*common.GeminiError).Msg == "redirect" {
err = handleRedirection(workerID, tx, s)
if err != nil {
if IsKnownError(err) {
if common.IsKnownError(err) {
s.Error = null.StringFrom(err.Error())
} else {
return err
@@ -270,7 +230,7 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID)
}
err = UpsertSnapshot(workerID, tx, s)
err = _db.OverwriteSnapshot(workerID, tx, s)
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
if err != nil {
return err
@@ -294,12 +254,12 @@ func isAnotherWorkerVisitingHost(workerID int, IPs []string) bool {
return false
}
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
func storeLinks(tx *sqlx.Tx, s *common.Snapshot) error {
if s.Links.Valid {
var batchSnapshots []*Snapshot
var batchSnapshots []*common.Snapshot
for _, link := range s.Links.ValueOrZero() {
if shouldPersistURL(&link) {
newSnapshot := &Snapshot{
newSnapshot := &common.Snapshot{
URL: link,
Host: link.Hostname,
Timestamp: null.TimeFrom(time.Now()),
@@ -309,7 +269,7 @@ func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
}
if len(batchSnapshots) > 0 {
err := SaveLinksToDBinBatches(tx, batchSnapshots)
err := _db.SaveLinksToDBinBatches(tx, batchSnapshots)
if err != nil {
return err
}
@@ -319,17 +279,33 @@ func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
}
// shouldPersistURL returns true if we
// should save the URL in the DB.
// should save the URL in the _db.
// Only gemini:// urls are saved.
func shouldPersistURL(u *URL) bool {
func shouldPersistURL(u *common.URL) bool {
return strings.HasPrefix(u.String(), "gemini://")
}
func haveWeVisitedURL(tx *sqlx.Tx, u *common.URL) (bool, error) {
var result bool
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u.String())
if err != nil {
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
}
if result {
return result, nil
}
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshot.url=$1`, u.String())
if err != nil {
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
}
return result, nil
}
// handleRedirection saves redirect URL as new snapshot
func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error {
func handleRedirection(workerID int, tx *sqlx.Tx, s *common.Snapshot) error {
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
if err != nil {
if errors.Is(err, ErrGeminiRedirect) {
if errors.Is(err, common.ErrGeminiRedirect) {
logging.LogDebug("[%d] %s", workerID, err)
}
return err
@@ -337,14 +313,14 @@ func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error {
logging.LogDebug("[%d] Page redirects to %s", workerID, newURL)
// Insert fresh snapshot with new URL
if shouldPersistURL(newURL) {
snapshot := &Snapshot{
snapshot := &common.Snapshot{
// UID: uid.UID(),
URL: *newURL,
Host: newURL.Hostname,
Timestamp: null.TimeFrom(time.Now()),
}
logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String())
err = SaveSnapshotIfNew(tx, snapshot)
err = _db.SaveSnapshotIfNew(tx, snapshot)
if err != nil {
return err
}
@@ -352,14 +328,14 @@ func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error {
return nil
}
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]common.Snapshot, error) {
query := `
SELECT *
FROM snapshots
WHERE url=$1
LIMIT 1
`
var snapshots []Snapshot
var snapshots []common.Snapshot
err := tx.Select(&snapshots, query, url)
if err != nil {
return nil, err

54
gemini/workerStatus.go Normal file
View File

@@ -0,0 +1,54 @@
package gemini
import (
"fmt"
"strings"
)
type WorkerStatus struct {
id int
status string
}
var statusChan chan WorkerStatus
func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
// Create a slice to store current status of each worker
statuses := make([]string, totalWorkers)
// Initialize empty statuses
for i := range statuses {
statuses[i] = ""
}
// Initial print
var output strings.Builder
// \033[H moves the cursor to the top left corner of the screen
// (ie, the first column of the first row in the screen).
// \033[J clears the part of the screen from the cursor to the end of the screen.
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
for i := range statuses {
output.WriteString(fmt.Sprintf("[%2d] \n", i))
}
fmt.Print(output.String())
// Continuously receive status updates
for update := range statusChan {
if update.id >= totalWorkers {
continue
}
// Update the status
statuses[update.id] = update.status
// Build the complete output string
output.Reset()
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
for i, status := range statuses {
output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status))
}
// Print the entire status
fmt.Print(output.String())
}
}