Break up common functions and small refactor.
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
package gemini
|
package common
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package gemini
|
package common
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package gemini
|
package common
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"database/sql/driver"
|
"database/sql/driver"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package gemini
|
package common
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"reflect"
|
"reflect"
|
||||||
@@ -1,9 +1,10 @@
|
|||||||
package gemini
|
package common
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"database/sql/driver"
|
"database/sql/driver"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/guregu/null/v5"
|
"github.com/guregu/null/v5"
|
||||||
)
|
)
|
||||||
@@ -40,3 +41,16 @@ type Snapshot struct {
|
|||||||
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code.
|
||||||
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
Error null.String `db:"error" json:"error,omitempty"` // On network errors only
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SnapshotFromURL(u string) *Snapshot {
|
||||||
|
url, err := ParseURL(u, "")
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
newSnapshot := Snapshot{
|
||||||
|
URL: *url,
|
||||||
|
Host: url.Hostname,
|
||||||
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
|
}
|
||||||
|
return &newSnapshot
|
||||||
|
}
|
||||||
@@ -1,9 +1,10 @@
|
|||||||
package gemini
|
package db
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/common"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
@@ -43,8 +44,8 @@ func ConnectToDB() *sqlx.DB {
|
|||||||
return db
|
return db
|
||||||
}
|
}
|
||||||
|
|
||||||
// isDeadlockError checks if the error is a PostgreSQL deadlock error
|
// IsDeadlockError checks if the error is a PostgreSQL deadlock error
|
||||||
func isDeadlockError(err error) bool {
|
func IsDeadlockError(err error) bool {
|
||||||
var pqErr *pq.Error
|
var pqErr *pq.Error
|
||||||
if errors.As(err, &pqErr) {
|
if errors.As(err, &pqErr) {
|
||||||
return pqErr.Code == "40P01" // PostgreSQL deadlock error code
|
return pqErr.Code == "40P01" // PostgreSQL deadlock error code
|
||||||
@@ -52,16 +53,25 @@ func isDeadlockError(err error) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetSnapshotsToVisit(tx *sqlx.Tx) ([]Snapshot, error) {
|
func GetURLsToVisit(tx *sqlx.Tx) ([]string, error) {
|
||||||
var snapshots []Snapshot
|
var urls []string
|
||||||
err := tx.Select(&snapshots, SQL_SELECT_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS, config.CONFIG.WorkerBatchSize)
|
err := tx.Select(&urls, SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS, config.CONFIG.WorkerBatchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrDatabase, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrDatabase, err)
|
||||||
}
|
}
|
||||||
return snapshots, nil
|
return urls, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
|
func InsertURL(tx *sqlx.Tx, url string) error {
|
||||||
|
query := SQL_INSERT_URL
|
||||||
|
_, err := tx.NamedExec(query, url)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("%w inserting URL: %w", common.ErrDatabase, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaveSnapshotIfNew(tx *sqlx.Tx, s *common.Snapshot) error {
|
||||||
if config.CONFIG.DryRun {
|
if config.CONFIG.DryRun {
|
||||||
marshalled, err := json.MarshalIndent(s, "", " ")
|
marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -78,7 +88,7 @@ func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func UpsertSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
func OverwriteSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) {
|
||||||
// if config.CONFIG.DryRun {
|
// if config.CONFIG.DryRun {
|
||||||
//marshalled, err := json.MarshalIndent(s, "", " ")
|
//marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
//if err != nil {
|
//if err != nil {
|
||||||
@@ -90,19 +100,19 @@ func UpsertSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
query := SQL_UPSERT_SNAPSHOT
|
query := SQL_UPSERT_SNAPSHOT
|
||||||
rows, err := tx.NamedQuery(query, s)
|
rows, err := tx.NamedQuery(query, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w while upserting snapshot: %w", workedID, ErrDatabase, err)
|
return fmt.Errorf("[%d] %w while upserting snapshot: %w", workedID, common.ErrDatabase, err)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
_err := rows.Close()
|
_err := rows.Close()
|
||||||
if _err != nil {
|
if _err != nil {
|
||||||
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, ErrDatabase, _err)
|
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
if rows.Next() {
|
if rows.Next() {
|
||||||
var returnedID int
|
var returnedID int
|
||||||
err = rows.Scan(&returnedID)
|
err = rows.Scan(&returnedID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, ErrDatabase, err)
|
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err)
|
||||||
}
|
}
|
||||||
s.ID = returnedID
|
s.ID = returnedID
|
||||||
// logging.LogDebug("[%d] Upserted snapshot with ID %d", workedID, returnedID)
|
// logging.LogDebug("[%d] Upserted snapshot with ID %d", workedID, returnedID)
|
||||||
@@ -110,7 +120,7 @@ func UpsertSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) {
|
||||||
// if config.CONFIG.DryRun {
|
// if config.CONFIG.DryRun {
|
||||||
//marshalled, err := json.MarshalIndent(s, "", " ")
|
//marshalled, err := json.MarshalIndent(s, "", " ")
|
||||||
//if err != nil {
|
//if err != nil {
|
||||||
@@ -122,19 +132,19 @@ func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
query := SQL_UPDATE_SNAPSHOT
|
query := SQL_UPDATE_SNAPSHOT
|
||||||
rows, err := tx.NamedQuery(query, s)
|
rows, err := tx.NamedQuery(query, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w while updating snapshot: %w", workedID, ErrDatabase, err)
|
return fmt.Errorf("[%d] %w while updating snapshot: %w", workedID, common.ErrDatabase, err)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
_err := rows.Close()
|
_err := rows.Close()
|
||||||
if _err != nil {
|
if _err != nil {
|
||||||
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, ErrDatabase, _err)
|
err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
if rows.Next() {
|
if rows.Next() {
|
||||||
var returnedID int
|
var returnedID int
|
||||||
err = rows.Scan(&returnedID)
|
err = rows.Scan(&returnedID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, ErrDatabase, err)
|
return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err)
|
||||||
}
|
}
|
||||||
s.ID = returnedID
|
s.ID = returnedID
|
||||||
// logging.LogDebug("[%d] Updated snapshot with ID %d", workedID, returnedID)
|
// logging.LogDebug("[%d] Updated snapshot with ID %d", workedID, returnedID)
|
||||||
@@ -142,7 +152,7 @@ func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*common.Snapshot) error {
|
||||||
if config.CONFIG.DryRun {
|
if config.CONFIG.DryRun {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -156,13 +166,13 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
|||||||
batch := snapshots[i:end]
|
batch := snapshots[i:end]
|
||||||
_, err := tx.NamedExec(query, batch)
|
_, err := tx.NamedExec(query, batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%w: While saving links in batches: %w", ErrDatabase, err)
|
return fmt.Errorf("%w: While saving links in batches: %w", common.ErrDatabase, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*common.Snapshot) error {
|
||||||
if config.CONFIG.DryRun {
|
if config.CONFIG.DryRun {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package gemini
|
package db
|
||||||
|
|
||||||
const (
|
const (
|
||||||
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS = `
|
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS = `
|
||||||
@@ -10,6 +10,16 @@ ORDER BY RANDOM()
|
|||||||
FOR UPDATE SKIP LOCKED
|
FOR UPDATE SKIP LOCKED
|
||||||
LIMIT $1
|
LIMIT $1
|
||||||
`
|
`
|
||||||
|
SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS = `
|
||||||
|
SELECT url
|
||||||
|
FROM urls u
|
||||||
|
WHERE u.id IN (
|
||||||
|
SELECT MIN(id)
|
||||||
|
FROM urls
|
||||||
|
GROUP BY host
|
||||||
|
)
|
||||||
|
LIMIT $1
|
||||||
|
`
|
||||||
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = `
|
SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = `
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM snapshots s
|
FROM snapshots s
|
||||||
@@ -75,4 +85,9 @@ error = :error
|
|||||||
WHERE id = :id
|
WHERE id = :id
|
||||||
RETURNING id
|
RETURNING id
|
||||||
`
|
`
|
||||||
|
SQL_INSERT_URL = `
|
||||||
|
INSERT INTO urls (url, host, timestamp)
|
||||||
|
VALUES (:url, :host, :timestamp)
|
||||||
|
ON CONFLICT (url) DO NOTHING
|
||||||
|
`
|
||||||
)
|
)
|
||||||
@@ -2,6 +2,7 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/common"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -39,7 +40,11 @@ func LoadBlacklist() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func IsBlacklisted(url URL) bool {
|
func IsBlacklisted(u string) bool {
|
||||||
|
url, err := common.ParseURL(u, "")
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
|
hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port)
|
||||||
for _, v := range *Blacklist {
|
for _, v := range *Blacklist {
|
||||||
if v == url.Hostname || v == hostWithPort {
|
if v == url.Hostname || v == hostWithPort {
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/common"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
@@ -63,7 +64,7 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
|
|||||||
return finalPath, nil
|
return finalPath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
|
func SaveToFile(rootPath string, s *common.Snapshot, done chan struct{}) {
|
||||||
parentPath := path.Join(rootPath, s.URL.Hostname)
|
parentPath := path.Join(rootPath, s.URL.Hostname)
|
||||||
urlPath := s.URL.Path
|
urlPath := s.URL.Path
|
||||||
// If path is empty, add `index.gmi` as the file to save
|
// If path is empty, add `index.gmi` as the file to save
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/common"
|
||||||
"net/url"
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -9,18 +10,18 @@ import (
|
|||||||
"gemini-grc/logging"
|
"gemini-grc/logging"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetPageLinks(currentURL URL, gemtext string) LinkList {
|
func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList {
|
||||||
// Grab link lines
|
// Grab link lines
|
||||||
linkLines := ExtractLinkLines(gemtext)
|
linkLines := ExtractLinkLines(gemtext)
|
||||||
if len(linkLines) == 0 {
|
if len(linkLines) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
var linkURLs LinkList
|
var linkURLs common.LinkList
|
||||||
// Normalize URLs in links, and store them in snapshot
|
// Normalize URLs in links, and store them in snapshot
|
||||||
for _, line := range linkLines {
|
for _, line := range linkLines {
|
||||||
linkURL, err := NormalizeLink(line, currentURL.String())
|
linkURL, err := NormalizeLink(line, currentURL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogDebug("%s: %s", ErrGeminiLinkLineParse, err)
|
logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
linkURLs = append(linkURLs, *linkURL)
|
linkURLs = append(linkURLs, *linkURL)
|
||||||
@@ -42,11 +43,11 @@ func ExtractLinkLines(gemtext string) []string {
|
|||||||
// NormalizeLink takes a single link line and the current URL,
|
// NormalizeLink takes a single link line and the current URL,
|
||||||
// return the URL converted to an absolute URL
|
// return the URL converted to an absolute URL
|
||||||
// and its description.
|
// and its description.
|
||||||
func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
|
func NormalizeLink(linkLine string, currentURL string) (*common.URL, error) {
|
||||||
// Parse the current URL
|
// Parse the current URL
|
||||||
baseURL, err := url.Parse(currentURL)
|
baseURL, err := url.Parse(currentURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Regular expression to extract the URL part from a link line
|
// Regular expression to extract the URL part from a link line
|
||||||
@@ -56,13 +57,13 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
|
|||||||
matches := re.FindStringSubmatch(linkLine)
|
matches := re.FindStringSubmatch(linkLine)
|
||||||
if len(matches) == 0 {
|
if len(matches) == 0 {
|
||||||
// If the line doesn't match the expected format, return it unchanged
|
// If the line doesn't match the expected format, return it unchanged
|
||||||
return nil, fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine)
|
return nil, fmt.Errorf("%w for link line %s", common.ErrGeminiLinkLineParse, linkLine)
|
||||||
}
|
}
|
||||||
|
|
||||||
originalURLStr := matches[1]
|
originalURLStr := matches[1]
|
||||||
_, err = url.QueryUnescape(originalURLStr)
|
_, err = url.QueryUnescape(originalURLStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrURLDecode, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
restOfLine := ""
|
restOfLine := ""
|
||||||
@@ -74,7 +75,7 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
|
|||||||
parsedURL, err := url.Parse(originalURLStr)
|
parsedURL, err := url.Parse(originalURLStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// If URL parsing fails, return an error
|
// If URL parsing fails, return an error
|
||||||
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resolve relative URLs against the base URL
|
// Resolve relative URLs against the base URL
|
||||||
@@ -89,10 +90,10 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) {
|
|||||||
restOfLine = restOfLine[1:]
|
restOfLine = restOfLine[1:]
|
||||||
}
|
}
|
||||||
|
|
||||||
finalURL, err := ParseURL(parsedURL.String(), restOfLine)
|
finalURL, err := common.ParseURL(parsedURL.String(), restOfLine)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// If URL parsing fails, return an error
|
// If URL parsing fails, return an error
|
||||||
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return finalURL, nil
|
return finalURL, nil
|
||||||
@@ -107,13 +108,13 @@ func ParseFirstTwoDigits(input string) (int, error) {
|
|||||||
// Find the first match in the string
|
// Find the first match in the string
|
||||||
matches := re.FindStringSubmatch(input)
|
matches := re.FindStringSubmatch(input)
|
||||||
if len(matches) == 0 {
|
if len(matches) == 0 {
|
||||||
return 0, fmt.Errorf("%w", ErrGeminiResponseHeader)
|
return 0, fmt.Errorf("%w", common.ErrGeminiResponseHeader)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse the captured match as an integer
|
// Parse the captured match as an integer
|
||||||
snapshot, err := strconv.Atoi(matches[1])
|
snapshot, err := strconv.Atoi(matches[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("%w: %w", ErrTextParse, err)
|
return 0, fmt.Errorf("%w: %w", common.ErrTextParse, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return snapshot, nil
|
return snapshot, nil
|
||||||
@@ -121,7 +122,7 @@ func ParseFirstTwoDigits(input string) (int, error) {
|
|||||||
|
|
||||||
// extractRedirectTarget returns the redirection
|
// extractRedirectTarget returns the redirection
|
||||||
// URL by parsing the header (or error message)
|
// URL by parsing the header (or error message)
|
||||||
func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
|
func extractRedirectTarget(currentURL common.URL, input string) (*common.URL, error) {
|
||||||
// \d+ - matches one or more digits
|
// \d+ - matches one or more digits
|
||||||
// \s+ - matches one or more whitespace
|
// \s+ - matches one or more whitespace
|
||||||
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
// ([^\r]+) - captures everything until it hits a \r (or end of string)
|
||||||
@@ -129,11 +130,11 @@ func extractRedirectTarget(currentURL URL, input string) (*URL, error) {
|
|||||||
re := regexp.MustCompile(pattern)
|
re := regexp.MustCompile(pattern)
|
||||||
matches := re.FindStringSubmatch(input)
|
matches := re.FindStringSubmatch(input)
|
||||||
if len(matches) < 2 {
|
if len(matches) < 2 {
|
||||||
return nil, fmt.Errorf("%w: %s", ErrGeminiRedirect, input)
|
return nil, fmt.Errorf("%w: %s", common.ErrGeminiRedirect, input)
|
||||||
}
|
}
|
||||||
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
|
newURL, err := common.DeriveAbsoluteURL(currentURL, matches[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w: %s", ErrGeminiRedirect, err, input)
|
return nil, fmt.Errorf("%w: %w: %s", common.ErrGeminiRedirect, err, input)
|
||||||
}
|
}
|
||||||
return newURL, nil
|
return newURL, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"gemini-grc/common"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
func TestExtractRedirectTargetFullURL(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
||||||
input := "redirect: 31 gemini://target.gr"
|
input := "redirect: 31 gemini://target.gr"
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
expected := "gemini://target.gr:1965"
|
expected := "gemini://target.gr:1965"
|
||||||
@@ -15,7 +18,7 @@ func TestExtractRedirectTargetFullURL(t *testing.T) {
|
|||||||
|
|
||||||
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
|
func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
||||||
input := "redirect: 31 gemini://target.gr/"
|
input := "redirect: 31 gemini://target.gr/"
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
expected := "gemini://target.gr:1965/"
|
expected := "gemini://target.gr:1965/"
|
||||||
@@ -26,7 +29,7 @@ func TestExtractRedirectTargetFullURLSlash(t *testing.T) {
|
|||||||
|
|
||||||
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
||||||
input := "redirect: 31 /a/b"
|
input := "redirect: 31 /a/b"
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") {
|
||||||
@@ -36,7 +39,7 @@ func TestExtractRedirectTargetRelativeURL(t *testing.T) {
|
|||||||
|
|
||||||
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
currentURL, _ := ParseURL("gemini://nox.im:1965", "")
|
currentURL, _ := common.ParseURL("gemini://nox.im:1965", "")
|
||||||
input := "redirect: 31 ./"
|
input := "redirect: 31 ./"
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
if err != nil || (result.String() != "gemini://nox.im:1965/") {
|
||||||
@@ -46,7 +49,7 @@ func TestExtractRedirectTargetRelativeURL2(t *testing.T) {
|
|||||||
|
|
||||||
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
|
func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
currentURL, _ := ParseURL("gemini://status.zvava.org:1965", "")
|
currentURL, _ := common.ParseURL("gemini://status.zvava.org:1965", "")
|
||||||
input := "redirect: 31 index.gmi"
|
input := "redirect: 31 index.gmi"
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
|
if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") {
|
||||||
@@ -56,7 +59,7 @@ func TestExtractRedirectTargetRelativeURL3(t *testing.T) {
|
|||||||
|
|
||||||
func TestExtractRedirectTargetWrong(t *testing.T) {
|
func TestExtractRedirectTargetWrong(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
currentURL, _ := ParseURL("gemini://smol.gr", "")
|
currentURL, _ := common.ParseURL("gemini://smol.gr", "")
|
||||||
input := "redirect: 31"
|
input := "redirect: 31"
|
||||||
result, err := extractRedirectTarget(*currentURL, input)
|
result, err := extractRedirectTarget(*currentURL, input)
|
||||||
if result != nil || err == nil {
|
if result != nil || err == nil {
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"crypto/tls"
|
"crypto/tls"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/common"
|
||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
gourl "net/url"
|
gourl "net/url"
|
||||||
@@ -35,7 +36,7 @@ type PageData struct {
|
|||||||
func getHostIPAddresses(hostname string) ([]string, error) {
|
func getHostIPAddresses(hostname string) ([]string, error) {
|
||||||
addrs, err := net.LookupHost(hostname)
|
addrs, err := net.LookupHost(hostname)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err)
|
return nil, fmt.Errorf("%w:%w", common.ErrNetworkDNS, err)
|
||||||
}
|
}
|
||||||
IPPool.Lock.RLock()
|
IPPool.Lock.RLock()
|
||||||
defer func() {
|
defer func() {
|
||||||
@@ -47,7 +48,7 @@ func getHostIPAddresses(hostname string) ([]string, error) {
|
|||||||
func ConnectAndGetData(url string) ([]byte, error) {
|
func ConnectAndGetData(url string) ([]byte, error) {
|
||||||
parsedURL, err := gourl.Parse(url)
|
parsedURL, err := gourl.Parse(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrURLParse, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
|
||||||
}
|
}
|
||||||
hostname := parsedURL.Hostname()
|
hostname := parsedURL.Hostname()
|
||||||
port := parsedURL.Port()
|
port := parsedURL.Port()
|
||||||
@@ -61,7 +62,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
}
|
}
|
||||||
conn, err := dialer.Dial("tcp", host)
|
conn, err := dialer.Dial("tcp", host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
|
||||||
}
|
}
|
||||||
// Make sure we always close the connection.
|
// Make sure we always close the connection.
|
||||||
defer func() {
|
defer func() {
|
||||||
@@ -73,11 +74,11 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
// Set read and write timeouts on the TCP connection.
|
// Set read and write timeouts on the TCP connection.
|
||||||
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
|
||||||
}
|
}
|
||||||
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform the TLS handshake
|
// Perform the TLS handshake
|
||||||
@@ -88,7 +89,7 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
}
|
}
|
||||||
tlsConn := tls.Client(conn, tlsConfig)
|
tlsConn := tls.Client(conn, tlsConfig)
|
||||||
if err := tlsConn.Handshake(); err != nil {
|
if err := tlsConn.Handshake(); err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrNetworkTLS, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We read `buf`-sized chunks and add data to `data`.
|
// We read `buf`-sized chunks and add data to `data`.
|
||||||
@@ -99,10 +100,10 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
// Fix for stupid server bug:
|
// Fix for stupid server bug:
|
||||||
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||||
// when the port is 1965 and is still specified explicitly in the URL.
|
// when the port is 1965 and is still specified explicitly in the URL.
|
||||||
_url, _ := ParseURL(url, "")
|
_url, _ := common.ParseURL(url, "")
|
||||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
|
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort())))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrNetworkCannotWrite, err)
|
||||||
}
|
}
|
||||||
// Read response bytes in len(buf) byte chunks
|
// Read response bytes in len(buf) byte chunks
|
||||||
for {
|
for {
|
||||||
@@ -111,13 +112,13 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
data = append(data, buf[:n]...)
|
data = append(data, buf[:n]...)
|
||||||
}
|
}
|
||||||
if len(data) > config.CONFIG.MaxResponseSize {
|
if len(data) > config.CONFIG.MaxResponseSize {
|
||||||
return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
|
return nil, fmt.Errorf("%w: %v", common.ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
return nil, fmt.Errorf("%w: %w", ErrNetwork, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return data, nil
|
return data, nil
|
||||||
@@ -127,16 +128,16 @@ func ConnectAndGetData(url string) ([]byte, error) {
|
|||||||
// Mutates given Snapshot with the data.
|
// Mutates given Snapshot with the data.
|
||||||
// In case of error, we store the error string
|
// In case of error, we store the error string
|
||||||
// inside snapshot and return the error.
|
// inside snapshot and return the error.
|
||||||
func Visit(s *Snapshot) (err error) {
|
func Visit(s *common.Snapshot) (err error) {
|
||||||
// Don't forget to also store error
|
// Don't forget to also store error
|
||||||
// response code (if we have one)
|
// response code (if we have one)
|
||||||
// and header
|
// and header
|
||||||
defer func() {
|
defer func() {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.Error = null.StringFrom(err.Error())
|
s.Error = null.StringFrom(err.Error())
|
||||||
if errors.As(err, new(*GeminiError)) {
|
if errors.As(err, new(*common.GeminiError)) {
|
||||||
s.Header = null.StringFrom(err.(*GeminiError).Header)
|
s.Header = null.StringFrom(err.(*common.GeminiError).Header)
|
||||||
s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code))
|
s.ResponseCode = null.IntFrom(int64(err.(*common.GeminiError).Code))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@@ -174,7 +175,7 @@ func processData(data []byte) (*PageData, error) {
|
|||||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||||
logging.LogDebug("Header: %s", strings.TrimSpace(header))
|
logging.LogDebug("Header: %s", strings.TrimSpace(header))
|
||||||
if code != 20 {
|
if code != 20 {
|
||||||
return nil, NewErrGeminiStatusCode(code, header)
|
return nil, common.NewErrGeminiStatusCode(code, header)
|
||||||
}
|
}
|
||||||
|
|
||||||
pageData := PageData{
|
pageData := PageData{
|
||||||
@@ -188,7 +189,7 @@ func processData(data []byte) (*PageData, error) {
|
|||||||
if mimeType == "text/gemini" {
|
if mimeType == "text/gemini" {
|
||||||
validBody, err := BytesToValidUTF8(body)
|
validBody, err := BytesToValidUTF8(body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err)
|
return nil, fmt.Errorf("%w: %w", common.ErrUTF8Parse, err)
|
||||||
}
|
}
|
||||||
pageData.GemText = validBody
|
pageData.GemText = validBody
|
||||||
} else {
|
} else {
|
||||||
@@ -204,7 +205,7 @@ func processData(data []byte) (*PageData, error) {
|
|||||||
func getHeadersAndData(data []byte) (string, []byte, error) {
|
func getHeadersAndData(data []byte) (string, []byte, error) {
|
||||||
firstLineEnds := slices.Index(data, '\n')
|
firstLineEnds := slices.Index(data, '\n')
|
||||||
if firstLineEnds == -1 {
|
if firstLineEnds == -1 {
|
||||||
return "", nil, ErrGeminiResponseHeader
|
return "", nil, common.ErrGeminiResponseHeader
|
||||||
}
|
}
|
||||||
firstLine := string(data[:firstLineEnds])
|
firstLine := string(data[:firstLineEnds])
|
||||||
rest := data[firstLineEnds+1:]
|
rest := data[firstLineEnds+1:]
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package gemini
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/common"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -56,7 +57,11 @@ func populateBlacklist(key string) (entries []string) {
|
|||||||
|
|
||||||
// RobotMatch checks if the snapshot URL matches
|
// RobotMatch checks if the snapshot URL matches
|
||||||
// a robots.txt allow rule.
|
// a robots.txt allow rule.
|
||||||
func RobotMatch(url URL) bool {
|
func RobotMatch(u string) bool {
|
||||||
|
url, err := common.ParseURL(u, "")
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||||
logging.LogDebug("Checking robots.txt cache for %s", key)
|
logging.LogDebug("Checking robots.txt cache for %s", key)
|
||||||
var disallowedURLs []string
|
var disallowedURLs []string
|
||||||
|
|||||||
182
gemini/worker.go
182
gemini/worker.go
@@ -3,6 +3,8 @@ package gemini
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gemini-grc/common"
|
||||||
|
_db "gemini-grc/db"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -12,54 +14,6 @@ import (
|
|||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
|
|
||||||
type WorkerStatus struct {
|
|
||||||
id int
|
|
||||||
status string
|
|
||||||
}
|
|
||||||
|
|
||||||
func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
|
|
||||||
// Create a slice to store current status of each worker
|
|
||||||
statuses := make([]string, totalWorkers)
|
|
||||||
|
|
||||||
// Initialize empty statuses
|
|
||||||
for i := range statuses {
|
|
||||||
statuses[i] = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initial print
|
|
||||||
var output strings.Builder
|
|
||||||
// \033[H moves the cursor to the top left corner of the screen
|
|
||||||
// (ie, the first column of the first row in the screen).
|
|
||||||
// \033[J clears the part of the screen from the cursor to the end of the screen.
|
|
||||||
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
|
|
||||||
for i := range statuses {
|
|
||||||
output.WriteString(fmt.Sprintf("[%2d] \n", i))
|
|
||||||
}
|
|
||||||
fmt.Print(output.String())
|
|
||||||
|
|
||||||
// Continuously receive status updates
|
|
||||||
for update := range statusChan {
|
|
||||||
if update.id >= totalWorkers {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the status
|
|
||||||
statuses[update.id] = update.status
|
|
||||||
|
|
||||||
// Build the complete output string
|
|
||||||
output.Reset()
|
|
||||||
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
|
|
||||||
for i, status := range statuses {
|
|
||||||
output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Print the entire status
|
|
||||||
fmt.Print(output.String())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var statusChan chan WorkerStatus
|
|
||||||
|
|
||||||
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||||
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
||||||
statusChan = make(chan WorkerStatus, numOfWorkers)
|
statusChan = make(chan WorkerStatus, numOfWorkers)
|
||||||
@@ -97,7 +51,7 @@ func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) {
|
|||||||
// On deadlock errors, rollback and return, otherwise panic.
|
// On deadlock errors, rollback and return, otherwise panic.
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%d] Failed to commit transaction: %w", workerID, err)
|
logging.LogError("[%d] Failed to commit transaction: %w", workerID, err)
|
||||||
if isDeadlockError(err) {
|
if _db.IsDeadlockError(err) {
|
||||||
logging.LogError("[%d] Deadlock detected. Rolling back", workerID)
|
logging.LogError("[%d] Deadlock detected. Rolling back", workerID)
|
||||||
time.Sleep(time.Duration(10) * time.Second)
|
time.Sleep(time.Duration(10) * time.Second)
|
||||||
err := tx.Rollback()
|
err := tx.Rollback()
|
||||||
@@ -112,78 +66,72 @@ func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func runWorker(workerID int, tx *sqlx.Tx, url *string) {
|
func runWorker(workerID int, tx *sqlx.Tx, url *string) {
|
||||||
var snapshots []Snapshot
|
var urls []string
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
// If not given a specific URL,
|
// If not given a specific URL,
|
||||||
// get some random ones to visit from DB.
|
// get some random ones to visit from db.
|
||||||
if url == nil {
|
if url == nil {
|
||||||
statusChan <- WorkerStatus{
|
statusChan <- WorkerStatus{
|
||||||
id: workerID,
|
id: workerID,
|
||||||
status: "Getting snapshots",
|
status: "Getting URLs",
|
||||||
}
|
}
|
||||||
snapshots, err = GetSnapshotsToVisit(tx)
|
urls, err = _db.GetURLsToVisit(tx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err)
|
logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err)
|
||||||
panic("This should never happen")
|
panic("This should never happen")
|
||||||
} else if len(snapshots) == 0 {
|
} else if len(urls) == 0 {
|
||||||
logging.LogInfo("[%d] No snapshots to visit.", workerID)
|
logging.LogInfo("[%d] No URLs to visit.", workerID)
|
||||||
time.Sleep(1 * time.Minute)
|
time.Sleep(1 * time.Minute)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
snapshotURL, err := ParseURL(*url, "")
|
geminiURL, err := common.ParseURL(*url, "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("Invalid URL given: %s", *url)
|
logging.LogError("Invalid URL given: %s", *url)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
snapshots = []Snapshot{{
|
urls = []string{geminiURL.String()}
|
||||||
// UID: uid.UID(),
|
|
||||||
URL: *snapshotURL,
|
|
||||||
Host: snapshotURL.Hostname,
|
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
|
||||||
}}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
total := len(snapshots)
|
|
||||||
for i, s := range snapshots {
|
|
||||||
logging.LogDebug("[%d] Snapshot %d/%d: %s", workerID, i+1, total, s.URL.String())
|
|
||||||
}
|
|
||||||
// Start visiting URLs.
|
// Start visiting URLs.
|
||||||
for i, s := range snapshots {
|
total := len(urls)
|
||||||
logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, s.URL.String())
|
for i, u := range urls {
|
||||||
|
logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, u)
|
||||||
// We differentiate between errors:
|
// We differentiate between errors:
|
||||||
// Unexpected errors are the ones returned from the following function.
|
// Unexpected errors are the ones returned from the following function.
|
||||||
// If an error is unexpected (which should never happen) we panic.
|
// If an error is unexpected (which should never happen) we panic.
|
||||||
// Expected errors are stored as strings within the snapshot,
|
// Expected errors are stored as strings within the snapshot.
|
||||||
// so that they can also be stored in DB.
|
err := workOnUrl(workerID, tx, u)
|
||||||
err := workOnSnapshot(workerID, tx, &s)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogError("[%d] [%s] Unexpected GeminiError %w", workerID, s.URL.String(), err)
|
logging.LogError("[%d] Unexpected GeminiError %w while visiting %s", workerID, err, u)
|
||||||
util.PrintStackAndPanic(err)
|
util.PrintStackAndPanic(err)
|
||||||
}
|
}
|
||||||
if s.Error.Valid {
|
|
||||||
logging.LogDebug("[%d] Error: %v", workerID, s.Error.String)
|
|
||||||
}
|
|
||||||
logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total)
|
logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// workOnSnapshot visits a URL and stores the result.
|
// workOnUrl visits a URL and stores the result.
|
||||||
// unexpected errors are returned.
|
// unexpected errors are returned.
|
||||||
// expected errors are stored within the snapshot.
|
// expected errors are stored within the snapshot.
|
||||||
func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
|
||||||
if IsBlacklisted(s.URL) {
|
if url == "" {
|
||||||
logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, s.URL.String())
|
return fmt.Errorf("nil URL given")
|
||||||
|
}
|
||||||
|
|
||||||
|
if IsBlacklisted(url) {
|
||||||
|
logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, url)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s := common.SnapshotFromURL(url)
|
||||||
|
|
||||||
// If URL matches a robots.txt disallow line,
|
// If URL matches a robots.txt disallow line,
|
||||||
// add it as an error so next time it won't be
|
// add it as an error so next time it won't be
|
||||||
// crawled.
|
// crawled.
|
||||||
if RobotMatch(s.URL) {
|
if RobotMatch(url) {
|
||||||
s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error())
|
s.Error = null.StringFrom(common.ErrGeminiRobotsDisallowed.Error())
|
||||||
err = UpsertSnapshot(workerID, tx, s)
|
err = _db.OverwriteSnapshot(workerID, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w", workerID, err)
|
return fmt.Errorf("[%d] %w", workerID, err)
|
||||||
}
|
}
|
||||||
@@ -191,10 +139,14 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Resolve IP address via DNS
|
// Resolve IP address via DNS
|
||||||
|
statusChan <- WorkerStatus{
|
||||||
|
id: workerID,
|
||||||
|
status: fmt.Sprintf("Resolving %s", url),
|
||||||
|
}
|
||||||
IPs, err := getHostIPAddresses(s.Host)
|
IPs, err := getHostIPAddresses(s.Host)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.Error = null.StringFrom(err.Error())
|
s.Error = null.StringFrom(err.Error())
|
||||||
err = UpsertSnapshot(workerID, tx, s)
|
err = _db.OverwriteSnapshot(workerID, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[%d] %w", workerID, err)
|
return fmt.Errorf("[%d] %w", workerID, err)
|
||||||
}
|
}
|
||||||
@@ -209,7 +161,7 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
id: workerID,
|
id: workerID,
|
||||||
status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host),
|
status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host),
|
||||||
}
|
}
|
||||||
time.Sleep(1 * time.Second) // Avoid flood-retrying
|
time.Sleep(2 * time.Second) // Avoid flood-retrying
|
||||||
count++
|
count++
|
||||||
if count == 3 {
|
if count == 3 {
|
||||||
return
|
return
|
||||||
@@ -219,6 +171,10 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
statusChan <- WorkerStatus{
|
||||||
|
id: workerID,
|
||||||
|
status: fmt.Sprintf("Adding to pool %s", url),
|
||||||
|
}
|
||||||
AddIPsToPool(IPs)
|
AddIPsToPool(IPs)
|
||||||
// After finishing, remove the host IPs from
|
// After finishing, remove the host IPs from
|
||||||
// the connections pool, with a small delay
|
// the connections pool, with a small delay
|
||||||
@@ -226,28 +182,32 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
defer func() {
|
defer func() {
|
||||||
go func() {
|
go func() {
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
|
statusChan <- WorkerStatus{
|
||||||
|
id: workerID,
|
||||||
|
status: fmt.Sprintf("Removing from pool %s", url),
|
||||||
|
}
|
||||||
RemoveIPsFromPool(IPs)
|
RemoveIPsFromPool(IPs)
|
||||||
}()
|
}()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
statusChan <- WorkerStatus{
|
statusChan <- WorkerStatus{
|
||||||
id: workerID,
|
id: workerID,
|
||||||
status: fmt.Sprintf("Visiting %s", s.URL.String()),
|
status: fmt.Sprintf("Visiting %s", url),
|
||||||
}
|
}
|
||||||
|
|
||||||
err = Visit(s)
|
err = Visit(s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if !IsKnownError(err) {
|
if !common.IsKnownError(err) {
|
||||||
logging.LogError("[%d] Unknown error visiting %s: %w", workerID, s.URL.String(), err)
|
logging.LogError("[%d] Unknown error visiting %s: %w", workerID, url, err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
s.Error = null.StringFrom(err.Error())
|
s.Error = null.StringFrom(err.Error())
|
||||||
// Check if error is redirection, and handle it
|
// Check if error is redirection, and handle it
|
||||||
if errors.As(err, new(*GeminiError)) &&
|
if errors.As(err, new(*common.GeminiError)) &&
|
||||||
err.(*GeminiError).Msg == "redirect" {
|
err.(*common.GeminiError).Msg == "redirect" {
|
||||||
err = handleRedirection(workerID, tx, s)
|
err = handleRedirection(workerID, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if IsKnownError(err) {
|
if common.IsKnownError(err) {
|
||||||
s.Error = null.StringFrom(err.Error())
|
s.Error = null.StringFrom(err.Error())
|
||||||
} else {
|
} else {
|
||||||
return err
|
return err
|
||||||
@@ -270,7 +230,7 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID)
|
logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = UpsertSnapshot(workerID, tx, s)
|
err = _db.OverwriteSnapshot(workerID, tx, s)
|
||||||
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
|
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -294,12 +254,12 @@ func isAnotherWorkerVisitingHost(workerID int, IPs []string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
|
func storeLinks(tx *sqlx.Tx, s *common.Snapshot) error {
|
||||||
if s.Links.Valid {
|
if s.Links.Valid {
|
||||||
var batchSnapshots []*Snapshot
|
var batchSnapshots []*common.Snapshot
|
||||||
for _, link := range s.Links.ValueOrZero() {
|
for _, link := range s.Links.ValueOrZero() {
|
||||||
if shouldPersistURL(&link) {
|
if shouldPersistURL(&link) {
|
||||||
newSnapshot := &Snapshot{
|
newSnapshot := &common.Snapshot{
|
||||||
URL: link,
|
URL: link,
|
||||||
Host: link.Hostname,
|
Host: link.Hostname,
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
@@ -309,7 +269,7 @@ func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(batchSnapshots) > 0 {
|
if len(batchSnapshots) > 0 {
|
||||||
err := SaveLinksToDBinBatches(tx, batchSnapshots)
|
err := _db.SaveLinksToDBinBatches(tx, batchSnapshots)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -319,17 +279,33 @@ func storeLinks(tx *sqlx.Tx, s *Snapshot) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// shouldPersistURL returns true if we
|
// shouldPersistURL returns true if we
|
||||||
// should save the URL in the DB.
|
// should save the URL in the _db.
|
||||||
// Only gemini:// urls are saved.
|
// Only gemini:// urls are saved.
|
||||||
func shouldPersistURL(u *URL) bool {
|
func shouldPersistURL(u *common.URL) bool {
|
||||||
return strings.HasPrefix(u.String(), "gemini://")
|
return strings.HasPrefix(u.String(), "gemini://")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func haveWeVisitedURL(tx *sqlx.Tx, u *common.URL) (bool, error) {
|
||||||
|
var result bool
|
||||||
|
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u.String())
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
|
||||||
|
}
|
||||||
|
if result {
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshot.url=$1`, u.String())
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("%w: %w", common.ErrDatabase, err)
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
// handleRedirection saves redirect URL as new snapshot
|
// handleRedirection saves redirect URL as new snapshot
|
||||||
func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error {
|
func handleRedirection(workerID int, tx *sqlx.Tx, s *common.Snapshot) error {
|
||||||
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
|
newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, ErrGeminiRedirect) {
|
if errors.Is(err, common.ErrGeminiRedirect) {
|
||||||
logging.LogDebug("[%d] %s", workerID, err)
|
logging.LogDebug("[%d] %s", workerID, err)
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
@@ -337,14 +313,14 @@ func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error {
|
|||||||
logging.LogDebug("[%d] Page redirects to %s", workerID, newURL)
|
logging.LogDebug("[%d] Page redirects to %s", workerID, newURL)
|
||||||
// Insert fresh snapshot with new URL
|
// Insert fresh snapshot with new URL
|
||||||
if shouldPersistURL(newURL) {
|
if shouldPersistURL(newURL) {
|
||||||
snapshot := &Snapshot{
|
snapshot := &common.Snapshot{
|
||||||
// UID: uid.UID(),
|
// UID: uid.UID(),
|
||||||
URL: *newURL,
|
URL: *newURL,
|
||||||
Host: newURL.Hostname,
|
Host: newURL.Hostname,
|
||||||
Timestamp: null.TimeFrom(time.Now()),
|
Timestamp: null.TimeFrom(time.Now()),
|
||||||
}
|
}
|
||||||
logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String())
|
logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String())
|
||||||
err = SaveSnapshotIfNew(tx, snapshot)
|
err = _db.SaveSnapshotIfNew(tx, snapshot)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -352,14 +328,14 @@ func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) {
|
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]common.Snapshot, error) {
|
||||||
query := `
|
query := `
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM snapshots
|
FROM snapshots
|
||||||
WHERE url=$1
|
WHERE url=$1
|
||||||
LIMIT 1
|
LIMIT 1
|
||||||
`
|
`
|
||||||
var snapshots []Snapshot
|
var snapshots []common.Snapshot
|
||||||
err := tx.Select(&snapshots, query, url)
|
err := tx.Select(&snapshots, query, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|||||||
54
gemini/workerStatus.go
Normal file
54
gemini/workerStatus.go
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
package gemini
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type WorkerStatus struct {
|
||||||
|
id int
|
||||||
|
status string
|
||||||
|
}
|
||||||
|
|
||||||
|
var statusChan chan WorkerStatus
|
||||||
|
|
||||||
|
func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) {
|
||||||
|
// Create a slice to store current status of each worker
|
||||||
|
statuses := make([]string, totalWorkers)
|
||||||
|
|
||||||
|
// Initialize empty statuses
|
||||||
|
for i := range statuses {
|
||||||
|
statuses[i] = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initial print
|
||||||
|
var output strings.Builder
|
||||||
|
// \033[H moves the cursor to the top left corner of the screen
|
||||||
|
// (ie, the first column of the first row in the screen).
|
||||||
|
// \033[J clears the part of the screen from the cursor to the end of the screen.
|
||||||
|
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
|
||||||
|
for i := range statuses {
|
||||||
|
output.WriteString(fmt.Sprintf("[%2d] \n", i))
|
||||||
|
}
|
||||||
|
fmt.Print(output.String())
|
||||||
|
|
||||||
|
// Continuously receive status updates
|
||||||
|
for update := range statusChan {
|
||||||
|
if update.id >= totalWorkers {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the status
|
||||||
|
statuses[update.id] = update.status
|
||||||
|
|
||||||
|
// Build the complete output string
|
||||||
|
output.Reset()
|
||||||
|
output.WriteString("\033[H\033[J") // Clear screen and move cursor to top
|
||||||
|
for i, status := range statuses {
|
||||||
|
output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print the entire status
|
||||||
|
fmt.Print(output.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
7
main.go
7
main.go
@@ -1,6 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
main2 "gemini-grc/db"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -29,11 +30,7 @@ func runApp() error {
|
|||||||
signals := make(chan os.Signal, 1)
|
signals := make(chan os.Signal, 1)
|
||||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
|
||||||
db := gemini.ConnectToDB()
|
db := main2.ConnectToDB()
|
||||||
|
|
||||||
// !!! DANGER !!!
|
|
||||||
// Removes all rows and adds some seed URLs.
|
|
||||||
// populateDB(db)
|
|
||||||
|
|
||||||
defer func(db *sqlx.DB) {
|
defer func(db *sqlx.DB) {
|
||||||
err := db.Close()
|
err := db.Close()
|
||||||
|
|||||||
Reference in New Issue
Block a user