diff --git a/gemini/errors.go b/common/errors.go similarity index 99% rename from gemini/errors.go rename to common/errors.go index 5ae014c..243e2fa 100644 --- a/gemini/errors.go +++ b/common/errors.go @@ -1,4 +1,4 @@ -package gemini +package common import ( "errors" diff --git a/gemini/errors_test.go b/common/errors_test.go similarity index 96% rename from gemini/errors_test.go rename to common/errors_test.go index e9ba76f..c756cb9 100644 --- a/gemini/errors_test.go +++ b/common/errors_test.go @@ -1,4 +1,4 @@ -package gemini +package common import ( "errors" diff --git a/gemini/gemini_url.go b/common/gemini_url.go similarity index 99% rename from gemini/gemini_url.go rename to common/gemini_url.go index 0ab39d9..6dffb96 100644 --- a/gemini/gemini_url.go +++ b/common/gemini_url.go @@ -1,4 +1,4 @@ -package gemini +package common import ( "database/sql/driver" diff --git a/gemini/gemini_url_test.go b/common/gemini_url_test.go similarity index 99% rename from gemini/gemini_url_test.go rename to common/gemini_url_test.go index 36c4662..b45a55d 100644 --- a/gemini/gemini_url_test.go +++ b/common/gemini_url_test.go @@ -1,4 +1,4 @@ -package gemini +package common import ( "reflect" diff --git a/gemini/snapshot.go b/common/snapshot.go similarity index 84% rename from gemini/snapshot.go rename to common/snapshot.go index bbd1a75..810fb15 100644 --- a/gemini/snapshot.go +++ b/common/snapshot.go @@ -1,9 +1,10 @@ -package gemini +package common import ( "database/sql/driver" "encoding/json" "fmt" + "time" "github.com/guregu/null/v5" ) @@ -40,3 +41,16 @@ type Snapshot struct { ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code. Error null.String `db:"error" json:"error,omitempty"` // On network errors only } + +func SnapshotFromURL(u string) *Snapshot { + url, err := ParseURL(u, "") + if err != nil { + return nil + } + newSnapshot := Snapshot{ + URL: *url, + Host: url.Hostname, + Timestamp: null.TimeFrom(time.Now()), + } + return &newSnapshot +} diff --git a/gemini/db.go b/db/db.go similarity index 75% rename from gemini/db.go rename to db/db.go index d35a3f5..7885b4d 100644 --- a/gemini/db.go +++ b/db/db.go @@ -1,9 +1,10 @@ -package gemini +package db import ( "encoding/json" "errors" "fmt" + "gemini-grc/common" "os" "strconv" @@ -43,8 +44,8 @@ func ConnectToDB() *sqlx.DB { return db } -// isDeadlockError checks if the error is a PostgreSQL deadlock error -func isDeadlockError(err error) bool { +// IsDeadlockError checks if the error is a PostgreSQL deadlock error +func IsDeadlockError(err error) bool { var pqErr *pq.Error if errors.As(err, &pqErr) { return pqErr.Code == "40P01" // PostgreSQL deadlock error code @@ -52,16 +53,25 @@ func isDeadlockError(err error) bool { return false } -func GetSnapshotsToVisit(tx *sqlx.Tx) ([]Snapshot, error) { - var snapshots []Snapshot - err := tx.Select(&snapshots, SQL_SELECT_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS, config.CONFIG.WorkerBatchSize) +func GetURLsToVisit(tx *sqlx.Tx) ([]string, error) { + var urls []string + err := tx.Select(&urls, SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS, config.CONFIG.WorkerBatchSize) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrDatabase, err) + return nil, fmt.Errorf("%w: %w", common.ErrDatabase, err) } - return snapshots, nil + return urls, nil } -func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error { +func InsertURL(tx *sqlx.Tx, url string) error { + query := SQL_INSERT_URL + _, err := tx.NamedExec(query, url) + if err != nil { + return fmt.Errorf("%w inserting URL: %w", common.ErrDatabase, err) + } + return nil +} + +func SaveSnapshotIfNew(tx *sqlx.Tx, s *common.Snapshot) error { if config.CONFIG.DryRun { marshalled, err := json.MarshalIndent(s, "", " ") if err != nil { @@ -78,7 +88,7 @@ func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error { return nil } -func UpsertSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) { +func OverwriteSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) { // if config.CONFIG.DryRun { //marshalled, err := json.MarshalIndent(s, "", " ") //if err != nil { @@ -90,19 +100,19 @@ func UpsertSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) { query := SQL_UPSERT_SNAPSHOT rows, err := tx.NamedQuery(query, s) if err != nil { - return fmt.Errorf("[%d] %w while upserting snapshot: %w", workedID, ErrDatabase, err) + return fmt.Errorf("[%d] %w while upserting snapshot: %w", workedID, common.ErrDatabase, err) } defer func() { _err := rows.Close() if _err != nil { - err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, ErrDatabase, _err) + err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err) } }() if rows.Next() { var returnedID int err = rows.Scan(&returnedID) if err != nil { - return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, ErrDatabase, err) + return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err) } s.ID = returnedID // logging.LogDebug("[%d] Upserted snapshot with ID %d", workedID, returnedID) @@ -110,7 +120,7 @@ func UpsertSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) { return nil } -func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) { +func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *common.Snapshot) (err error) { // if config.CONFIG.DryRun { //marshalled, err := json.MarshalIndent(s, "", " ") //if err != nil { @@ -122,19 +132,19 @@ func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) { query := SQL_UPDATE_SNAPSHOT rows, err := tx.NamedQuery(query, s) if err != nil { - return fmt.Errorf("[%d] %w while updating snapshot: %w", workedID, ErrDatabase, err) + return fmt.Errorf("[%d] %w while updating snapshot: %w", workedID, common.ErrDatabase, err) } defer func() { _err := rows.Close() if _err != nil { - err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, ErrDatabase, _err) + err = fmt.Errorf("[%d] %w error closing rows: %w", workedID, common.ErrDatabase, _err) } }() if rows.Next() { var returnedID int err = rows.Scan(&returnedID) if err != nil { - return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, ErrDatabase, err) + return fmt.Errorf("[%d] %w error scanning returned id: %w", workedID, common.ErrDatabase, err) } s.ID = returnedID // logging.LogDebug("[%d] Updated snapshot with ID %d", workedID, returnedID) @@ -142,7 +152,7 @@ func UpdateSnapshot(workedID int, tx *sqlx.Tx, s *Snapshot) (err error) { return nil } -func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { +func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*common.Snapshot) error { if config.CONFIG.DryRun { return nil } @@ -156,13 +166,13 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { batch := snapshots[i:end] _, err := tx.NamedExec(query, batch) if err != nil { - return fmt.Errorf("%w: While saving links in batches: %w", ErrDatabase, err) + return fmt.Errorf("%w: While saving links in batches: %w", common.ErrDatabase, err) } } return nil } -func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error { +func SaveLinksToDB(tx *sqlx.Tx, snapshots []*common.Snapshot) error { if config.CONFIG.DryRun { return nil } diff --git a/gemini/db_queries.go b/db/db_queries.go similarity index 86% rename from gemini/db_queries.go rename to db/db_queries.go index 4ef07a5..bb2d238 100644 --- a/gemini/db_queries.go +++ b/db/db_queries.go @@ -1,4 +1,4 @@ -package gemini +package db const ( SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS = ` @@ -10,6 +10,16 @@ ORDER BY RANDOM() FOR UPDATE SKIP LOCKED LIMIT $1 ` + SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS = ` +SELECT url +FROM urls u +WHERE u.id IN ( + SELECT MIN(id) + FROM urls + GROUP BY host +) +LIMIT $1 +` SQL_SELECT_RANDOM_UNVISITED_SNAPSHOTS_UNIQUE_HOSTS = ` SELECT * FROM snapshots s @@ -75,4 +85,9 @@ error = :error WHERE id = :id RETURNING id ` + SQL_INSERT_URL = ` + INSERT INTO urls (url, host, timestamp) + VALUES (:url, :host, :timestamp) + ON CONFLICT (url) DO NOTHING + ` ) diff --git a/gemini/blacklist.go b/gemini/blacklist.go index 6b4148c..5fa9ad5 100644 --- a/gemini/blacklist.go +++ b/gemini/blacklist.go @@ -2,6 +2,7 @@ package gemini import ( "fmt" + "gemini-grc/common" "os" "strings" @@ -39,7 +40,11 @@ func LoadBlacklist() { } } -func IsBlacklisted(url URL) bool { +func IsBlacklisted(u string) bool { + url, err := common.ParseURL(u, "") + if err != nil { + return false + } hostWithPort := fmt.Sprintf("%s:%d", url.Hostname, url.Port) for _, v := range *Blacklist { if v == url.Hostname || v == hostWithPort { diff --git a/gemini/files.go b/gemini/files.go index 84b3012..27a9fc0 100644 --- a/gemini/files.go +++ b/gemini/files.go @@ -2,6 +2,7 @@ package gemini import ( "fmt" + "gemini-grc/common" "net/url" "os" "path" @@ -63,7 +64,7 @@ func calcFilePath(rootPath, urlPath string) (string, error) { return finalPath, nil } -func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) { +func SaveToFile(rootPath string, s *common.Snapshot, done chan struct{}) { parentPath := path.Join(rootPath, s.URL.Hostname) urlPath := s.URL.Path // If path is empty, add `index.gmi` as the file to save diff --git a/gemini/gemini.go b/gemini/gemini.go index ab10222..eceefe2 100644 --- a/gemini/gemini.go +++ b/gemini/gemini.go @@ -2,6 +2,7 @@ package gemini import ( "fmt" + "gemini-grc/common" "net/url" "regexp" "strconv" @@ -9,18 +10,18 @@ import ( "gemini-grc/logging" ) -func GetPageLinks(currentURL URL, gemtext string) LinkList { +func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList { // Grab link lines linkLines := ExtractLinkLines(gemtext) if len(linkLines) == 0 { return nil } - var linkURLs LinkList + var linkURLs common.LinkList // Normalize URLs in links, and store them in snapshot for _, line := range linkLines { linkURL, err := NormalizeLink(line, currentURL.String()) if err != nil { - logging.LogDebug("%s: %s", ErrGeminiLinkLineParse, err) + logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err) continue } linkURLs = append(linkURLs, *linkURL) @@ -42,11 +43,11 @@ func ExtractLinkLines(gemtext string) []string { // NormalizeLink takes a single link line and the current URL, // return the URL converted to an absolute URL // and its description. -func NormalizeLink(linkLine string, currentURL string) (*URL, error) { +func NormalizeLink(linkLine string, currentURL string) (*common.URL, error) { // Parse the current URL baseURL, err := url.Parse(currentURL) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrURLParse, err) + return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) } // Regular expression to extract the URL part from a link line @@ -56,13 +57,13 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) { matches := re.FindStringSubmatch(linkLine) if len(matches) == 0 { // If the line doesn't match the expected format, return it unchanged - return nil, fmt.Errorf("%w for link line %s", ErrGeminiLinkLineParse, linkLine) + return nil, fmt.Errorf("%w for link line %s", common.ErrGeminiLinkLineParse, linkLine) } originalURLStr := matches[1] _, err = url.QueryUnescape(originalURLStr) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrURLDecode, err) + return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err) } restOfLine := "" @@ -74,7 +75,7 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) { parsedURL, err := url.Parse(originalURLStr) if err != nil { // If URL parsing fails, return an error - return nil, fmt.Errorf("%w: %w", ErrURLParse, err) + return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) } // Resolve relative URLs against the base URL @@ -89,10 +90,10 @@ func NormalizeLink(linkLine string, currentURL string) (*URL, error) { restOfLine = restOfLine[1:] } - finalURL, err := ParseURL(parsedURL.String(), restOfLine) + finalURL, err := common.ParseURL(parsedURL.String(), restOfLine) if err != nil { // If URL parsing fails, return an error - return nil, fmt.Errorf("%w: %w", ErrURLParse, err) + return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) } return finalURL, nil @@ -107,13 +108,13 @@ func ParseFirstTwoDigits(input string) (int, error) { // Find the first match in the string matches := re.FindStringSubmatch(input) if len(matches) == 0 { - return 0, fmt.Errorf("%w", ErrGeminiResponseHeader) + return 0, fmt.Errorf("%w", common.ErrGeminiResponseHeader) } // Parse the captured match as an integer snapshot, err := strconv.Atoi(matches[1]) if err != nil { - return 0, fmt.Errorf("%w: %w", ErrTextParse, err) + return 0, fmt.Errorf("%w: %w", common.ErrTextParse, err) } return snapshot, nil @@ -121,7 +122,7 @@ func ParseFirstTwoDigits(input string) (int, error) { // extractRedirectTarget returns the redirection // URL by parsing the header (or error message) -func extractRedirectTarget(currentURL URL, input string) (*URL, error) { +func extractRedirectTarget(currentURL common.URL, input string) (*common.URL, error) { // \d+ - matches one or more digits // \s+ - matches one or more whitespace // ([^\r]+) - captures everything until it hits a \r (or end of string) @@ -129,11 +130,11 @@ func extractRedirectTarget(currentURL URL, input string) (*URL, error) { re := regexp.MustCompile(pattern) matches := re.FindStringSubmatch(input) if len(matches) < 2 { - return nil, fmt.Errorf("%w: %s", ErrGeminiRedirect, input) + return nil, fmt.Errorf("%w: %s", common.ErrGeminiRedirect, input) } - newURL, err := DeriveAbsoluteURL(currentURL, matches[1]) + newURL, err := common.DeriveAbsoluteURL(currentURL, matches[1]) if err != nil { - return nil, fmt.Errorf("%w: %w: %s", ErrGeminiRedirect, err, input) + return nil, fmt.Errorf("%w: %w: %s", common.ErrGeminiRedirect, err, input) } return newURL, nil } diff --git a/gemini/gemini_test.go b/gemini/gemini_test.go index df4a6c5..f6c7b0b 100644 --- a/gemini/gemini_test.go +++ b/gemini/gemini_test.go @@ -1,10 +1,13 @@ package gemini -import "testing" +import ( + "gemini-grc/common" + "testing" +) func TestExtractRedirectTargetFullURL(t *testing.T) { t.Parallel() - currentURL, _ := ParseURL("gemini://smol.gr", "") + currentURL, _ := common.ParseURL("gemini://smol.gr", "") input := "redirect: 31 gemini://target.gr" result, err := extractRedirectTarget(*currentURL, input) expected := "gemini://target.gr:1965" @@ -15,7 +18,7 @@ func TestExtractRedirectTargetFullURL(t *testing.T) { func TestExtractRedirectTargetFullURLSlash(t *testing.T) { t.Parallel() - currentURL, _ := ParseURL("gemini://smol.gr", "") + currentURL, _ := common.ParseURL("gemini://smol.gr", "") input := "redirect: 31 gemini://target.gr/" result, err := extractRedirectTarget(*currentURL, input) expected := "gemini://target.gr:1965/" @@ -26,7 +29,7 @@ func TestExtractRedirectTargetFullURLSlash(t *testing.T) { func TestExtractRedirectTargetRelativeURL(t *testing.T) { t.Parallel() - currentURL, _ := ParseURL("gemini://smol.gr", "") + currentURL, _ := common.ParseURL("gemini://smol.gr", "") input := "redirect: 31 /a/b" result, err := extractRedirectTarget(*currentURL, input) if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") { @@ -36,7 +39,7 @@ func TestExtractRedirectTargetRelativeURL(t *testing.T) { func TestExtractRedirectTargetRelativeURL2(t *testing.T) { t.Parallel() - currentURL, _ := ParseURL("gemini://nox.im:1965", "") + currentURL, _ := common.ParseURL("gemini://nox.im:1965", "") input := "redirect: 31 ./" result, err := extractRedirectTarget(*currentURL, input) if err != nil || (result.String() != "gemini://nox.im:1965/") { @@ -46,7 +49,7 @@ func TestExtractRedirectTargetRelativeURL2(t *testing.T) { func TestExtractRedirectTargetRelativeURL3(t *testing.T) { t.Parallel() - currentURL, _ := ParseURL("gemini://status.zvava.org:1965", "") + currentURL, _ := common.ParseURL("gemini://status.zvava.org:1965", "") input := "redirect: 31 index.gmi" result, err := extractRedirectTarget(*currentURL, input) if err != nil || (result.String() != "gemini://status.zvava.org:1965/index.gmi") { @@ -56,7 +59,7 @@ func TestExtractRedirectTargetRelativeURL3(t *testing.T) { func TestExtractRedirectTargetWrong(t *testing.T) { t.Parallel() - currentURL, _ := ParseURL("gemini://smol.gr", "") + currentURL, _ := common.ParseURL("gemini://smol.gr", "") input := "redirect: 31" result, err := extractRedirectTarget(*currentURL, input) if result != nil || err == nil { diff --git a/gemini/network.go b/gemini/network.go index 74fde37..841d26c 100644 --- a/gemini/network.go +++ b/gemini/network.go @@ -4,6 +4,7 @@ import ( "crypto/tls" "errors" "fmt" + "gemini-grc/common" "io" "net" gourl "net/url" @@ -35,7 +36,7 @@ type PageData struct { func getHostIPAddresses(hostname string) ([]string, error) { addrs, err := net.LookupHost(hostname) if err != nil { - return nil, fmt.Errorf("%w:%w", ErrNetworkDNS, err) + return nil, fmt.Errorf("%w:%w", common.ErrNetworkDNS, err) } IPPool.Lock.RLock() defer func() { @@ -47,7 +48,7 @@ func getHostIPAddresses(hostname string) ([]string, error) { func ConnectAndGetData(url string) ([]byte, error) { parsedURL, err := gourl.Parse(url) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrURLParse, err) + return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) } hostname := parsedURL.Hostname() port := parsedURL.Port() @@ -61,7 +62,7 @@ func ConnectAndGetData(url string) ([]byte, error) { } conn, err := dialer.Dial("tcp", host) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrNetwork, err) + return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err) } // Make sure we always close the connection. defer func() { @@ -73,11 +74,11 @@ func ConnectAndGetData(url string) ([]byte, error) { // Set read and write timeouts on the TCP connection. err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err) + return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err) } err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second)) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrNetworkSetConnectionDeadline, err) + return nil, fmt.Errorf("%w: %w", common.ErrNetworkSetConnectionDeadline, err) } // Perform the TLS handshake @@ -88,7 +89,7 @@ func ConnectAndGetData(url string) ([]byte, error) { } tlsConn := tls.Client(conn, tlsConfig) if err := tlsConn.Handshake(); err != nil { - return nil, fmt.Errorf("%w: %w", ErrNetworkTLS, err) + return nil, fmt.Errorf("%w: %w", common.ErrNetworkTLS, err) } // We read `buf`-sized chunks and add data to `data`. @@ -99,10 +100,10 @@ func ConnectAndGetData(url string) ([]byte, error) { // Fix for stupid server bug: // Some servers return 'Header: 53 No proxying to other hosts or ports!' // when the port is 1965 and is still specified explicitly in the URL. - _url, _ := ParseURL(url, "") + _url, _ := common.ParseURL(url, "") _, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort()))) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err) + return nil, fmt.Errorf("%w: %w", common.ErrNetworkCannotWrite, err) } // Read response bytes in len(buf) byte chunks for { @@ -111,13 +112,13 @@ func ConnectAndGetData(url string) ([]byte, error) { data = append(data, buf[:n]...) } if len(data) > config.CONFIG.MaxResponseSize { - return nil, fmt.Errorf("%w: %v", ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize) + return nil, fmt.Errorf("%w: %v", common.ErrNetworkResponseSizeExceededMax, config.CONFIG.MaxResponseSize) } if err != nil { if errors.Is(err, io.EOF) { break } - return nil, fmt.Errorf("%w: %w", ErrNetwork, err) + return nil, fmt.Errorf("%w: %w", common.ErrNetwork, err) } } return data, nil @@ -127,16 +128,16 @@ func ConnectAndGetData(url string) ([]byte, error) { // Mutates given Snapshot with the data. // In case of error, we store the error string // inside snapshot and return the error. -func Visit(s *Snapshot) (err error) { +func Visit(s *common.Snapshot) (err error) { // Don't forget to also store error // response code (if we have one) // and header defer func() { if err != nil { s.Error = null.StringFrom(err.Error()) - if errors.As(err, new(*GeminiError)) { - s.Header = null.StringFrom(err.(*GeminiError).Header) - s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code)) + if errors.As(err, new(*common.GeminiError)) { + s.Header = null.StringFrom(err.(*common.GeminiError).Header) + s.ResponseCode = null.IntFrom(int64(err.(*common.GeminiError).Code)) } } }() @@ -174,7 +175,7 @@ func processData(data []byte) (*PageData, error) { code, mimeType, lang := getMimeTypeAndLang(header) logging.LogDebug("Header: %s", strings.TrimSpace(header)) if code != 20 { - return nil, NewErrGeminiStatusCode(code, header) + return nil, common.NewErrGeminiStatusCode(code, header) } pageData := PageData{ @@ -188,7 +189,7 @@ func processData(data []byte) (*PageData, error) { if mimeType == "text/gemini" { validBody, err := BytesToValidUTF8(body) if err != nil { - return nil, fmt.Errorf("%w: %w", ErrUTF8Parse, err) + return nil, fmt.Errorf("%w: %w", common.ErrUTF8Parse, err) } pageData.GemText = validBody } else { @@ -204,7 +205,7 @@ func processData(data []byte) (*PageData, error) { func getHeadersAndData(data []byte) (string, []byte, error) { firstLineEnds := slices.Index(data, '\n') if firstLineEnds == -1 { - return "", nil, ErrGeminiResponseHeader + return "", nil, common.ErrGeminiResponseHeader } firstLine := string(data[:firstLineEnds]) rest := data[firstLineEnds+1:] diff --git a/gemini/robotmatch.go b/gemini/robotmatch.go index b786204..52967f4 100644 --- a/gemini/robotmatch.go +++ b/gemini/robotmatch.go @@ -2,6 +2,7 @@ package gemini import ( "fmt" + "gemini-grc/common" "strings" "sync" @@ -56,7 +57,11 @@ func populateBlacklist(key string) (entries []string) { // RobotMatch checks if the snapshot URL matches // a robots.txt allow rule. -func RobotMatch(url URL) bool { +func RobotMatch(u string) bool { + url, err := common.ParseURL(u, "") + if err != nil { + return false + } key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port)) logging.LogDebug("Checking robots.txt cache for %s", key) var disallowedURLs []string diff --git a/gemini/worker.go b/gemini/worker.go index 626d5c9..8c84997 100644 --- a/gemini/worker.go +++ b/gemini/worker.go @@ -3,6 +3,8 @@ package gemini import ( "errors" "fmt" + "gemini-grc/common" + _db "gemini-grc/db" "strings" "time" @@ -12,54 +14,6 @@ import ( "github.com/jmoiron/sqlx" ) -type WorkerStatus struct { - id int - status string -} - -func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) { - // Create a slice to store current status of each worker - statuses := make([]string, totalWorkers) - - // Initialize empty statuses - for i := range statuses { - statuses[i] = "" - } - - // Initial print - var output strings.Builder - // \033[H moves the cursor to the top left corner of the screen - // (ie, the first column of the first row in the screen). - // \033[J clears the part of the screen from the cursor to the end of the screen. - output.WriteString("\033[H\033[J") // Clear screen and move cursor to top - for i := range statuses { - output.WriteString(fmt.Sprintf("[%2d] \n", i)) - } - fmt.Print(output.String()) - - // Continuously receive status updates - for update := range statusChan { - if update.id >= totalWorkers { - continue - } - - // Update the status - statuses[update.id] = update.status - - // Build the complete output string - output.Reset() - output.WriteString("\033[H\033[J") // Clear screen and move cursor to top - for i, status := range statuses { - output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status)) - } - - // Print the entire status - fmt.Print(output.String()) - } -} - -var statusChan chan WorkerStatus - func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { logging.LogInfo("Spawning %d workers", numOfWorkers) statusChan = make(chan WorkerStatus, numOfWorkers) @@ -97,7 +51,7 @@ func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) { // On deadlock errors, rollback and return, otherwise panic. if err != nil { logging.LogError("[%d] Failed to commit transaction: %w", workerID, err) - if isDeadlockError(err) { + if _db.IsDeadlockError(err) { logging.LogError("[%d] Deadlock detected. Rolling back", workerID) time.Sleep(time.Duration(10) * time.Second) err := tx.Rollback() @@ -112,78 +66,72 @@ func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) { } func runWorker(workerID int, tx *sqlx.Tx, url *string) { - var snapshots []Snapshot + var urls []string var err error // If not given a specific URL, - // get some random ones to visit from DB. + // get some random ones to visit from db. if url == nil { statusChan <- WorkerStatus{ id: workerID, - status: "Getting snapshots", + status: "Getting URLs", } - snapshots, err = GetSnapshotsToVisit(tx) + urls, err = _db.GetURLsToVisit(tx) if err != nil { logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err) panic("This should never happen") - } else if len(snapshots) == 0 { - logging.LogInfo("[%d] No snapshots to visit.", workerID) + } else if len(urls) == 0 { + logging.LogInfo("[%d] No URLs to visit.", workerID) time.Sleep(1 * time.Minute) return } } else { - snapshotURL, err := ParseURL(*url, "") + geminiURL, err := common.ParseURL(*url, "") if err != nil { logging.LogError("Invalid URL given: %s", *url) return } - snapshots = []Snapshot{{ - // UID: uid.UID(), - URL: *snapshotURL, - Host: snapshotURL.Hostname, - Timestamp: null.TimeFrom(time.Now()), - }} + urls = []string{geminiURL.String()} } - total := len(snapshots) - for i, s := range snapshots { - logging.LogDebug("[%d] Snapshot %d/%d: %s", workerID, i+1, total, s.URL.String()) - } // Start visiting URLs. - for i, s := range snapshots { - logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, s.URL.String()) + total := len(urls) + for i, u := range urls { + logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, u) // We differentiate between errors: // Unexpected errors are the ones returned from the following function. // If an error is unexpected (which should never happen) we panic. - // Expected errors are stored as strings within the snapshot, - // so that they can also be stored in DB. - err := workOnSnapshot(workerID, tx, &s) + // Expected errors are stored as strings within the snapshot. + err := workOnUrl(workerID, tx, u) if err != nil { - logging.LogError("[%d] [%s] Unexpected GeminiError %w", workerID, s.URL.String(), err) + logging.LogError("[%d] Unexpected GeminiError %w while visiting %s", workerID, err, u) util.PrintStackAndPanic(err) } - if s.Error.Valid { - logging.LogDebug("[%d] Error: %v", workerID, s.Error.String) - } logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total) } } -// workOnSnapshot visits a URL and stores the result. +// workOnUrl visits a URL and stores the result. // unexpected errors are returned. // expected errors are stored within the snapshot. -func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) { - if IsBlacklisted(s.URL) { - logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, s.URL.String()) +func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) { + if url == "" { + return fmt.Errorf("nil URL given") + } + + if IsBlacklisted(url) { + logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, url) return nil } + s := common.SnapshotFromURL(url) + // If URL matches a robots.txt disallow line, // add it as an error so next time it won't be // crawled. - if RobotMatch(s.URL) { - s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error()) - err = UpsertSnapshot(workerID, tx, s) + if RobotMatch(url) { + s.Error = null.StringFrom(common.ErrGeminiRobotsDisallowed.Error()) + err = _db.OverwriteSnapshot(workerID, tx, s) if err != nil { return fmt.Errorf("[%d] %w", workerID, err) } @@ -191,10 +139,14 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) { } // Resolve IP address via DNS + statusChan <- WorkerStatus{ + id: workerID, + status: fmt.Sprintf("Resolving %s", url), + } IPs, err := getHostIPAddresses(s.Host) if err != nil { s.Error = null.StringFrom(err.Error()) - err = UpsertSnapshot(workerID, tx, s) + err = _db.OverwriteSnapshot(workerID, tx, s) if err != nil { return fmt.Errorf("[%d] %w", workerID, err) } @@ -209,7 +161,7 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) { id: workerID, status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host), } - time.Sleep(1 * time.Second) // Avoid flood-retrying + time.Sleep(2 * time.Second) // Avoid flood-retrying count++ if count == 3 { return @@ -219,6 +171,10 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) { } } + statusChan <- WorkerStatus{ + id: workerID, + status: fmt.Sprintf("Adding to pool %s", url), + } AddIPsToPool(IPs) // After finishing, remove the host IPs from // the connections pool, with a small delay @@ -226,28 +182,32 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) { defer func() { go func() { time.Sleep(1 * time.Second) + statusChan <- WorkerStatus{ + id: workerID, + status: fmt.Sprintf("Removing from pool %s", url), + } RemoveIPsFromPool(IPs) }() }() statusChan <- WorkerStatus{ id: workerID, - status: fmt.Sprintf("Visiting %s", s.URL.String()), + status: fmt.Sprintf("Visiting %s", url), } err = Visit(s) if err != nil { - if !IsKnownError(err) { - logging.LogError("[%d] Unknown error visiting %s: %w", workerID, s.URL.String(), err) + if !common.IsKnownError(err) { + logging.LogError("[%d] Unknown error visiting %s: %w", workerID, url, err) return err } s.Error = null.StringFrom(err.Error()) // Check if error is redirection, and handle it - if errors.As(err, new(*GeminiError)) && - err.(*GeminiError).Msg == "redirect" { + if errors.As(err, new(*common.GeminiError)) && + err.(*common.GeminiError).Msg == "redirect" { err = handleRedirection(workerID, tx, s) if err != nil { - if IsKnownError(err) { + if common.IsKnownError(err) { s.Error = null.StringFrom(err.Error()) } else { return err @@ -270,7 +230,7 @@ func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) { logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID) } - err = UpsertSnapshot(workerID, tx, s) + err = _db.OverwriteSnapshot(workerID, tx, s) logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String()) if err != nil { return err @@ -294,12 +254,12 @@ func isAnotherWorkerVisitingHost(workerID int, IPs []string) bool { return false } -func storeLinks(tx *sqlx.Tx, s *Snapshot) error { +func storeLinks(tx *sqlx.Tx, s *common.Snapshot) error { if s.Links.Valid { - var batchSnapshots []*Snapshot + var batchSnapshots []*common.Snapshot for _, link := range s.Links.ValueOrZero() { if shouldPersistURL(&link) { - newSnapshot := &Snapshot{ + newSnapshot := &common.Snapshot{ URL: link, Host: link.Hostname, Timestamp: null.TimeFrom(time.Now()), @@ -309,7 +269,7 @@ func storeLinks(tx *sqlx.Tx, s *Snapshot) error { } if len(batchSnapshots) > 0 { - err := SaveLinksToDBinBatches(tx, batchSnapshots) + err := _db.SaveLinksToDBinBatches(tx, batchSnapshots) if err != nil { return err } @@ -319,17 +279,33 @@ func storeLinks(tx *sqlx.Tx, s *Snapshot) error { } // shouldPersistURL returns true if we -// should save the URL in the DB. +// should save the URL in the _db. // Only gemini:// urls are saved. -func shouldPersistURL(u *URL) bool { +func shouldPersistURL(u *common.URL) bool { return strings.HasPrefix(u.String(), "gemini://") } +func haveWeVisitedURL(tx *sqlx.Tx, u *common.URL) (bool, error) { + var result bool + err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u.String()) + if err != nil { + return false, fmt.Errorf("%w: %w", common.ErrDatabase, err) + } + if result { + return result, nil + } + err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshot.url=$1`, u.String()) + if err != nil { + return false, fmt.Errorf("%w: %w", common.ErrDatabase, err) + } + return result, nil +} + // handleRedirection saves redirect URL as new snapshot -func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error { +func handleRedirection(workerID int, tx *sqlx.Tx, s *common.Snapshot) error { newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero()) if err != nil { - if errors.Is(err, ErrGeminiRedirect) { + if errors.Is(err, common.ErrGeminiRedirect) { logging.LogDebug("[%d] %s", workerID, err) } return err @@ -337,14 +313,14 @@ func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error { logging.LogDebug("[%d] Page redirects to %s", workerID, newURL) // Insert fresh snapshot with new URL if shouldPersistURL(newURL) { - snapshot := &Snapshot{ + snapshot := &common.Snapshot{ // UID: uid.UID(), URL: *newURL, Host: newURL.Hostname, Timestamp: null.TimeFrom(time.Now()), } logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String()) - err = SaveSnapshotIfNew(tx, snapshot) + err = _db.SaveSnapshotIfNew(tx, snapshot) if err != nil { return err } @@ -352,14 +328,14 @@ func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error { return nil } -func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) { +func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]common.Snapshot, error) { query := ` SELECT * FROM snapshots WHERE url=$1 LIMIT 1 ` - var snapshots []Snapshot + var snapshots []common.Snapshot err := tx.Select(&snapshots, query, url) if err != nil { return nil, err diff --git a/gemini/workerStatus.go b/gemini/workerStatus.go new file mode 100644 index 0000000..d77b618 --- /dev/null +++ b/gemini/workerStatus.go @@ -0,0 +1,54 @@ +package gemini + +import ( + "fmt" + "strings" +) + +type WorkerStatus struct { + id int + status string +} + +var statusChan chan WorkerStatus + +func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) { + // Create a slice to store current status of each worker + statuses := make([]string, totalWorkers) + + // Initialize empty statuses + for i := range statuses { + statuses[i] = "" + } + + // Initial print + var output strings.Builder + // \033[H moves the cursor to the top left corner of the screen + // (ie, the first column of the first row in the screen). + // \033[J clears the part of the screen from the cursor to the end of the screen. + output.WriteString("\033[H\033[J") // Clear screen and move cursor to top + for i := range statuses { + output.WriteString(fmt.Sprintf("[%2d] \n", i)) + } + fmt.Print(output.String()) + + // Continuously receive status updates + for update := range statusChan { + if update.id >= totalWorkers { + continue + } + + // Update the status + statuses[update.id] = update.status + + // Build the complete output string + output.Reset() + output.WriteString("\033[H\033[J") // Clear screen and move cursor to top + for i, status := range statuses { + output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status)) + } + + // Print the entire status + fmt.Print(output.String()) + } +} diff --git a/main.go b/main.go index 2bbaab9..c9e1b28 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,7 @@ package main import ( + main2 "gemini-grc/db" "os" "os/signal" "syscall" @@ -29,11 +30,7 @@ func runApp() error { signals := make(chan os.Signal, 1) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) - db := gemini.ConnectToDB() - - // !!! DANGER !!! - // Removes all rows and adds some seed URLs. - // populateDB(db) + db := main2.ConnectToDB() defer func(db *sqlx.DB) { err := db.Close()