diff --git a/.gitignore b/.gitignore index 160e6bc..21c4e54 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ .idea +.goroot **/.#* **/*~ +/.go /cmd /db/initdb.sql /db/*sh diff --git a/gemini/errors.go b/gemini/errors.go index 310377d..e75e40b 100644 --- a/gemini/errors.go +++ b/gemini/errors.go @@ -5,13 +5,13 @@ import ( "fmt" ) -type ErrGeminiStatusCode struct { +type GeminiError struct { Msg string Code int Header string } -func (e *ErrGeminiStatusCode) Error() string { +func (e *GeminiError) Error() string { return fmt.Sprintf("%s: %s", e.Msg, e.Header) } @@ -31,7 +31,7 @@ func NewErrGeminiStatusCode(code int, header string) error { default: msg = "unexpected status code" } - return &ErrGeminiStatusCode{ + return &GeminiError{ Msg: msg, Code: code, Header: header, @@ -39,7 +39,6 @@ func NewErrGeminiStatusCode(code int, header string) error { } var ( - ErrGemini = errors.New("gemini general error") ErrGeminiRobotsParse = errors.New("gemini robots.txt parse error") ErrGeminiRobotsDisallowed = errors.New("gemini robots.txt disallowed") ErrGeminiResponseHeader = errors.New("gemini response header error") @@ -64,8 +63,10 @@ var ( // we would lose ability to check wrapped // errors via errors.Is(). -var KnownErrors = []error{ - ErrGemini, +var errGemini *GeminiError + +var knownErrors = []error{ //nolint:gochecknoglobals + errGemini, ErrGeminiLinkLineParse, ErrGeminiRobotsParse, ErrGeminiRobotsDisallowed, @@ -87,14 +88,14 @@ var KnownErrors = []error{ } func IsKnownError(err error) bool { - var errGeminiStatusCode *ErrGeminiStatusCode - if errors.As(err, &errGeminiStatusCode) { - return true - } - for _, known := range KnownErrors { + for _, known := range knownErrors { if errors.Is(err, known) { return true } } + // Check for wrapped errors as well + if errors.As(err, new(*GeminiError)) { + return true + } return false } diff --git a/gemini/errors_test.go b/gemini/errors_test.go new file mode 100644 index 0000000..e9ba76f --- /dev/null +++ b/gemini/errors_test.go @@ -0,0 +1,24 @@ +package gemini + +import ( + "errors" + "fmt" + "testing" +) + +func TestErrGemini(t *testing.T) { + t.Parallel() + err := NewErrGeminiStatusCode(50, "50 server error") + if !errors.As(err, new(*GeminiError)) { + t.Errorf("TestErrGemini fail") + } +} + +func TestErrGeminiWrapped(t *testing.T) { + t.Parallel() + err := NewErrGeminiStatusCode(50, "50 server error") + errWrapped := fmt.Errorf("%w wrapped", err) + if !errors.As(errWrapped, new(*GeminiError)) { + t.Errorf("TestErrGeminiWrapped fail") + } +} diff --git a/gemini/files.go b/gemini/files.go index 1c685eb..84b3012 100644 --- a/gemini/files.go +++ b/gemini/files.go @@ -78,7 +78,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) { finalPath, err := calcFilePath(parentPath, urlPath) if err != nil { - logging.LogError("Error saving %s: %w", s.URL, err) + logging.LogError("GeminiError saving %s: %w", s.URL, err) return } // Ensure the directory exists @@ -93,7 +93,7 @@ func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) { err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0o666) } if err != nil { - logging.LogError("Error saving %s: %w", s.URL.Full, err) + logging.LogError("GeminiError saving %s: %w", s.URL.Full, err) } close(done) } diff --git a/gemini/gemini.go b/gemini/gemini.go index 57ec82d..998ab36 100644 --- a/gemini/gemini.go +++ b/gemini/gemini.go @@ -9,14 +9,16 @@ import ( "gemini-grc/logging" ) -func ProcessGemini(snapshot *Snapshot) *Snapshot { +func GetPageLinks(currentURL URL, gemtext string) LinkList { // Grab link lines - linkLines := ExtractLinkLines(snapshot.GemText.String) - logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines)) - + linkLines := ExtractLinkLines(gemtext) + if len(linkLines) == 0 { + return nil + } + var linkURLs LinkList // Normalize URLs in links, and store them in snapshot for _, line := range linkLines { - normalizedLink, descr, err := NormalizeLink(line, snapshot.URL.String()) + normalizedLink, descr, err := NormalizeLink(line, currentURL.String()) if err != nil { logging.LogDebug("Cannot normalize URL in line '%s': %v", line, err) continue @@ -26,13 +28,10 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot { logging.LogDebug("Cannot parse URL in link '%s': %v", line, err) continue } - if snapshot.Links == nil { - snapshot.Links = &LinkList{*geminiUrl} - } else { - *snapshot.Links = append(*snapshot.Links, *geminiUrl) - } + logging.LogDebug(geminiUrl.String()) + linkURLs = append(linkURLs, *geminiUrl) } - return snapshot + return linkURLs } // ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines @@ -124,3 +123,22 @@ func ParseFirstTwoDigits(input string) (int, error) { return snapshot, nil } + +// extractRedirectTarget returns the redirection +// URL by parsing the header (or error message) +func extractRedirectTarget(currentURL URL, input string) (*URL, error) { + // \d+ - matches one or more digits + // \s+ - matches one or more whitespace + // ([^\r]+) - captures everything until it hits a \r (or end of string) + pattern := `\d+\s+([^\r]+)` + re := regexp.MustCompile(pattern) + matches := re.FindStringSubmatch(input) + if len(matches) < 2 { + return nil, fmt.Errorf("%w: Cannot find redirect target from header %s", ErrGeminiResponseHeader, input) + } + newURL, err := DeriveAbsoluteURL(currentURL, matches[1]) + if err != nil { + return nil, fmt.Errorf("%w: Cannot find redirect target from header: %w", ErrGeminiResponseHeader, err) + } + return newURL, nil +} diff --git a/gemini/gemini_test.go b/gemini/gemini_test.go new file mode 100644 index 0000000..66c1d26 --- /dev/null +++ b/gemini/gemini_test.go @@ -0,0 +1,47 @@ +package gemini + +import ( + "fmt" + "testing" +) + +func TestExtractRedirectTargetFullURL(t *testing.T) { + t.Parallel() + currentURL, _ := ParseURL("gemini://smol.gr", "") + input := "redirect: 31 gemini://target.gr" + result, err := extractRedirectTarget(*currentURL, input) + if err != nil || (result.String() != "gemini://target.gr:1965") { + t.Errorf("fail: %s", result) + } +} + +func TestExtractRedirectTargetRelativeURL(t *testing.T) { + t.Parallel() + currentURL, _ := ParseURL("gemini://smol.gr", "") + input := "redirect: 31 /a/b" + result, err := extractRedirectTarget(*currentURL, input) + if err != nil || (result.String() != "gemini://smol.gr:1965/a/b") { + t.Errorf("fail: %s", result) + } +} + +func TestExtractRedirectTargetRelativeURL2(t *testing.T) { + t.Parallel() + currentURL, _ := ParseURL("gemini://nox.im:1965", "") + input := "redirect: 31 ./" + result, err := extractRedirectTarget(*currentURL, input) + if err != nil || (result.String() != "gemini://nox.im:1965/") { + t.Errorf("fail: %s", result) + } +} + +func TestExtractRedirectTargetWrong(t *testing.T) { + t.Parallel() + currentURL, _ := ParseURL("gemini://smol.gr", "") + input := "redirect: 31 fsdsdf" + result, err := extractRedirectTarget(*currentURL, input) + fmt.Println(err) + if result != nil || err == nil { + t.Errorf("fail: result should be nil, err is %s", err) + } +} diff --git a/gemini/gemini_url.go b/gemini/gemini_url.go index 5157ffb..ebdf5a7 100644 --- a/gemini/gemini_url.go +++ b/gemini/gemini_url.go @@ -3,8 +3,11 @@ package gemini import ( "database/sql/driver" "fmt" + "gemini-grc/logging" "net/url" + "path" "strconv" + "strings" ) type URL struct { @@ -38,7 +41,13 @@ func (u URL) String() string { return u.Full } -// Value implements the driver.Valuer interface +func (u URL) StringNoDefaultPort() string { + if u.Port == 1965 { + return fmt.Sprintf("%s://%s%s", u.Protocol, u.Hostname, u.Path) + } + return u.Full +} + func (u URL) Value() (driver.Value, error) { if u.Full == "" { return nil, nil @@ -65,3 +74,24 @@ func ParseURL(input string, descr string) (*URL, error) { full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, path) return &URL{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: full}, nil } + +func DeriveAbsoluteURL(currentURL URL, input string) (*URL, error) { + logging.LogDebug("Calculating redirect URL. Current %s header string %s", currentURL, input) + // If URL is absolute, return just it + if strings.Contains(input, "://") { + return ParseURL(input, "") + } + // input is a path. Clean it and construct + // new path + var newPath string + // Handle weird cases found in the wild + if strings.HasPrefix(input, "/") { + newPath = path.Clean(input) + } else if input == "./" || input == "." { + newPath = path.Join(currentURL.Path, "/") + } else { + newPath = path.Join(currentURL.Path, path.Clean(input)) + } + strURL := fmt.Sprintf("%s://%s:%d%s", currentURL.Protocol, currentURL.Hostname, currentURL.Port, newPath) + return ParseURL(strURL, "") +} diff --git a/gemini/gemini_url_test.go b/gemini/gemini_url_test.go new file mode 100644 index 0000000..a9fd24d --- /dev/null +++ b/gemini/gemini_url_test.go @@ -0,0 +1,103 @@ +package gemini + +import ( + "reflect" + "testing" +) + +func TestParseURL(t *testing.T) { + t.Parallel() + input := "gemini://caolan.uk/cgi-bin/weather.py/wxfcs/3162" + parsed, err := ParseURL(input, "") + value, _ := parsed.Value() + if err != nil || !(value == "gemini://caolan.uk:1965/cgi-bin/weather.py/wxfcs/3162") { + t.Errorf("fail: %s", parsed) + } +} + +func TestDeriveAbsoluteURL_abs_url_input(t *testing.T) { + t.Parallel() + currentURL := URL{ + Protocol: "gemini", + Hostname: "smol.gr", + Port: 1965, + Path: "/a/b", + Descr: "Nothing", + Full: "gemini://smol.gr:1965/a/b", + } + input := "gemini://a.b/c" + output, err := DeriveAbsoluteURL(currentURL, input) + if err != nil { + t.Errorf("fail: %v", err) + } + expected := &URL{ + Protocol: "gemini", + Hostname: "a.b", + Port: 1965, + Path: "/c", + Descr: "", + Full: "gemini://a.b:1965/c", + } + pass := reflect.DeepEqual(output, expected) + if !pass { + t.Errorf("fail: %#v != %#v", output, expected) + } +} + +func TestDeriveAbsoluteURL_abs_path_input(t *testing.T) { + t.Parallel() + currentURL := URL{ + Protocol: "gemini", + Hostname: "smol.gr", + Port: 1965, + Path: "/a/b", + Descr: "Nothing", + Full: "gemini://smol.gr:1965/a/b", + } + input := "/c" + output, err := DeriveAbsoluteURL(currentURL, input) + if err != nil { + t.Errorf("fail: %v", err) + } + expected := &URL{ + Protocol: "gemini", + Hostname: "smol.gr", + Port: 1965, + Path: "/c", + Descr: "", + Full: "gemini://smol.gr:1965/c", + } + pass := reflect.DeepEqual(output, expected) + if !pass { + t.Errorf("fail: %#v != %#v", output, expected) + } +} + +func TestDeriveAbsoluteURL_rel_path_input(t *testing.T) { + t.Parallel() + currentURL := URL{ + Protocol: "gemini", + Hostname: "smol.gr", + Port: 1965, + Path: "/a/b", + Descr: "Nothing", + Full: "gemini://smol.gr:1965/a/b", + } + input := "c/d" + output, err := DeriveAbsoluteURL(currentURL, input) + if err != nil { + t.Errorf("fail: %v", err) + } + expected := &URL{ + Protocol: "gemini", + Hostname: "smol.gr", + Port: 1965, + Path: "/a/b/c/d", + Descr: "", + Full: "gemini://smol.gr:1965/a/b/c/d", + } + pass := reflect.DeepEqual(output, expected) + if !pass { + t.Errorf("fail: %#v != %#v", output, expected) + } +} diff --git a/gemini/network.go b/gemini/network.go index 57506ba..9a16e14 100644 --- a/gemini/network.go +++ b/gemini/network.go @@ -4,15 +4,18 @@ import ( "crypto/tls" "errors" "fmt" + "gemini-grc/logging" "io" "net" gourl "net/url" "regexp" "slices" "strconv" + "strings" "time" "gemini-grc/config" + "github.com/guregu/null/v5" ) @@ -55,8 +58,7 @@ func ConnectAndGetData(url string) ([]byte, error) { host := fmt.Sprintf("%s:%s", hostname, port) // Establish the underlying TCP connection. dialer := &net.Dialer{ - Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, - KeepAlive: 10 * time.Second, + Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, } conn, err := dialer.Dial("tcp", host) if err != nil { @@ -95,7 +97,11 @@ func ConnectAndGetData(url string) ([]byte, error) { var data []byte // Send Gemini request to trigger server response. - _, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url))) + // Fix for stupid server bug: + // Some servers return 'Header: 53 No proxying to other hosts or ports!' + // when the port is 1965 and is still specified explicitely in the URL. + _url, _ := ParseURL(url, "") + _, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", _url.StringNoDefaultPort()))) if err != nil { return nil, fmt.Errorf("%w: %w", ErrNetworkCannotWrite, err) } @@ -111,9 +117,8 @@ func ConnectAndGetData(url string) ([]byte, error) { if err != nil { if errors.Is(err, io.EOF) { break - } else { - return nil, fmt.Errorf("%w: %w", ErrNetwork, err) } + return nil, fmt.Errorf("%w: %w", ErrNetwork, err) } } return data, nil @@ -121,7 +126,19 @@ func ConnectAndGetData(url string) ([]byte, error) { // Visit given URL, using the Gemini protocol. // Mutates given Snapshot with the data. -func Visit(s *Snapshot) error { +// In case of error, we store the error string +// inside snapshot and return the error. +func Visit(s *Snapshot) (err error) { + // Don't forget to also store error + // response code (if we have one) + defer func() { + if err != nil { + s.Error = null.StringFrom(err.Error()) + if errors.As(err, new(*GeminiError)) { + s.ResponseCode = null.IntFrom(int64(err.(*GeminiError).Code)) + } + } + }() data, err := ConnectAndGetData(s.URL.String()) if err != nil { return err @@ -130,6 +147,9 @@ func Visit(s *Snapshot) error { if err != nil { return err } + //marshalled, _ := json.MarshalIndent(pageData, "", " ") + //fmt.Printf("%s\n", marshalled) + s.Header = null.StringFrom(pageData.ResponseHeader) s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode)) s.MimeType = null.StringFrom(pageData.MimeType) s.Lang = null.StringFrom(pageData.Lang) @@ -142,24 +162,21 @@ func Visit(s *Snapshot) error { return nil } -// Update given snapshot with the -// Gemini header data: response code, -// mime type and lang (optional) +// processData returne results from +// parsing Gemini header data: +// Code, mime type and lang (optional) +// Returns error if header was invalid func processData(data []byte) (*PageData, error) { header, body, err := getHeadersAndData(data) if err != nil { return nil, err } code, mimeType, lang := getMimeTypeAndLang(header) - var geminiError error + logging.LogDebug("Header: %s", strings.TrimSpace(header)) if code != 20 { - geminiError = NewErrGeminiStatusCode(code, header) + return nil, NewErrGeminiStatusCode(code, header) } - fmt.Printf("%v\n", header) - if geminiError != nil { - return nil, geminiError - } pageData := PageData{ ResponseCode: code, ResponseHeader: header, @@ -201,11 +218,11 @@ func getHeadersAndData(data []byte) (string, []byte, error) { // `20 text/gemini` (code, mimetype) // `31 gemini://redirected.to/other/site` (code) func getMimeTypeAndLang(headers string) (int, string, string) { - // Regex that parses code, mimetype & lang - re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(lang=([a-zA-Z0-9-]+)))?\s*$`) + // Regex that parses code, mimetype & optional charset/lang parameters + re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)(?:[;\s]+(?:(?:charset|lang)=([a-zA-Z0-9-]+)))?\s*$`) matches := re.FindStringSubmatch(headers) if matches == nil || len(matches) <= 1 { - // Try to get code at least. + // Try to get code at least re := regexp.MustCompile(`^(\d+)\s+`) matches := re.FindStringSubmatch(headers) if matches == nil || len(matches) <= 1 { @@ -222,6 +239,6 @@ func getMimeTypeAndLang(headers string) (int, string, string) { return 0, "", "" } mimeType := matches[2] - lang := matches[4] - return code, mimeType, lang + param := matches[3] // This will capture either charset or lang value + return code, mimeType, param } diff --git a/gemini/network_test.go b/gemini/network_test.go index 79cc71e..81202db 100644 --- a/gemini/network_test.go +++ b/gemini/network_test.go @@ -21,7 +21,31 @@ func TestGetMimeTypeAndLang11(t *testing.T) { } } +func TestGetMimeTypeAndLang12(t *testing.T) { + t.Parallel() + code, mimeType, lang := getMimeTypeAndLang("20 text/plain; charset=utf-8") + if code != 20 || mimeType != "text/plain" || lang != "utf-8" { + t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang) + } +} + +func TestGetMimeTypeAndLang13(t *testing.T) { + t.Parallel() + code, mimeType, lang := getMimeTypeAndLang("20 text/gemini; charset=utf-8") + if code != 20 || mimeType != "text/gemini" || lang != "utf-8" { + t.Errorf("Expected (20, 'text/plain', ''), got (%d, '%s', '%s')", code, mimeType, lang) + } +} + func TestGetTypeAndLang2(t *testing.T) { + t.Parallel() + code, mimeType, lang := getMimeTypeAndLang("20 text/gemini charset=en") + if code != 20 || mimeType != "text/gemini" || lang != "en" { + t.Errorf("Expected (20, 'text/gemini', 'en'), got (%d, '%s', '%s')", code, mimeType, lang) + } +} + +func TestGetTypeAndLang21(t *testing.T) { t.Parallel() code, mimeType, lang := getMimeTypeAndLang("20 text/gemini lang=en") if code != 20 || mimeType != "text/gemini" || lang != "en" { diff --git a/gemini/persistence.go b/gemini/persistence.go index 778873a..5c0e377 100644 --- a/gemini/persistence.go +++ b/gemini/persistence.go @@ -1,10 +1,13 @@ package gemini import ( + "encoding/json" "fmt" + "gemini-grc/config" "os" "gemini-grc/logging" + _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL "github.com/jmoiron/sqlx" ) @@ -33,25 +36,40 @@ func ConnectToDB() *sqlx.DB { return db } -func SaveSnapshotToDBIfNotExists(tx *sqlx.Tx, s *Snapshot) error { +func SaveSnapshotIfNew(tx *sqlx.Tx, s *Snapshot) error { + marshalled, err := json.MarshalIndent(s, "", " ") + if err != nil { + panic(fmt.Sprintf("JSON serialization error for %v", s)) + } + if config.CONFIG.DryRun { + logging.LogDebug("Would insert (if new) snapshot %s", marshalled) + return nil + } query := ` - INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) - VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) + INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) + VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) ON CONFLICT (url) DO NOTHING ` - _, err := tx.NamedExec(query, s) + _, err = tx.NamedExec(query, s) if err != nil { - logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err) - return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking + return fmt.Errorf("[%s] GeminiError inserting snapshot: %w", s.URL, err) } return nil } -func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { - fmt.Printf("%+v", s) +func UpsertSnapshot(id int, tx *sqlx.Tx, s *Snapshot) error { + marshalled, err := json.MarshalIndent(s, "", " ") + if err != nil { + panic(fmt.Sprintf("JSON serialization error for %v", s)) + } + if config.CONFIG.DryRun { + logging.LogDebug("[%d] Would upsert snapshot %s", id, marshalled) + return nil + } + query := ` - INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) - VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) +INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) + VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) ON CONFLICT (url) DO UPDATE SET url = EXCLUDED.url, host = EXCLUDED.host, @@ -62,23 +80,29 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error { links = EXCLUDED.links, lang = EXCLUDED.lang, response_code = EXCLUDED.response_code, - error = EXCLUDED.error - ` - _, err := tx.NamedExec(query, s) + error = EXCLUDED.error` + _, err = tx.NamedExec(query, s) + //if err != nil { + // logging.LogError("[%s] GeminiError upserting snapshot: %w", s.URL, err) + // panic("This shouldn't happen") + //} if err != nil { - logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err) - return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking + return fmt.Errorf("[%s] GeminiError upserting snapshot: %w", s.URL, err) } return nil } func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { + if config.CONFIG.DryRun { + return nil + } + // Approximately 5,957 rows maximum (65535/11 parameters), use 5000 to be safe const batchSize = 5000 query := ` - INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) - VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) + INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) + VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) ON CONFLICT (url) DO NOTHING ` @@ -92,7 +116,7 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { _, err := tx.NamedExec(query, batch) if err != nil { - logging.LogError("Error batch inserting snapshots: %w", err) + logging.LogError("GeminiError batch inserting snapshots: %w", err) return fmt.Errorf("DB error: %w", err) } } @@ -101,14 +125,17 @@ func SaveLinksToDBinBatches(tx *sqlx.Tx, snapshots []*Snapshot) error { } func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error { + if config.CONFIG.DryRun { + return nil + } query := ` - INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) - VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) + INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error) + VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error) ON CONFLICT (url) DO NOTHING ` _, err := tx.NamedExec(query, snapshots) if err != nil { - logging.LogError("Error batch inserting snapshots: %w", err) + logging.LogError("GeminiError batch inserting snapshots: %w", err) return fmt.Errorf("DB error: %w", err) } return nil diff --git a/gemini/robotmatch.go b/gemini/robotmatch.go index bd1f8ad..b786204 100644 --- a/gemini/robotmatch.go +++ b/gemini/robotmatch.go @@ -57,8 +57,8 @@ func populateBlacklist(key string) (entries []string) { // RobotMatch checks if the snapshot URL matches // a robots.txt allow rule. func RobotMatch(url URL) bool { - logging.LogDebug("Checking robots.txt cache for %s", url.String()) key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port)) + logging.LogDebug("Checking robots.txt cache for %s", key) var disallowedURLs []string cacheEntries, ok := RobotsCache.Load(key) if !ok { diff --git a/gemini/snapshot.go b/gemini/snapshot.go index 9d8ec59..1acf78d 100644 --- a/gemini/snapshot.go +++ b/gemini/snapshot.go @@ -27,46 +27,17 @@ func (l *LinkList) Scan(value interface{}) error { } type Snapshot struct { - ID int `db:"id" json:"id,omitempty"` - UID string `db:"uid" json:"uid,omitempty"` - URL URL `db:"url" json:"url,omitempty"` - Host string `db:"host" json:"host,omitempty"` - Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"` - MimeType null.String `db:"mimetype" json:"mimetype,omitempty"` - Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files. - GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files. - Links *LinkList `db:"links" json:"links,omitempty"` - Lang null.String `db:"lang" json:"lang,omitempty"` - ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code. - Error null.String `db:"error" json:"error,omitempty"` // On network errors only + ID int `db:"id" json:"id,omitempty"` + //UID string `db:"uid" json:"uid,omitempty"` + URL URL `db:"url" json:"url,omitempty"` + Host string `db:"host" json:"host,omitempty"` + Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"` + MimeType null.String `db:"mimetype" json:"mimetype,omitempty"` + Data null.Value[[]byte] `db:"data" json:"data,omitempty"` // For non text/gemini files. + GemText null.String `db:"gemtext" json:"gemtext,omitempty"` // For text/gemini files. + Header null.String `db:"header" json:"header,omitempty"` // Response header. + Links null.Value[LinkList] `db:"links" json:"links,omitempty"` + Lang null.String `db:"lang" json:"lang,omitempty"` + ResponseCode null.Int `db:"response_code" json:"code,omitempty"` // Gemini response status code. + Error null.String `db:"error" json:"error,omitempty"` // On network errors only } - -//func SnapshotToJSON(g Snapshot) string { -// // Serialize the Person struct to JSON -// jsonData, err := json.MarshalIndent(g, "", "\t") -// if err != nil { -// logging.LogError("Error serializing to JSON: %w", err) -// } -// return string(jsonData) -//} -// -//func SnapshotFromJSON(input string) Snapshot { -// var snapshot Snapshot -// err := json.Unmarshal([]byte(input), &snapshot) -// if err != nil { -// logging.LogError("Error deserializing from JSON: %w", err) -// } -// return snapshot -//} -// -//func ShouldPersistSnapshot(result *Snapshot) bool { -// if !result.MimeType.Valid { -// return false -// } -// if result.MimeType.String == "text/gemini" || -// strings.HasPrefix(result.MimeType.String, "image/") || -// strings.HasPrefix(result.MimeType.String, "text/") { -// return true -// } -// return false -//} diff --git a/gemini/worker.go b/gemini/worker.go index 1d21fe5..4137b75 100644 --- a/gemini/worker.go +++ b/gemini/worker.go @@ -3,14 +3,13 @@ package gemini import ( "errors" "fmt" - "regexp" "strings" "time" "gemini-grc/config" "gemini-grc/logging" - "gemini-grc/uid" "gemini-grc/util" + "github.com/guregu/null/v5" "github.com/jmoiron/sqlx" ) @@ -27,12 +26,13 @@ func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { } func RunWorker(id int, db *sqlx.DB, url *string) { - // Start the DB transaction + // Each worker runs within a DB transaction. tx, err := db.Beginx() if err != nil { logging.LogError("Failed to begin transaction: %w", err) } + // Commit/rollback at the end defer func() { err = tx.Commit() if err != nil { @@ -46,73 +46,57 @@ func RunWorker(id int, db *sqlx.DB, url *string) { var snapshots []Snapshot + // If not given a specific URL, + // get some random ones to visit from DB. if url == nil { snapshots, err = GetRandomSnapshotsDistinctHosts(tx) - } else { - snapshots, err = GetSnapshotFromURL(tx, *url) - if len(snapshots) == 0 { - snapshotURL, err := ParseURL(*url, "") - if err != nil { - panic("Invalid URL: " + *url) - } - snapshots = []Snapshot{{ - UID: uid.UID(), - URL: *snapshotURL, - Host: snapshotURL.Hostname, - Timestamp: null.TimeFrom(time.Now()), - }} + if err != nil { + logging.LogError("[%d] GeminiError retrieving snapshot: %w", id, err) + panic("This should never happen") + } else if len(snapshots) == 0 { + logging.LogInfo("[%d] No snapshots to visit.", id) + time.Sleep(1 * time.Minute) + return } + } else { + snapshotURL, err := ParseURL(*url, "") + if err != nil { + logging.LogError("Invalid URL given: " + *url) + return + + } + snapshots = []Snapshot{{ + //UID: uid.UID(), + URL: *snapshotURL, + Host: snapshotURL.Hostname, + Timestamp: null.TimeFrom(time.Now()), + }} } - if err != nil { - logging.LogError("[%d] Error retrieving snapshot: %w", id, err) - time.Sleep(10 * time.Second) - return - } else if len(snapshots) == 0 { - logging.LogInfo("[%d] No snapshots to visit.", id) - time.Sleep(1 * time.Minute) - return - } + // Start visiting URLs. total := len(snapshots) for i, s := range snapshots { logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL.String()) + // We differentiate between errors: + // Unexpected errors are the ones returned from the following function. + // If an error is unexpected (which should never happen) we panic. + // Expected errors are stored as strings within the snapshot, + // so that they can also be stored in DB. err = workOnSnapshot(id, tx, &s) if err != nil { - logging.LogError("[%d] [%s] Unexpected Error %w", id, s.URL.String(), err) + logging.LogError("[%d] [%s] Unexpected GeminiError %w", id, s.URL.String(), err) util.PrintStackAndPanic(err) } if s.Error.Valid { - logging.LogWarn("[%d] [%s] Worker Error: %v", id, s.URL.String(), s.Error.String) + logging.LogWarn("[%d] Error: %v", id, s.Error.String) } logging.LogDebug("[%d] Done %d/%d.", id, i+1, total) } logging.LogInfo("[%d] Worker done.", id) } -func handleRedirection(tx *sqlx.Tx, s *Snapshot) error { - re := regexp.MustCompile(`gemini://\S+`) - matches := re.FindStringSubmatch(s.Error.ValueOrZero()) - if len(matches) == 1 { - newURL := matches[0] - logging.LogDebug("Page redirects to %s", newURL) - _url, err := ParseURL(newURL, "") - // Insert fresh snapshot with new URL - if err == nil { - snapshot := &Snapshot{ - UID: uid.UID(), - URL: *_url, - Host: _url.Hostname, - Timestamp: null.TimeFrom(time.Now()), - } - err := SaveSnapshotToDBIfNotExists(tx, snapshot) - if err != nil { - return err - } - } - } - return nil -} - +// workOnSnapshot visits a URL and stores the result. +// errors should be returned only if they are unexpected. func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { if IsBlacklisted(s.URL) { logging.LogInfo("[%d] URL matches Blacklist, ignoring %s", id, s.URL.String()) @@ -123,31 +107,26 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { // add it as an error so next time it won't be // crawled. if RobotMatch(s.URL) { - s.Error = null.StringFrom("robots.txt disallow match") - err = SaveSnapshotToDB(tx, s) + s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error()) + err = UpsertSnapshot(id, tx, s) if err != nil { - return fmt.Errorf("[%d] DB Error: %w", id, err) + return fmt.Errorf("[%d] %w", id, err) } return nil } + // Resolve IP address via DNS IPs, err := getHostIPAddresses(s.Host) if err != nil { - s.Error = null.StringFrom("DNS Resolve error") - err = SaveSnapshotToDB(tx, s) + s.Error = null.StringFrom(err.Error()) + err = UpsertSnapshot(id, tx, s) if err != nil { - return fmt.Errorf("[%d] DB Error: %w", id, err) + return fmt.Errorf("[%d] %w", id, err) } return nil } - defer func() { - time.Sleep(5 * time.Second) - RemoveIPsFromPool(IPs) - }() - - // If the host's ip is in the connections pool, - // stop and add the url in the queue later. + // If the host's ip is in the connections pool we stop IpPool.Lock.RLock() logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL.String()) for _, ip := range IPs { @@ -155,7 +134,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { if ok { logging.LogDebug("[%d] Another worker is visiting this host: %s", id, s.URL.String()) IpPool.Lock.RUnlock() - time.Sleep(1 * time.Second) // Avoid flood-retrying when few URLs remain + time.Sleep(1 * time.Second) // Avoid flood-retrying return nil } } @@ -163,84 +142,115 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) { AddIPsToPool(IPs) + // After finishing, remove the host IPs from + // the connections pool, with a small delay + // to avoid potentially hitting the same IP quickly. + defer func() { + time.Sleep(5 * time.Second) + RemoveIPsFromPool(IPs) + }() + url := s.URL.String() logging.LogDebug("[%d] Dialing %s", id, url) + err = Visit(s) if err != nil { if !IsKnownError(err) { logging.LogError("[%d] Unknown error visiting %s: %w", id, url, err) - if config.CONFIG.PanicOnUnexpectedError { - util.PrintStackAndPanic(err) - } - } else { - s.Error = null.StringFrom(err.Error()) + return err } - if errors.As(err, new(*ErrGeminiStatusCode)) { - err = handleRedirection(tx, s) + // Check if error is redirection, and handle it + s.Error = null.StringFrom(err.Error()) + if errors.As(err, new(*GeminiError)) && + err.(*GeminiError).Msg == "redirect" { + err = handleRedirection(id, tx, s) if err != nil { return err } } } - logging.LogDebug("[%d] Finished dialing.", id) + logging.LogInfo("[%d] Done, response code %d.", id, s.ResponseCode.ValueOrZero()) + // If this is a gemini page, parse possible links inside if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" { - logging.LogDebug("[%d] [%s] Processing", id, url) - s = ProcessGemini(s) + links := GetPageLinks(s.URL, s.GemText.String) + logging.LogDebug("[%d] Found %d links", id, len(links)) + if len(links) > 0 { + s.Links = null.ValueFrom(links) + } + } else { + logging.LogDebug("[%d] Not looking for page links", id) } - logging.LogDebug("[%d] Saving", id) - err = SaveSnapshotToDB(tx, s) + + err = UpsertSnapshot(id, tx, s) if err != nil { - return fmt.Errorf("[%d] DB Error: %w", id, err) + return err } - // Store links in batch - if s.Links != nil { - var batchSnapshots []*Snapshot - timestamp := null.TimeFrom(time.Now()) + err = storeLinks(tx, s) + if err != nil { + return err + } + return nil +} - for _, link := range *s.Links { - if shouldPersistURL(tx, link) { +func storeLinks(tx *sqlx.Tx, s *Snapshot) error { + if s.Links.Valid { + var batchSnapshots []*Snapshot + for _, link := range s.Links.ValueOrZero() { + if shouldPersistURL(link) { newSnapshot := &Snapshot{ - UID: uid.UID(), + //UID: uid.UID(), URL: link, Host: link.Hostname, - Timestamp: timestamp, + Timestamp: null.TimeFrom(time.Now()), } batchSnapshots = append(batchSnapshots, newSnapshot) } } if len(batchSnapshots) > 0 { - logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots)) - err = SaveLinksToDBinBatches(tx, batchSnapshots) + err := SaveLinksToDBinBatches(tx, batchSnapshots) if err != nil { - return fmt.Errorf("[%d] DB Error: %w", id, err) + return err } } } return nil } -// Should we save the given URL for crawling? -func shouldPersistURL(tx *sqlx.Tx, u URL) bool { - if !strings.HasPrefix(u.String(), "gemini://") { - return false - } - query := `SELECT EXISTS(SELECT 1 FROM snapshots WHERE URL=$1)` - var exists bool - err := tx.Get(&exists, query, u.String()) +// shouldPersistURL returns true if we +// should save the URL in the DB. +// Only gemini:// urls are saved. +func shouldPersistURL(u URL) bool { + return strings.HasPrefix(u.String(), "gemini://") +} + +func handleRedirection(id int, tx *sqlx.Tx, s *Snapshot) error { + newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero()) if err != nil { - fmt.Println("Error executing query:", err) - return false + return err } - return !exists + logging.LogDebug("[%d] Page redirects to %s", id, newURL) + // Insert fresh snapshot with new URL + snapshot := &Snapshot{ + //UID: uid.UID(), + URL: *newURL, + Host: newURL.Hostname, + Timestamp: null.TimeFrom(time.Now()), + } + logging.LogDebug("[%d] Saving empty snapshot for %s", id, snapshot.URL.String()) + err = SaveSnapshotIfNew(tx, snapshot) + if err != nil { + return err + } + return nil } func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) { // Old, unoptimized query - // - // query := ` + + // query := ` // SELECT DISTINCT ON (host) * // FROM snapshots // WHERE response_code IS NULL @@ -249,20 +259,28 @@ func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) { // LIMIT $1 // ` query := ` - WITH RankedSnapshots AS ( - SELECT id, uid, url, host, timestamp, mimetype, data, gemtext, - links, lang, response_code, error, - ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn - FROM snapshots - WHERE response_code IS NULL - AND error IS NULL - ) - SELECT id, uid, url, host, timestamp, mimetype, data, gemtext, - links, lang, response_code, error - FROM RankedSnapshots - WHERE rn = 1 - LIMIT $1 - ` + SELECT * + FROM snapshots + WHERE response_code IS NULL + AND error IS NULL + ORDER BY RANDOM() + LIMIT $1 + ` + //query := ` + // WITH RankedSnapshots AS ( + // SELECT id, url, host, timestamp, mimetype, data, gemtext, + // links, lang, response_code, error, + // ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn + // FROM snapshots + // WHERE response_code IS NULL + // AND error IS NULL + // ) + // SELECT id, url, host, timestamp, mimetype, data, gemtext, + // links, lang, response_code, error + // FROM RankedSnapshots + // WHERE rn = 1 + // LIMIT $1 + //` var snapshots []Snapshot err := tx.Select(&snapshots, query, config.CONFIG.WorkerBatchSize) if err != nil { diff --git a/main.go b/main.go index 532aa41..3e695ab 100644 --- a/main.go +++ b/main.go @@ -1,16 +1,16 @@ package main import ( - "os" - "os/signal" - "syscall" - "gemini-grc/config" "gemini-grc/gemini" + "gemini-grc/http" "gemini-grc/logging" "github.com/jmoiron/sqlx" "github.com/rs/zerolog" zlog "github.com/rs/zerolog/log" + "os" + "os/signal" + "syscall" ) func main() { @@ -26,9 +26,10 @@ func main() { func runApp() error { logging.LogInfo("Starting up. Press Ctrl+C to exit") - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + signals := make(chan os.Signal, 1) + signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) + server := http.CreateServer("localhost:8899") db := gemini.ConnectToDB() // !!! DANGER !!! @@ -44,7 +45,8 @@ func runApp() error { }(db) gemini.LoadBlacklist() - + // If there's an argument, assume it's a URL + // to visit and ignore database state. if len(os.Args) > 1 { url := os.Args[1] go gemini.RunWorker(0, db, &url) @@ -52,7 +54,10 @@ func runApp() error { go gemini.SpawnWorkers(config.CONFIG.NumOfWorkers, db) } - <-sigs + <-signals + if err := server.Close(); err != nil { + logging.LogError("GeminiError during server shutdown: %s", err) + } logging.LogInfo("Received SIGINT or SIGTERM signal, exiting") return nil }