From 74e9327b0bd3a4fda3807e1c1a4fcf123b7eeb05 Mon Sep 17 00:00:00 2001 From: antanst Date: Mon, 7 Oct 2024 13:36:20 +0300 Subject: [PATCH] Persist pages to file system --- .gitignore | 2 ++ fs.go | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++ gemini-url.go | 37 ++++++++++++++++++++++ gemini.go | 45 ++++++++++++++++----------- go.mod | 6 +++- go.sum | 2 ++ main.go | 35 +++++++++++++-------- network.go | 31 ++++++++++++------- snapshot.go | 42 +++++++++++++++++++++++++ types.go | 25 --------------- uid.go | 14 +++++++++ 11 files changed, 256 insertions(+), 69 deletions(-) create mode 100644 fs.go create mode 100644 gemini-url.go create mode 100644 snapshot.go delete mode 100644 types.go create mode 100644 uid.go diff --git a/.gitignore b/.gitignore index 485dee6..2e05024 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ .idea + +/gemini-grc diff --git a/fs.go b/fs.go new file mode 100644 index 0000000..2f38034 --- /dev/null +++ b/fs.go @@ -0,0 +1,86 @@ +package main + +import ( + "fmt" + "net/url" + "os" + "path/filepath" + "strings" +) + +// sanitizePath encodes invalid filesystem characters using URL encoding. +// Example: +// /example/path/to/page?query=param&another=value +// would become +// example/path/to/page%3Fquery%3Dparam%26another%3Dvalue +func sanitizePath(p string) string { + // Split the path into its components + components := strings.Split(p, "/") + + // Encode each component separately + for i, component := range components { + // Decode any existing percent-encoded characters + decodedComponent, err := url.PathUnescape(component) + if err != nil { + decodedComponent = component // Fallback to original if unescape fails + } + + // Encode the component to escape invalid filesystem characters + encodedComponent := url.QueryEscape(decodedComponent) + + // Replace '+' (from QueryEscape) with '%20' to handle spaces correctly + encodedComponent = strings.ReplaceAll(encodedComponent, "+", "%20") + + components[i] = encodedComponent + } + + // Rejoin the components into a sanitized path + safe := filepath.Join(components...) + + return safe +} + +// getFilePath constructs a safe file path from the root path and URL path. +// It URL-encodes invalid filesystem characters to ensure the path is valid. +func calcFilePath(rootPath, urlPath string) (string, error) { + // Normalize the URL path + cleanPath := filepath.Clean(urlPath) + + fmt.Printf("%s %s\n", urlPath, cleanPath) + // Safe check to prevent directory traversal + if strings.Contains(cleanPath, "..") { + return "", fmt.Errorf("invalid URL path: contains directory traversal") + } + + // Sanitize the path by encoding invalid characters + safePath := sanitizePath(cleanPath) + + // Join the root path and the sanitized URL path + finalPath := filepath.Join(rootPath, safePath) + + // Ensure the directory exists + dir := filepath.Dir(finalPath) + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + return "", fmt.Errorf("failed to create directories: %v", err) + } + + return finalPath, nil +} + +func SaveResult(rootPath string, s *Snapshot) { + urlPath := s.Url.Path + if urlPath == "" || urlPath == "/" { + urlPath = fmt.Sprintf("%s/index.gmi", s.Url.Hostname) + } + filepath, err := calcFilePath(rootPath, urlPath) + if err != nil { + LogError("Error saving %s: %w", s.Url, err) + return + } + // err = os.WriteFile(filepath, []byte(SnapshotToJSON(*s)), 0666) + err = os.WriteFile(filepath, []byte((*s).Data), 0666) + if err != nil { + LogError("Error saving %s: %w", s.Url.Full, err) + } + LogInfo("[%s] Saved to %s", s.Url.Full, filepath) +} diff --git a/gemini-url.go b/gemini-url.go new file mode 100644 index 0000000..41f9a1b --- /dev/null +++ b/gemini-url.go @@ -0,0 +1,37 @@ +package main + +import ( + "encoding/json" +) + +type GeminiUrl struct { + Protocol string `json:"protocol,omitempty"` + Hostname string `json:"hostname,omitempty"` + Port int `json:"port,omitempty"` + Path string `json:"path,omitempty"` + Descr string `json:"descr,omitempty"` + Full string `json:"full,omitempty"` +} + +func (u GeminiUrl) String() string { + return u.Full + // return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path) +} + +func GeminiUrltoJSON(g GeminiUrl) string { + // Serialize the Person struct to JSON + jsonData, err := json.Marshal(g) + if err != nil { + LogError("Error serializing to JSON: %w", err) + } + return string(jsonData) +} + +func GeminiUrlFromJSON(input string) GeminiUrl { + var geminiUrl GeminiUrl + err := json.Unmarshal([]byte(input), &geminiUrl) + if err != nil { + LogError("Error deserializing from JSON: %w", err) + } + return geminiUrl +} diff --git a/gemini.go b/gemini.go index 313c8c5..647d03d 100644 --- a/gemini.go +++ b/gemini.go @@ -8,34 +8,34 @@ import ( "strconv" ) -func Process(result *Result) *Result { - LogInfo("[%s] Processing data", result.url.String()) - code, err := ParseFirstTwoDigits(result.data) +func Process(snapshot *Snapshot) *Snapshot { + LogInfo("[%s] Processing data", snapshot.Url.String()) + code, err := ParseFirstTwoDigits(snapshot.Data) if err != nil { - result.error = fmt.Errorf("[%s] Invalid gemini response code", result.url.String()) - return result + snapshot.Error = fmt.Errorf("[%s] Invalid gemini response code", snapshot.Url.String()) + return snapshot } if code != 20 { - result.error = fmt.Errorf("[%s] Gemini response code != 20, skipping", result.url.String()) - return result + snapshot.Error = fmt.Errorf("[%s] Gemini response code != 20, skipping", snapshot.Url.String()) + return snapshot } // Grab link lines - linkLines := ExtractLinkLines(result.data) - LogDebug("[%s] Found %d links", result.url.String(), len(linkLines)) - // Normalize URLs in links, and store them in result + linkLines := ExtractLinkLines(snapshot.Data) + LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines)) + // Normalize URLs in links, and store them in snapshot for _, line := range linkLines { - normalizedLink, descr, error := NormalizeLink(line, result.url.String()) + normalizedLink, descr, error := NormalizeLink(line, snapshot.Url.String()) if error != nil { - LogError("[%s] Invalid link URL %w", result.url.String(), error) + LogError("[%s] Invalid link URL %w", snapshot.Url.String(), error) continue } geminiUrl, error := ParseUrl(normalizedLink, descr) if error != nil { - LogError("[%s] Unparseable gemini link %w", result.url.String(), error) + LogError("[%s] Unparseable gemini link %w", snapshot.Url.String(), error) } - result.links = append(result.links, *geminiUrl) + snapshot.Links = append(snapshot.Links, *geminiUrl) } - return result + return snapshot } func ParseUrl(input string, descr string) (*GeminiUrl, error) { @@ -46,6 +46,7 @@ func ParseUrl(input string, descr string) (*GeminiUrl, error) { protocol := u.Scheme hostname := u.Hostname() str_port := u.Port() + path := u.Path if str_port == "" { str_port = "1965" } @@ -53,7 +54,7 @@ func ParseUrl(input string, descr string) (*GeminiUrl, error) { if err != nil { return nil, fmt.Errorf("Error parsing URL %s: %w", input, err) } - return &GeminiUrl{protocol: protocol, hostname: hostname, port: port, path: u.Path, descr: descr}, nil + return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil } // ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines @@ -107,6 +108,14 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin // Construct the canonicalized link line canonicalURLStr := parsedURL.String() + + // Remove usual first space from URL description: + // => URL description + // ^^^^^^^^^^^^ + if restOfLine[0] == ' ' { + restOfLine = restOfLine[1:] + } + return canonicalURLStr, restOfLine, nil // canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine) // return canonicalizedLine, nil @@ -125,10 +134,10 @@ func ParseFirstTwoDigits(input string) (int, error) { } // Parse the captured match as an integer - result, err := strconv.Atoi(matches[1]) + snapshot, err := strconv.Atoi(matches[1]) if err != nil { return 0, fmt.Errorf("failed to convert matched digits to int: %v", err) } - return result, nil + return snapshot, nil } diff --git a/go.mod b/go.mod index b4c0810..1ddc2f2 100644 --- a/go.mod +++ b/go.mod @@ -2,9 +2,13 @@ module gemini-grc go 1.23.1 +require ( + github.com/jaevor/go-nanoid v1.4.0 + github.com/rs/zerolog v1.33.0 +) + require ( github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect - github.com/rs/zerolog v1.33.0 // indirect golang.org/x/sys v0.25.0 // indirect ) diff --git a/go.sum b/go.sum index b52b48f..bd70927 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/jaevor/go-nanoid v1.4.0 h1:mPz0oi3CrQyEtRxeRq927HHtZCJAAtZ7zdy7vOkrvWs= +github.com/jaevor/go-nanoid v1.4.0/go.mod h1:GIpPtsvl3eSBsjjIEFQdzzgpi50+Bo1Luk+aYlbJzlc= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= diff --git a/main.go b/main.go index 09a4566..1c846d9 100644 --- a/main.go +++ b/main.go @@ -8,19 +8,22 @@ import ( zlog "github.com/rs/zerolog/log" ) +const ROOTPATH string = "./a" + func main() { zerolog.TimeFieldFormat = zerolog.TimeFormatUnix - // zerolog.SetGlobalLevel(zerolog.DebugLevel) - zerolog.SetGlobalLevel(zerolog.InfoLevel) - zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr}) + zerolog.SetGlobalLevel(zerolog.DebugLevel) + //zerolog.SetGlobalLevel(zerolog.InfoLevel) + zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"}) if err := runApp(); err != nil { - LogError("Application error: %v", err) + LogError("Application error: %w", err) os.Exit(1) } } func runApp() error { - urls := []string{"gemini://smol.gr"} //, "gemini://gmi.noulin.neta/", "gemini://in.gr:443"} + //urls := []string{"gemini://smol.gr"} + urls := []string{"gemini://smol.gr", "gemini://gmi.noulin.net/"} queue := make(chan string) done := make(chan struct{}) @@ -45,7 +48,7 @@ func runApp() error { func crawler(queue <-chan string, done chan struct{}) { // Start processing results. - results := make(chan Result) + results := make(chan Snapshot) resultsDone := make(chan struct{}) go resultsHandler(results, resultsDone) @@ -68,7 +71,7 @@ func crawler(queue <-chan string, done chan struct{}) { LogInfo("All workers have finished") // Nobody left to send to results, so we - // close it, and the ResultsProcessor can + // close it, and the SnapshotsProcessor can // finish close(results) <-resultsDone @@ -76,28 +79,34 @@ func crawler(queue <-chan string, done chan struct{}) { close(done) } -func resultsHandler(results <-chan Result, done chan struct{}) { +func resultsHandler(results <-chan Snapshot, done chan struct{}) { for result := range results { - if result.error != nil { - LogError("%w", result.error) + if result.Error != nil { + LogError("[%s] %w", result.Url, result.Error) } else { - LogInfo("[%s] Done. Result: %#v", result.url, result) + LogInfo("[%s] Done", result.Url) + // fmt.Printf(SnapshotToJSON(result)) } } LogInfo("All results have been processed") close(done) } -func worker(queue <-chan string, results chan Result) { +func worker(queue <-chan string, results chan Snapshot) { for url := range queue { result := Visit(url) // If we encountered an error when // visiting, skip processing - if result.error != nil { + if result.Error != nil { results <- *result continue } result = Process(result) + if result.Error != nil { + results <- *result + continue + } + SaveResult(ROOTPATH, result) results <- *result } } diff --git a/network.go b/network.go index 7702abc..9e31cd9 100644 --- a/network.go +++ b/network.go @@ -7,35 +7,42 @@ import ( "time" ) -func Visit(url string) (result *Result) { - result = &Result{} +func Visit(url string) (result *Snapshot) { + result = &Snapshot{Timestamp: time.Now(), UID: UID()} // Wrap error with additional information defer func() { - if result.error != nil { - result.error = fmt.Errorf("[%s] Error: %w", result.url, result.error) + if result.Error != nil { + result.Error = fmt.Errorf("[%s] Error: %w", result.Url, result.Error) } }() geminiUrl, err := ParseUrl(url, "") if err != nil { - result.error = err + result.Error = err return result } - result.url = *geminiUrl + result.Url = *geminiUrl - LogInfo("[%s] Dialing", geminiUrl.String()) + LogInfo("[%s] Dialing", geminiUrl) // Establish a TLS connection tlsConfig := &tls.Config{ InsecureSkipVerify: true, } - conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.hostname, geminiUrl.port), tlsConfig) + conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.Hostname, geminiUrl.Port), tlsConfig) if err != nil { - result.error = err + result.Error = err return result } - defer conn.Close() + // Defer properly: Also handle possible + // error of conn.Close() + defer func() { + err := conn.Close() + if err != nil { + result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.Url.String(), err) + } + }() // Read data from the connection conn.SetReadDeadline(time.Now().Add(5 * time.Second)) @@ -53,7 +60,7 @@ func Visit(url string) (result *Result) { if err == io.EOF { break } else { - result.error = err + result.Error = err return result } } @@ -61,6 +68,6 @@ func Visit(url string) (result *Result) { LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data)) // time.Sleep(time.Duration(time.Second * 2)) // LogDebug("[%s] Visitor finished", geminiUrl.String()) - result.data = string(data) + result.Data = string(data) return result } diff --git a/snapshot.go b/snapshot.go new file mode 100644 index 0000000..fb1f32e --- /dev/null +++ b/snapshot.go @@ -0,0 +1,42 @@ +package main + +import ( + "encoding/json" + "fmt" + "time" +) + +type Snapshot struct { + Url GeminiUrl `json:"url,omitempty"` + Timestamp time.Time `json:"timestamp,omitempty"` + Data string `json:"data,omitempty"` + Links []GeminiUrl `json:"links,omitempty"` + Code int `json:"code,omitempty"` + Error error `json:"error,omitempty"` + UID string `json:"uid,omitempty"` +} + +func (u Snapshot) String() string { + return fmt.Sprintf( + "[%s] %s %s %s %d %s", + u.UID, u.Url, u.Timestamp, u.Links, u.Code, u.Error, + ) +} + +func SnapshotToJSON(g Snapshot) string { + // Serialize the Person struct to JSON + jsonData, err := json.Marshal(g) + if err != nil { + LogError("Error serializing to JSON: %w", err) + } + return string(jsonData) +} + +func SnapshotFromJSON(input string) Snapshot { + var snapshot Snapshot + err := json.Unmarshal([]byte(input), &snapshot) + if err != nil { + LogError("Error deserializing from JSON: %w", err) + } + return snapshot +} diff --git a/types.go b/types.go deleted file mode 100644 index 5bcc108..0000000 --- a/types.go +++ /dev/null @@ -1,25 +0,0 @@ -package main - -import ( - "fmt" -) - -type GeminiUrl struct { - protocol string - hostname string - port int - path string - descr string -} - -func (self GeminiUrl) String() string { - return fmt.Sprintf("%s://%s:%d%s", self.protocol, self.hostname, self.port, self.path) -} - -type Result struct { - url GeminiUrl - data string - links []GeminiUrl - code int - error error -} diff --git a/uid.go b/uid.go new file mode 100644 index 0000000..0df6238 --- /dev/null +++ b/uid.go @@ -0,0 +1,14 @@ +package main + +import ( + nanoid "github.com/jaevor/go-nanoid" +) + +func UID() string { + // Missing o,O and l + uid, err := nanoid.CustomASCII("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 18) + if err != nil { + panic(err) + } + return uid() +}