Persist pages to file system

This commit is contained in:
2024-10-07 13:36:20 +03:00
parent 74be6b4d0d
commit 74e9327b0b
11 changed files with 256 additions and 69 deletions

2
.gitignore vendored
View File

@@ -1 +1,3 @@
.idea .idea
/gemini-grc

86
fs.go Normal file
View File

@@ -0,0 +1,86 @@
package main
import (
"fmt"
"net/url"
"os"
"path/filepath"
"strings"
)
// sanitizePath encodes invalid filesystem characters using URL encoding.
// Example:
// /example/path/to/page?query=param&another=value
// would become
// example/path/to/page%3Fquery%3Dparam%26another%3Dvalue
func sanitizePath(p string) string {
// Split the path into its components
components := strings.Split(p, "/")
// Encode each component separately
for i, component := range components {
// Decode any existing percent-encoded characters
decodedComponent, err := url.PathUnescape(component)
if err != nil {
decodedComponent = component // Fallback to original if unescape fails
}
// Encode the component to escape invalid filesystem characters
encodedComponent := url.QueryEscape(decodedComponent)
// Replace '+' (from QueryEscape) with '%20' to handle spaces correctly
encodedComponent = strings.ReplaceAll(encodedComponent, "+", "%20")
components[i] = encodedComponent
}
// Rejoin the components into a sanitized path
safe := filepath.Join(components...)
return safe
}
// getFilePath constructs a safe file path from the root path and URL path.
// It URL-encodes invalid filesystem characters to ensure the path is valid.
func calcFilePath(rootPath, urlPath string) (string, error) {
// Normalize the URL path
cleanPath := filepath.Clean(urlPath)
fmt.Printf("%s %s\n", urlPath, cleanPath)
// Safe check to prevent directory traversal
if strings.Contains(cleanPath, "..") {
return "", fmt.Errorf("invalid URL path: contains directory traversal")
}
// Sanitize the path by encoding invalid characters
safePath := sanitizePath(cleanPath)
// Join the root path and the sanitized URL path
finalPath := filepath.Join(rootPath, safePath)
// Ensure the directory exists
dir := filepath.Dir(finalPath)
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
return "", fmt.Errorf("failed to create directories: %v", err)
}
return finalPath, nil
}
func SaveResult(rootPath string, s *Snapshot) {
urlPath := s.Url.Path
if urlPath == "" || urlPath == "/" {
urlPath = fmt.Sprintf("%s/index.gmi", s.Url.Hostname)
}
filepath, err := calcFilePath(rootPath, urlPath)
if err != nil {
LogError("Error saving %s: %w", s.Url, err)
return
}
// err = os.WriteFile(filepath, []byte(SnapshotToJSON(*s)), 0666)
err = os.WriteFile(filepath, []byte((*s).Data), 0666)
if err != nil {
LogError("Error saving %s: %w", s.Url.Full, err)
}
LogInfo("[%s] Saved to %s", s.Url.Full, filepath)
}

37
gemini-url.go Normal file
View File

@@ -0,0 +1,37 @@
package main
import (
"encoding/json"
)
type GeminiUrl struct {
Protocol string `json:"protocol,omitempty"`
Hostname string `json:"hostname,omitempty"`
Port int `json:"port,omitempty"`
Path string `json:"path,omitempty"`
Descr string `json:"descr,omitempty"`
Full string `json:"full,omitempty"`
}
func (u GeminiUrl) String() string {
return u.Full
// return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path)
}
func GeminiUrltoJSON(g GeminiUrl) string {
// Serialize the Person struct to JSON
jsonData, err := json.Marshal(g)
if err != nil {
LogError("Error serializing to JSON: %w", err)
}
return string(jsonData)
}
func GeminiUrlFromJSON(input string) GeminiUrl {
var geminiUrl GeminiUrl
err := json.Unmarshal([]byte(input), &geminiUrl)
if err != nil {
LogError("Error deserializing from JSON: %w", err)
}
return geminiUrl
}

View File

@@ -8,34 +8,34 @@ import (
"strconv" "strconv"
) )
func Process(result *Result) *Result { func Process(snapshot *Snapshot) *Snapshot {
LogInfo("[%s] Processing data", result.url.String()) LogInfo("[%s] Processing data", snapshot.Url.String())
code, err := ParseFirstTwoDigits(result.data) code, err := ParseFirstTwoDigits(snapshot.Data)
if err != nil { if err != nil {
result.error = fmt.Errorf("[%s] Invalid gemini response code", result.url.String()) snapshot.Error = fmt.Errorf("[%s] Invalid gemini response code", snapshot.Url.String())
return result return snapshot
} }
if code != 20 { if code != 20 {
result.error = fmt.Errorf("[%s] Gemini response code != 20, skipping", result.url.String()) snapshot.Error = fmt.Errorf("[%s] Gemini response code != 20, skipping", snapshot.Url.String())
return result return snapshot
} }
// Grab link lines // Grab link lines
linkLines := ExtractLinkLines(result.data) linkLines := ExtractLinkLines(snapshot.Data)
LogDebug("[%s] Found %d links", result.url.String(), len(linkLines)) LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines))
// Normalize URLs in links, and store them in result // Normalize URLs in links, and store them in snapshot
for _, line := range linkLines { for _, line := range linkLines {
normalizedLink, descr, error := NormalizeLink(line, result.url.String()) normalizedLink, descr, error := NormalizeLink(line, snapshot.Url.String())
if error != nil { if error != nil {
LogError("[%s] Invalid link URL %w", result.url.String(), error) LogError("[%s] Invalid link URL %w", snapshot.Url.String(), error)
continue continue
} }
geminiUrl, error := ParseUrl(normalizedLink, descr) geminiUrl, error := ParseUrl(normalizedLink, descr)
if error != nil { if error != nil {
LogError("[%s] Unparseable gemini link %w", result.url.String(), error) LogError("[%s] Unparseable gemini link %w", snapshot.Url.String(), error)
} }
result.links = append(result.links, *geminiUrl) snapshot.Links = append(snapshot.Links, *geminiUrl)
} }
return result return snapshot
} }
func ParseUrl(input string, descr string) (*GeminiUrl, error) { func ParseUrl(input string, descr string) (*GeminiUrl, error) {
@@ -46,6 +46,7 @@ func ParseUrl(input string, descr string) (*GeminiUrl, error) {
protocol := u.Scheme protocol := u.Scheme
hostname := u.Hostname() hostname := u.Hostname()
str_port := u.Port() str_port := u.Port()
path := u.Path
if str_port == "" { if str_port == "" {
str_port = "1965" str_port = "1965"
} }
@@ -53,7 +54,7 @@ func ParseUrl(input string, descr string) (*GeminiUrl, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err) return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
} }
return &GeminiUrl{protocol: protocol, hostname: hostname, port: port, path: u.Path, descr: descr}, nil return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
} }
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines // ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
@@ -107,6 +108,14 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
// Construct the canonicalized link line // Construct the canonicalized link line
canonicalURLStr := parsedURL.String() canonicalURLStr := parsedURL.String()
// Remove usual first space from URL description:
// => URL description
// ^^^^^^^^^^^^
if restOfLine[0] == ' ' {
restOfLine = restOfLine[1:]
}
return canonicalURLStr, restOfLine, nil return canonicalURLStr, restOfLine, nil
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine) // canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
// return canonicalizedLine, nil // return canonicalizedLine, nil
@@ -125,10 +134,10 @@ func ParseFirstTwoDigits(input string) (int, error) {
} }
// Parse the captured match as an integer // Parse the captured match as an integer
result, err := strconv.Atoi(matches[1]) snapshot, err := strconv.Atoi(matches[1])
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err) return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
} }
return result, nil return snapshot, nil
} }

6
go.mod
View File

@@ -2,9 +2,13 @@ module gemini-grc
go 1.23.1 go 1.23.1
require (
github.com/jaevor/go-nanoid v1.4.0
github.com/rs/zerolog v1.33.0
)
require ( require (
github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-isatty v0.0.20 // indirect
github.com/rs/zerolog v1.33.0 // indirect
golang.org/x/sys v0.25.0 // indirect golang.org/x/sys v0.25.0 // indirect
) )

2
go.sum
View File

@@ -1,5 +1,7 @@
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/jaevor/go-nanoid v1.4.0 h1:mPz0oi3CrQyEtRxeRq927HHtZCJAAtZ7zdy7vOkrvWs=
github.com/jaevor/go-nanoid v1.4.0/go.mod h1:GIpPtsvl3eSBsjjIEFQdzzgpi50+Bo1Luk+aYlbJzlc=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=

35
main.go
View File

@@ -8,19 +8,22 @@ import (
zlog "github.com/rs/zerolog/log" zlog "github.com/rs/zerolog/log"
) )
const ROOTPATH string = "./a"
func main() { func main() {
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
// zerolog.SetGlobalLevel(zerolog.DebugLevel) zerolog.SetGlobalLevel(zerolog.DebugLevel)
zerolog.SetGlobalLevel(zerolog.InfoLevel) //zerolog.SetGlobalLevel(zerolog.InfoLevel)
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr}) zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
if err := runApp(); err != nil { if err := runApp(); err != nil {
LogError("Application error: %v", err) LogError("Application error: %w", err)
os.Exit(1) os.Exit(1)
} }
} }
func runApp() error { func runApp() error {
urls := []string{"gemini://smol.gr"} //, "gemini://gmi.noulin.neta/", "gemini://in.gr:443"} //urls := []string{"gemini://smol.gr"}
urls := []string{"gemini://smol.gr", "gemini://gmi.noulin.net/"}
queue := make(chan string) queue := make(chan string)
done := make(chan struct{}) done := make(chan struct{})
@@ -45,7 +48,7 @@ func runApp() error {
func crawler(queue <-chan string, done chan struct{}) { func crawler(queue <-chan string, done chan struct{}) {
// Start processing results. // Start processing results.
results := make(chan Result) results := make(chan Snapshot)
resultsDone := make(chan struct{}) resultsDone := make(chan struct{})
go resultsHandler(results, resultsDone) go resultsHandler(results, resultsDone)
@@ -68,7 +71,7 @@ func crawler(queue <-chan string, done chan struct{}) {
LogInfo("All workers have finished") LogInfo("All workers have finished")
// Nobody left to send to results, so we // Nobody left to send to results, so we
// close it, and the ResultsProcessor can // close it, and the SnapshotsProcessor can
// finish // finish
close(results) close(results)
<-resultsDone <-resultsDone
@@ -76,28 +79,34 @@ func crawler(queue <-chan string, done chan struct{}) {
close(done) close(done)
} }
func resultsHandler(results <-chan Result, done chan struct{}) { func resultsHandler(results <-chan Snapshot, done chan struct{}) {
for result := range results { for result := range results {
if result.error != nil { if result.Error != nil {
LogError("%w", result.error) LogError("[%s] %w", result.Url, result.Error)
} else { } else {
LogInfo("[%s] Done. Result: %#v", result.url, result) LogInfo("[%s] Done", result.Url)
// fmt.Printf(SnapshotToJSON(result))
} }
} }
LogInfo("All results have been processed") LogInfo("All results have been processed")
close(done) close(done)
} }
func worker(queue <-chan string, results chan Result) { func worker(queue <-chan string, results chan Snapshot) {
for url := range queue { for url := range queue {
result := Visit(url) result := Visit(url)
// If we encountered an error when // If we encountered an error when
// visiting, skip processing // visiting, skip processing
if result.error != nil { if result.Error != nil {
results <- *result results <- *result
continue continue
} }
result = Process(result) result = Process(result)
if result.Error != nil {
results <- *result
continue
}
SaveResult(ROOTPATH, result)
results <- *result results <- *result
} }
} }

View File

@@ -7,35 +7,42 @@ import (
"time" "time"
) )
func Visit(url string) (result *Result) { func Visit(url string) (result *Snapshot) {
result = &Result{} result = &Snapshot{Timestamp: time.Now(), UID: UID()}
// Wrap error with additional information // Wrap error with additional information
defer func() { defer func() {
if result.error != nil { if result.Error != nil {
result.error = fmt.Errorf("[%s] Error: %w", result.url, result.error) result.Error = fmt.Errorf("[%s] Error: %w", result.Url, result.Error)
} }
}() }()
geminiUrl, err := ParseUrl(url, "") geminiUrl, err := ParseUrl(url, "")
if err != nil { if err != nil {
result.error = err result.Error = err
return result return result
} }
result.url = *geminiUrl result.Url = *geminiUrl
LogInfo("[%s] Dialing", geminiUrl.String()) LogInfo("[%s] Dialing", geminiUrl)
// Establish a TLS connection // Establish a TLS connection
tlsConfig := &tls.Config{ tlsConfig := &tls.Config{
InsecureSkipVerify: true, InsecureSkipVerify: true,
} }
conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.hostname, geminiUrl.port), tlsConfig) conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.Hostname, geminiUrl.Port), tlsConfig)
if err != nil { if err != nil {
result.error = err result.Error = err
return result return result
} }
defer conn.Close() // Defer properly: Also handle possible
// error of conn.Close()
defer func() {
err := conn.Close()
if err != nil {
result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.Url.String(), err)
}
}()
// Read data from the connection // Read data from the connection
conn.SetReadDeadline(time.Now().Add(5 * time.Second)) conn.SetReadDeadline(time.Now().Add(5 * time.Second))
@@ -53,7 +60,7 @@ func Visit(url string) (result *Result) {
if err == io.EOF { if err == io.EOF {
break break
} else { } else {
result.error = err result.Error = err
return result return result
} }
} }
@@ -61,6 +68,6 @@ func Visit(url string) (result *Result) {
LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data)) LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data))
// time.Sleep(time.Duration(time.Second * 2)) // time.Sleep(time.Duration(time.Second * 2))
// LogDebug("[%s] Visitor finished", geminiUrl.String()) // LogDebug("[%s] Visitor finished", geminiUrl.String())
result.data = string(data) result.Data = string(data)
return result return result
} }

42
snapshot.go Normal file
View File

@@ -0,0 +1,42 @@
package main
import (
"encoding/json"
"fmt"
"time"
)
type Snapshot struct {
Url GeminiUrl `json:"url,omitempty"`
Timestamp time.Time `json:"timestamp,omitempty"`
Data string `json:"data,omitempty"`
Links []GeminiUrl `json:"links,omitempty"`
Code int `json:"code,omitempty"`
Error error `json:"error,omitempty"`
UID string `json:"uid,omitempty"`
}
func (u Snapshot) String() string {
return fmt.Sprintf(
"[%s] %s %s %s %d %s",
u.UID, u.Url, u.Timestamp, u.Links, u.Code, u.Error,
)
}
func SnapshotToJSON(g Snapshot) string {
// Serialize the Person struct to JSON
jsonData, err := json.Marshal(g)
if err != nil {
LogError("Error serializing to JSON: %w", err)
}
return string(jsonData)
}
func SnapshotFromJSON(input string) Snapshot {
var snapshot Snapshot
err := json.Unmarshal([]byte(input), &snapshot)
if err != nil {
LogError("Error deserializing from JSON: %w", err)
}
return snapshot
}

View File

@@ -1,25 +0,0 @@
package main
import (
"fmt"
)
type GeminiUrl struct {
protocol string
hostname string
port int
path string
descr string
}
func (self GeminiUrl) String() string {
return fmt.Sprintf("%s://%s:%d%s", self.protocol, self.hostname, self.port, self.path)
}
type Result struct {
url GeminiUrl
data string
links []GeminiUrl
code int
error error
}

14
uid.go Normal file
View File

@@ -0,0 +1,14 @@
package main
import (
nanoid "github.com/jaevor/go-nanoid"
)
func UID() string {
// Missing o,O and l
uid, err := nanoid.CustomASCII("abcdefghijkmnpqrstuvwxyzABCDEFGHIJKLMNPQRSTUVWXYZ0123456789", 18)
if err != nil {
panic(err)
}
return uid()
}