Persist pages to file system
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1 +1,3 @@
|
||||
.idea
|
||||
|
||||
/gemini-grc
|
||||
|
||||
86
fs.go
Normal file
86
fs.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// sanitizePath encodes invalid filesystem characters using URL encoding.
|
||||
// Example:
|
||||
// /example/path/to/page?query=param&another=value
|
||||
// would become
|
||||
// example/path/to/page%3Fquery%3Dparam%26another%3Dvalue
|
||||
func sanitizePath(p string) string {
|
||||
// Split the path into its components
|
||||
components := strings.Split(p, "/")
|
||||
|
||||
// Encode each component separately
|
||||
for i, component := range components {
|
||||
// Decode any existing percent-encoded characters
|
||||
decodedComponent, err := url.PathUnescape(component)
|
||||
if err != nil {
|
||||
decodedComponent = component // Fallback to original if unescape fails
|
||||
}
|
||||
|
||||
// Encode the component to escape invalid filesystem characters
|
||||
encodedComponent := url.QueryEscape(decodedComponent)
|
||||
|
||||
// Replace '+' (from QueryEscape) with '%20' to handle spaces correctly
|
||||
encodedComponent = strings.ReplaceAll(encodedComponent, "+", "%20")
|
||||
|
||||
components[i] = encodedComponent
|
||||
}
|
||||
|
||||
// Rejoin the components into a sanitized path
|
||||
safe := filepath.Join(components...)
|
||||
|
||||
return safe
|
||||
}
|
||||
|
||||
// getFilePath constructs a safe file path from the root path and URL path.
|
||||
// It URL-encodes invalid filesystem characters to ensure the path is valid.
|
||||
func calcFilePath(rootPath, urlPath string) (string, error) {
|
||||
// Normalize the URL path
|
||||
cleanPath := filepath.Clean(urlPath)
|
||||
|
||||
fmt.Printf("%s %s\n", urlPath, cleanPath)
|
||||
// Safe check to prevent directory traversal
|
||||
if strings.Contains(cleanPath, "..") {
|
||||
return "", fmt.Errorf("invalid URL path: contains directory traversal")
|
||||
}
|
||||
|
||||
// Sanitize the path by encoding invalid characters
|
||||
safePath := sanitizePath(cleanPath)
|
||||
|
||||
// Join the root path and the sanitized URL path
|
||||
finalPath := filepath.Join(rootPath, safePath)
|
||||
|
||||
// Ensure the directory exists
|
||||
dir := filepath.Dir(finalPath)
|
||||
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
|
||||
return "", fmt.Errorf("failed to create directories: %v", err)
|
||||
}
|
||||
|
||||
return finalPath, nil
|
||||
}
|
||||
|
||||
func SaveResult(rootPath string, s *Snapshot) {
|
||||
urlPath := s.Url.Path
|
||||
if urlPath == "" || urlPath == "/" {
|
||||
urlPath = fmt.Sprintf("%s/index.gmi", s.Url.Hostname)
|
||||
}
|
||||
filepath, err := calcFilePath(rootPath, urlPath)
|
||||
if err != nil {
|
||||
LogError("Error saving %s: %w", s.Url, err)
|
||||
return
|
||||
}
|
||||
// err = os.WriteFile(filepath, []byte(SnapshotToJSON(*s)), 0666)
|
||||
err = os.WriteFile(filepath, []byte((*s).Data), 0666)
|
||||
if err != nil {
|
||||
LogError("Error saving %s: %w", s.Url.Full, err)
|
||||
}
|
||||
LogInfo("[%s] Saved to %s", s.Url.Full, filepath)
|
||||
}
|
||||
37
gemini-url.go
Normal file
37
gemini-url.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
)
|
||||
|
||||
type GeminiUrl struct {
|
||||
Protocol string `json:"protocol,omitempty"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
Port int `json:"port,omitempty"`
|
||||
Path string `json:"path,omitempty"`
|
||||
Descr string `json:"descr,omitempty"`
|
||||
Full string `json:"full,omitempty"`
|
||||
}
|
||||
|
||||
func (u GeminiUrl) String() string {
|
||||
return u.Full
|
||||
// return fmt.Sprintf("%s://%s:%d%s", u.Protocol, u.Hostname, u.Port, u.Path)
|
||||
}
|
||||
|
||||
func GeminiUrltoJSON(g GeminiUrl) string {
|
||||
// Serialize the Person struct to JSON
|
||||
jsonData, err := json.Marshal(g)
|
||||
if err != nil {
|
||||
LogError("Error serializing to JSON: %w", err)
|
||||
}
|
||||
return string(jsonData)
|
||||
}
|
||||
|
||||
func GeminiUrlFromJSON(input string) GeminiUrl {
|
||||
var geminiUrl GeminiUrl
|
||||
err := json.Unmarshal([]byte(input), &geminiUrl)
|
||||
if err != nil {
|
||||
LogError("Error deserializing from JSON: %w", err)
|
||||
}
|
||||
return geminiUrl
|
||||
}
|
||||
45
gemini.go
45
gemini.go
@@ -8,34 +8,34 @@ import (
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func Process(result *Result) *Result {
|
||||
LogInfo("[%s] Processing data", result.url.String())
|
||||
code, err := ParseFirstTwoDigits(result.data)
|
||||
func Process(snapshot *Snapshot) *Snapshot {
|
||||
LogInfo("[%s] Processing data", snapshot.Url.String())
|
||||
code, err := ParseFirstTwoDigits(snapshot.Data)
|
||||
if err != nil {
|
||||
result.error = fmt.Errorf("[%s] Invalid gemini response code", result.url.String())
|
||||
return result
|
||||
snapshot.Error = fmt.Errorf("[%s] Invalid gemini response code", snapshot.Url.String())
|
||||
return snapshot
|
||||
}
|
||||
if code != 20 {
|
||||
result.error = fmt.Errorf("[%s] Gemini response code != 20, skipping", result.url.String())
|
||||
return result
|
||||
snapshot.Error = fmt.Errorf("[%s] Gemini response code != 20, skipping", snapshot.Url.String())
|
||||
return snapshot
|
||||
}
|
||||
// Grab link lines
|
||||
linkLines := ExtractLinkLines(result.data)
|
||||
LogDebug("[%s] Found %d links", result.url.String(), len(linkLines))
|
||||
// Normalize URLs in links, and store them in result
|
||||
linkLines := ExtractLinkLines(snapshot.Data)
|
||||
LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines))
|
||||
// Normalize URLs in links, and store them in snapshot
|
||||
for _, line := range linkLines {
|
||||
normalizedLink, descr, error := NormalizeLink(line, result.url.String())
|
||||
normalizedLink, descr, error := NormalizeLink(line, snapshot.Url.String())
|
||||
if error != nil {
|
||||
LogError("[%s] Invalid link URL %w", result.url.String(), error)
|
||||
LogError("[%s] Invalid link URL %w", snapshot.Url.String(), error)
|
||||
continue
|
||||
}
|
||||
geminiUrl, error := ParseUrl(normalizedLink, descr)
|
||||
if error != nil {
|
||||
LogError("[%s] Unparseable gemini link %w", result.url.String(), error)
|
||||
LogError("[%s] Unparseable gemini link %w", snapshot.Url.String(), error)
|
||||
}
|
||||
result.links = append(result.links, *geminiUrl)
|
||||
snapshot.Links = append(snapshot.Links, *geminiUrl)
|
||||
}
|
||||
return result
|
||||
return snapshot
|
||||
}
|
||||
|
||||
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||
@@ -46,6 +46,7 @@ func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||
protocol := u.Scheme
|
||||
hostname := u.Hostname()
|
||||
str_port := u.Port()
|
||||
path := u.Path
|
||||
if str_port == "" {
|
||||
str_port = "1965"
|
||||
}
|
||||
@@ -53,7 +54,7 @@ func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
|
||||
}
|
||||
return &GeminiUrl{protocol: protocol, hostname: hostname, port: port, path: u.Path, descr: descr}, nil
|
||||
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
|
||||
}
|
||||
|
||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||
@@ -107,6 +108,14 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
|
||||
|
||||
// Construct the canonicalized link line
|
||||
canonicalURLStr := parsedURL.String()
|
||||
|
||||
// Remove usual first space from URL description:
|
||||
// => URL description
|
||||
// ^^^^^^^^^^^^
|
||||
if restOfLine[0] == ' ' {
|
||||
restOfLine = restOfLine[1:]
|
||||
}
|
||||
|
||||
return canonicalURLStr, restOfLine, nil
|
||||
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
|
||||
// return canonicalizedLine, nil
|
||||
@@ -125,10 +134,10 @@ func ParseFirstTwoDigits(input string) (int, error) {
|
||||
}
|
||||
|
||||
// Parse the captured match as an integer
|
||||
result, err := strconv.Atoi(matches[1])
|
||||
snapshot, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
6
go.mod
6
go.mod
@@ -2,9 +2,13 @@ module gemini-grc
|
||||
|
||||
go 1.23.1
|
||||
|
||||
require (
|
||||
github.com/jaevor/go-nanoid v1.4.0
|
||||
github.com/rs/zerolog v1.33.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/rs/zerolog v1.33.0 // indirect
|
||||
golang.org/x/sys v0.25.0 // indirect
|
||||
)
|
||||
|
||||
2
go.sum
2
go.sum
@@ -1,5 +1,7 @@
|
||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
||||
github.com/jaevor/go-nanoid v1.4.0 h1:mPz0oi3CrQyEtRxeRq927HHtZCJAAtZ7zdy7vOkrvWs=
|
||||
github.com/jaevor/go-nanoid v1.4.0/go.mod h1:GIpPtsvl3eSBsjjIEFQdzzgpi50+Bo1Luk+aYlbJzlc=
|
||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
|
||||
35
main.go
35
main.go
@@ -8,19 +8,22 @@ import (
|
||||
zlog "github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
const ROOTPATH string = "./a"
|
||||
|
||||
func main() {
|
||||
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
||||
// zerolog.SetGlobalLevel(zerolog.DebugLevel)
|
||||
zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
||||
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr})
|
||||
zerolog.SetGlobalLevel(zerolog.DebugLevel)
|
||||
//zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
||||
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
|
||||
if err := runApp(); err != nil {
|
||||
LogError("Application error: %v", err)
|
||||
LogError("Application error: %w", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func runApp() error {
|
||||
urls := []string{"gemini://smol.gr"} //, "gemini://gmi.noulin.neta/", "gemini://in.gr:443"}
|
||||
//urls := []string{"gemini://smol.gr"}
|
||||
urls := []string{"gemini://smol.gr", "gemini://gmi.noulin.net/"}
|
||||
|
||||
queue := make(chan string)
|
||||
done := make(chan struct{})
|
||||
@@ -45,7 +48,7 @@ func runApp() error {
|
||||
|
||||
func crawler(queue <-chan string, done chan struct{}) {
|
||||
// Start processing results.
|
||||
results := make(chan Result)
|
||||
results := make(chan Snapshot)
|
||||
resultsDone := make(chan struct{})
|
||||
go resultsHandler(results, resultsDone)
|
||||
|
||||
@@ -68,7 +71,7 @@ func crawler(queue <-chan string, done chan struct{}) {
|
||||
LogInfo("All workers have finished")
|
||||
|
||||
// Nobody left to send to results, so we
|
||||
// close it, and the ResultsProcessor can
|
||||
// close it, and the SnapshotsProcessor can
|
||||
// finish
|
||||
close(results)
|
||||
<-resultsDone
|
||||
@@ -76,28 +79,34 @@ func crawler(queue <-chan string, done chan struct{}) {
|
||||
close(done)
|
||||
}
|
||||
|
||||
func resultsHandler(results <-chan Result, done chan struct{}) {
|
||||
func resultsHandler(results <-chan Snapshot, done chan struct{}) {
|
||||
for result := range results {
|
||||
if result.error != nil {
|
||||
LogError("%w", result.error)
|
||||
if result.Error != nil {
|
||||
LogError("[%s] %w", result.Url, result.Error)
|
||||
} else {
|
||||
LogInfo("[%s] Done. Result: %#v", result.url, result)
|
||||
LogInfo("[%s] Done", result.Url)
|
||||
// fmt.Printf(SnapshotToJSON(result))
|
||||
}
|
||||
}
|
||||
LogInfo("All results have been processed")
|
||||
close(done)
|
||||
}
|
||||
|
||||
func worker(queue <-chan string, results chan Result) {
|
||||
func worker(queue <-chan string, results chan Snapshot) {
|
||||
for url := range queue {
|
||||
result := Visit(url)
|
||||
// If we encountered an error when
|
||||
// visiting, skip processing
|
||||
if result.error != nil {
|
||||
if result.Error != nil {
|
||||
results <- *result
|
||||
continue
|
||||
}
|
||||
result = Process(result)
|
||||
if result.Error != nil {
|
||||
results <- *result
|
||||
continue
|
||||
}
|
||||
SaveResult(ROOTPATH, result)
|
||||
results <- *result
|
||||
}
|
||||
}
|
||||
|
||||
31
network.go
31
network.go
@@ -7,35 +7,42 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
func Visit(url string) (result *Result) {
|
||||
result = &Result{}
|
||||
func Visit(url string) (result *Snapshot) {
|
||||
result = &Snapshot{Timestamp: time.Now(), UID: UID()}
|
||||
|
||||
// Wrap error with additional information
|
||||
defer func() {
|
||||
if result.error != nil {
|
||||
result.error = fmt.Errorf("[%s] Error: %w", result.url, result.error)
|
||||
if result.Error != nil {
|
||||
result.Error = fmt.Errorf("[%s] Error: %w", result.Url, result.Error)
|
||||
}
|
||||
}()
|
||||
|
||||
geminiUrl, err := ParseUrl(url, "")
|
||||
if err != nil {
|
||||
result.error = err
|
||||
result.Error = err
|
||||
return result
|
||||
}
|
||||
result.url = *geminiUrl
|
||||
result.Url = *geminiUrl
|
||||
|
||||
LogInfo("[%s] Dialing", geminiUrl.String())
|
||||
LogInfo("[%s] Dialing", geminiUrl)
|
||||
|
||||
// Establish a TLS connection
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
}
|
||||
conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.hostname, geminiUrl.port), tlsConfig)
|
||||
conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.Hostname, geminiUrl.Port), tlsConfig)
|
||||
if err != nil {
|
||||
result.error = err
|
||||
result.Error = err
|
||||
return result
|
||||
}
|
||||
defer conn.Close()
|
||||
// Defer properly: Also handle possible
|
||||
// error of conn.Close()
|
||||
defer func() {
|
||||
err := conn.Close()
|
||||
if err != nil {
|
||||
result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.Url.String(), err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Read data from the connection
|
||||
conn.SetReadDeadline(time.Now().Add(5 * time.Second))
|
||||
@@ -53,7 +60,7 @@ func Visit(url string) (result *Result) {
|
||||
if err == io.EOF {
|
||||
break
|
||||
} else {
|
||||
result.error = err
|
||||
result.Error = err
|
||||
return result
|
||||
}
|
||||
}
|
||||
@@ -61,6 +68,6 @@ func Visit(url string) (result *Result) {
|
||||
LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data))
|
||||
// time.Sleep(time.Duration(time.Second * 2))
|
||||
// LogDebug("[%s] Visitor finished", geminiUrl.String())
|
||||
result.data = string(data)
|
||||
result.Data = string(data)
|
||||
return result
|
||||
}
|
||||
|
||||
42
snapshot.go
Normal file
42
snapshot.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Snapshot struct {
|
||||
Url GeminiUrl `json:"url,omitempty"`
|
||||
Timestamp time.Time `json:"timestamp,omitempty"`
|
||||
Data string `json:"data,omitempty"`
|
||||
Links []GeminiUrl `json:"links,omitempty"`
|
||||
Code int `json:"code,omitempty"`
|
||||
Error error `json:"error,omitempty"`
|
||||
UID string `json:"uid,omitempty"`
|
||||
}
|
||||
|
||||
func (u Snapshot) String() string {
|
||||
return fmt.Sprintf(
|
||||
"[%s] %s %s %s %d %s",
|
||||
u.UID, u.Url, u.Timestamp, u.Links, u.Code, u.Error,
|
||||
)
|
||||
}
|
||||
|
||||
func SnapshotToJSON(g Snapshot) string {
|
||||
// Serialize the Person struct to JSON
|
||||
jsonData, err := json.Marshal(g)
|
||||
if err != nil {
|
||||
LogError("Error serializing to JSON: %w", err)
|
||||
}
|
||||
return string(jsonData)
|
||||
}
|
||||
|
||||
func SnapshotFromJSON(input string) Snapshot {
|
||||
var snapshot Snapshot
|
||||
err := json.Unmarshal([]byte(input), &snapshot)
|
||||
if err != nil {
|
||||
LogError("Error deserializing from JSON: %w", err)
|
||||
}
|
||||
return snapshot
|
||||
}
|
||||
25
types.go
25
types.go
@@ -1,25 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type GeminiUrl struct {
|
||||
protocol string
|
||||
hostname string
|
||||
port int
|
||||
path string
|
||||
descr string
|
||||
}
|
||||
|
||||
func (self GeminiUrl) String() string {
|
||||
return fmt.Sprintf("%s://%s:%d%s", self.protocol, self.hostname, self.port, self.path)
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
url GeminiUrl
|
||||
data string
|
||||
links []GeminiUrl
|
||||
code int
|
||||
error error
|
||||
}
|
||||
Reference in New Issue
Block a user