From 91f8e69fdf7629973d69a16ee8a01350eb98fa4e Mon Sep 17 00:00:00 2001 From: antanst Date: Tue, 8 Oct 2024 18:16:47 +0300 Subject: [PATCH] Work on header parsing & saving other files --- .gitignore | 2 ++ config.go | 17 +++++++++--- fs.go | 33 ----------------------- gemini.go | 78 +++++++++++++++++++++++++++++++++++++++++++---------- main.go | 58 +++++++++++++++++++++++++-------------- network.go | 13 ++++++--- snapshot.go | 17 +++++++----- 7 files changed, 138 insertions(+), 80 deletions(-) diff --git a/.gitignore b/.gitignore index d3939fc..7318b3a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .idea +**/.#* +**/*~ /run.sh /gemini-grc /snaps diff --git a/config.go b/config.go index e5235b6..58e94ca 100644 --- a/config.go +++ b/config.go @@ -9,9 +9,10 @@ import ( ) type Config struct { - logLevel zerolog.Level - rootPath string - numOfWorkers int + logLevel zerolog.Level + rootPath string + numOfWorkers int + maxResponseSize int } func getConfig() *Config { @@ -20,6 +21,7 @@ func getConfig() *Config { "LOG_LEVEL", "ROOT_PATH", "NUM_OF_WORKERS", + "MAX_RESPONSE_SIZE", } { if env, ok := os.LookupEnv(envVar); !ok { fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar) @@ -48,6 +50,15 @@ func getConfig() *Config { config.numOfWorkers = numOfWorkers } } + case "MAX_RESPONSE_SIZE": + { + if maxResponseSize, err := strconv.Atoi(env); err != nil { + fmt.Fprintf(os.Stderr, "Invalid MAX_RESPONSE_SIZE value\n") + os.Exit(1) + } else { + config.maxResponseSize = maxResponseSize + } + } } } } diff --git a/fs.go b/fs.go index be89ecc..ce82157 100644 --- a/fs.go +++ b/fs.go @@ -3,8 +3,6 @@ package main import ( "fmt" "net/url" - "os" - "path" "path/filepath" "strings" ) @@ -60,34 +58,3 @@ func calcFilePath(rootPath, urlPath string) (string, error) { return finalPath, nil } - -func SaveResult(rootPath string, s *Snapshot) { - parentPath := path.Join(rootPath, s.Url.Hostname) - urlPath := s.Url.Path - // If path is empty, add `index.gmi` as the file to save - if urlPath == "" || urlPath == "." { - urlPath = fmt.Sprintf("index.gmi") - } - // If path ends with '/' then add index.gmi for the - // directory to be created. - if strings.HasSuffix(urlPath, "/") { - urlPath = strings.Join([]string{urlPath, "index.gmi"}, "") - } - - finalPath, err := calcFilePath(parentPath, urlPath) - if err != nil { - LogError("Error saving %s: %w", s.Url, err) - return - } - // Ensure the directory exists - dir := filepath.Dir(finalPath) - if err := os.MkdirAll(dir, os.ModePerm); err != nil { - LogError("Failed to create directory: %w", err) - return - } - err = os.WriteFile(finalPath, []byte((*s).Data), 0666) - if err != nil { - LogError("Error saving %s: %w", s.Url.Full, err) - } - LogInfo("[%s] Saved to %s", s.Url.Full, finalPath) -} diff --git a/gemini.go b/gemini.go index b9a489a..4c86918 100644 --- a/gemini.go +++ b/gemini.go @@ -4,12 +4,15 @@ import ( "errors" "fmt" "net/url" + "os" + "path" + "path/filepath" "regexp" "strconv" "strings" ) -func checkStatusCode(code int) error { +func checkGeminiStatusCode(code int) error { switch { case code == 20: return nil @@ -28,20 +31,36 @@ func checkStatusCode(code int) error { } } -func Process(snapshot *Snapshot) *Snapshot { - LogDebug("[%s] Processing snapshot", snapshot.Url.String()) +func parseHeaders(data string) (string, string) { + re := regexp.MustCompile(`^\d+\s+([a-zA-Z0-9/\-+]+)[;\s]+(lang=([a-zA-Z0-9-]+))?`) + matches := re.FindStringSubmatch(data) + if matches == nil || len(matches) <= 1 { + return "", "" + } + return matches[1], matches[3] +} + +func ProcessHeaders(snapshot *Snapshot) *Snapshot { + LogDebug("[%s] Processing snapshot", snapshot.URL.String()) + mimetype, lang := parseHeaders(snapshot.Data) + if mimetype != "" { + snapshot.MimeType = mimetype + } + if lang != "" { + snapshot.Lang = lang + } + return snapshot +} + +func ProcessGemini(snapshot *Snapshot) *Snapshot { code, err := ParseFirstTwoDigits(snapshot.Data) if err != nil { - snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.Url.String()) - return snapshot - } - err = checkStatusCode(code) - if err != nil { - snapshot.Error = fmt.Errorf("[%s] Gemini response code error, skipping. %w", snapshot.Url.String(), err) + snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.URL.String()) return snapshot } + snapshot.ResponseCode = code - // Remove response code from body (first line) + // Remove response headers from body (first line) index := strings.Index(snapshot.Data, "\n") if index != -1 { snapshot.Data = snapshot.Data[index+1:] @@ -49,23 +68,54 @@ func Process(snapshot *Snapshot) *Snapshot { // Grab any link lines linkLines := ExtractLinkLines(snapshot.Data) - LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines)) + LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines)) + // Normalize URLs in links, and store them in snapshot for _, line := range linkLines { - normalizedLink, descr, error := NormalizeLink(line, snapshot.Url.String()) + normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String()) if error != nil { - LogError("[%s] Invalid link URL %w", snapshot.Url.String(), error) + LogError("[%s] Invalid link URL %w", snapshot.URL.String(), error) continue } geminiUrl, error := ParseUrl(normalizedLink, descr) if error != nil { - LogError("[%s] Unparseable gemini link %w", snapshot.Url.String(), error) + LogError("[%s] Unparseable gemini link %w", snapshot.URL.String(), error) } snapshot.Links = append(snapshot.Links, *geminiUrl) } return snapshot } +func SaveResult(rootPath string, s *Snapshot) { + parentPath := path.Join(rootPath, s.URL.Hostname) + urlPath := s.URL.Path + // If path is empty, add `index.gmi` as the file to save + if urlPath == "" || urlPath == "." { + urlPath = fmt.Sprintf("index.gmi") + } + // If path ends with '/' then add index.gmi for the + // directory to be created. + if strings.HasSuffix(urlPath, "/") { + urlPath = strings.Join([]string{urlPath, "index.gmi"}, "") + } + + finalPath, err := calcFilePath(parentPath, urlPath) + if err != nil { + LogError("Error saving %s: %w", s.URL, err) + return + } + // Ensure the directory exists + dir := filepath.Dir(finalPath) + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + LogError("Failed to create directory: %w", err) + return + } + err = os.WriteFile(finalPath, []byte((*s).Data), 0666) + if err != nil { + LogError("Error saving %s: %w", s.URL.Full, err) + } +} + func ParseUrl(input string, descr string) (*GeminiUrl, error) { u, err := url.Parse(input) if err != nil { diff --git a/main.go b/main.go index fdb3577..9d012ef 100644 --- a/main.go +++ b/main.go @@ -10,28 +10,30 @@ import ( zlog "github.com/rs/zerolog/log" ) +var CONFIG Config + func main() { - config := *getConfig() + CONFIG = *getConfig() zerolog.TimeFieldFormat = zerolog.TimeFormatUnix - zerolog.SetGlobalLevel(config.logLevel) + zerolog.SetGlobalLevel(CONFIG.logLevel) zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"}) - if err := runApp(&config); err != nil { + if err := runApp(); err != nil { LogError("Application error: %w", err) os.Exit(1) } } -func runApp(config *Config) error { +func runApp() error { // urls := []string{"gemini://smol.gr"} urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"} - queue := make(chan string, 10000) + queue := make(chan string, 1000) results := make(chan Snapshot, 100) done := make(chan struct{}) go spawnStats(queue, results) go resultsHandler(queue, results) - spawnWorkers(config, queue, results) + spawnWorkers(CONFIG.numOfWorkers, queue, results) for _, url := range urls { queue <- url @@ -44,18 +46,17 @@ func spawnStats(queue chan string, results chan Snapshot) { ticker := time.NewTicker(time.Duration(time.Second * 10)) defer ticker.Stop() for range ticker.C { - LogInfo("Queue length: %d\n", len(queue)) - LogInfo("Results length: %d\n", len(results)) + LogInfo("Queue length: %d", len(queue)) + LogInfo("Results length: %d", len(results)) } } -func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) { - workers := config.numOfWorkers - LogInfo("Spawning %d workers", workers) +func spawnWorkers(numOfWorkers int, queue <-chan string, results chan Snapshot) { + LogInfo("Spawning %d workers", numOfWorkers) // Start worker goroutines - for i := 0; i < workers; i++ { + for i := 0; i < numOfWorkers; i++ { go func(i int) { - worker(i, config.rootPath, queue, results) + worker(i, queue, results) }(i) } } @@ -63,9 +64,9 @@ func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) { func resultsHandler(queue chan string, results <-chan Snapshot) { for result := range results { if result.Error != nil { - LogError("[%s] %w", result.Url, result.Error) + LogError("[%s] %w", result.URL, result.Error) } else { - LogDebug("[%s] Done", result.Url) + LogDebug("[%s] Done", result.URL) for _, link := range result.Links { if strings.HasPrefix(link.Full, "gemini://") { go func(link GeminiUrl) { @@ -74,12 +75,15 @@ func resultsHandler(queue chan string, results <-chan Snapshot) { }(link) } } - // fmt.Printf(SnapshotToJSON(result)) + // if result.MimeType == "text/gemini" { + // result.Data = "" + // fmt.Printf(SnapshotToJSON(result)) + // } } } } -func worker(id int, rootPath string, queue <-chan string, results chan Snapshot) { +func worker(id int, queue <-chan string, results chan Snapshot) { for url := range queue { LogDebug("Worker %d visiting %s", id, url) result := Visit(url) @@ -90,14 +94,28 @@ func worker(id int, rootPath string, queue <-chan string, results chan Snapshot) continue } LogDebug("Worker %d processing %s", id, url) - result = Process(result) + result = ProcessHeaders(result) if result.Error != nil { results <- *result continue } - LogDebug("Worker %d saving %s", id, url) - SaveResult(rootPath, result) + if result.MimeType == "text/gemini" { + result = ProcessGemini(result) + } + if shouldPersist(result) { + LogInfo("Worker %d saving %s", id, url) + SaveResult(CONFIG.rootPath, result) + } results <- *result time.Sleep(time.Duration(rand.IntN(5)) * time.Second) } } + +func shouldPersist(result *Snapshot) bool { + if result.MimeType == "text/gemini" || + strings.HasPrefix(result.MimeType, "image/") || + strings.HasPrefix(result.MimeType, "text/") { + return true + } + return false +} diff --git a/network.go b/network.go index 6f2dd1b..ac3f2f3 100644 --- a/network.go +++ b/network.go @@ -13,7 +13,7 @@ func Visit(url string) (result *Snapshot) { // Wrap error with additional information defer func() { if result.Error != nil { - result.Error = fmt.Errorf("[%s] Error: %w", result.Url, result.Error) + result.Error = fmt.Errorf("[%s] Error: %w", result.URL, result.Error) } }() @@ -22,9 +22,9 @@ func Visit(url string) (result *Snapshot) { result.Error = err return result } - result.Url = *geminiUrl + result.URL = *geminiUrl - LogInfo("[%s] Connecting", geminiUrl) + LogDebug("[%s] Connecting", geminiUrl) // Establish a TLS connection tlsConfig := &tls.Config{ @@ -40,11 +40,12 @@ func Visit(url string) (result *Snapshot) { defer func() { err := conn.Close() if err != nil { - result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.Url.String(), err) + result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.URL.String(), err) } }() // Read data from the connection + // TODO make timeout configurable conn.SetReadDeadline(time.Now().Add(5 * time.Second)) buf := make([]byte, 1024) var data []byte @@ -56,6 +57,10 @@ func Visit(url string) (result *Snapshot) { if n > 0 { data = append(data, buf[:n]...) } + if len(data) > CONFIG.maxResponseSize { + result.Error = fmt.Errorf("Response size exceeded maximum of %d bytes", CONFIG.maxResponseSize) + return result + } if err != nil { if err == io.EOF { break diff --git a/snapshot.go b/snapshot.go index fb1f32e..3fa7627 100644 --- a/snapshot.go +++ b/snapshot.go @@ -7,19 +7,24 @@ import ( ) type Snapshot struct { - Url GeminiUrl `json:"url,omitempty"` + UID string `json:"uid,omitempty"` + URL GeminiUrl `json:"url,omitempty"` Timestamp time.Time `json:"timestamp,omitempty"` + MimeType string `json:"mimetype,omitempty"` Data string `json:"data,omitempty"` Links []GeminiUrl `json:"links,omitempty"` - Code int `json:"code,omitempty"` - Error error `json:"error,omitempty"` - UID string `json:"uid,omitempty"` + Lang string `json:"lang,omitempty"` + // Gemini status code + ResponseCode int `json:"code,omitempty"` + // On network errors, for Gemini server errors + // we have ResponseCode above. + Error error `json:"error,omitempty"` } func (u Snapshot) String() string { return fmt.Sprintf( - "[%s] %s %s %s %d %s", - u.UID, u.Url, u.Timestamp, u.Links, u.Code, u.Error, + "[%s] %s %s %s %d %s %s %s", + u.UID, u.URL, u.Timestamp, u.Links, u.ResponseCode, u.MimeType, u.Lang, u.Error, ) }