From 8278f2b204b01396879a2657b79ff2590b8ec8f1 Mon Sep 17 00:00:00 2001 From: antanst Date: Wed, 9 Oct 2024 13:31:49 +0300 Subject: [PATCH] Proper mimetype parsing, refactoring --- config.go | 11 +++++++ fs.go | 32 +++++++++++++++++++ gemini.go | 72 ++---------------------------------------- go.mod | 2 ++ go.sum | 6 ++++ main.go | 42 ++++++++++++++++--------- network.go | 91 ++++++++++++++++++++++++++++++++++++++--------------- snapshot.go | 3 +- 8 files changed, 147 insertions(+), 112 deletions(-) diff --git a/config.go b/config.go index 58e94ca..3051bfa 100644 --- a/config.go +++ b/config.go @@ -13,6 +13,7 @@ type Config struct { rootPath string numOfWorkers int maxResponseSize int + responseTimeout int } func getConfig() *Config { @@ -22,6 +23,7 @@ func getConfig() *Config { "ROOT_PATH", "NUM_OF_WORKERS", "MAX_RESPONSE_SIZE", + "RESPONSE_TIMEOUT", } { if env, ok := os.LookupEnv(envVar); !ok { fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar) @@ -59,6 +61,15 @@ func getConfig() *Config { config.maxResponseSize = maxResponseSize } } + case "RESPONSE_TIMEOUT": + { + if val, err := strconv.Atoi(env); err != nil { + fmt.Fprintf(os.Stderr, "Invalid RESPONSE_TIMEOUT value\n") + os.Exit(1) + } else { + config.responseTimeout = val + } + } } } } diff --git a/fs.go b/fs.go index ce82157..24c2ea4 100644 --- a/fs.go +++ b/fs.go @@ -3,6 +3,8 @@ package main import ( "fmt" "net/url" + "os" + "path" "path/filepath" "strings" ) @@ -58,3 +60,33 @@ func calcFilePath(rootPath, urlPath string) (string, error) { return finalPath, nil } + +func SaveSnapshot(rootPath string, s *Snapshot) { + parentPath := path.Join(rootPath, s.URL.Hostname) + urlPath := s.URL.Path + // If path is empty, add `index.gmi` as the file to save + if urlPath == "" || urlPath == "." { + urlPath = fmt.Sprintf("index.gmi") + } + // If path ends with '/' then add index.gmi for the + // directory to be created. + if strings.HasSuffix(urlPath, "/") { + urlPath = strings.Join([]string{urlPath, "index.gmi"}, "") + } + + finalPath, err := calcFilePath(parentPath, urlPath) + if err != nil { + LogError("Error saving %s: %w", s.URL, err) + return + } + // Ensure the directory exists + dir := filepath.Dir(finalPath) + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + LogError("Failed to create directory: %w", err) + return + } + err = os.WriteFile(finalPath, []byte((*s).Data), 0666) + if err != nil { + LogError("Error saving %s: %w", s.URL.Full, err) + } +} diff --git a/gemini.go b/gemini.go index 4c86918..e15b21b 100644 --- a/gemini.go +++ b/gemini.go @@ -4,12 +4,8 @@ import ( "errors" "fmt" "net/url" - "os" - "path" - "path/filepath" "regexp" "strconv" - "strings" ) func checkGeminiStatusCode(code int) error { @@ -31,43 +27,9 @@ func checkGeminiStatusCode(code int) error { } } -func parseHeaders(data string) (string, string) { - re := regexp.MustCompile(`^\d+\s+([a-zA-Z0-9/\-+]+)[;\s]+(lang=([a-zA-Z0-9-]+))?`) - matches := re.FindStringSubmatch(data) - if matches == nil || len(matches) <= 1 { - return "", "" - } - return matches[1], matches[3] -} - -func ProcessHeaders(snapshot *Snapshot) *Snapshot { - LogDebug("[%s] Processing snapshot", snapshot.URL.String()) - mimetype, lang := parseHeaders(snapshot.Data) - if mimetype != "" { - snapshot.MimeType = mimetype - } - if lang != "" { - snapshot.Lang = lang - } - return snapshot -} - func ProcessGemini(snapshot *Snapshot) *Snapshot { - code, err := ParseFirstTwoDigits(snapshot.Data) - if err != nil { - snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.URL.String()) - return snapshot - } - snapshot.ResponseCode = code - - // Remove response headers from body (first line) - index := strings.Index(snapshot.Data, "\n") - if index != -1 { - snapshot.Data = snapshot.Data[index+1:] - } - - // Grab any link lines - linkLines := ExtractLinkLines(snapshot.Data) + // Grab link lines + linkLines := ExtractLinkLines(snapshot.GemText) LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines)) // Normalize URLs in links, and store them in snapshot @@ -86,36 +48,6 @@ func ProcessGemini(snapshot *Snapshot) *Snapshot { return snapshot } -func SaveResult(rootPath string, s *Snapshot) { - parentPath := path.Join(rootPath, s.URL.Hostname) - urlPath := s.URL.Path - // If path is empty, add `index.gmi` as the file to save - if urlPath == "" || urlPath == "." { - urlPath = fmt.Sprintf("index.gmi") - } - // If path ends with '/' then add index.gmi for the - // directory to be created. - if strings.HasSuffix(urlPath, "/") { - urlPath = strings.Join([]string{urlPath, "index.gmi"}, "") - } - - finalPath, err := calcFilePath(parentPath, urlPath) - if err != nil { - LogError("Error saving %s: %w", s.URL, err) - return - } - // Ensure the directory exists - dir := filepath.Dir(finalPath) - if err := os.MkdirAll(dir, os.ModePerm); err != nil { - LogError("Failed to create directory: %w", err) - return - } - err = os.WriteFile(finalPath, []byte((*s).Data), 0666) - if err != nil { - LogError("Error saving %s: %w", s.URL.Full, err) - } -} - func ParseUrl(input string, descr string) (*GeminiUrl, error) { u, err := url.Parse(input) if err != nil { diff --git a/go.mod b/go.mod index 1ddc2f2..467f629 100644 --- a/go.mod +++ b/go.mod @@ -10,5 +10,7 @@ require ( require ( github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 // indirect + golang.org/x/net v0.27.0 // indirect golang.org/x/sys v0.25.0 // indirect ) diff --git a/go.sum b/go.sum index bd70927..f5cf7b2 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,6 @@ github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/gabriel-vasile/mimetype v1.4.5 h1:J7wGKdGu33ocBOhGy0z653k/lFKLFDPJMG8Gql0kxn4= +github.com/gabriel-vasile/mimetype v1.4.5/go.mod h1:ibHel+/kbxn9x2407k1izTA1S81ku1z/DlgOW2QE0M4= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/jaevor/go-nanoid v1.4.0 h1:mPz0oi3CrQyEtRxeRq927HHtZCJAAtZ7zdy7vOkrvWs= github.com/jaevor/go-nanoid v1.4.0/go.mod h1:GIpPtsvl3eSBsjjIEFQdzzgpi50+Bo1Luk+aYlbJzlc= @@ -12,6 +14,10 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= +golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 h1:1wqE9dj9NpSm04INVsJhhEUzhuDVjbcyKH91sVyPATw= +golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/main.go b/main.go index 9d012ef..0853fb9 100644 --- a/main.go +++ b/main.go @@ -1,7 +1,6 @@ package main import ( - "math/rand/v2" "os" "strings" "time" @@ -25,13 +24,17 @@ func main() { func runApp() error { // urls := []string{"gemini://smol.gr"} - urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"} + // urls := []string{"gemini://gemini.circumlunar.space/users/solderpunk/gemlog/orphans-of-netscape.gmi"} // Test 31 redirect + // urls := []string{"gemini://zaibatsu.circumlunar.space/~solderpunk/gemlog/orphans-of-netscape.gmi"} + // urls := []string{"gemini://farcaster.net/berlin/dared.jpg"} + // urls := []string{"gemini://smol.gr/media/amstrad_cpc_6128.jpg", "https://go.dev/blog/go-brand/Go-Logo/PNG/Go-Logo_Blue.png"} + urls := []string{"gemini://tlgs.one/", "gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"} queue := make(chan string, 1000) results := make(chan Snapshot, 100) done := make(chan struct{}) - go spawnStats(queue, results) + go spawnStatsReport(queue, results) go resultsHandler(queue, results) spawnWorkers(CONFIG.numOfWorkers, queue, results) @@ -42,7 +45,7 @@ func runApp() error { return nil } -func spawnStats(queue chan string, results chan Snapshot) { +func spawnStatsReport(queue chan string, results chan Snapshot) { ticker := time.NewTicker(time.Duration(time.Second * 10)) defer ticker.Stop() for range ticker.C { @@ -53,7 +56,6 @@ func spawnStats(queue chan string, results chan Snapshot) { func spawnWorkers(numOfWorkers int, queue <-chan string, results chan Snapshot) { LogInfo("Spawning %d workers", numOfWorkers) - // Start worker goroutines for i := 0; i < numOfWorkers; i++ { go func(i int) { worker(i, queue, results) @@ -85,8 +87,16 @@ func resultsHandler(queue chan string, results <-chan Snapshot) { func worker(id int, queue <-chan string, results chan Snapshot) { for url := range queue { - LogDebug("Worker %d visiting %s", id, url) - result := Visit(url) + if !shouldVisit(url) { + LogInfo("Skipping %s", url) + continue + } + LogInfo("Worker %d visiting %s", id, url) + result, err := Visit(url) + if err != nil { + LogError("[%s] %w", url, err) + continue + } // If we encountered an error when // visiting, skip processing if result.Error != nil { @@ -94,23 +104,25 @@ func worker(id int, queue <-chan string, results chan Snapshot) { continue } LogDebug("Worker %d processing %s", id, url) - result = ProcessHeaders(result) - if result.Error != nil { - results <- *result - continue - } if result.MimeType == "text/gemini" { result = ProcessGemini(result) } if shouldPersist(result) { - LogInfo("Worker %d saving %s", id, url) - SaveResult(CONFIG.rootPath, result) + LogDebug("Worker %d saving %s", id, url) + SaveSnapshot(CONFIG.rootPath, result) } results <- *result - time.Sleep(time.Duration(rand.IntN(5)) * time.Second) + // time.Sleep(time.Duration(rand.IntN(5)) * time.Second) } } +func shouldVisit(url string) bool { + if !strings.HasPrefix(url, "gemini://") { + return false + } + return true +} + func shouldPersist(result *Snapshot) bool { if result.MimeType == "text/gemini" || strings.HasPrefix(result.MimeType, "image/") || diff --git a/network.go b/network.go index ac3f2f3..9274667 100644 --- a/network.go +++ b/network.go @@ -4,25 +4,21 @@ import ( "crypto/tls" "fmt" "io" + "regexp" + "slices" + "strconv" "time" ) -func Visit(url string) (result *Snapshot) { - result = &Snapshot{Timestamp: time.Now(), UID: UID()} - - // Wrap error with additional information - defer func() { - if result.Error != nil { - result.Error = fmt.Errorf("[%s] Error: %w", result.URL, result.Error) - } - }() +func Visit(url string) (snapshot *Snapshot, err error) { + snapshot = &Snapshot{Timestamp: time.Now(), UID: UID()} geminiUrl, err := ParseUrl(url, "") if err != nil { - result.Error = err - return result + snapshot.Error = fmt.Errorf("[%s] %w", url, err) + return snapshot, nil } - result.URL = *geminiUrl + snapshot.URL = *geminiUrl LogDebug("[%s] Connecting", geminiUrl) @@ -32,25 +28,29 @@ func Visit(url string) (result *Snapshot) { } conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.Hostname, geminiUrl.Port), tlsConfig) if err != nil { - result.Error = err - return result + snapshot.Error = err + return snapshot, nil } // Defer properly: Also handle possible // error of conn.Close() defer func() { err := conn.Close() if err != nil { - result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.URL.String(), err) + snapshot.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", snapshot.URL.String(), err) } }() // Read data from the connection - // TODO make timeout configurable - conn.SetReadDeadline(time.Now().Add(5 * time.Second)) - buf := make([]byte, 1024) + conn.SetReadDeadline(time.Now().Add(time.Duration(CONFIG.responseTimeout) * time.Second)) + buf := make([]byte, 4096) var data []byte + // Write Gemini request to get response. + // paths := []string{"/", ".", ""} + // if slices.Contains(paths, geminiUrl.Path) || strings.HasSuffix(geminiUrl.Path, "gmi") { conn.Write([]byte(fmt.Sprintf("%s\r\n", geminiUrl.String()))) + // } + // Read response bytes in len(buf) byte chunks for { n, err := conn.Read(buf) @@ -58,21 +58,60 @@ func Visit(url string) (result *Snapshot) { data = append(data, buf[:n]...) } if len(data) > CONFIG.maxResponseSize { - result.Error = fmt.Errorf("Response size exceeded maximum of %d bytes", CONFIG.maxResponseSize) - return result + snapshot.Error = fmt.Errorf("[%s] Response size exceeded maximum of %d bytes", url, CONFIG.maxResponseSize) + return snapshot, nil } if err != nil { if err == io.EOF { break } else { - result.Error = err - return result + snapshot.Error = fmt.Errorf("[%s] %w", url, err) + return snapshot, nil } } } LogDebug("[%s] Received %d bytes", geminiUrl.String(), len(data)) - // time.Sleep(time.Duration(time.Second * 2)) - // LogDebug("[%s] Visitor finished", geminiUrl.String()) - result.Data = string(data) - return result + err = processResponse(snapshot, data) + if err != nil { + snapshot.Error = fmt.Errorf("%w", err) + } + return snapshot, nil +} + +func processResponse(snapshot *Snapshot, data []byte) error { + headers, body, err := getHeadersAndData(data) + if err != nil { + return err + } + code, mimeType, lang := getMimeTypeAndLang(headers) + snapshot.ResponseCode, snapshot.MimeType, snapshot.Lang, snapshot.Data = code, mimeType, lang, body + if mimeType == "text/gemini" { + snapshot.GemText = string(body) + } + return nil +} + +func getHeadersAndData(data []byte) (string, []byte, error) { + firstLineEnds := slices.Index(data, '\n') + if firstLineEnds == -1 { + return "", nil, fmt.Errorf("Could not parse response header") + } + firstLine := data[:firstLineEnds] + rest := data[firstLineEnds+1:] + return string(firstLine), rest, nil +} + +func getMimeTypeAndLang(headers string) (int, string, string) { + re := regexp.MustCompile(`^(\d+)\s+([a-zA-Z0-9/\-+]+)[;\s]+(lang=([a-zA-Z0-9-]+))?`) + matches := re.FindStringSubmatch(headers) + if matches == nil || len(matches) <= 1 { + return 0, "", "" + } + code, err := strconv.Atoi(matches[1]) + if err != nil { + return 0, "", "" + } + mimeType := matches[2] + lang := matches[4] + return code, mimeType, lang } diff --git a/snapshot.go b/snapshot.go index 3fa7627..ad94368 100644 --- a/snapshot.go +++ b/snapshot.go @@ -11,7 +11,8 @@ type Snapshot struct { URL GeminiUrl `json:"url,omitempty"` Timestamp time.Time `json:"timestamp,omitempty"` MimeType string `json:"mimetype,omitempty"` - Data string `json:"data,omitempty"` + Data []byte `json:"data,omitempty"` + GemText string `json:"gemtext,omitempty"` Links []GeminiUrl `json:"links,omitempty"` Lang string `json:"lang,omitempty"` // Gemini status code