diff --git a/.gitignore b/.gitignore index 2e05024..d3939fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea - +/run.sh /gemini-grc +/snaps diff --git a/config.go b/config.go new file mode 100644 index 0000000..e5235b6 --- /dev/null +++ b/config.go @@ -0,0 +1,55 @@ +package main + +import ( + "fmt" + "os" + "strconv" + + "github.com/rs/zerolog" +) + +type Config struct { + logLevel zerolog.Level + rootPath string + numOfWorkers int +} + +func getConfig() *Config { + var config Config + for _, envVar := range []string{ + "LOG_LEVEL", + "ROOT_PATH", + "NUM_OF_WORKERS", + } { + if env, ok := os.LookupEnv(envVar); !ok { + fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar) + os.Exit(1) + } else { + switch envVar { + case "LOG_LEVEL": + { + logLevel, err := zerolog.ParseLevel(env) + if err != nil { + fmt.Fprintf(os.Stderr, "Invalid LOG_LEVEL value\n") + os.Exit(1) + } + config.logLevel = logLevel + } + case "ROOT_PATH": + { + config.rootPath = env + } + case "NUM_OF_WORKERS": + { + if numOfWorkers, err := strconv.Atoi(env); err != nil { + fmt.Fprintf(os.Stderr, "Invalid NUM_OF_WORKERS value\n") + os.Exit(1) + } else { + config.numOfWorkers = numOfWorkers + } + } + } + } + } + return &config +} diff --git a/fs.go b/fs.go index 2f38034..be89ecc 100644 --- a/fs.go +++ b/fs.go @@ -4,6 +4,7 @@ import ( "fmt" "net/url" "os" + "path" "path/filepath" "strings" ) @@ -46,10 +47,9 @@ func calcFilePath(rootPath, urlPath string) (string, error) { // Normalize the URL path cleanPath := filepath.Clean(urlPath) - fmt.Printf("%s %s\n", urlPath, cleanPath) // Safe check to prevent directory traversal if strings.Contains(cleanPath, "..") { - return "", fmt.Errorf("invalid URL path: contains directory traversal") + return "", fmt.Errorf("Invalid URL path: contains directory traversal") } // Sanitize the path by encoding invalid characters @@ -58,29 +58,36 @@ func calcFilePath(rootPath, urlPath string) (string, error) { // Join the root path and the sanitized URL path finalPath := filepath.Join(rootPath, safePath) - // Ensure the directory exists - dir := filepath.Dir(finalPath) - if err := os.MkdirAll(dir, os.ModePerm); err != nil { - return "", fmt.Errorf("failed to create directories: %v", err) - } - return finalPath, nil } func SaveResult(rootPath string, s *Snapshot) { + parentPath := path.Join(rootPath, s.Url.Hostname) urlPath := s.Url.Path - if urlPath == "" || urlPath == "/" { - urlPath = fmt.Sprintf("%s/index.gmi", s.Url.Hostname) + // If path is empty, add `index.gmi` as the file to save + if urlPath == "" || urlPath == "." { + urlPath = fmt.Sprintf("index.gmi") } - filepath, err := calcFilePath(rootPath, urlPath) + // If path ends with '/' then add index.gmi for the + // directory to be created. + if strings.HasSuffix(urlPath, "/") { + urlPath = strings.Join([]string{urlPath, "index.gmi"}, "") + } + + finalPath, err := calcFilePath(parentPath, urlPath) if err != nil { LogError("Error saving %s: %w", s.Url, err) return } - // err = os.WriteFile(filepath, []byte(SnapshotToJSON(*s)), 0666) - err = os.WriteFile(filepath, []byte((*s).Data), 0666) + // Ensure the directory exists + dir := filepath.Dir(finalPath) + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + LogError("Failed to create directory: %w", err) + return + } + err = os.WriteFile(finalPath, []byte((*s).Data), 0666) if err != nil { LogError("Error saving %s: %w", s.Url.Full, err) } - LogInfo("[%s] Saved to %s", s.Url.Full, filepath) + LogInfo("[%s] Saved to %s", s.Url.Full, finalPath) } diff --git a/gemini.go b/gemini.go index 647d03d..b9a489a 100644 --- a/gemini.go +++ b/gemini.go @@ -6,20 +6,48 @@ import ( "net/url" "regexp" "strconv" + "strings" ) +func checkStatusCode(code int) error { + switch { + case code == 20: + return nil + case code >= 10 && code < 20: + return fmt.Errorf("Gemini response %d needs data input", code) + case code >= 30 && code < 40: + return fmt.Errorf("Gemini response %d redirect", code) + case code >= 40 && code < 50: + return fmt.Errorf("Gemini response %d server error", code) + case code >= 50 && code < 60: + return fmt.Errorf("Gemini response %d server permanent error", code) + case code >= 60 && code < 70: + return fmt.Errorf("Gemini response %d certificate error", code) + default: + return fmt.Errorf("Unexpected/unhandled Gemini response %d", code) + } +} + func Process(snapshot *Snapshot) *Snapshot { - LogInfo("[%s] Processing data", snapshot.Url.String()) + LogDebug("[%s] Processing snapshot", snapshot.Url.String()) code, err := ParseFirstTwoDigits(snapshot.Data) if err != nil { - snapshot.Error = fmt.Errorf("[%s] Invalid gemini response code", snapshot.Url.String()) + snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.Url.String()) return snapshot } - if code != 20 { - snapshot.Error = fmt.Errorf("[%s] Gemini response code != 20, skipping", snapshot.Url.String()) + err = checkStatusCode(code) + if err != nil { + snapshot.Error = fmt.Errorf("[%s] Gemini response code error, skipping. %w", snapshot.Url.String(), err) return snapshot } - // Grab link lines + + // Remove response code from body (first line) + index := strings.Index(snapshot.Data, "\n") + if index != -1 { + snapshot.Data = snapshot.Data[index+1:] + } + + // Grab any link lines linkLines := ExtractLinkLines(snapshot.Data) LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines)) // Normalize URLs in links, and store them in snapshot @@ -112,7 +140,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin // Remove usual first space from URL description: // => URL description // ^^^^^^^^^^^^ - if restOfLine[0] == ' ' { + if len(restOfLine) > 0 && restOfLine[0] == ' ' { restOfLine = restOfLine[1:] } diff --git a/gemini-url.go b/gemini_url.go similarity index 100% rename from gemini-url.go rename to gemini_url.go diff --git a/logging.go b/logging.go index 0e2d3f7..0a54e83 100644 --- a/logging.go +++ b/logging.go @@ -13,6 +13,11 @@ func LogDebug(format string, args ...interface{}) { func LogInfo(format string, args ...interface{}) { zlog.Info().Msg(fmt.Sprintf(format, args...)) } + +func LogWarn(format string, args ...interface{}) { + zlog.Warn().Msg(fmt.Sprintf(format, args...)) +} + func LogError(format string, args ...interface{}) { zlog.Error().Err(fmt.Errorf(format, args...)).Msg("") } diff --git a/main.go b/main.go index 1c846d9..fdb3577 100644 --- a/main.go +++ b/main.go @@ -1,99 +1,87 @@ package main import ( + "math/rand/v2" "os" - "sync" + "strings" + "time" "github.com/rs/zerolog" zlog "github.com/rs/zerolog/log" ) -const ROOTPATH string = "./a" - func main() { + config := *getConfig() zerolog.TimeFieldFormat = zerolog.TimeFormatUnix - zerolog.SetGlobalLevel(zerolog.DebugLevel) - //zerolog.SetGlobalLevel(zerolog.InfoLevel) + zerolog.SetGlobalLevel(config.logLevel) zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"}) - if err := runApp(); err != nil { + if err := runApp(&config); err != nil { LogError("Application error: %w", err) os.Exit(1) } } -func runApp() error { - //urls := []string{"gemini://smol.gr"} - urls := []string{"gemini://smol.gr", "gemini://gmi.noulin.net/"} +func runApp(config *Config) error { + // urls := []string{"gemini://smol.gr"} + urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"} - queue := make(chan string) + queue := make(chan string, 10000) + results := make(chan Snapshot, 100) done := make(chan struct{}) - // Start the crawler. - go crawler(queue, done) + go spawnStats(queue, results) + go resultsHandler(queue, results) + spawnWorkers(config, queue, results) - // Send URLs to the queue for _, url := range urls { - // Send URL to queue; blocks until crawler receives it queue <- url } - - // All URLs have been sent and received - // because queue is unbuffered; safe to close the queue - close(queue) - - // Wait until crawler signals finish <-done return nil } -func crawler(queue <-chan string, done chan struct{}) { - // Start processing results. - results := make(chan Snapshot) - resultsDone := make(chan struct{}) - go resultsHandler(results, resultsDone) - - // Create workers that consume the queue channel, - // and send their result to results channel. - workers := 3 - LogInfo("Spawning %d workers", workers) - var wg sync.WaitGroup - // Start worker goroutines - for range workers { - wg.Add(1) - go func() { - worker(queue, results) - wg.Done() - }() +func spawnStats(queue chan string, results chan Snapshot) { + ticker := time.NewTicker(time.Duration(time.Second * 10)) + defer ticker.Stop() + for range ticker.C { + LogInfo("Queue length: %d\n", len(queue)) + LogInfo("Results length: %d\n", len(results)) } - - // Wait until all workers have finished. - wg.Wait() - LogInfo("All workers have finished") - - // Nobody left to send to results, so we - // close it, and the SnapshotsProcessor can - // finish - close(results) - <-resultsDone - - close(done) } -func resultsHandler(results <-chan Snapshot, done chan struct{}) { +func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) { + workers := config.numOfWorkers + LogInfo("Spawning %d workers", workers) + // Start worker goroutines + for i := 0; i < workers; i++ { + go func(i int) { + worker(i, config.rootPath, queue, results) + }(i) + } +} + +func resultsHandler(queue chan string, results <-chan Snapshot) { for result := range results { if result.Error != nil { LogError("[%s] %w", result.Url, result.Error) } else { - LogInfo("[%s] Done", result.Url) + LogDebug("[%s] Done", result.Url) + for _, link := range result.Links { + if strings.HasPrefix(link.Full, "gemini://") { + go func(link GeminiUrl) { + queue <- link.Full + // fmt.Printf("Sent %s to queue\n", link.Full) + }(link) + } + } // fmt.Printf(SnapshotToJSON(result)) } } - LogInfo("All results have been processed") - close(done) } -func worker(queue <-chan string, results chan Snapshot) { +func worker(id int, rootPath string, queue <-chan string, results chan Snapshot) { for url := range queue { + LogDebug("Worker %d visiting %s", id, url) result := Visit(url) // If we encountered an error when // visiting, skip processing @@ -101,12 +89,15 @@ func worker(queue <-chan string, results chan Snapshot) { results <- *result continue } + LogDebug("Worker %d processing %s", id, url) result = Process(result) if result.Error != nil { results <- *result continue } - SaveResult(ROOTPATH, result) + LogDebug("Worker %d saving %s", id, url) + SaveResult(rootPath, result) results <- *result + time.Sleep(time.Duration(rand.IntN(5)) * time.Second) } } diff --git a/network.go b/network.go index 9e31cd9..6f2dd1b 100644 --- a/network.go +++ b/network.go @@ -24,7 +24,7 @@ func Visit(url string) (result *Snapshot) { } result.Url = *geminiUrl - LogInfo("[%s] Dialing", geminiUrl) + LogInfo("[%s] Connecting", geminiUrl) // Establish a TLS connection tlsConfig := &tls.Config{ @@ -65,7 +65,7 @@ func Visit(url string) (result *Snapshot) { } } } - LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data)) + LogDebug("[%s] Received %d bytes", geminiUrl.String(), len(data)) // time.Sleep(time.Duration(time.Second * 2)) // LogDebug("[%s] Visitor finished", geminiUrl.String()) result.Data = string(data)