Add configuration via env vars

This commit is contained in:
2024-10-08 12:42:08 +03:00
parent 74e9327b0b
commit c3d6481de0
8 changed files with 166 additions and 79 deletions

3
.gitignore vendored
View File

@@ -1,3 +1,4 @@
.idea .idea
/run.sh
/gemini-grc /gemini-grc
/snaps

55
config.go Normal file
View File

@@ -0,0 +1,55 @@
package main
import (
"fmt"
"os"
"strconv"
"github.com/rs/zerolog"
)
type Config struct {
logLevel zerolog.Level
rootPath string
numOfWorkers int
}
func getConfig() *Config {
var config Config
for _, envVar := range []string{
"LOG_LEVEL",
"ROOT_PATH",
"NUM_OF_WORKERS",
} {
if env, ok := os.LookupEnv(envVar); !ok {
fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar)
os.Exit(1)
} else {
switch envVar {
case "LOG_LEVEL":
{
logLevel, err := zerolog.ParseLevel(env)
if err != nil {
fmt.Fprintf(os.Stderr, "Invalid LOG_LEVEL value\n")
os.Exit(1)
}
config.logLevel = logLevel
}
case "ROOT_PATH":
{
config.rootPath = env
}
case "NUM_OF_WORKERS":
{
if numOfWorkers, err := strconv.Atoi(env); err != nil {
fmt.Fprintf(os.Stderr, "Invalid NUM_OF_WORKERS value\n")
os.Exit(1)
} else {
config.numOfWorkers = numOfWorkers
}
}
}
}
}
return &config
}

35
fs.go
View File

@@ -4,6 +4,7 @@ import (
"fmt" "fmt"
"net/url" "net/url"
"os" "os"
"path"
"path/filepath" "path/filepath"
"strings" "strings"
) )
@@ -46,10 +47,9 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
// Normalize the URL path // Normalize the URL path
cleanPath := filepath.Clean(urlPath) cleanPath := filepath.Clean(urlPath)
fmt.Printf("%s %s\n", urlPath, cleanPath)
// Safe check to prevent directory traversal // Safe check to prevent directory traversal
if strings.Contains(cleanPath, "..") { if strings.Contains(cleanPath, "..") {
return "", fmt.Errorf("invalid URL path: contains directory traversal") return "", fmt.Errorf("Invalid URL path: contains directory traversal")
} }
// Sanitize the path by encoding invalid characters // Sanitize the path by encoding invalid characters
@@ -58,29 +58,36 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
// Join the root path and the sanitized URL path // Join the root path and the sanitized URL path
finalPath := filepath.Join(rootPath, safePath) finalPath := filepath.Join(rootPath, safePath)
// Ensure the directory exists
dir := filepath.Dir(finalPath)
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
return "", fmt.Errorf("failed to create directories: %v", err)
}
return finalPath, nil return finalPath, nil
} }
func SaveResult(rootPath string, s *Snapshot) { func SaveResult(rootPath string, s *Snapshot) {
parentPath := path.Join(rootPath, s.Url.Hostname)
urlPath := s.Url.Path urlPath := s.Url.Path
if urlPath == "" || urlPath == "/" { // If path is empty, add `index.gmi` as the file to save
urlPath = fmt.Sprintf("%s/index.gmi", s.Url.Hostname) if urlPath == "" || urlPath == "." {
urlPath = fmt.Sprintf("index.gmi")
} }
filepath, err := calcFilePath(rootPath, urlPath) // If path ends with '/' then add index.gmi for the
// directory to be created.
if strings.HasSuffix(urlPath, "/") {
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
}
finalPath, err := calcFilePath(parentPath, urlPath)
if err != nil { if err != nil {
LogError("Error saving %s: %w", s.Url, err) LogError("Error saving %s: %w", s.Url, err)
return return
} }
// err = os.WriteFile(filepath, []byte(SnapshotToJSON(*s)), 0666) // Ensure the directory exists
err = os.WriteFile(filepath, []byte((*s).Data), 0666) dir := filepath.Dir(finalPath)
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
LogError("Failed to create directory: %w", err)
return
}
err = os.WriteFile(finalPath, []byte((*s).Data), 0666)
if err != nil { if err != nil {
LogError("Error saving %s: %w", s.Url.Full, err) LogError("Error saving %s: %w", s.Url.Full, err)
} }
LogInfo("[%s] Saved to %s", s.Url.Full, filepath) LogInfo("[%s] Saved to %s", s.Url.Full, finalPath)
} }

View File

@@ -6,20 +6,48 @@ import (
"net/url" "net/url"
"regexp" "regexp"
"strconv" "strconv"
"strings"
) )
func checkStatusCode(code int) error {
switch {
case code == 20:
return nil
case code >= 10 && code < 20:
return fmt.Errorf("Gemini response %d needs data input", code)
case code >= 30 && code < 40:
return fmt.Errorf("Gemini response %d redirect", code)
case code >= 40 && code < 50:
return fmt.Errorf("Gemini response %d server error", code)
case code >= 50 && code < 60:
return fmt.Errorf("Gemini response %d server permanent error", code)
case code >= 60 && code < 70:
return fmt.Errorf("Gemini response %d certificate error", code)
default:
return fmt.Errorf("Unexpected/unhandled Gemini response %d", code)
}
}
func Process(snapshot *Snapshot) *Snapshot { func Process(snapshot *Snapshot) *Snapshot {
LogInfo("[%s] Processing data", snapshot.Url.String()) LogDebug("[%s] Processing snapshot", snapshot.Url.String())
code, err := ParseFirstTwoDigits(snapshot.Data) code, err := ParseFirstTwoDigits(snapshot.Data)
if err != nil { if err != nil {
snapshot.Error = fmt.Errorf("[%s] Invalid gemini response code", snapshot.Url.String()) snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.Url.String())
return snapshot return snapshot
} }
if code != 20 { err = checkStatusCode(code)
snapshot.Error = fmt.Errorf("[%s] Gemini response code != 20, skipping", snapshot.Url.String()) if err != nil {
snapshot.Error = fmt.Errorf("[%s] Gemini response code error, skipping. %w", snapshot.Url.String(), err)
return snapshot return snapshot
} }
// Grab link lines
// Remove response code from body (first line)
index := strings.Index(snapshot.Data, "\n")
if index != -1 {
snapshot.Data = snapshot.Data[index+1:]
}
// Grab any link lines
linkLines := ExtractLinkLines(snapshot.Data) linkLines := ExtractLinkLines(snapshot.Data)
LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines)) LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines))
// Normalize URLs in links, and store them in snapshot // Normalize URLs in links, and store them in snapshot
@@ -112,7 +140,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
// Remove usual first space from URL description: // Remove usual first space from URL description:
// => URL description // => URL description
// ^^^^^^^^^^^^ // ^^^^^^^^^^^^
if restOfLine[0] == ' ' { if len(restOfLine) > 0 && restOfLine[0] == ' ' {
restOfLine = restOfLine[1:] restOfLine = restOfLine[1:]
} }

View File

@@ -13,6 +13,11 @@ func LogDebug(format string, args ...interface{}) {
func LogInfo(format string, args ...interface{}) { func LogInfo(format string, args ...interface{}) {
zlog.Info().Msg(fmt.Sprintf(format, args...)) zlog.Info().Msg(fmt.Sprintf(format, args...))
} }
func LogWarn(format string, args ...interface{}) {
zlog.Warn().Msg(fmt.Sprintf(format, args...))
}
func LogError(format string, args ...interface{}) { func LogError(format string, args ...interface{}) {
zlog.Error().Err(fmt.Errorf(format, args...)).Msg("") zlog.Error().Err(fmt.Errorf(format, args...)).Msg("")
} }

103
main.go
View File

@@ -1,99 +1,87 @@
package main package main
import ( import (
"math/rand/v2"
"os" "os"
"sync" "strings"
"time"
"github.com/rs/zerolog" "github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log" zlog "github.com/rs/zerolog/log"
) )
const ROOTPATH string = "./a"
func main() { func main() {
config := *getConfig()
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
zerolog.SetGlobalLevel(zerolog.DebugLevel) zerolog.SetGlobalLevel(config.logLevel)
//zerolog.SetGlobalLevel(zerolog.InfoLevel)
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"}) zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
if err := runApp(); err != nil { if err := runApp(&config); err != nil {
LogError("Application error: %w", err) LogError("Application error: %w", err)
os.Exit(1) os.Exit(1)
} }
} }
func runApp() error { func runApp(config *Config) error {
//urls := []string{"gemini://smol.gr"} // urls := []string{"gemini://smol.gr"}
urls := []string{"gemini://smol.gr", "gemini://gmi.noulin.net/"} urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"}
queue := make(chan string) queue := make(chan string, 10000)
results := make(chan Snapshot, 100)
done := make(chan struct{}) done := make(chan struct{})
// Start the crawler. go spawnStats(queue, results)
go crawler(queue, done) go resultsHandler(queue, results)
spawnWorkers(config, queue, results)
// Send URLs to the queue
for _, url := range urls { for _, url := range urls {
// Send URL to queue; blocks until crawler receives it
queue <- url queue <- url
} }
// All URLs have been sent and received
// because queue is unbuffered; safe to close the queue
close(queue)
// Wait until crawler signals finish
<-done <-done
return nil return nil
} }
func crawler(queue <-chan string, done chan struct{}) { func spawnStats(queue chan string, results chan Snapshot) {
// Start processing results. ticker := time.NewTicker(time.Duration(time.Second * 10))
results := make(chan Snapshot) defer ticker.Stop()
resultsDone := make(chan struct{}) for range ticker.C {
go resultsHandler(results, resultsDone) LogInfo("Queue length: %d\n", len(queue))
LogInfo("Results length: %d\n", len(results))
// Create workers that consume the queue channel,
// and send their result to results channel.
workers := 3
LogInfo("Spawning %d workers", workers)
var wg sync.WaitGroup
// Start worker goroutines
for range workers {
wg.Add(1)
go func() {
worker(queue, results)
wg.Done()
}()
} }
// Wait until all workers have finished.
wg.Wait()
LogInfo("All workers have finished")
// Nobody left to send to results, so we
// close it, and the SnapshotsProcessor can
// finish
close(results)
<-resultsDone
close(done)
} }
func resultsHandler(results <-chan Snapshot, done chan struct{}) { func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) {
workers := config.numOfWorkers
LogInfo("Spawning %d workers", workers)
// Start worker goroutines
for i := 0; i < workers; i++ {
go func(i int) {
worker(i, config.rootPath, queue, results)
}(i)
}
}
func resultsHandler(queue chan string, results <-chan Snapshot) {
for result := range results { for result := range results {
if result.Error != nil { if result.Error != nil {
LogError("[%s] %w", result.Url, result.Error) LogError("[%s] %w", result.Url, result.Error)
} else { } else {
LogInfo("[%s] Done", result.Url) LogDebug("[%s] Done", result.Url)
for _, link := range result.Links {
if strings.HasPrefix(link.Full, "gemini://") {
go func(link GeminiUrl) {
queue <- link.Full
// fmt.Printf("Sent %s to queue\n", link.Full)
}(link)
}
}
// fmt.Printf(SnapshotToJSON(result)) // fmt.Printf(SnapshotToJSON(result))
} }
} }
LogInfo("All results have been processed")
close(done)
} }
func worker(queue <-chan string, results chan Snapshot) { func worker(id int, rootPath string, queue <-chan string, results chan Snapshot) {
for url := range queue { for url := range queue {
LogDebug("Worker %d visiting %s", id, url)
result := Visit(url) result := Visit(url)
// If we encountered an error when // If we encountered an error when
// visiting, skip processing // visiting, skip processing
@@ -101,12 +89,15 @@ func worker(queue <-chan string, results chan Snapshot) {
results <- *result results <- *result
continue continue
} }
LogDebug("Worker %d processing %s", id, url)
result = Process(result) result = Process(result)
if result.Error != nil { if result.Error != nil {
results <- *result results <- *result
continue continue
} }
SaveResult(ROOTPATH, result) LogDebug("Worker %d saving %s", id, url)
SaveResult(rootPath, result)
results <- *result results <- *result
time.Sleep(time.Duration(rand.IntN(5)) * time.Second)
} }
} }

View File

@@ -24,7 +24,7 @@ func Visit(url string) (result *Snapshot) {
} }
result.Url = *geminiUrl result.Url = *geminiUrl
LogInfo("[%s] Dialing", geminiUrl) LogInfo("[%s] Connecting", geminiUrl)
// Establish a TLS connection // Establish a TLS connection
tlsConfig := &tls.Config{ tlsConfig := &tls.Config{
@@ -65,7 +65,7 @@ func Visit(url string) (result *Snapshot) {
} }
} }
} }
LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data)) LogDebug("[%s] Received %d bytes", geminiUrl.String(), len(data))
// time.Sleep(time.Duration(time.Second * 2)) // time.Sleep(time.Duration(time.Second * 2))
// LogDebug("[%s] Visitor finished", geminiUrl.String()) // LogDebug("[%s] Visitor finished", geminiUrl.String())
result.Data = string(data) result.Data = string(data)