Add configuration via env vars
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,3 +1,4 @@
|
|||||||
.idea
|
.idea
|
||||||
|
/run.sh
|
||||||
/gemini-grc
|
/gemini-grc
|
||||||
|
/snaps
|
||||||
|
|||||||
55
config.go
Normal file
55
config.go
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/rs/zerolog"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Config struct {
|
||||||
|
logLevel zerolog.Level
|
||||||
|
rootPath string
|
||||||
|
numOfWorkers int
|
||||||
|
}
|
||||||
|
|
||||||
|
func getConfig() *Config {
|
||||||
|
var config Config
|
||||||
|
for _, envVar := range []string{
|
||||||
|
"LOG_LEVEL",
|
||||||
|
"ROOT_PATH",
|
||||||
|
"NUM_OF_WORKERS",
|
||||||
|
} {
|
||||||
|
if env, ok := os.LookupEnv(envVar); !ok {
|
||||||
|
fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar)
|
||||||
|
os.Exit(1)
|
||||||
|
} else {
|
||||||
|
switch envVar {
|
||||||
|
case "LOG_LEVEL":
|
||||||
|
{
|
||||||
|
logLevel, err := zerolog.ParseLevel(env)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Invalid LOG_LEVEL value\n")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
config.logLevel = logLevel
|
||||||
|
}
|
||||||
|
case "ROOT_PATH":
|
||||||
|
{
|
||||||
|
config.rootPath = env
|
||||||
|
}
|
||||||
|
case "NUM_OF_WORKERS":
|
||||||
|
{
|
||||||
|
if numOfWorkers, err := strconv.Atoi(env); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Invalid NUM_OF_WORKERS value\n")
|
||||||
|
os.Exit(1)
|
||||||
|
} else {
|
||||||
|
config.numOfWorkers = numOfWorkers
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &config
|
||||||
|
}
|
||||||
35
fs.go
35
fs.go
@@ -4,6 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
@@ -46,10 +47,9 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
|
|||||||
// Normalize the URL path
|
// Normalize the URL path
|
||||||
cleanPath := filepath.Clean(urlPath)
|
cleanPath := filepath.Clean(urlPath)
|
||||||
|
|
||||||
fmt.Printf("%s %s\n", urlPath, cleanPath)
|
|
||||||
// Safe check to prevent directory traversal
|
// Safe check to prevent directory traversal
|
||||||
if strings.Contains(cleanPath, "..") {
|
if strings.Contains(cleanPath, "..") {
|
||||||
return "", fmt.Errorf("invalid URL path: contains directory traversal")
|
return "", fmt.Errorf("Invalid URL path: contains directory traversal")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sanitize the path by encoding invalid characters
|
// Sanitize the path by encoding invalid characters
|
||||||
@@ -58,29 +58,36 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
|
|||||||
// Join the root path and the sanitized URL path
|
// Join the root path and the sanitized URL path
|
||||||
finalPath := filepath.Join(rootPath, safePath)
|
finalPath := filepath.Join(rootPath, safePath)
|
||||||
|
|
||||||
// Ensure the directory exists
|
|
||||||
dir := filepath.Dir(finalPath)
|
|
||||||
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
|
|
||||||
return "", fmt.Errorf("failed to create directories: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return finalPath, nil
|
return finalPath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveResult(rootPath string, s *Snapshot) {
|
func SaveResult(rootPath string, s *Snapshot) {
|
||||||
|
parentPath := path.Join(rootPath, s.Url.Hostname)
|
||||||
urlPath := s.Url.Path
|
urlPath := s.Url.Path
|
||||||
if urlPath == "" || urlPath == "/" {
|
// If path is empty, add `index.gmi` as the file to save
|
||||||
urlPath = fmt.Sprintf("%s/index.gmi", s.Url.Hostname)
|
if urlPath == "" || urlPath == "." {
|
||||||
|
urlPath = fmt.Sprintf("index.gmi")
|
||||||
}
|
}
|
||||||
filepath, err := calcFilePath(rootPath, urlPath)
|
// If path ends with '/' then add index.gmi for the
|
||||||
|
// directory to be created.
|
||||||
|
if strings.HasSuffix(urlPath, "/") {
|
||||||
|
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
finalPath, err := calcFilePath(parentPath, urlPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
LogError("Error saving %s: %w", s.Url, err)
|
LogError("Error saving %s: %w", s.Url, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// err = os.WriteFile(filepath, []byte(SnapshotToJSON(*s)), 0666)
|
// Ensure the directory exists
|
||||||
err = os.WriteFile(filepath, []byte((*s).Data), 0666)
|
dir := filepath.Dir(finalPath)
|
||||||
|
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
|
||||||
|
LogError("Failed to create directory: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = os.WriteFile(finalPath, []byte((*s).Data), 0666)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
LogError("Error saving %s: %w", s.Url.Full, err)
|
LogError("Error saving %s: %w", s.Url.Full, err)
|
||||||
}
|
}
|
||||||
LogInfo("[%s] Saved to %s", s.Url.Full, filepath)
|
LogInfo("[%s] Saved to %s", s.Url.Full, finalPath)
|
||||||
}
|
}
|
||||||
|
|||||||
40
gemini.go
40
gemini.go
@@ -6,20 +6,48 @@ import (
|
|||||||
"net/url"
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func checkStatusCode(code int) error {
|
||||||
|
switch {
|
||||||
|
case code == 20:
|
||||||
|
return nil
|
||||||
|
case code >= 10 && code < 20:
|
||||||
|
return fmt.Errorf("Gemini response %d needs data input", code)
|
||||||
|
case code >= 30 && code < 40:
|
||||||
|
return fmt.Errorf("Gemini response %d redirect", code)
|
||||||
|
case code >= 40 && code < 50:
|
||||||
|
return fmt.Errorf("Gemini response %d server error", code)
|
||||||
|
case code >= 50 && code < 60:
|
||||||
|
return fmt.Errorf("Gemini response %d server permanent error", code)
|
||||||
|
case code >= 60 && code < 70:
|
||||||
|
return fmt.Errorf("Gemini response %d certificate error", code)
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("Unexpected/unhandled Gemini response %d", code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func Process(snapshot *Snapshot) *Snapshot {
|
func Process(snapshot *Snapshot) *Snapshot {
|
||||||
LogInfo("[%s] Processing data", snapshot.Url.String())
|
LogDebug("[%s] Processing snapshot", snapshot.Url.String())
|
||||||
code, err := ParseFirstTwoDigits(snapshot.Data)
|
code, err := ParseFirstTwoDigits(snapshot.Data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
snapshot.Error = fmt.Errorf("[%s] Invalid gemini response code", snapshot.Url.String())
|
snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.Url.String())
|
||||||
return snapshot
|
return snapshot
|
||||||
}
|
}
|
||||||
if code != 20 {
|
err = checkStatusCode(code)
|
||||||
snapshot.Error = fmt.Errorf("[%s] Gemini response code != 20, skipping", snapshot.Url.String())
|
if err != nil {
|
||||||
|
snapshot.Error = fmt.Errorf("[%s] Gemini response code error, skipping. %w", snapshot.Url.String(), err)
|
||||||
return snapshot
|
return snapshot
|
||||||
}
|
}
|
||||||
// Grab link lines
|
|
||||||
|
// Remove response code from body (first line)
|
||||||
|
index := strings.Index(snapshot.Data, "\n")
|
||||||
|
if index != -1 {
|
||||||
|
snapshot.Data = snapshot.Data[index+1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab any link lines
|
||||||
linkLines := ExtractLinkLines(snapshot.Data)
|
linkLines := ExtractLinkLines(snapshot.Data)
|
||||||
LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines))
|
LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines))
|
||||||
// Normalize URLs in links, and store them in snapshot
|
// Normalize URLs in links, and store them in snapshot
|
||||||
@@ -112,7 +140,7 @@ func NormalizeLink(linkLine string, currentURL string) (link string, descr strin
|
|||||||
// Remove usual first space from URL description:
|
// Remove usual first space from URL description:
|
||||||
// => URL description
|
// => URL description
|
||||||
// ^^^^^^^^^^^^
|
// ^^^^^^^^^^^^
|
||||||
if restOfLine[0] == ' ' {
|
if len(restOfLine) > 0 && restOfLine[0] == ' ' {
|
||||||
restOfLine = restOfLine[1:]
|
restOfLine = restOfLine[1:]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,11 @@ func LogDebug(format string, args ...interface{}) {
|
|||||||
func LogInfo(format string, args ...interface{}) {
|
func LogInfo(format string, args ...interface{}) {
|
||||||
zlog.Info().Msg(fmt.Sprintf(format, args...))
|
zlog.Info().Msg(fmt.Sprintf(format, args...))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LogWarn(format string, args ...interface{}) {
|
||||||
|
zlog.Warn().Msg(fmt.Sprintf(format, args...))
|
||||||
|
}
|
||||||
|
|
||||||
func LogError(format string, args ...interface{}) {
|
func LogError(format string, args ...interface{}) {
|
||||||
zlog.Error().Err(fmt.Errorf(format, args...)).Msg("")
|
zlog.Error().Err(fmt.Errorf(format, args...)).Msg("")
|
||||||
}
|
}
|
||||||
|
|||||||
103
main.go
103
main.go
@@ -1,99 +1,87 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"math/rand/v2"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
"github.com/rs/zerolog"
|
||||||
zlog "github.com/rs/zerolog/log"
|
zlog "github.com/rs/zerolog/log"
|
||||||
)
|
)
|
||||||
|
|
||||||
const ROOTPATH string = "./a"
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
config := *getConfig()
|
||||||
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
||||||
zerolog.SetGlobalLevel(zerolog.DebugLevel)
|
zerolog.SetGlobalLevel(config.logLevel)
|
||||||
//zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
|
||||||
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
|
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
|
||||||
if err := runApp(); err != nil {
|
if err := runApp(&config); err != nil {
|
||||||
LogError("Application error: %w", err)
|
LogError("Application error: %w", err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runApp() error {
|
func runApp(config *Config) error {
|
||||||
//urls := []string{"gemini://smol.gr"}
|
// urls := []string{"gemini://smol.gr"}
|
||||||
urls := []string{"gemini://smol.gr", "gemini://gmi.noulin.net/"}
|
urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"}
|
||||||
|
|
||||||
queue := make(chan string)
|
queue := make(chan string, 10000)
|
||||||
|
results := make(chan Snapshot, 100)
|
||||||
done := make(chan struct{})
|
done := make(chan struct{})
|
||||||
|
|
||||||
// Start the crawler.
|
go spawnStats(queue, results)
|
||||||
go crawler(queue, done)
|
go resultsHandler(queue, results)
|
||||||
|
spawnWorkers(config, queue, results)
|
||||||
|
|
||||||
// Send URLs to the queue
|
|
||||||
for _, url := range urls {
|
for _, url := range urls {
|
||||||
// Send URL to queue; blocks until crawler receives it
|
|
||||||
queue <- url
|
queue <- url
|
||||||
}
|
}
|
||||||
|
|
||||||
// All URLs have been sent and received
|
|
||||||
// because queue is unbuffered; safe to close the queue
|
|
||||||
close(queue)
|
|
||||||
|
|
||||||
// Wait until crawler signals finish
|
|
||||||
<-done
|
<-done
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func crawler(queue <-chan string, done chan struct{}) {
|
func spawnStats(queue chan string, results chan Snapshot) {
|
||||||
// Start processing results.
|
ticker := time.NewTicker(time.Duration(time.Second * 10))
|
||||||
results := make(chan Snapshot)
|
defer ticker.Stop()
|
||||||
resultsDone := make(chan struct{})
|
for range ticker.C {
|
||||||
go resultsHandler(results, resultsDone)
|
LogInfo("Queue length: %d\n", len(queue))
|
||||||
|
LogInfo("Results length: %d\n", len(results))
|
||||||
// Create workers that consume the queue channel,
|
|
||||||
// and send their result to results channel.
|
|
||||||
workers := 3
|
|
||||||
LogInfo("Spawning %d workers", workers)
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
// Start worker goroutines
|
|
||||||
for range workers {
|
|
||||||
wg.Add(1)
|
|
||||||
go func() {
|
|
||||||
worker(queue, results)
|
|
||||||
wg.Done()
|
|
||||||
}()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait until all workers have finished.
|
|
||||||
wg.Wait()
|
|
||||||
LogInfo("All workers have finished")
|
|
||||||
|
|
||||||
// Nobody left to send to results, so we
|
|
||||||
// close it, and the SnapshotsProcessor can
|
|
||||||
// finish
|
|
||||||
close(results)
|
|
||||||
<-resultsDone
|
|
||||||
|
|
||||||
close(done)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func resultsHandler(results <-chan Snapshot, done chan struct{}) {
|
func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) {
|
||||||
|
workers := config.numOfWorkers
|
||||||
|
LogInfo("Spawning %d workers", workers)
|
||||||
|
// Start worker goroutines
|
||||||
|
for i := 0; i < workers; i++ {
|
||||||
|
go func(i int) {
|
||||||
|
worker(i, config.rootPath, queue, results)
|
||||||
|
}(i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resultsHandler(queue chan string, results <-chan Snapshot) {
|
||||||
for result := range results {
|
for result := range results {
|
||||||
if result.Error != nil {
|
if result.Error != nil {
|
||||||
LogError("[%s] %w", result.Url, result.Error)
|
LogError("[%s] %w", result.Url, result.Error)
|
||||||
} else {
|
} else {
|
||||||
LogInfo("[%s] Done", result.Url)
|
LogDebug("[%s] Done", result.Url)
|
||||||
|
for _, link := range result.Links {
|
||||||
|
if strings.HasPrefix(link.Full, "gemini://") {
|
||||||
|
go func(link GeminiUrl) {
|
||||||
|
queue <- link.Full
|
||||||
|
// fmt.Printf("Sent %s to queue\n", link.Full)
|
||||||
|
}(link)
|
||||||
|
}
|
||||||
|
}
|
||||||
// fmt.Printf(SnapshotToJSON(result))
|
// fmt.Printf(SnapshotToJSON(result))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LogInfo("All results have been processed")
|
|
||||||
close(done)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func worker(queue <-chan string, results chan Snapshot) {
|
func worker(id int, rootPath string, queue <-chan string, results chan Snapshot) {
|
||||||
for url := range queue {
|
for url := range queue {
|
||||||
|
LogDebug("Worker %d visiting %s", id, url)
|
||||||
result := Visit(url)
|
result := Visit(url)
|
||||||
// If we encountered an error when
|
// If we encountered an error when
|
||||||
// visiting, skip processing
|
// visiting, skip processing
|
||||||
@@ -101,12 +89,15 @@ func worker(queue <-chan string, results chan Snapshot) {
|
|||||||
results <- *result
|
results <- *result
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
LogDebug("Worker %d processing %s", id, url)
|
||||||
result = Process(result)
|
result = Process(result)
|
||||||
if result.Error != nil {
|
if result.Error != nil {
|
||||||
results <- *result
|
results <- *result
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
SaveResult(ROOTPATH, result)
|
LogDebug("Worker %d saving %s", id, url)
|
||||||
|
SaveResult(rootPath, result)
|
||||||
results <- *result
|
results <- *result
|
||||||
|
time.Sleep(time.Duration(rand.IntN(5)) * time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ func Visit(url string) (result *Snapshot) {
|
|||||||
}
|
}
|
||||||
result.Url = *geminiUrl
|
result.Url = *geminiUrl
|
||||||
|
|
||||||
LogInfo("[%s] Dialing", geminiUrl)
|
LogInfo("[%s] Connecting", geminiUrl)
|
||||||
|
|
||||||
// Establish a TLS connection
|
// Establish a TLS connection
|
||||||
tlsConfig := &tls.Config{
|
tlsConfig := &tls.Config{
|
||||||
@@ -65,7 +65,7 @@ func Visit(url string) (result *Snapshot) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data))
|
LogDebug("[%s] Received %d bytes", geminiUrl.String(), len(data))
|
||||||
// time.Sleep(time.Duration(time.Second * 2))
|
// time.Sleep(time.Duration(time.Second * 2))
|
||||||
// LogDebug("[%s] Visitor finished", geminiUrl.String())
|
// LogDebug("[%s] Visitor finished", geminiUrl.String())
|
||||||
result.Data = string(data)
|
result.Data = string(data)
|
||||||
|
|||||||
Reference in New Issue
Block a user