Work on header parsing & saving other files

This commit is contained in:
2024-10-08 18:16:47 +03:00
parent 7e849feffe
commit 91f8e69fdf
7 changed files with 138 additions and 80 deletions

58
main.go
View File

@@ -10,28 +10,30 @@ import (
zlog "github.com/rs/zerolog/log"
)
var CONFIG Config
func main() {
config := *getConfig()
CONFIG = *getConfig()
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
zerolog.SetGlobalLevel(config.logLevel)
zerolog.SetGlobalLevel(CONFIG.logLevel)
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
if err := runApp(&config); err != nil {
if err := runApp(); err != nil {
LogError("Application error: %w", err)
os.Exit(1)
}
}
func runApp(config *Config) error {
func runApp() error {
// urls := []string{"gemini://smol.gr"}
urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"}
queue := make(chan string, 10000)
queue := make(chan string, 1000)
results := make(chan Snapshot, 100)
done := make(chan struct{})
go spawnStats(queue, results)
go resultsHandler(queue, results)
spawnWorkers(config, queue, results)
spawnWorkers(CONFIG.numOfWorkers, queue, results)
for _, url := range urls {
queue <- url
@@ -44,18 +46,17 @@ func spawnStats(queue chan string, results chan Snapshot) {
ticker := time.NewTicker(time.Duration(time.Second * 10))
defer ticker.Stop()
for range ticker.C {
LogInfo("Queue length: %d\n", len(queue))
LogInfo("Results length: %d\n", len(results))
LogInfo("Queue length: %d", len(queue))
LogInfo("Results length: %d", len(results))
}
}
func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) {
workers := config.numOfWorkers
LogInfo("Spawning %d workers", workers)
func spawnWorkers(numOfWorkers int, queue <-chan string, results chan Snapshot) {
LogInfo("Spawning %d workers", numOfWorkers)
// Start worker goroutines
for i := 0; i < workers; i++ {
for i := 0; i < numOfWorkers; i++ {
go func(i int) {
worker(i, config.rootPath, queue, results)
worker(i, queue, results)
}(i)
}
}
@@ -63,9 +64,9 @@ func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) {
func resultsHandler(queue chan string, results <-chan Snapshot) {
for result := range results {
if result.Error != nil {
LogError("[%s] %w", result.Url, result.Error)
LogError("[%s] %w", result.URL, result.Error)
} else {
LogDebug("[%s] Done", result.Url)
LogDebug("[%s] Done", result.URL)
for _, link := range result.Links {
if strings.HasPrefix(link.Full, "gemini://") {
go func(link GeminiUrl) {
@@ -74,12 +75,15 @@ func resultsHandler(queue chan string, results <-chan Snapshot) {
}(link)
}
}
// fmt.Printf(SnapshotToJSON(result))
// if result.MimeType == "text/gemini" {
// result.Data = ""
// fmt.Printf(SnapshotToJSON(result))
// }
}
}
}
func worker(id int, rootPath string, queue <-chan string, results chan Snapshot) {
func worker(id int, queue <-chan string, results chan Snapshot) {
for url := range queue {
LogDebug("Worker %d visiting %s", id, url)
result := Visit(url)
@@ -90,14 +94,28 @@ func worker(id int, rootPath string, queue <-chan string, results chan Snapshot)
continue
}
LogDebug("Worker %d processing %s", id, url)
result = Process(result)
result = ProcessHeaders(result)
if result.Error != nil {
results <- *result
continue
}
LogDebug("Worker %d saving %s", id, url)
SaveResult(rootPath, result)
if result.MimeType == "text/gemini" {
result = ProcessGemini(result)
}
if shouldPersist(result) {
LogInfo("Worker %d saving %s", id, url)
SaveResult(CONFIG.rootPath, result)
}
results <- *result
time.Sleep(time.Duration(rand.IntN(5)) * time.Second)
}
}
func shouldPersist(result *Snapshot) bool {
if result.MimeType == "text/gemini" ||
strings.HasPrefix(result.MimeType, "image/") ||
strings.HasPrefix(result.MimeType, "text/") {
return true
}
return false
}