Work on header parsing & saving other files
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,4 +1,6 @@
|
|||||||
.idea
|
.idea
|
||||||
|
**/.#*
|
||||||
|
**/*~
|
||||||
/run.sh
|
/run.sh
|
||||||
/gemini-grc
|
/gemini-grc
|
||||||
/snaps
|
/snaps
|
||||||
|
|||||||
11
config.go
11
config.go
@@ -12,6 +12,7 @@ type Config struct {
|
|||||||
logLevel zerolog.Level
|
logLevel zerolog.Level
|
||||||
rootPath string
|
rootPath string
|
||||||
numOfWorkers int
|
numOfWorkers int
|
||||||
|
maxResponseSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
func getConfig() *Config {
|
func getConfig() *Config {
|
||||||
@@ -20,6 +21,7 @@ func getConfig() *Config {
|
|||||||
"LOG_LEVEL",
|
"LOG_LEVEL",
|
||||||
"ROOT_PATH",
|
"ROOT_PATH",
|
||||||
"NUM_OF_WORKERS",
|
"NUM_OF_WORKERS",
|
||||||
|
"MAX_RESPONSE_SIZE",
|
||||||
} {
|
} {
|
||||||
if env, ok := os.LookupEnv(envVar); !ok {
|
if env, ok := os.LookupEnv(envVar); !ok {
|
||||||
fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar)
|
fmt.Fprintf(os.Stderr, "Missing env var %s\n", envVar)
|
||||||
@@ -48,6 +50,15 @@ func getConfig() *Config {
|
|||||||
config.numOfWorkers = numOfWorkers
|
config.numOfWorkers = numOfWorkers
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case "MAX_RESPONSE_SIZE":
|
||||||
|
{
|
||||||
|
if maxResponseSize, err := strconv.Atoi(env); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Invalid MAX_RESPONSE_SIZE value\n")
|
||||||
|
os.Exit(1)
|
||||||
|
} else {
|
||||||
|
config.maxResponseSize = maxResponseSize
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
33
fs.go
33
fs.go
@@ -3,8 +3,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
|
||||||
"path"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
@@ -60,34 +58,3 @@ func calcFilePath(rootPath, urlPath string) (string, error) {
|
|||||||
|
|
||||||
return finalPath, nil
|
return finalPath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func SaveResult(rootPath string, s *Snapshot) {
|
|
||||||
parentPath := path.Join(rootPath, s.Url.Hostname)
|
|
||||||
urlPath := s.Url.Path
|
|
||||||
// If path is empty, add `index.gmi` as the file to save
|
|
||||||
if urlPath == "" || urlPath == "." {
|
|
||||||
urlPath = fmt.Sprintf("index.gmi")
|
|
||||||
}
|
|
||||||
// If path ends with '/' then add index.gmi for the
|
|
||||||
// directory to be created.
|
|
||||||
if strings.HasSuffix(urlPath, "/") {
|
|
||||||
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
|
|
||||||
}
|
|
||||||
|
|
||||||
finalPath, err := calcFilePath(parentPath, urlPath)
|
|
||||||
if err != nil {
|
|
||||||
LogError("Error saving %s: %w", s.Url, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Ensure the directory exists
|
|
||||||
dir := filepath.Dir(finalPath)
|
|
||||||
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
|
|
||||||
LogError("Failed to create directory: %w", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
err = os.WriteFile(finalPath, []byte((*s).Data), 0666)
|
|
||||||
if err != nil {
|
|
||||||
LogError("Error saving %s: %w", s.Url.Full, err)
|
|
||||||
}
|
|
||||||
LogInfo("[%s] Saved to %s", s.Url.Full, finalPath)
|
|
||||||
}
|
|
||||||
|
|||||||
78
gemini.go
78
gemini.go
@@ -4,12 +4,15 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"path"
|
||||||
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func checkStatusCode(code int) error {
|
func checkGeminiStatusCode(code int) error {
|
||||||
switch {
|
switch {
|
||||||
case code == 20:
|
case code == 20:
|
||||||
return nil
|
return nil
|
||||||
@@ -28,20 +31,36 @@ func checkStatusCode(code int) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func Process(snapshot *Snapshot) *Snapshot {
|
func parseHeaders(data string) (string, string) {
|
||||||
LogDebug("[%s] Processing snapshot", snapshot.Url.String())
|
re := regexp.MustCompile(`^\d+\s+([a-zA-Z0-9/\-+]+)[;\s]+(lang=([a-zA-Z0-9-]+))?`)
|
||||||
|
matches := re.FindStringSubmatch(data)
|
||||||
|
if matches == nil || len(matches) <= 1 {
|
||||||
|
return "", ""
|
||||||
|
}
|
||||||
|
return matches[1], matches[3]
|
||||||
|
}
|
||||||
|
|
||||||
|
func ProcessHeaders(snapshot *Snapshot) *Snapshot {
|
||||||
|
LogDebug("[%s] Processing snapshot", snapshot.URL.String())
|
||||||
|
mimetype, lang := parseHeaders(snapshot.Data)
|
||||||
|
if mimetype != "" {
|
||||||
|
snapshot.MimeType = mimetype
|
||||||
|
}
|
||||||
|
if lang != "" {
|
||||||
|
snapshot.Lang = lang
|
||||||
|
}
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
|
||||||
|
func ProcessGemini(snapshot *Snapshot) *Snapshot {
|
||||||
code, err := ParseFirstTwoDigits(snapshot.Data)
|
code, err := ParseFirstTwoDigits(snapshot.Data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.Url.String())
|
snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.URL.String())
|
||||||
return snapshot
|
|
||||||
}
|
|
||||||
err = checkStatusCode(code)
|
|
||||||
if err != nil {
|
|
||||||
snapshot.Error = fmt.Errorf("[%s] Gemini response code error, skipping. %w", snapshot.Url.String(), err)
|
|
||||||
return snapshot
|
return snapshot
|
||||||
}
|
}
|
||||||
|
snapshot.ResponseCode = code
|
||||||
|
|
||||||
// Remove response code from body (first line)
|
// Remove response headers from body (first line)
|
||||||
index := strings.Index(snapshot.Data, "\n")
|
index := strings.Index(snapshot.Data, "\n")
|
||||||
if index != -1 {
|
if index != -1 {
|
||||||
snapshot.Data = snapshot.Data[index+1:]
|
snapshot.Data = snapshot.Data[index+1:]
|
||||||
@@ -49,23 +68,54 @@ func Process(snapshot *Snapshot) *Snapshot {
|
|||||||
|
|
||||||
// Grab any link lines
|
// Grab any link lines
|
||||||
linkLines := ExtractLinkLines(snapshot.Data)
|
linkLines := ExtractLinkLines(snapshot.Data)
|
||||||
LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines))
|
LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
|
||||||
|
|
||||||
// Normalize URLs in links, and store them in snapshot
|
// Normalize URLs in links, and store them in snapshot
|
||||||
for _, line := range linkLines {
|
for _, line := range linkLines {
|
||||||
normalizedLink, descr, error := NormalizeLink(line, snapshot.Url.String())
|
normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String())
|
||||||
if error != nil {
|
if error != nil {
|
||||||
LogError("[%s] Invalid link URL %w", snapshot.Url.String(), error)
|
LogError("[%s] Invalid link URL %w", snapshot.URL.String(), error)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
geminiUrl, error := ParseUrl(normalizedLink, descr)
|
geminiUrl, error := ParseUrl(normalizedLink, descr)
|
||||||
if error != nil {
|
if error != nil {
|
||||||
LogError("[%s] Unparseable gemini link %w", snapshot.Url.String(), error)
|
LogError("[%s] Unparseable gemini link %w", snapshot.URL.String(), error)
|
||||||
}
|
}
|
||||||
snapshot.Links = append(snapshot.Links, *geminiUrl)
|
snapshot.Links = append(snapshot.Links, *geminiUrl)
|
||||||
}
|
}
|
||||||
return snapshot
|
return snapshot
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SaveResult(rootPath string, s *Snapshot) {
|
||||||
|
parentPath := path.Join(rootPath, s.URL.Hostname)
|
||||||
|
urlPath := s.URL.Path
|
||||||
|
// If path is empty, add `index.gmi` as the file to save
|
||||||
|
if urlPath == "" || urlPath == "." {
|
||||||
|
urlPath = fmt.Sprintf("index.gmi")
|
||||||
|
}
|
||||||
|
// If path ends with '/' then add index.gmi for the
|
||||||
|
// directory to be created.
|
||||||
|
if strings.HasSuffix(urlPath, "/") {
|
||||||
|
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
finalPath, err := calcFilePath(parentPath, urlPath)
|
||||||
|
if err != nil {
|
||||||
|
LogError("Error saving %s: %w", s.URL, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Ensure the directory exists
|
||||||
|
dir := filepath.Dir(finalPath)
|
||||||
|
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
|
||||||
|
LogError("Failed to create directory: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = os.WriteFile(finalPath, []byte((*s).Data), 0666)
|
||||||
|
if err != nil {
|
||||||
|
LogError("Error saving %s: %w", s.URL.Full, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||||
u, err := url.Parse(input)
|
u, err := url.Parse(input)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
56
main.go
56
main.go
@@ -10,28 +10,30 @@ import (
|
|||||||
zlog "github.com/rs/zerolog/log"
|
zlog "github.com/rs/zerolog/log"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var CONFIG Config
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
config := *getConfig()
|
CONFIG = *getConfig()
|
||||||
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
||||||
zerolog.SetGlobalLevel(config.logLevel)
|
zerolog.SetGlobalLevel(CONFIG.logLevel)
|
||||||
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
|
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: "[2006-01-02 15:04:05]"})
|
||||||
if err := runApp(&config); err != nil {
|
if err := runApp(); err != nil {
|
||||||
LogError("Application error: %w", err)
|
LogError("Application error: %w", err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runApp(config *Config) error {
|
func runApp() error {
|
||||||
// urls := []string{"gemini://smol.gr"}
|
// urls := []string{"gemini://smol.gr"}
|
||||||
urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"}
|
urls := []string{"gemini://gmi.noulin.net/", "gemini://warmedal.se/~antenna/"}
|
||||||
|
|
||||||
queue := make(chan string, 10000)
|
queue := make(chan string, 1000)
|
||||||
results := make(chan Snapshot, 100)
|
results := make(chan Snapshot, 100)
|
||||||
done := make(chan struct{})
|
done := make(chan struct{})
|
||||||
|
|
||||||
go spawnStats(queue, results)
|
go spawnStats(queue, results)
|
||||||
go resultsHandler(queue, results)
|
go resultsHandler(queue, results)
|
||||||
spawnWorkers(config, queue, results)
|
spawnWorkers(CONFIG.numOfWorkers, queue, results)
|
||||||
|
|
||||||
for _, url := range urls {
|
for _, url := range urls {
|
||||||
queue <- url
|
queue <- url
|
||||||
@@ -44,18 +46,17 @@ func spawnStats(queue chan string, results chan Snapshot) {
|
|||||||
ticker := time.NewTicker(time.Duration(time.Second * 10))
|
ticker := time.NewTicker(time.Duration(time.Second * 10))
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
LogInfo("Queue length: %d\n", len(queue))
|
LogInfo("Queue length: %d", len(queue))
|
||||||
LogInfo("Results length: %d\n", len(results))
|
LogInfo("Results length: %d", len(results))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) {
|
func spawnWorkers(numOfWorkers int, queue <-chan string, results chan Snapshot) {
|
||||||
workers := config.numOfWorkers
|
LogInfo("Spawning %d workers", numOfWorkers)
|
||||||
LogInfo("Spawning %d workers", workers)
|
|
||||||
// Start worker goroutines
|
// Start worker goroutines
|
||||||
for i := 0; i < workers; i++ {
|
for i := 0; i < numOfWorkers; i++ {
|
||||||
go func(i int) {
|
go func(i int) {
|
||||||
worker(i, config.rootPath, queue, results)
|
worker(i, queue, results)
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -63,9 +64,9 @@ func spawnWorkers(config *Config, queue <-chan string, results chan Snapshot) {
|
|||||||
func resultsHandler(queue chan string, results <-chan Snapshot) {
|
func resultsHandler(queue chan string, results <-chan Snapshot) {
|
||||||
for result := range results {
|
for result := range results {
|
||||||
if result.Error != nil {
|
if result.Error != nil {
|
||||||
LogError("[%s] %w", result.Url, result.Error)
|
LogError("[%s] %w", result.URL, result.Error)
|
||||||
} else {
|
} else {
|
||||||
LogDebug("[%s] Done", result.Url)
|
LogDebug("[%s] Done", result.URL)
|
||||||
for _, link := range result.Links {
|
for _, link := range result.Links {
|
||||||
if strings.HasPrefix(link.Full, "gemini://") {
|
if strings.HasPrefix(link.Full, "gemini://") {
|
||||||
go func(link GeminiUrl) {
|
go func(link GeminiUrl) {
|
||||||
@@ -74,12 +75,15 @@ func resultsHandler(queue chan string, results <-chan Snapshot) {
|
|||||||
}(link)
|
}(link)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// if result.MimeType == "text/gemini" {
|
||||||
|
// result.Data = ""
|
||||||
// fmt.Printf(SnapshotToJSON(result))
|
// fmt.Printf(SnapshotToJSON(result))
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func worker(id int, rootPath string, queue <-chan string, results chan Snapshot) {
|
func worker(id int, queue <-chan string, results chan Snapshot) {
|
||||||
for url := range queue {
|
for url := range queue {
|
||||||
LogDebug("Worker %d visiting %s", id, url)
|
LogDebug("Worker %d visiting %s", id, url)
|
||||||
result := Visit(url)
|
result := Visit(url)
|
||||||
@@ -90,14 +94,28 @@ func worker(id int, rootPath string, queue <-chan string, results chan Snapshot)
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
LogDebug("Worker %d processing %s", id, url)
|
LogDebug("Worker %d processing %s", id, url)
|
||||||
result = Process(result)
|
result = ProcessHeaders(result)
|
||||||
if result.Error != nil {
|
if result.Error != nil {
|
||||||
results <- *result
|
results <- *result
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
LogDebug("Worker %d saving %s", id, url)
|
if result.MimeType == "text/gemini" {
|
||||||
SaveResult(rootPath, result)
|
result = ProcessGemini(result)
|
||||||
|
}
|
||||||
|
if shouldPersist(result) {
|
||||||
|
LogInfo("Worker %d saving %s", id, url)
|
||||||
|
SaveResult(CONFIG.rootPath, result)
|
||||||
|
}
|
||||||
results <- *result
|
results <- *result
|
||||||
time.Sleep(time.Duration(rand.IntN(5)) * time.Second)
|
time.Sleep(time.Duration(rand.IntN(5)) * time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldPersist(result *Snapshot) bool {
|
||||||
|
if result.MimeType == "text/gemini" ||
|
||||||
|
strings.HasPrefix(result.MimeType, "image/") ||
|
||||||
|
strings.HasPrefix(result.MimeType, "text/") {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|||||||
13
network.go
13
network.go
@@ -13,7 +13,7 @@ func Visit(url string) (result *Snapshot) {
|
|||||||
// Wrap error with additional information
|
// Wrap error with additional information
|
||||||
defer func() {
|
defer func() {
|
||||||
if result.Error != nil {
|
if result.Error != nil {
|
||||||
result.Error = fmt.Errorf("[%s] Error: %w", result.Url, result.Error)
|
result.Error = fmt.Errorf("[%s] Error: %w", result.URL, result.Error)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -22,9 +22,9 @@ func Visit(url string) (result *Snapshot) {
|
|||||||
result.Error = err
|
result.Error = err
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
result.Url = *geminiUrl
|
result.URL = *geminiUrl
|
||||||
|
|
||||||
LogInfo("[%s] Connecting", geminiUrl)
|
LogDebug("[%s] Connecting", geminiUrl)
|
||||||
|
|
||||||
// Establish a TLS connection
|
// Establish a TLS connection
|
||||||
tlsConfig := &tls.Config{
|
tlsConfig := &tls.Config{
|
||||||
@@ -40,11 +40,12 @@ func Visit(url string) (result *Snapshot) {
|
|||||||
defer func() {
|
defer func() {
|
||||||
err := conn.Close()
|
err := conn.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.Url.String(), err)
|
result.Error = fmt.Errorf("[%s] Closing connection error, ignoring: %w", result.URL.String(), err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Read data from the connection
|
// Read data from the connection
|
||||||
|
// TODO make timeout configurable
|
||||||
conn.SetReadDeadline(time.Now().Add(5 * time.Second))
|
conn.SetReadDeadline(time.Now().Add(5 * time.Second))
|
||||||
buf := make([]byte, 1024)
|
buf := make([]byte, 1024)
|
||||||
var data []byte
|
var data []byte
|
||||||
@@ -56,6 +57,10 @@ func Visit(url string) (result *Snapshot) {
|
|||||||
if n > 0 {
|
if n > 0 {
|
||||||
data = append(data, buf[:n]...)
|
data = append(data, buf[:n]...)
|
||||||
}
|
}
|
||||||
|
if len(data) > CONFIG.maxResponseSize {
|
||||||
|
result.Error = fmt.Errorf("Response size exceeded maximum of %d bytes", CONFIG.maxResponseSize)
|
||||||
|
return result
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
break
|
break
|
||||||
|
|||||||
15
snapshot.go
15
snapshot.go
@@ -7,19 +7,24 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Snapshot struct {
|
type Snapshot struct {
|
||||||
Url GeminiUrl `json:"url,omitempty"`
|
UID string `json:"uid,omitempty"`
|
||||||
|
URL GeminiUrl `json:"url,omitempty"`
|
||||||
Timestamp time.Time `json:"timestamp,omitempty"`
|
Timestamp time.Time `json:"timestamp,omitempty"`
|
||||||
|
MimeType string `json:"mimetype,omitempty"`
|
||||||
Data string `json:"data,omitempty"`
|
Data string `json:"data,omitempty"`
|
||||||
Links []GeminiUrl `json:"links,omitempty"`
|
Links []GeminiUrl `json:"links,omitempty"`
|
||||||
Code int `json:"code,omitempty"`
|
Lang string `json:"lang,omitempty"`
|
||||||
|
// Gemini status code
|
||||||
|
ResponseCode int `json:"code,omitempty"`
|
||||||
|
// On network errors, for Gemini server errors
|
||||||
|
// we have ResponseCode above.
|
||||||
Error error `json:"error,omitempty"`
|
Error error `json:"error,omitempty"`
|
||||||
UID string `json:"uid,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u Snapshot) String() string {
|
func (u Snapshot) String() string {
|
||||||
return fmt.Sprintf(
|
return fmt.Sprintf(
|
||||||
"[%s] %s %s %s %d %s",
|
"[%s] %s %s %s %d %s %s %s",
|
||||||
u.UID, u.Url, u.Timestamp, u.Links, u.Code, u.Error,
|
u.UID, u.URL, u.Timestamp, u.Links, u.ResponseCode, u.MimeType, u.Lang, u.Error,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user