Work on header parsing & saving other files

This commit is contained in:
2024-10-08 18:16:47 +03:00
parent 7e849feffe
commit 91f8e69fdf
7 changed files with 138 additions and 80 deletions

View File

@@ -4,12 +4,15 @@ import (
"errors"
"fmt"
"net/url"
"os"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"
)
func checkStatusCode(code int) error {
func checkGeminiStatusCode(code int) error {
switch {
case code == 20:
return nil
@@ -28,20 +31,36 @@ func checkStatusCode(code int) error {
}
}
func Process(snapshot *Snapshot) *Snapshot {
LogDebug("[%s] Processing snapshot", snapshot.Url.String())
func parseHeaders(data string) (string, string) {
re := regexp.MustCompile(`^\d+\s+([a-zA-Z0-9/\-+]+)[;\s]+(lang=([a-zA-Z0-9-]+))?`)
matches := re.FindStringSubmatch(data)
if matches == nil || len(matches) <= 1 {
return "", ""
}
return matches[1], matches[3]
}
func ProcessHeaders(snapshot *Snapshot) *Snapshot {
LogDebug("[%s] Processing snapshot", snapshot.URL.String())
mimetype, lang := parseHeaders(snapshot.Data)
if mimetype != "" {
snapshot.MimeType = mimetype
}
if lang != "" {
snapshot.Lang = lang
}
return snapshot
}
func ProcessGemini(snapshot *Snapshot) *Snapshot {
code, err := ParseFirstTwoDigits(snapshot.Data)
if err != nil {
snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.Url.String())
return snapshot
}
err = checkStatusCode(code)
if err != nil {
snapshot.Error = fmt.Errorf("[%s] Gemini response code error, skipping. %w", snapshot.Url.String(), err)
snapshot.Error = fmt.Errorf("[%s] No/invalid gemini response code", snapshot.URL.String())
return snapshot
}
snapshot.ResponseCode = code
// Remove response code from body (first line)
// Remove response headers from body (first line)
index := strings.Index(snapshot.Data, "\n")
if index != -1 {
snapshot.Data = snapshot.Data[index+1:]
@@ -49,23 +68,54 @@ func Process(snapshot *Snapshot) *Snapshot {
// Grab any link lines
linkLines := ExtractLinkLines(snapshot.Data)
LogDebug("[%s] Found %d links", snapshot.Url.String(), len(linkLines))
LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
// Normalize URLs in links, and store them in snapshot
for _, line := range linkLines {
normalizedLink, descr, error := NormalizeLink(line, snapshot.Url.String())
normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String())
if error != nil {
LogError("[%s] Invalid link URL %w", snapshot.Url.String(), error)
LogError("[%s] Invalid link URL %w", snapshot.URL.String(), error)
continue
}
geminiUrl, error := ParseUrl(normalizedLink, descr)
if error != nil {
LogError("[%s] Unparseable gemini link %w", snapshot.Url.String(), error)
LogError("[%s] Unparseable gemini link %w", snapshot.URL.String(), error)
}
snapshot.Links = append(snapshot.Links, *geminiUrl)
}
return snapshot
}
func SaveResult(rootPath string, s *Snapshot) {
parentPath := path.Join(rootPath, s.URL.Hostname)
urlPath := s.URL.Path
// If path is empty, add `index.gmi` as the file to save
if urlPath == "" || urlPath == "." {
urlPath = fmt.Sprintf("index.gmi")
}
// If path ends with '/' then add index.gmi for the
// directory to be created.
if strings.HasSuffix(urlPath, "/") {
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
}
finalPath, err := calcFilePath(parentPath, urlPath)
if err != nil {
LogError("Error saving %s: %w", s.URL, err)
return
}
// Ensure the directory exists
dir := filepath.Dir(finalPath)
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
LogError("Failed to create directory: %w", err)
return
}
err = os.WriteFile(finalPath, []byte((*s).Data), 0666)
if err != nil {
LogError("Error saving %s: %w", s.URL.Full, err)
}
}
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
u, err := url.Parse(input)
if err != nil {