Lots of features, first version that reliably crawls Geminispace.

- [x] Concurrent downloading with workers - [x] Concurrent connection limit per host - [x] URL Blacklist - [x] Save image/* and text/* files - [x] Configuration via environment variables - [x] Storing snapshots in PostgreSQL - [x] Proper response header & body UTF-8 and format validation . .
2024-10-21 20:03:28 +03:00
parent 212345764b
commit fee7d3e01c
37 changed files with 1231 additions and 319 deletions
--- a/gemini/gemini.go
+++ b/gemini/gemini.go
@@ -0,0 +1,186 @@
+package gemini
+
+import (
+	"errors"
+	"fmt"
+	"gemini-grc/logging"
+	"net/url"
+	go_url "net/url"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+func isGeminiURL(url string) bool {
+	_, err := go_url.Parse(url)
+	if err != nil {
+		logging.LogWarn("[%s] Invalid URL: %v", url, err)
+		return false
+	}
+	return strings.HasPrefix(url, "gemini://")
+}
+
+func parseLinks(s Snapshot, queue chan string) {
+	for _, link := range *s.Links {
+		if strings.HasPrefix(link.Full, "gemini://") {
+			go func(link GeminiUrl) {
+				// fmt.Printf("LINK: %s\n", link)
+				queue <- link.Full
+			}(link)
+		}
+	}
+}
+
+func checkGeminiStatusCode(code int) error {
+	switch {
+	case code == 20:
+		return nil
+	case code >= 10 && code < 20:
+		return fmt.Errorf("Gemini response %d needs data input", code)
+	case code >= 30 && code < 40:
+		return fmt.Errorf("Gemini response %d redirect", code)
+	case code >= 40 && code < 50:
+		return fmt.Errorf("Gemini response %d server error", code)
+	case code >= 50 && code < 60:
+		return fmt.Errorf("Gemini response %d server permanent error", code)
+	case code >= 60 && code < 70:
+		return fmt.Errorf("Gemini response %d certificate error", code)
+	default:
+		return fmt.Errorf("Unexpected/unhandled Gemini response %d", code)
+	}
+}
+
+func ProcessGemini(snapshot *Snapshot) *Snapshot {
+	// Grab link lines
+	linkLines := ExtractLinkLines(snapshot.GemText.String)
+	logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
+
+	// Normalize URLs in links, and store them in snapshot
+	for _, line := range linkLines {
+		normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String())
+		if error != nil {
+			logging.LogWarn("Cannot normalize URL in line '%s': %v", line, error)
+			continue
+		}
+		geminiUrl, error := ParseUrl(normalizedLink, descr)
+		if error != nil {
+			logging.LogWarn("Cannot parse URL in link '%s': %v", line, error)
+			continue
+		}
+		if snapshot.Links == nil {
+			snapshot.Links = &LinkList{*geminiUrl}
+		} else {
+			*snapshot.Links = append(*snapshot.Links, *geminiUrl)
+		}
+	}
+	return snapshot
+}
+
+func ParseUrl(input string, descr string) (*GeminiUrl, error) {
+	u, err := url.Parse(input)
+	if err != nil {
+		return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
+	}
+	protocol := u.Scheme
+	hostname := u.Hostname()
+	str_port := u.Port()
+	path := u.Path
+	if str_port == "" {
+		str_port = "1965"
+	}
+	port, err := strconv.Atoi(str_port)
+	if err != nil {
+		return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
+	}
+	return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
+}
+
+// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
+func ExtractLinkLines(gemtext string) []string {
+	// Define the regular expression pattern to match link lines
+	re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
+
+	// Find all matches using the regular expression
+	matches := re.FindAllString(gemtext, -1)
+
+	return matches
+}
+
+// Take a single link line and the current URL,
+// return the URL converted to an absolute URL
+// and its description.
+func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
+	// Parse the current URL
+	baseURL, err := url.Parse(currentURL)
+	if err != nil {
+		return "", "", fmt.Errorf("Invalid current URL: %v", err)
+	}
+
+	// Regular expression to extract the URL part from a link line
+	re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
+
+	// Use regex to extract the URL and the rest of the line
+	matches := re.FindStringSubmatch(linkLine)
+	if len(matches) == 0 {
+		// If the line doesn't match the expected format, return it unchanged
+		return "", "", fmt.Errorf("Not a link line: %v", linkLine)
+	}
+
+	originalURLStr := matches[1]
+	_, err = url.QueryUnescape(originalURLStr)
+	if err != nil {
+		return "", "", fmt.Errorf("Error decoding URL: %w", err)
+	}
+
+	restOfLine := ""
+	if len(matches) > 2 {
+		restOfLine = matches[2]
+	}
+
+	// Parse the URL from the link line
+	parsedURL, err := url.Parse(originalURLStr)
+	if err != nil {
+		// If URL parsing fails, return an error
+		return "", "", fmt.Errorf("Invalid URL '%s': %v", originalURLStr, err)
+	}
+
+	// Resolve relative URLs against the base URL
+	if !parsedURL.IsAbs() {
+		parsedURL = baseURL.ResolveReference(parsedURL)
+	}
+
+	// Construct the canonicalized link line
+	canonicalURLStr := parsedURL.String()
+
+	// Remove usual first space from URL description:
+	// => URL description
+	//       ^^^^^^^^^^^^
+	if len(restOfLine) > 0 && restOfLine[0] == ' ' {
+		restOfLine = restOfLine[1:]
+	}
+
+	return canonicalURLStr, restOfLine, nil
+	//	canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
+	// return canonicalizedLine, nil
+}
+
+// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
+// If no valid digits are found, it returns an error.
+func ParseFirstTwoDigits(input string) (int, error) {
+	// Define the regular expression pattern to match one or two leading digits
+	re := regexp.MustCompile(`^(\d{1,2})`)
+
+	// Find the first match in the string
+	matches := re.FindStringSubmatch(input)
+	if len(matches) == 0 {
+		return 0, errors.New("no digits found at the beginning of the string")
+	}
+
+	// Parse the captured match as an integer
+	snapshot, err := strconv.Atoi(matches[1])
+	if err != nil {
+		return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
+	}
+
+	return snapshot, nil
+}