Files
gemini-grs/gemini/gemini.go
antanst cd60c1363b Lots of features, first version that reliably crawls Geminispace.
- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
- [x] Save image/* and text/* files
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
.

.

.
2024-10-21 20:04:09 +03:00

187 lines
5.3 KiB
Go

package gemini
import (
"errors"
"fmt"
"gemini-grc/logging"
"net/url"
go_url "net/url"
"regexp"
"strconv"
"strings"
)
func isGeminiURL(url string) bool {
_, err := go_url.Parse(url)
if err != nil {
logging.LogWarn("[%s] Invalid URL: %v", url, err)
return false
}
return strings.HasPrefix(url, "gemini://")
}
func parseLinks(s Snapshot, queue chan string) {
for _, link := range *s.Links {
if strings.HasPrefix(link.Full, "gemini://") {
go func(link GeminiUrl) {
// fmt.Printf("LINK: %s\n", link)
queue <- link.Full
}(link)
}
}
}
func checkGeminiStatusCode(code int) error {
switch {
case code == 20:
return nil
case code >= 10 && code < 20:
return fmt.Errorf("Gemini response %d needs data input", code)
case code >= 30 && code < 40:
return fmt.Errorf("Gemini response %d redirect", code)
case code >= 40 && code < 50:
return fmt.Errorf("Gemini response %d server error", code)
case code >= 50 && code < 60:
return fmt.Errorf("Gemini response %d server permanent error", code)
case code >= 60 && code < 70:
return fmt.Errorf("Gemini response %d certificate error", code)
default:
return fmt.Errorf("Unexpected/unhandled Gemini response %d", code)
}
}
func ProcessGemini(snapshot *Snapshot) *Snapshot {
// Grab link lines
linkLines := ExtractLinkLines(snapshot.GemText.String)
logging.LogDebug("[%s] Found %d links", snapshot.URL.String(), len(linkLines))
// Normalize URLs in links, and store them in snapshot
for _, line := range linkLines {
normalizedLink, descr, error := NormalizeLink(line, snapshot.URL.String())
if error != nil {
logging.LogWarn("Cannot normalize URL in line '%s': %v", line, error)
continue
}
geminiUrl, error := ParseUrl(normalizedLink, descr)
if error != nil {
logging.LogWarn("Cannot parse URL in link '%s': %v", line, error)
continue
}
if snapshot.Links == nil {
snapshot.Links = &LinkList{*geminiUrl}
} else {
*snapshot.Links = append(*snapshot.Links, *geminiUrl)
}
}
return snapshot
}
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
str_port := u.Port()
path := u.Path
if str_port == "" {
str_port = "1965"
}
port, err := strconv.Atoi(str_port)
if err != nil {
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
}
return &GeminiUrl{Protocol: protocol, Hostname: hostname, Port: port, Path: path, Descr: descr, Full: u.String()}, nil
}
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
func ExtractLinkLines(gemtext string) []string {
// Define the regular expression pattern to match link lines
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
// Find all matches using the regular expression
matches := re.FindAllString(gemtext, -1)
return matches
}
// Take a single link line and the current URL,
// return the URL converted to an absolute URL
// and its description.
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
// Parse the current URL
baseURL, err := url.Parse(currentURL)
if err != nil {
return "", "", fmt.Errorf("Invalid current URL: %v", err)
}
// Regular expression to extract the URL part from a link line
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
// Use regex to extract the URL and the rest of the line
matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged
return "", "", fmt.Errorf("Not a link line: %v", linkLine)
}
originalURLStr := matches[1]
_, err = url.QueryUnescape(originalURLStr)
if err != nil {
return "", "", fmt.Errorf("Error decoding URL: %w", err)
}
restOfLine := ""
if len(matches) > 2 {
restOfLine = matches[2]
}
// Parse the URL from the link line
parsedURL, err := url.Parse(originalURLStr)
if err != nil {
// If URL parsing fails, return an error
return "", "", fmt.Errorf("Invalid URL '%s': %v", originalURLStr, err)
}
// Resolve relative URLs against the base URL
if !parsedURL.IsAbs() {
parsedURL = baseURL.ResolveReference(parsedURL)
}
// Construct the canonicalized link line
canonicalURLStr := parsedURL.String()
// Remove usual first space from URL description:
// => URL description
// ^^^^^^^^^^^^
if len(restOfLine) > 0 && restOfLine[0] == ' ' {
restOfLine = restOfLine[1:]
}
return canonicalURLStr, restOfLine, nil
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
// return canonicalizedLine, nil
}
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
// If no valid digits are found, it returns an error.
func ParseFirstTwoDigits(input string) (int, error) {
// Define the regular expression pattern to match one or two leading digits
re := regexp.MustCompile(`^(\d{1,2})`)
// Find the first match in the string
matches := re.FindStringSubmatch(input)
if len(matches) == 0 {
return 0, errors.New("no digits found at the beginning of the string")
}
// Parse the captured match as an integer
snapshot, err := strconv.Atoi(matches[1])
if err != nil {
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
}
return snapshot, nil
}