Break up Gemtext link parsing code and improve tests.

This commit is contained in:
2025-01-16 09:38:28 +02:00
parent 982fb75bd2
commit ea92b90c62
6 changed files with 231 additions and 93 deletions

View File

@@ -2,103 +2,12 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"net/url"
"regexp"
"strconv"
"gemini-grc/logging"
"gemini-grc/common"
)
func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList {
// Grab link lines
linkLines := ExtractLinkLines(gemtext)
if len(linkLines) == 0 {
return nil
}
var linkURLs common.LinkList
// Normalize URLs in links, and store them in snapshot
for _, line := range linkLines {
linkURL, err := NormalizeLink(line, currentURL.String())
if err != nil {
logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err)
continue
}
linkURLs = append(linkURLs, *linkURL)
}
return linkURLs
}
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
func ExtractLinkLines(gemtext string) []string {
// Define the regular expression pattern to match link lines
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
// Find all matches using the regular expression
matches := re.FindAllString(gemtext, -1)
return matches
}
// NormalizeLink takes a single link line and the current URL,
// return the URL converted to an absolute URL
// and its description.
func NormalizeLink(linkLine string, currentURL string) (*common.URL, error) {
// Parse the current URL
baseURL, err := url.Parse(currentURL)
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
}
// Regular expression to extract the URL part from a link line
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
// Use regex to extract the URL and the rest of the line
matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged
return nil, fmt.Errorf("%w for link line %s", common.ErrGeminiLinkLineParse, linkLine)
}
originalURLStr := matches[1]
_, err = url.QueryUnescape(originalURLStr)
if err != nil {
return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err)
}
restOfLine := ""
if len(matches) > 2 {
restOfLine = matches[2]
}
// Parse the URL from the link line
parsedURL, err := url.Parse(originalURLStr)
if err != nil {
// If URL parsing fails, return an error
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
}
// Resolve relative URLs against the base URL
if !parsedURL.IsAbs() {
parsedURL = baseURL.ResolveReference(parsedURL)
}
// Remove usual first space from URL description:
// => URL description
// ^^^^^^^^^^^^
if len(restOfLine) > 0 && restOfLine[0] == ' ' {
restOfLine = restOfLine[1:]
}
finalURL, err := common.ParseURL(parsedURL.String(), restOfLine)
if err != nil {
// If URL parsing fails, return an error
return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err)
}
return finalURL, nil
}
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
// If no valid digits are found, it returns an error.
func ParseFirstTwoDigits(input string) (int, error) {