From 4a345a176394588d7d23ab11b50eeb480dce9dca Mon Sep 17 00:00:00 2001 From: antanst Date: Thu, 16 Jan 2025 09:38:28 +0200 Subject: [PATCH] Break up Gemtext link parsing code and improve tests. --- gemini/gemini.go | 93 +-------------------------- gemini/geminiLinks.go | 88 ++++++++++++++++++++++++++ gemini/geminiLinks_test.go | 125 +++++++++++++++++++++++++++++++++++++ gemini/gemini_test.go | 3 +- go.mod | 6 ++ go.sum | 9 +++ 6 files changed, 231 insertions(+), 93 deletions(-) create mode 100644 gemini/geminiLinks.go create mode 100644 gemini/geminiLinks_test.go diff --git a/gemini/gemini.go b/gemini/gemini.go index eceefe2..023e0f0 100644 --- a/gemini/gemini.go +++ b/gemini/gemini.go @@ -2,103 +2,12 @@ package gemini import ( "fmt" - "gemini-grc/common" - "net/url" "regexp" "strconv" - "gemini-grc/logging" + "gemini-grc/common" ) -func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList { - // Grab link lines - linkLines := ExtractLinkLines(gemtext) - if len(linkLines) == 0 { - return nil - } - var linkURLs common.LinkList - // Normalize URLs in links, and store them in snapshot - for _, line := range linkLines { - linkURL, err := NormalizeLink(line, currentURL.String()) - if err != nil { - logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err) - continue - } - linkURLs = append(linkURLs, *linkURL) - } - return linkURLs -} - -// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines -func ExtractLinkLines(gemtext string) []string { - // Define the regular expression pattern to match link lines - re := regexp.MustCompile(`(?m)^=>[ \t]+.*`) - - // Find all matches using the regular expression - matches := re.FindAllString(gemtext, -1) - - return matches -} - -// NormalizeLink takes a single link line and the current URL, -// return the URL converted to an absolute URL -// and its description. -func NormalizeLink(linkLine string, currentURL string) (*common.URL, error) { - // Parse the current URL - baseURL, err := url.Parse(currentURL) - if err != nil { - return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) - } - - // Regular expression to extract the URL part from a link line - re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`) - - // Use regex to extract the URL and the rest of the line - matches := re.FindStringSubmatch(linkLine) - if len(matches) == 0 { - // If the line doesn't match the expected format, return it unchanged - return nil, fmt.Errorf("%w for link line %s", common.ErrGeminiLinkLineParse, linkLine) - } - - originalURLStr := matches[1] - _, err = url.QueryUnescape(originalURLStr) - if err != nil { - return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err) - } - - restOfLine := "" - if len(matches) > 2 { - restOfLine = matches[2] - } - - // Parse the URL from the link line - parsedURL, err := url.Parse(originalURLStr) - if err != nil { - // If URL parsing fails, return an error - return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) - } - - // Resolve relative URLs against the base URL - if !parsedURL.IsAbs() { - parsedURL = baseURL.ResolveReference(parsedURL) - } - - // Remove usual first space from URL description: - // => URL description - // ^^^^^^^^^^^^ - if len(restOfLine) > 0 && restOfLine[0] == ' ' { - restOfLine = restOfLine[1:] - } - - finalURL, err := common.ParseURL(parsedURL.String(), restOfLine) - if err != nil { - // If URL parsing fails, return an error - return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) - } - - return finalURL, nil -} - // ParseFirstTwoDigits takes a string and returns the first one or two digits as an int. // If no valid digits are found, it returns an error. func ParseFirstTwoDigits(input string) (int, error) { diff --git a/gemini/geminiLinks.go b/gemini/geminiLinks.go new file mode 100644 index 0000000..050729a --- /dev/null +++ b/gemini/geminiLinks.go @@ -0,0 +1,88 @@ +package gemini + +import ( + "fmt" + "net/url" + "regexp" + + "gemini-grc/common" + "gemini-grc/logging" + "gemini-grc/util" +) + +func GetPageLinks(currentURL common.URL, gemtext string) common.LinkList { + linkLines := util.GetLinesMatchingRegex(gemtext, `(?m)^=>[ \t]+.*`) + if len(linkLines) == 0 { + return nil + } + var linkURLs common.LinkList + // Normalize URLs in links + for _, line := range linkLines { + linkUrl, err := ParseGeminiLinkLine(line, currentURL.String()) + if err != nil { + logging.LogDebug("%s: %s", common.ErrGeminiLinkLineParse, err) + continue + } + linkURLs = append(linkURLs, *linkUrl) + } + return linkURLs +} + +// ParseGeminiLinkLine takes a single link line and the current URL, +// return the URL converted to an absolute URL +// and its description. +func ParseGeminiLinkLine(linkLine string, currentURL string) (*common.URL, error) { + // Check: currentURL is parseable + baseURL, err := url.Parse(currentURL) + if err != nil { + return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) + } + + // Extract the actual URL and the description + re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`) + matches := re.FindStringSubmatch(linkLine) + if len(matches) == 0 { + // If the line doesn't match the expected format, return it unchanged + return nil, fmt.Errorf("%w could not parse gemini link %s", common.ErrGeminiLinkLineParse, linkLine) + } + + originalURLStr := matches[1] + + // Check: Unescape the URL if escaped + _, err = url.QueryUnescape(originalURLStr) + if err != nil { + return nil, fmt.Errorf("%w: %w", common.ErrURLDecode, err) + } + + description := "" + if len(matches) > 2 { + description = matches[2] + } + + // Parse the URL from the link line + parsedURL, err := url.Parse(originalURLStr) + if err != nil { + // If URL parsing fails, return an error + return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) + } + + // If link URL is relative, resolve full URL + if !parsedURL.IsAbs() { + parsedURL = baseURL.ResolveReference(parsedURL) + } + + // Remove usual first space from URL description: + // => URL description + // ^^^^^^^^^^^^ + if len(description) > 0 && description[0] == ' ' { + description = description[1:] + } + + finalURL, err := common.ParseURL(parsedURL.String(), description) + if err != nil { + // If URL parsing fails, return an error + return nil, fmt.Errorf("%w: %w", common.ErrURLParse, err) + } + + return finalURL, nil +} diff --git a/gemini/geminiLinks_test.go b/gemini/geminiLinks_test.go new file mode 100644 index 0000000..76bc8c9 --- /dev/null +++ b/gemini/geminiLinks_test.go @@ -0,0 +1,125 @@ +package gemini + +import ( + "errors" + "reflect" + "testing" + + "gemini-grc/common" +) + +type TestData struct { + currentURL string + link string + value *common.URL + error error +} + +var data = []TestData{ + { + currentURL: "https://gemini.com/", + link: "https://gemini.com/", + value: nil, + error: common.ErrGeminiLinkLineParse, + }, + { + currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", + link: "=> archive/ Complete Archive", + value: &common.URL{ + Protocol: "gemini", + Hostname: "gemi.dev", + Port: 1965, + Path: "/cgi-bin/xkcd/archive/", + Descr: "Complete Archive", + Full: "gemini://gemi.dev:1965/cgi-bin/xkcd/archive/", + }, + error: nil, + }, + { + currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", + link: "=> /cgi-bin/xkcd.cgi?a=5&b=6 Example", + value: &common.URL{ + Protocol: "gemini", + Hostname: "gemi.dev", + Port: 1965, + Path: "/cgi-bin/xkcd.cgi", + Descr: "Example", + Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?a=5&b=6", + }, + error: nil, + }, + { + currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", + link: "=> /cgi-bin/xkcd.cgi?1494 XKCD 1494: Insurance", + value: &common.URL{ + Protocol: "gemini", + Hostname: "gemi.dev", + Port: 1965, + Path: "/cgi-bin/xkcd.cgi", + Descr: "XKCD 1494: Insurance", + Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494", + }, + error: nil, + }, + { + currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", + link: "=> /cgi-bin/xkcd.cgi?1494#f XKCD 1494: Insurance", + value: &common.URL{ + Protocol: "gemini", + Hostname: "gemi.dev", + Port: 1965, + Path: "/cgi-bin/xkcd.cgi", + Descr: "XKCD 1494: Insurance", + Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?1494#f", + }, + error: nil, + }, + { + currentURL: "gemini://gemi.dev/cgi-bin/xkcd/", + link: "=> /cgi-bin/xkcd.cgi?c=5#d XKCD 1494: Insurance", + value: &common.URL{ + Protocol: "gemini", + Hostname: "gemi.dev", + Port: 1965, + Path: "/cgi-bin/xkcd.cgi", + Descr: "XKCD 1494: Insurance", + Full: "gemini://gemi.dev:1965/cgi-bin/xkcd.cgi?c=5#d", + }, + error: nil, + }, + { + currentURL: "gemini://a.b/c#d", + link: "=> /d/e#f", + value: &common.URL{ + Protocol: "gemini", + Hostname: "a.b", + Port: 1965, + Path: "/d/e", + Descr: "", + Full: "gemini://a.b:1965/d/e#f", + }, + error: nil, + }, +} + +func Test(t *testing.T) { + t.Parallel() + for i, expected := range data { + result, err := ParseGeminiLinkLine(expected.link, expected.currentURL) + if err != nil { //nolint:nestif + if expected.value != nil { + t.Errorf("data[%d]: Expected value %v, got %v", i, nil, expected.value) + } + if !errors.Is(err, common.ErrGeminiLinkLineParse) { + t.Errorf("data[%d]: expected error %v, got %v", i, expected.error, err) + } + } else { + if expected.error != nil { + t.Errorf("data[%d]: Expected error %v, got %v", i, nil, expected.error) + } + if !(reflect.DeepEqual(result, expected.value)) { + t.Errorf("data[%d]: expected %#v, got %#v", i, expected.value, result) + } + } + } +} diff --git a/gemini/gemini_test.go b/gemini/gemini_test.go index f6c7b0b..13aa3c3 100644 --- a/gemini/gemini_test.go +++ b/gemini/gemini_test.go @@ -1,8 +1,9 @@ package gemini import ( - "gemini-grc/common" "testing" + + "gemini-grc/common" ) func TestExtractRedirectTargetFullURL(t *testing.T) { diff --git a/go.mod b/go.mod index 89a5188..3ea7d40 100644 --- a/go.mod +++ b/go.mod @@ -9,16 +9,22 @@ require ( github.com/lib/pq v1.10.9 github.com/matoous/go-nanoid/v2 v2.1.0 github.com/rs/zerolog v1.33.0 + github.com/stretchr/testify v1.9.0 golang.org/x/text v0.19.0 ) require ( + github.com/davecgh/go-spew v1.1.1 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect + github.com/kr/text v0.2.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/rogpeppe/go-internal v1.13.1 // indirect golang.org/x/crypto v0.27.0 // indirect golang.org/x/sync v0.8.0 // indirect golang.org/x/sys v0.25.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 6deda2a..85a1711 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,7 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -19,6 +20,10 @@ github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE= @@ -34,6 +39,8 @@ github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxU github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= @@ -54,6 +61,8 @@ golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=