Update and refactor core functionality
- Update common package utilities - Refactor network code for better error handling - Remove deprecated files and functionality - Enhance blacklist and filtering capabilities - Improve snapshot handling and processing
This commit is contained in:
@@ -9,7 +9,7 @@ import (
|
||||
url2 "gemini-grc/common/url"
|
||||
"gemini-grc/logging"
|
||||
"gemini-grc/util"
|
||||
"github.com/antanst/go_errors"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
)
|
||||
|
||||
func GetPageLinks(currentURL url2.URL, gemtext string) linkList.LinkList {
|
||||
@@ -37,14 +37,14 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*url2.URL, error)
|
||||
// Check: currentURL is parseable
|
||||
baseURL, err := url.Parse(currentURL)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||
}
|
||||
|
||||
// Extract the actual URL and the description
|
||||
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
|
||||
matches := re.FindStringSubmatch(linkLine)
|
||||
if len(matches) == 0 {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error parsing link line: no regexp match for line %s", linkLine))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: no regexp match for line %s", linkLine), 0, "", false)
|
||||
}
|
||||
|
||||
originalURLStr := matches[1]
|
||||
@@ -52,7 +52,7 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*url2.URL, error)
|
||||
// Check: Unescape the URL if escaped
|
||||
_, err = url.QueryUnescape(originalURLStr)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||
}
|
||||
|
||||
description := ""
|
||||
@@ -63,7 +63,7 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*url2.URL, error)
|
||||
// Parse the URL from the link line
|
||||
parsedURL, err := url.Parse(originalURLStr)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||
}
|
||||
|
||||
// If link URL is relative, resolve full URL
|
||||
@@ -80,7 +80,7 @@ func ParseGeminiLinkLine(linkLine string, currentURL string) (*url2.URL, error)
|
||||
|
||||
finalURL, err := url2.ParseURL(parsedURL.String(), description, true)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing link line: %w input '%s'", err, linkLine), 0, "", false)
|
||||
}
|
||||
|
||||
return finalURL, nil
|
||||
|
||||
@@ -1,166 +1,23 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
stdurl "net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
errors2 "gemini-grc/common/errors"
|
||||
commonErrors "gemini-grc/common/errors"
|
||||
"gemini-grc/common/snapshot"
|
||||
_url "gemini-grc/common/url"
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/logging"
|
||||
"github.com/antanst/go_errors"
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
// Visit given URL, using the Gemini protocol.
|
||||
// Mutates given Snapshot with the data.
|
||||
// In case of error, we store the error string
|
||||
// inside snapshot and return the error.
|
||||
func Visit(url string) (s *snapshot.Snapshot, err error) {
|
||||
s, err = snapshot.SnapshotFromURL(url, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
// GeminiError and HostError should
|
||||
// be stored in the snapshot. Other
|
||||
// errors are returned.
|
||||
if errors2.IsHostError(err) {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
err = nil
|
||||
} else if IsGeminiError(err) {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
s.Header = null.StringFrom(go_errors.Unwrap(err).(*GeminiError).Header)
|
||||
s.ResponseCode = null.IntFrom(int64(go_errors.Unwrap(err).(*GeminiError).Code))
|
||||
err = nil
|
||||
} else {
|
||||
s = nil
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
data, err := ConnectAndGetData(s.URL.String())
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
|
||||
s, err = processData(*s, data)
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
|
||||
if isGeminiCapsule(s) {
|
||||
links := GetPageLinks(s.URL, s.GemText.String)
|
||||
if len(links) > 0 {
|
||||
logging.LogDebug("Found %d links", len(links))
|
||||
s.Links = null.ValueFrom(links)
|
||||
}
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func ConnectAndGetData(url string) ([]byte, error) {
|
||||
parsedURL, err := stdurl.Parse(url)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(err)
|
||||
}
|
||||
hostname := parsedURL.Hostname()
|
||||
port := parsedURL.Port()
|
||||
if port == "" {
|
||||
port = "1965"
|
||||
}
|
||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
|
||||
// Establish the underlying TCP connection.
|
||||
dialer := &net.Dialer{
|
||||
Timeout: timeoutDuration,
|
||||
}
|
||||
conn, err := dialer.Dial("tcp", host)
|
||||
if err != nil {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
// Make sure we always close the connection.
|
||||
defer func() {
|
||||
_ = conn.Close()
|
||||
}()
|
||||
|
||||
// Set read and write timeouts on the TCP connection.
|
||||
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
|
||||
// Perform the TLS handshake
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
|
||||
ServerName: parsedURL.Hostname(), // SNI says we should not include port in hostname
|
||||
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
|
||||
}
|
||||
tlsConn := tls.Client(conn, tlsConfig)
|
||||
err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
err = tlsConn.Handshake()
|
||||
if err != nil {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
|
||||
// We read `buf`-sized chunks and add data to `data`.
|
||||
buf := make([]byte, 4096)
|
||||
var data []byte
|
||||
|
||||
// Send Gemini request to trigger server response.
|
||||
// Fix for stupid server bug:
|
||||
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||
// when the port is 1965 and is still specified explicitly in the URL.
|
||||
url2, _ := _url.ParseURL(url, "", true)
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
|
||||
if err != nil {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
// Read response bytes in len(buf) byte chunks
|
||||
for {
|
||||
n, err := tlsConn.Read(buf)
|
||||
if n > 0 {
|
||||
data = append(data, buf[:n]...)
|
||||
}
|
||||
if len(data) > config.CONFIG.MaxResponseSize {
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
if err != nil {
|
||||
if go_errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
return nil, errors2.NewHostError(err)
|
||||
}
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func processData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
|
||||
// ProcessData processes the raw data from a Gemini response and populates the Snapshot.
|
||||
// This function is exported for use by the robotsMatch package.
|
||||
func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
|
||||
header, body, err := getHeadersAndData(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return &s, err
|
||||
}
|
||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||
|
||||
@@ -198,7 +55,7 @@ func processData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
|
||||
func getHeadersAndData(data []byte) (string, []byte, error) {
|
||||
firstLineEnds := slices.Index(data, '\n')
|
||||
if firstLineEnds == -1 {
|
||||
return "", nil, errors2.NewHostError(fmt.Errorf("error parsing header"))
|
||||
return "", nil, commonErrors.NewHostError(fmt.Errorf("error parsing header"))
|
||||
}
|
||||
firstLine := string(data[:firstLineEnds])
|
||||
rest := data[firstLineEnds+1:]
|
||||
@@ -252,4 +109,4 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||
|
||||
func isGeminiCapsule(s *snapshot.Snapshot) bool {
|
||||
return !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +135,7 @@ func TestProcessData(t *testing.T) {
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
s := snapshot.Snapshot{}
|
||||
result, err := processData(s, test.inputData)
|
||||
result, err := ProcessData(s, test.inputData)
|
||||
|
||||
if test.expectedError && err == nil {
|
||||
t.Errorf("Expected error, got nil")
|
||||
@@ -175,192 +175,3 @@ func TestProcessData(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
//// Mock Gemini server for testing ConnectAndGetData
|
||||
//func mockGeminiServer(response string, delay time.Duration, closeConnection bool) net.Listener {
|
||||
// listener, err := net.Listen("tcp", "127.0.0.1:0") // Bind to a random available port
|
||||
// if err != nil {
|
||||
// panic(fmt.Sprintf("Failed to create mock server: %v", err))
|
||||
// }
|
||||
//
|
||||
// go func() {
|
||||
// conn, err := listener.Accept()
|
||||
// if err != nil {
|
||||
// if !closeConnection { // Don't panic if we closed the connection on purpose
|
||||
// panic(fmt.Sprintf("Failed to accept connection: %v", err))
|
||||
// }
|
||||
// return
|
||||
// }
|
||||
// defer conn.Close()
|
||||
//
|
||||
// time.Sleep(delay) // Simulate network latency
|
||||
//
|
||||
// _, err = conn.Write([]byte(response))
|
||||
// if err != nil && !closeConnection {
|
||||
// panic(fmt.Sprintf("Failed to write response: %v", err))
|
||||
// }
|
||||
// }()
|
||||
//
|
||||
// return listener
|
||||
//}
|
||||
|
||||
// func TestConnectAndGetData(t *testing.T) {
|
||||
// config.CONFIG = config.ConfigStruct{
|
||||
// ResponseTimeout: 5,
|
||||
// MaxResponseSize: 1024 * 1024,
|
||||
// }
|
||||
// tests := []struct {
|
||||
// name string
|
||||
// serverResponse string
|
||||
// serverDelay time.Duration
|
||||
// expectedData []byte
|
||||
// expectedError bool
|
||||
// closeConnection bool
|
||||
// }{
|
||||
// {
|
||||
// name: "Successful response",
|
||||
// serverResponse: "20 text/gemini\r\n# Hello",
|
||||
// expectedData: []byte("20 text/gemini\r\n# Hello"),
|
||||
// expectedError: false,
|
||||
// },
|
||||
// {
|
||||
// name: "Server error",
|
||||
// serverResponse: "50 Server error\r\n",
|
||||
// expectedData: []byte("50 Server error\r\n"),
|
||||
// expectedError: false,
|
||||
// },
|
||||
// {
|
||||
// name: "Timeout",
|
||||
// serverDelay: 6 * time.Second, // Longer than the timeout
|
||||
// expectedError: true,
|
||||
// },
|
||||
// {
|
||||
// name: "Server closes connection",
|
||||
// closeConnection: true,
|
||||
// expectedError: true,
|
||||
// },
|
||||
// }
|
||||
|
||||
// for _, test := range tests {
|
||||
// t.Run(test.name, func(t *testing.T) {
|
||||
// listener := mockGeminiServer(test.serverResponse, test.serverDelay, test.closeConnection)
|
||||
// defer func() {
|
||||
// test.closeConnection = true // Prevent panic in mock server
|
||||
// listener.Close()
|
||||
// }()
|
||||
// addr := listener.Addr().String()
|
||||
// data, err := ConnectAndGetData(fmt.Sprintf("gemini://%s/", addr))
|
||||
|
||||
// if test.expectedError && err == nil {
|
||||
// t.Errorf("Expected error, got nil")
|
||||
// }
|
||||
|
||||
// if !test.expectedError && err != nil {
|
||||
// t.Errorf("Unexpected error: %v", err)
|
||||
// }
|
||||
|
||||
// if !slices.Equal(data, test.expectedData) {
|
||||
// t.Errorf("Expected data '%s', got '%s'", test.expectedData, data)
|
||||
// }
|
||||
// })
|
||||
// }
|
||||
// }
|
||||
|
||||
// func TestVisit(t *testing.T) {
|
||||
// config.CONFIG = config.ConfigStruct{
|
||||
// ResponseTimeout: 5,
|
||||
// MaxResponseSize: 1024 * 1024,
|
||||
// }
|
||||
// tests := []struct {
|
||||
// name string
|
||||
// serverResponse string
|
||||
// expectedCode int
|
||||
// expectedMime string
|
||||
// expectedError bool
|
||||
// expectedLinks []string
|
||||
// }{
|
||||
// {
|
||||
// name: "Successful response",
|
||||
// serverResponse: "20 text/gemini\r\n# Hello\n=> /link1 Link 1\n=> /link2 Link 2",
|
||||
// expectedCode: 20,
|
||||
// expectedMime: "text/gemini",
|
||||
// expectedError: false,
|
||||
// expectedLinks: []string{"gemini://127.0.0.1:1965/link1", "gemini://127.0.0.1:1965/link2"},
|
||||
// },
|
||||
// {
|
||||
// name: "Server error",
|
||||
// serverResponse: "50 Server error\r\n",
|
||||
// expectedCode: 50,
|
||||
// expectedMime: "Server error",
|
||||
// expectedError: false,
|
||||
// expectedLinks: []string{},
|
||||
// },
|
||||
// }
|
||||
|
||||
// for _, test := range tests {
|
||||
// t.Run(test.name, func(t *testing.T) {
|
||||
// listener := mockGeminiServer(test.serverResponse, 0, false)
|
||||
// defer listener.Close()
|
||||
// addr := listener.Addr().String()
|
||||
// snapshot, err := Visit(fmt.Sprintf("gemini://%s/", addr))
|
||||
|
||||
// if test.expectedError && err == nil {
|
||||
// t.Errorf("Expected error, got nil")
|
||||
// }
|
||||
|
||||
// if !test.expectedError && err != nil {
|
||||
// t.Errorf("Unexpected error: %v", err)
|
||||
// }
|
||||
|
||||
// if snapshot.ResponseCode.ValueOrZero() != int64(test.expectedCode) {
|
||||
// t.Errorf("Expected code %d, got %d", test.expectedCode, snapshot.ResponseCode.ValueOrZero())
|
||||
// }
|
||||
|
||||
// if snapshot.MimeType.ValueOrZero() != test.expectedMime {
|
||||
// t.Errorf("Expected mimeType '%s', got '%s'", test.expectedMime, snapshot.MimeType.ValueOrZero())
|
||||
// }
|
||||
|
||||
// if test.expectedLinks != nil {
|
||||
// links, _ := snapshot.Links.Value()
|
||||
|
||||
// if len(links) != len(test.expectedLinks) {
|
||||
// t.Errorf("Expected %d links, got %d", len(test.expectedLinks), len(links))
|
||||
// }
|
||||
// for i, link := range links {
|
||||
// if link != test.expectedLinks[i] {
|
||||
// t.Errorf("Expected link '%s', got '%s'", test.expectedLinks[i], link)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// })
|
||||
// }
|
||||
// }
|
||||
|
||||
func TestVisit_InvalidURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
_, err := Visit("invalid-url")
|
||||
if err == nil {
|
||||
t.Errorf("Expected error for invalid URL, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
//func TestVisit_GeminiError(t *testing.T) {
|
||||
// listener := mockGeminiServer("51 Not Found\r\n", 0, false)
|
||||
// defer listener.Close()
|
||||
// addr := listener.Addr().String()
|
||||
//
|
||||
// s, err := Visit(fmt.Sprintf("gemini://%s/", addr))
|
||||
// if err != nil {
|
||||
// t.Errorf("Unexpected error: %v", err)
|
||||
// }
|
||||
//
|
||||
// expectedError := "51 Not Found"
|
||||
// if s.Error.ValueOrZero() != expectedError {
|
||||
// t.Errorf("Expected error in snapshot: %v, got %v", expectedError, s.Error)
|
||||
// }
|
||||
//
|
||||
// expectedCode := 51
|
||||
// if s.ResponseCode.ValueOrZero() != int64(expectedCode) {
|
||||
// t.Errorf("Expected code %d, got %d", expectedCode, s.ResponseCode.ValueOrZero())
|
||||
// }
|
||||
//}
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/antanst/go_errors"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
"golang.org/x/text/encoding/japanese"
|
||||
"golang.org/x/text/encoding/korean"
|
||||
@@ -25,7 +25,7 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
||||
}
|
||||
const maxSize = 10 * 1024 * 1024 // 10MB
|
||||
if len(input) > maxSize {
|
||||
return "", go_errors.NewError(fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize))
|
||||
return "", xerrors.NewError(fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false)
|
||||
}
|
||||
// remove NULL byte 0x00 (ReplaceAll accepts slices)
|
||||
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
|
||||
@@ -56,5 +56,5 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
return "", go_errors.NewError(fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr))
|
||||
return "", xerrors.NewError(fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false)
|
||||
}
|
||||
|
||||
@@ -1,95 +0,0 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"gemini-grc/common/snapshot"
|
||||
geminiUrl "gemini-grc/common/url"
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
// RobotsCache is a map of blocked URLs
|
||||
// key: URL
|
||||
// value: []string list of disallowed URLs
|
||||
// If a key has no blocked URLs, an empty
|
||||
// list is stored for caching.
|
||||
var RobotsCache sync.Map //nolint:gochecknoglobals
|
||||
|
||||
func populateRobotsCache(key string) (entries []string, _err error) {
|
||||
// We either store an empty list when
|
||||
// no rules, or a list of disallowed URLs.
|
||||
// This applies even if we have an error
|
||||
// finding/downloading robots.txt
|
||||
defer func() {
|
||||
RobotsCache.Store(key, entries)
|
||||
}()
|
||||
url := fmt.Sprintf("gemini://%s/robots.txt", key)
|
||||
robotsContent, err := ConnectAndGetData(url)
|
||||
if err != nil {
|
||||
return []string{}, err
|
||||
}
|
||||
s, err := snapshot.SnapshotFromURL(url, true)
|
||||
if err != nil {
|
||||
return []string{}, nil
|
||||
}
|
||||
s, err = processData(*s, robotsContent)
|
||||
if err != nil {
|
||||
logging.LogDebug("robots.txt error %s", err)
|
||||
return []string{}, nil
|
||||
}
|
||||
if s.ResponseCode.ValueOrZero() != 20 {
|
||||
logging.LogDebug("robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
|
||||
return []string{}, nil
|
||||
}
|
||||
// Some return text/plain, others text/gemini.
|
||||
// According to spec, the first is correct,
|
||||
// however let's be lenient
|
||||
var data string
|
||||
switch {
|
||||
case s.MimeType.ValueOrZero() == "text/plain":
|
||||
data = string(s.Data.ValueOrZero())
|
||||
case s.MimeType.ValueOrZero() == "text/gemini":
|
||||
data = s.GemText.ValueOrZero()
|
||||
default:
|
||||
return []string{}, nil
|
||||
}
|
||||
entries = ParseRobotsTxt(data, key)
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// RobotMatch checks if the snapshot URL matches
|
||||
// a robots.txt allow rule.
|
||||
func RobotMatch(u string) (bool, error) {
|
||||
url, err := geminiUrl.ParseURL(u, "", true)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||
var disallowedURLs []string
|
||||
cacheEntries, ok := RobotsCache.Load(key)
|
||||
if !ok {
|
||||
// First time check, populate robot cache
|
||||
disallowedURLs, err := populateRobotsCache(key)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if len(disallowedURLs) > 0 {
|
||||
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||
}
|
||||
} else {
|
||||
disallowedURLs, _ = cacheEntries.([]string)
|
||||
}
|
||||
return isURLblocked(disallowedURLs, url.Full), nil
|
||||
}
|
||||
|
||||
func isURLblocked(disallowedURLs []string, input string) bool {
|
||||
for _, url := range disallowedURLs {
|
||||
if strings.HasPrefix(strings.ToLower(input), url) {
|
||||
logging.LogDebug("robots.txt match: %s matches %s", input, url)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ParseRobotsTxt takes robots.txt content and a host, and
|
||||
// returns a list of full URLs that shouldn't
|
||||
// be visited.
|
||||
// TODO Also take into account the user agent?
|
||||
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||
func ParseRobotsTxt(content string, host string) []string {
|
||||
var disallowedPaths []string
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
line = strings.ToLower(line)
|
||||
if strings.HasPrefix(line, "disallow:") {
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
path := strings.TrimSpace(parts[1])
|
||||
if path != "" {
|
||||
// Construct full Gemini URL
|
||||
disallowedPaths = append(disallowedPaths,
|
||||
fmt.Sprintf("gemini://%s%s", host, path))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return disallowedPaths
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseRobotsTxt(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := `User-agent: *
|
||||
Disallow: /cgi-bin/wp.cgi/view
|
||||
Disallow: /cgi-bin/wp.cgi/media
|
||||
User-agent: googlebot
|
||||
Disallow: /admin/`
|
||||
|
||||
expected := []string{
|
||||
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||
"gemini://example.com/admin/",
|
||||
}
|
||||
|
||||
result := ParseRobotsTxt(input, "example.com")
|
||||
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRobotsTxtEmpty(t *testing.T) {
|
||||
t.Parallel()
|
||||
input := ``
|
||||
|
||||
result := ParseRobotsTxt(input, "example.com")
|
||||
|
||||
if len(result) != 0 {
|
||||
t.Errorf("ParseRobotsTxt() = %v, want empty []string", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsURLblocked(t *testing.T) {
|
||||
t.Parallel()
|
||||
disallowedURLs := []string{
|
||||
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||
"gemini://example.com/admin/",
|
||||
}
|
||||
url := "gemini://example.com/admin/index.html"
|
||||
if !isURLblocked(disallowedURLs, url) {
|
||||
t.Errorf("Expected %s to be blocked", url)
|
||||
}
|
||||
url = "gemini://example1.com/admin/index.html"
|
||||
if isURLblocked(disallowedURLs, url) {
|
||||
t.Errorf("expected %s to not be blocked", url)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user