Reorganize code for more granular imports

This commit is contained in:
2025-02-26 10:34:25 +02:00
parent 8350e106d6
commit ca008b0796
23 changed files with 1549 additions and 1232 deletions

View File

@@ -2,10 +2,11 @@ package gemini
import (
"fmt"
"gemini-grc/common"
"strings"
"sync"
"gemini-grc/common/snapshot"
geminiUrl "gemini-grc/common/url"
"gemini-grc/logging"
)
@@ -16,7 +17,7 @@ import (
// list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals
func populateBlacklist(key string) (entries []string) {
func populateRobotsCache(key string) (entries []string, _err error) {
// We either store an empty list when
// no rules, or a list of disallowed URLs.
// This applies even if we have an error
@@ -27,53 +28,60 @@ func populateBlacklist(key string) (entries []string) {
url := fmt.Sprintf("gemini://%s/robots.txt", key)
robotsContent, err := ConnectAndGetData(url)
if err != nil {
logging.LogDebug("robots.txt error %s", err)
return []string{}
return []string{}, err
}
robotsData, err := processData(robotsContent)
s, err := snapshot.SnapshotFromURL(url, true)
if err != nil {
return []string{}, nil
}
s, err = processData(*s, robotsContent)
if err != nil {
logging.LogDebug("robots.txt error %s", err)
return []string{}
return []string{}, nil
}
if robotsData.ResponseCode != 20 {
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
return []string{}
if s.ResponseCode.ValueOrZero() != 20 {
logging.LogDebug("robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
return []string{}, nil
}
// Some return text/plain, others text/gemini.
// According to spec, the first is correct,
// however let's be lenient
var data string
switch {
case robotsData.MimeType == "text/plain":
data = string(robotsData.Data)
case robotsData.MimeType == "text/gemini":
data = robotsData.GemText
case s.MimeType.ValueOrZero() == "text/plain":
data = string(s.Data.ValueOrZero())
case s.MimeType.ValueOrZero() == "text/gemini":
data = s.GemText.ValueOrZero()
default:
return []string{}
return []string{}, nil
}
entries = ParseRobotsTxt(data, key)
return entries
return entries, nil
}
// RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule.
func RobotMatch(u string) bool {
url, err := common.ParseURL(u, "")
func RobotMatch(u string) (bool, error) {
url, err := geminiUrl.ParseURL(u, "", true)
if err != nil {
return false
return false, err
}
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
logging.LogDebug("Checking robots.txt cache for %s", key)
var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key)
if !ok {
// First time check, populate robot cache
disallowedURLs = populateBlacklist(key)
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
disallowedURLs, err := populateRobotsCache(key)
if err != nil {
return false, err
}
if len(disallowedURLs) > 0 {
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
}
} else {
disallowedURLs, _ = cacheEntries.([]string)
}
return isURLblocked(disallowedURLs, url.Full)
return isURLblocked(disallowedURLs, url.Full), nil
}
func isURLblocked(disallowedURLs []string, input string) bool {