Files
gemini-grc/gemini/robotmatch.go

96 lines
2.5 KiB
Go

package gemini
import (
"fmt"
"strings"
"sync"
"gemini-grc/common/snapshot"
geminiUrl "gemini-grc/common/url"
"gemini-grc/logging"
)
// RobotsCache is a map of blocked URLs
// key: URL
// value: []string list of disallowed URLs
// If a key has no blocked URLs, an empty
// list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals
func populateRobotsCache(key string) (entries []string, _err error) {
// We either store an empty list when
// no rules, or a list of disallowed URLs.
// This applies even if we have an error
// finding/downloading robots.txt
defer func() {
RobotsCache.Store(key, entries)
}()
url := fmt.Sprintf("gemini://%s/robots.txt", key)
robotsContent, err := ConnectAndGetData(url)
if err != nil {
return []string{}, err
}
s, err := snapshot.SnapshotFromURL(url, true)
if err != nil {
return []string{}, nil
}
s, err = processData(*s, robotsContent)
if err != nil {
logging.LogDebug("robots.txt error %s", err)
return []string{}, nil
}
if s.ResponseCode.ValueOrZero() != 20 {
logging.LogDebug("robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
return []string{}, nil
}
// Some return text/plain, others text/gemini.
// According to spec, the first is correct,
// however let's be lenient
var data string
switch {
case s.MimeType.ValueOrZero() == "text/plain":
data = string(s.Data.ValueOrZero())
case s.MimeType.ValueOrZero() == "text/gemini":
data = s.GemText.ValueOrZero()
default:
return []string{}, nil
}
entries = ParseRobotsTxt(data, key)
return entries, nil
}
// RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule.
func RobotMatch(u string) (bool, error) {
url, err := geminiUrl.ParseURL(u, "", true)
if err != nil {
return false, err
}
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key)
if !ok {
// First time check, populate robot cache
disallowedURLs, err := populateRobotsCache(key)
if err != nil {
return false, err
}
if len(disallowedURLs) > 0 {
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
}
} else {
disallowedURLs, _ = cacheEntries.([]string)
}
return isURLblocked(disallowedURLs, url.Full), nil
}
func isURLblocked(disallowedURLs []string, input string) bool {
for _, url := range disallowedURLs {
if strings.HasPrefix(strings.ToLower(input), url) {
logging.LogDebug("robots.txt match: %s matches %s", input, url)
return true
}
}
return false
}