This commit is contained in:
2024-11-18 16:28:45 +02:00
parent f0452ff9f7
commit 825c7e3391
34 changed files with 624 additions and 426 deletions

View File

@@ -2,16 +2,18 @@ package gemini
import (
"fmt"
"gemini-grc/logging"
"strings"
"sync"
"gemini-grc/logging"
)
// key: "host:port" (string)
// value:
// empty []string if no robots data, or
// list of URL prefixes ([]string) in robots
var RobotsCache sync.Map
// RobotsCache is a map of blocked URLs
// key: URL
// value: []string list of disallowed URLs
// If a key has no blocked URLs, an empty
// list is stored for caching.
var RobotsCache sync.Map //nolint:gochecknoglobals
func populateBlacklist(key string) (entries []string) {
// We either store an empty list when
@@ -40,43 +42,40 @@ func populateBlacklist(key string) (entries []string) {
// According to spec, the first is correct,
// however let's be lenient
var data string
if robotsData.MimeType == "text/plain" {
switch {
case robotsData.MimeType == "text/plain":
data = string(robotsData.Data)
} else if robotsData.MimeType == "text/gemini" {
case robotsData.MimeType == "text/gemini":
data = robotsData.GemText
} else {
default:
return []string{}
}
entries = ParseRobotsTxt(string(data), key)
entries = ParseRobotsTxt(data, key)
return entries
}
// Check if the snapshot URL matches
// RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule.
func RobotMatch(s *Snapshot) bool {
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
v, ok := RobotsCache.Load(key)
func RobotMatch(url URL) bool {
logging.LogDebug("Checking robots.txt cache for %s", url.String())
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
var disallowedURLs []string
cacheEntries, ok := RobotsCache.Load(key)
if !ok {
// First time check, populate robot cache
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
disallowedURLs := populateBlacklist(key)
for _, url := range disallowedURLs {
if strings.HasPrefix(s.URL.String(), url) {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
return true
}
}
disallowedURLs = populateBlacklist(key)
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
} else {
if len(v.([]string)) == 0 {
logging.LogDebug("No robots.txt or no rules, allowed")
return false
}
for _, url := range v.([]string) {
if strings.HasPrefix(s.URL.String(), url) {
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
return true
}
disallowedURLs, _ = cacheEntries.([]string)
}
return isURLblocked(disallowedURLs, url.Full)
}
func isURLblocked(disallowedURLs []string, input string) bool {
for _, url := range disallowedURLs {
if strings.HasPrefix(strings.ToLower(input), url) {
logging.LogDebug("robots.txt match: %s matches %s", input, url)
return true
}
}
return false