.
This commit is contained in:
@@ -2,16 +2,18 @@ package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/logging"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
// key: "host:port" (string)
|
||||
// value:
|
||||
// empty []string if no robots data, or
|
||||
// list of URL prefixes ([]string) in robots
|
||||
var RobotsCache sync.Map
|
||||
// RobotsCache is a map of blocked URLs
|
||||
// key: URL
|
||||
// value: []string list of disallowed URLs
|
||||
// If a key has no blocked URLs, an empty
|
||||
// list is stored for caching.
|
||||
var RobotsCache sync.Map //nolint:gochecknoglobals
|
||||
|
||||
func populateBlacklist(key string) (entries []string) {
|
||||
// We either store an empty list when
|
||||
@@ -40,43 +42,40 @@ func populateBlacklist(key string) (entries []string) {
|
||||
// According to spec, the first is correct,
|
||||
// however let's be lenient
|
||||
var data string
|
||||
if robotsData.MimeType == "text/plain" {
|
||||
switch {
|
||||
case robotsData.MimeType == "text/plain":
|
||||
data = string(robotsData.Data)
|
||||
} else if robotsData.MimeType == "text/gemini" {
|
||||
case robotsData.MimeType == "text/gemini":
|
||||
data = robotsData.GemText
|
||||
} else {
|
||||
default:
|
||||
return []string{}
|
||||
}
|
||||
entries = ParseRobotsTxt(string(data), key)
|
||||
entries = ParseRobotsTxt(data, key)
|
||||
return entries
|
||||
}
|
||||
|
||||
// Check if the snapshot URL matches
|
||||
// RobotMatch checks if the snapshot URL matches
|
||||
// a robots.txt allow rule.
|
||||
func RobotMatch(s *Snapshot) bool {
|
||||
logging.LogDebug("Checking robots.txt cache for %s", s.URL.String())
|
||||
key := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
|
||||
v, ok := RobotsCache.Load(key)
|
||||
func RobotMatch(url URL) bool {
|
||||
logging.LogDebug("Checking robots.txt cache for %s", url.String())
|
||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||
var disallowedURLs []string
|
||||
cacheEntries, ok := RobotsCache.Load(key)
|
||||
if !ok {
|
||||
// First time check, populate robot cache
|
||||
logging.LogDebug("No robots.txt entry, populating cache for %s", s.URL.String())
|
||||
disallowedURLs := populateBlacklist(key)
|
||||
for _, url := range disallowedURLs {
|
||||
if strings.HasPrefix(s.URL.String(), url) {
|
||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||
return true
|
||||
}
|
||||
}
|
||||
disallowedURLs = populateBlacklist(key)
|
||||
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||
} else {
|
||||
if len(v.([]string)) == 0 {
|
||||
logging.LogDebug("No robots.txt or no rules, allowed")
|
||||
return false
|
||||
}
|
||||
for _, url := range v.([]string) {
|
||||
if strings.HasPrefix(s.URL.String(), url) {
|
||||
logging.LogDebug("robots.txt match: %s %s", s.URL.String(), url)
|
||||
return true
|
||||
}
|
||||
disallowedURLs, _ = cacheEntries.([]string)
|
||||
}
|
||||
return isURLblocked(disallowedURLs, url.Full)
|
||||
}
|
||||
|
||||
func isURLblocked(disallowedURLs []string, input string) bool {
|
||||
for _, url := range disallowedURLs {
|
||||
if strings.HasPrefix(strings.ToLower(input), url) {
|
||||
logging.LogDebug("robots.txt match: %s matches %s", input, url)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
||||
Reference in New Issue
Block a user