Reorganize code for more granular imports
This commit is contained in:
@@ -2,10 +2,11 @@ package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gemini-grc/common"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"gemini-grc/common/snapshot"
|
||||
geminiUrl "gemini-grc/common/url"
|
||||
"gemini-grc/logging"
|
||||
)
|
||||
|
||||
@@ -16,7 +17,7 @@ import (
|
||||
// list is stored for caching.
|
||||
var RobotsCache sync.Map //nolint:gochecknoglobals
|
||||
|
||||
func populateBlacklist(key string) (entries []string) {
|
||||
func populateRobotsCache(key string) (entries []string, _err error) {
|
||||
// We either store an empty list when
|
||||
// no rules, or a list of disallowed URLs.
|
||||
// This applies even if we have an error
|
||||
@@ -27,53 +28,60 @@ func populateBlacklist(key string) (entries []string) {
|
||||
url := fmt.Sprintf("gemini://%s/robots.txt", key)
|
||||
robotsContent, err := ConnectAndGetData(url)
|
||||
if err != nil {
|
||||
logging.LogDebug("robots.txt error %s", err)
|
||||
return []string{}
|
||||
return []string{}, err
|
||||
}
|
||||
robotsData, err := processData(robotsContent)
|
||||
s, err := snapshot.SnapshotFromURL(url, true)
|
||||
if err != nil {
|
||||
return []string{}, nil
|
||||
}
|
||||
s, err = processData(*s, robotsContent)
|
||||
if err != nil {
|
||||
logging.LogDebug("robots.txt error %s", err)
|
||||
return []string{}
|
||||
return []string{}, nil
|
||||
}
|
||||
if robotsData.ResponseCode != 20 {
|
||||
logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
|
||||
return []string{}
|
||||
if s.ResponseCode.ValueOrZero() != 20 {
|
||||
logging.LogDebug("robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
|
||||
return []string{}, nil
|
||||
}
|
||||
// Some return text/plain, others text/gemini.
|
||||
// According to spec, the first is correct,
|
||||
// however let's be lenient
|
||||
var data string
|
||||
switch {
|
||||
case robotsData.MimeType == "text/plain":
|
||||
data = string(robotsData.Data)
|
||||
case robotsData.MimeType == "text/gemini":
|
||||
data = robotsData.GemText
|
||||
case s.MimeType.ValueOrZero() == "text/plain":
|
||||
data = string(s.Data.ValueOrZero())
|
||||
case s.MimeType.ValueOrZero() == "text/gemini":
|
||||
data = s.GemText.ValueOrZero()
|
||||
default:
|
||||
return []string{}
|
||||
return []string{}, nil
|
||||
}
|
||||
entries = ParseRobotsTxt(data, key)
|
||||
return entries
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// RobotMatch checks if the snapshot URL matches
|
||||
// a robots.txt allow rule.
|
||||
func RobotMatch(u string) bool {
|
||||
url, err := common.ParseURL(u, "")
|
||||
func RobotMatch(u string) (bool, error) {
|
||||
url, err := geminiUrl.ParseURL(u, "", true)
|
||||
if err != nil {
|
||||
return false
|
||||
return false, err
|
||||
}
|
||||
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
|
||||
logging.LogDebug("Checking robots.txt cache for %s", key)
|
||||
var disallowedURLs []string
|
||||
cacheEntries, ok := RobotsCache.Load(key)
|
||||
if !ok {
|
||||
// First time check, populate robot cache
|
||||
disallowedURLs = populateBlacklist(key)
|
||||
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||
disallowedURLs, err := populateRobotsCache(key)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if len(disallowedURLs) > 0 {
|
||||
logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
|
||||
}
|
||||
} else {
|
||||
disallowedURLs, _ = cacheEntries.([]string)
|
||||
}
|
||||
return isURLblocked(disallowedURLs, url.Full)
|
||||
return isURLblocked(disallowedURLs, url.Full), nil
|
||||
}
|
||||
|
||||
func isURLblocked(disallowedURLs []string, input string) bool {
|
||||
|
||||
Reference in New Issue
Block a user