Reorganize code for more granular imports

2025-02-26 10:34:25 +02:00
parent a9983f3531
commit 4bceb75695
23 changed files with 1549 additions and 1232 deletions
--- a/gemini/robotmatch.go
+++ b/gemini/robotmatch.go
@@ -2,10 +2,11 @@ package gemini

 import (
 	"fmt"
-	"gemini-grc/common"
 	"strings"
 	"sync"

+	"gemini-grc/common/snapshot"
+	geminiUrl "gemini-grc/common/url"
 	"gemini-grc/logging"
 )

@@ -16,7 +17,7 @@ import (
 // list is stored for caching.
 var RobotsCache sync.Map //nolint:gochecknoglobals

-func populateBlacklist(key string) (entries []string) {
+func populateRobotsCache(key string) (entries []string, _err error) {
 	// We either store an empty list when
 	// no rules, or a list of disallowed URLs.
 	// This applies even if we have an error
@@ -27,53 +28,60 @@ func populateBlacklist(key string) (entries []string) {
 	url := fmt.Sprintf("gemini://%s/robots.txt", key)
 	robotsContent, err := ConnectAndGetData(url)
 	if err != nil {
-		logging.LogDebug("robots.txt error %s", err)
-		return []string{}
+		return []string{}, err
 	}
-	robotsData, err := processData(robotsContent)
+	s, err := snapshot.SnapshotFromURL(url, true)
+	if err != nil {
+		return []string{}, nil
+	}
+	s, err = processData(*s, robotsContent)
 	if err != nil {
 		logging.LogDebug("robots.txt error %s", err)
-		return []string{}
+		return []string{}, nil
 	}
-	if robotsData.ResponseCode != 20 {
-		logging.LogDebug("robots.txt error code %d, ignoring", robotsData.ResponseCode)
-		return []string{}
+	if s.ResponseCode.ValueOrZero() != 20 {
+		logging.LogDebug("robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
+		return []string{}, nil
 	}
 	// Some return text/plain, others text/gemini.
 	// According to spec, the first is correct,
 	// however let's be lenient
 	var data string
 	switch {
-	case robotsData.MimeType == "text/plain":
-		data = string(robotsData.Data)
-	case robotsData.MimeType == "text/gemini":
-		data = robotsData.GemText
+	case s.MimeType.ValueOrZero() == "text/plain":
+		data = string(s.Data.ValueOrZero())
+	case s.MimeType.ValueOrZero() == "text/gemini":
+		data = s.GemText.ValueOrZero()
 	default:
-		return []string{}
+		return []string{}, nil
 	}
 	entries = ParseRobotsTxt(data, key)
-	return entries
+	return entries, nil
 }

 // RobotMatch checks if the snapshot URL matches
 // a robots.txt allow rule.
-func RobotMatch(u string) bool {
-	url, err := common.ParseURL(u, "")
+func RobotMatch(u string) (bool, error) {
+	url, err := geminiUrl.ParseURL(u, "", true)
 	if err != nil {
-		return false
+		return false, err
 	}
 	key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
-	logging.LogDebug("Checking robots.txt cache for %s", key)
 	var disallowedURLs []string
 	cacheEntries, ok := RobotsCache.Load(key)
 	if !ok {
 		// First time check, populate robot cache
-		disallowedURLs = populateBlacklist(key)
-		logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
+		disallowedURLs, err := populateRobotsCache(key)
+		if err != nil {
+			return false, err
+		}
+		if len(disallowedURLs) > 0 {
+			logging.LogDebug("Added to robots.txt cache: %v => %v", key, disallowedURLs)
+		}
 	} else {
 		disallowedURLs, _ = cacheEntries.([]string)
 	}
-	return isURLblocked(disallowedURLs, url.Full)
+	return isURLblocked(disallowedURLs, url.Full), nil
 }

 func isURLblocked(disallowedURLs []string, input string) bool {