Update and refactor core functionality
- Update common package utilities - Refactor network code for better error handling - Remove deprecated files and functionality - Enhance blacklist and filtering capabilities - Improve snapshot handling and processing
This commit is contained in:
@@ -8,45 +8,63 @@ import (
|
||||
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/logging"
|
||||
"github.com/antanst/go_errors"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
)
|
||||
|
||||
var Blacklist []regexp.Regexp //nolint:gochecknoglobals
|
||||
var blacklist []regexp.Regexp //nolint:gochecknoglobals
|
||||
|
||||
func LoadBlacklist() error {
|
||||
if config.CONFIG.BlacklistPath == "" {
|
||||
return nil
|
||||
}
|
||||
if Blacklist == nil {
|
||||
data, err := os.ReadFile(config.CONFIG.BlacklistPath)
|
||||
if err != nil {
|
||||
Blacklist = []regexp.Regexp{}
|
||||
return go_errors.NewError(fmt.Errorf("could not load Blacklist file: %w", err))
|
||||
}
|
||||
func Initialize() error {
|
||||
var err error
|
||||
|
||||
lines := strings.Split(string(data), "\n")
|
||||
|
||||
for _, line := range lines {
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
regex, err := regexp.Compile(line)
|
||||
if err != nil {
|
||||
return go_errors.NewError(fmt.Errorf("could not compile Blacklist line %s: %w", line, err))
|
||||
}
|
||||
Blacklist = append(Blacklist, *regex)
|
||||
|
||||
}
|
||||
|
||||
if len(lines) > 0 {
|
||||
logging.LogInfo("Loaded %d blacklist entries", len(Blacklist))
|
||||
// Initialize blacklist
|
||||
if config.CONFIG.BlacklistPath != "" {
|
||||
if err = loadBlacklist(config.CONFIG.BlacklistPath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func loadBlacklist(filePath string) error {
|
||||
if blacklist != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
blacklist = []regexp.Regexp{}
|
||||
return xerrors.NewError(fmt.Errorf("could not load blacklist file: %w", err), 0, "", true)
|
||||
}
|
||||
|
||||
lines := strings.Split(string(data), "\n")
|
||||
blacklist = []regexp.Regexp{}
|
||||
|
||||
for _, line := range lines {
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
regex, err := regexp.Compile(line)
|
||||
if err != nil {
|
||||
return xerrors.NewError(fmt.Errorf("could not compile blacklist line %s: %w", line, err), 0, "", true)
|
||||
}
|
||||
blacklist = append(blacklist, *regex)
|
||||
}
|
||||
|
||||
if len(blacklist) > 0 {
|
||||
logging.LogInfo("Loaded %d blacklist entries", len(blacklist))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func Shutdown() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsBlacklisted checks if the URL matches any blacklist pattern
|
||||
func IsBlacklisted(u string) bool {
|
||||
for _, v := range Blacklist {
|
||||
for _, v := range blacklist {
|
||||
if v.MatchString(u) {
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -3,16 +3,17 @@ package blackList
|
||||
import (
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gemini-grc/config"
|
||||
)
|
||||
|
||||
func TestIsBlacklisted(t *testing.T) {
|
||||
// Save original blacklist to restore after test
|
||||
originalBlacklist := Blacklist
|
||||
// Save original blacklist and whitelist to restore after test
|
||||
originalBlacklist := blacklist
|
||||
defer func() {
|
||||
Blacklist = originalBlacklist
|
||||
blacklist = originalBlacklist
|
||||
}()
|
||||
|
||||
tests := []struct {
|
||||
@@ -24,7 +25,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
{
|
||||
name: "empty blacklist",
|
||||
setup: func() {
|
||||
Blacklist = []regexp.Regexp{}
|
||||
blacklist = []regexp.Regexp{}
|
||||
},
|
||||
url: "https://example.com",
|
||||
expected: false,
|
||||
@@ -33,7 +34,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "exact hostname match",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`example\.com`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "example.com",
|
||||
expected: true,
|
||||
@@ -42,7 +43,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "hostname in URL match",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`example\.com`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://example.com/path",
|
||||
expected: true,
|
||||
@@ -51,7 +52,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "partial hostname match",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`example\.com`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://safe-example.com",
|
||||
expected: true,
|
||||
@@ -60,7 +61,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "full URL match",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`https://example\.com/bad-path`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://example.com/bad-path",
|
||||
expected: true,
|
||||
@@ -69,7 +70,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "path match",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile("/malicious-path")
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://example.com/malicious-path",
|
||||
expected: true,
|
||||
@@ -78,7 +79,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "subdomain match with word boundary",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`bad\.example\.com`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://bad.example.com/path",
|
||||
expected: true,
|
||||
@@ -89,7 +90,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
regex1, _ := regexp.Compile(`badsite\.com`)
|
||||
regex2, _ := regexp.Compile(`malicious\.org`)
|
||||
regex3, _ := regexp.Compile(`example\.com/sensitive`)
|
||||
Blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
|
||||
blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
|
||||
},
|
||||
url: "https://example.com/sensitive/data",
|
||||
expected: true,
|
||||
@@ -100,7 +101,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
regex1, _ := regexp.Compile(`badsite\.com`)
|
||||
regex2, _ := regexp.Compile(`malicious\.org`)
|
||||
regex3, _ := regexp.Compile(`example\.com/sensitive`)
|
||||
Blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
|
||||
blacklist = []regexp.Regexp{*regex1, *regex2, *regex3}
|
||||
},
|
||||
url: "https://example.com/safe/data",
|
||||
expected: false,
|
||||
@@ -109,7 +110,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "pattern with wildcard",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`.*\.evil\.com`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://subdomain.evil.com/path",
|
||||
expected: true,
|
||||
@@ -118,7 +119,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "pattern with special characters",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`example\.com/path\?id=[0-9]+`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://example.com/path?id=12345",
|
||||
expected: true,
|
||||
@@ -127,7 +128,7 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
name: "unicode character support",
|
||||
setup: func() {
|
||||
regex, _ := regexp.Compile(`example\.com/[\p{L}]+`)
|
||||
Blacklist = []regexp.Regexp{*regex}
|
||||
blacklist = []regexp.Regexp{*regex}
|
||||
},
|
||||
url: "https://example.com/café",
|
||||
expected: true,
|
||||
@@ -145,12 +146,88 @@ func TestIsBlacklisted(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadBlacklist(t *testing.T) {
|
||||
// Save original blacklist to restore after test
|
||||
originalBlacklist := Blacklist
|
||||
// TestBlacklistLoading tests that the blacklist loading logic works with a mock blacklist file
|
||||
func TestBlacklistLoading(t *testing.T) {
|
||||
// Save original blacklist and config
|
||||
originalBlacklist := blacklist
|
||||
originalConfigPath := config.CONFIG.BlacklistPath
|
||||
defer func() {
|
||||
Blacklist = originalBlacklist
|
||||
blacklist = originalBlacklist
|
||||
config.CONFIG.BlacklistPath = originalConfigPath
|
||||
}()
|
||||
|
||||
// Create a temporary blacklist file with known patterns
|
||||
tmpFile, err := os.CreateTemp("", "mock-blacklist-*.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temporary file: %v", err)
|
||||
}
|
||||
defer os.Remove(tmpFile.Name())
|
||||
|
||||
// Write some test patterns to the mock blacklist file
|
||||
mockBlacklistContent := `# Mock blacklist file for testing
|
||||
/git/
|
||||
/.git/
|
||||
/cgit/
|
||||
gemini://git\..*$
|
||||
gemini://.*/git/.*
|
||||
gopher://.*/git/.*
|
||||
.*/(commit|blob|tree)/.*
|
||||
.*/[0-9a-f]{7,40}$
|
||||
`
|
||||
if err := os.WriteFile(tmpFile.Name(), []byte(mockBlacklistContent), 0o644); err != nil {
|
||||
t.Fatalf("Failed to write to temporary file: %v", err)
|
||||
}
|
||||
|
||||
// Configure and load the mock blacklist
|
||||
blacklist = nil
|
||||
config.CONFIG.BlacklistPath = tmpFile.Name()
|
||||
err = Initialize()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to load mock blacklist: %v", err)
|
||||
}
|
||||
|
||||
// Count the number of non-comment, non-empty lines to verify loading
|
||||
lineCount := 0
|
||||
for _, line := range strings.Split(mockBlacklistContent, "\n") {
|
||||
if line != "" && !strings.HasPrefix(line, "#") {
|
||||
lineCount++
|
||||
}
|
||||
}
|
||||
|
||||
if len(blacklist) != lineCount {
|
||||
t.Errorf("Expected %d patterns to be loaded, got %d", lineCount, len(blacklist))
|
||||
}
|
||||
|
||||
// Verify some sample URLs against our known patterns
|
||||
testURLs := []struct {
|
||||
url string
|
||||
expected bool
|
||||
desc string
|
||||
}{
|
||||
{"gemini://example.com/git/repo", true, "git repository"},
|
||||
{"gemini://git.example.com", true, "git subdomain"},
|
||||
{"gemini://example.com/cgit/repo", true, "cgit repository"},
|
||||
{"gemini://example.com/repo/commit/abc123", true, "git commit"},
|
||||
{"gemini://example.com/123abc7", true, "commit hash at path end"},
|
||||
{"gopher://example.com/1/git/repo", true, "gopher git repository"},
|
||||
{"gemini://example.com/normal/page.gmi", false, "normal gemini page"},
|
||||
{"gemini://example.com/project/123abc", false, "hash not at path end"},
|
||||
}
|
||||
|
||||
for _, tt := range testURLs {
|
||||
result := IsBlacklisted(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("With mock blacklist, IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadBlacklist(t *testing.T) {
|
||||
// Save original blacklist to restore after test
|
||||
originalBlacklist := blacklist
|
||||
originalConfigPath := config.CONFIG.BlacklistPath
|
||||
defer func() {
|
||||
blacklist = originalBlacklist
|
||||
config.CONFIG.BlacklistPath = originalConfigPath
|
||||
}()
|
||||
|
||||
@@ -161,7 +238,7 @@ func TestLoadBlacklist(t *testing.T) {
|
||||
}
|
||||
defer os.Remove(tmpFile.Name())
|
||||
|
||||
// Test cases for LoadBlacklist
|
||||
// Test cases for Initialize
|
||||
tests := []struct {
|
||||
name string
|
||||
blacklistLines []string
|
||||
@@ -202,7 +279,7 @@ func TestLoadBlacklist(t *testing.T) {
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Reset blacklist
|
||||
Blacklist = nil
|
||||
blacklist = nil
|
||||
|
||||
// Set config path
|
||||
config.CONFIG.BlacklistPath = tt.configPath
|
||||
@@ -219,29 +296,186 @@ func TestLoadBlacklist(t *testing.T) {
|
||||
}
|
||||
|
||||
// Call the function
|
||||
err := LoadBlacklist()
|
||||
err := Initialize()
|
||||
|
||||
// Check results
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("LoadBlacklist() error = %v, wantErr %v", err, tt.wantErr)
|
||||
t.Errorf("Initialize() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
|
||||
if !tt.wantErr && len(Blacklist) != tt.expectedLen {
|
||||
t.Errorf("LoadBlacklist() loaded %d entries, want %d", len(Blacklist), tt.expectedLen)
|
||||
if !tt.wantErr && len(blacklist) != tt.expectedLen {
|
||||
t.Errorf("Initialize() loaded %d entries, want %d", len(blacklist), tt.expectedLen)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGitPatterns tests the blacklist patterns specifically for Git repositories
|
||||
func TestGitPatterns(t *testing.T) {
|
||||
// Save original blacklist to restore after test
|
||||
originalBlacklist := blacklist
|
||||
defer func() {
|
||||
blacklist = originalBlacklist
|
||||
}()
|
||||
|
||||
// Create patterns similar to those in the blacklist.txt file
|
||||
patterns := []string{
|
||||
"/git/",
|
||||
"/.git/",
|
||||
"/cgit/",
|
||||
"/gitweb/",
|
||||
"/gitea/",
|
||||
"/scm/",
|
||||
".*/(commit|blob|tree|tag|diff|blame|log|raw)/.*",
|
||||
".*/(commits|objects|refs|branches|tags)/.*",
|
||||
".*/[0-9a-f]{7,40}$",
|
||||
"gemini://git\\..*$",
|
||||
"gemini://.*/git/.*",
|
||||
"gemini://.*\\.git/.*",
|
||||
"gopher://.*/git/.*",
|
||||
}
|
||||
|
||||
// Compile and set up the patterns
|
||||
blacklist = []regexp.Regexp{}
|
||||
for _, pattern := range patterns {
|
||||
regex, err := regexp.Compile(pattern)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to compile pattern %q: %v", pattern, err)
|
||||
}
|
||||
blacklist = append(blacklist, *regex)
|
||||
}
|
||||
|
||||
// Test URLs against git-related patterns
|
||||
tests := []struct {
|
||||
url string
|
||||
expected bool
|
||||
desc string
|
||||
}{
|
||||
// Git paths
|
||||
{"gemini://example.com/git/", true, "basic git path"},
|
||||
{"gemini://example.com/.git/", true, "hidden git path"},
|
||||
{"gemini://example.com/cgit/", true, "cgit path"},
|
||||
{"gemini://example.com/gitweb/", true, "gitweb path"},
|
||||
{"gemini://example.com/gitea/", true, "gitea path"},
|
||||
{"gemini://example.com/scm/", true, "scm path"},
|
||||
|
||||
// Git operations
|
||||
{"gemini://example.com/repo/commit/abc123", true, "commit path"},
|
||||
{"gemini://example.com/repo/blob/main/README.md", true, "blob path"},
|
||||
{"gemini://example.com/repo/tree/master", true, "tree path"},
|
||||
{"gemini://example.com/repo/tag/v1.0", true, "tag path"},
|
||||
|
||||
// Git internals
|
||||
{"gemini://example.com/repo/commits/", true, "commits path"},
|
||||
{"gemini://example.com/repo/objects/", true, "objects path"},
|
||||
{"gemini://example.com/repo/refs/heads/main", true, "refs path"},
|
||||
|
||||
// Git hashes
|
||||
{"gemini://example.com/commit/a1b2c3d", true, "short hash"},
|
||||
{"gemini://example.com/commit/a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0", true, "long hash"},
|
||||
|
||||
// Git domains
|
||||
{"gemini://git.example.com/", true, "git subdomain"},
|
||||
{"gemini://example.com/git/repo", true, "git directory"},
|
||||
{"gemini://example.com/project.git/", true, "git extension"},
|
||||
|
||||
// Gopher protocol
|
||||
{"gopher://example.com/1/git/repo", true, "gopher git path"},
|
||||
|
||||
// Non-matching URLs
|
||||
{"gemini://example.com/project/", false, "regular project path"},
|
||||
{"gemini://example.com/blog/", false, "blog path"},
|
||||
{"gemini://example.com/git-guide.gmi", false, "hyphenated word with git"},
|
||||
{"gemini://example.com/digital/", false, "word containing 'git'"},
|
||||
{"gemini://example.com/ab12cd3", true, "short hex string matches commit hash pattern"},
|
||||
{"gemini://example.com/ab12cdz", false, "alphanumeric string with non-hex chars won't match commit hash"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.desc, func(t *testing.T) {
|
||||
result := IsBlacklisted(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGeminiGopherPatterns tests the blacklist patterns specific to Gemini and Gopher protocols
|
||||
func TestGeminiGopherPatterns(t *testing.T) {
|
||||
// Save original blacklist to restore after test
|
||||
originalBlacklist := blacklist
|
||||
defer func() {
|
||||
blacklist = originalBlacklist
|
||||
}()
|
||||
|
||||
// Create patterns for Gemini and Gopher
|
||||
patterns := []string{
|
||||
"gemini://badhost\\.com",
|
||||
"gemini://.*/cgi-bin/",
|
||||
"gemini://.*/private/",
|
||||
"gemini://.*\\.evil\\..*",
|
||||
"gopher://badhost\\.org",
|
||||
"gopher://.*/I/onlyfans/",
|
||||
"gopher://.*/[0-9]/(cgi|bin)/",
|
||||
}
|
||||
|
||||
// Compile and set up the patterns
|
||||
blacklist = []regexp.Regexp{}
|
||||
for _, pattern := range patterns {
|
||||
regex, err := regexp.Compile(pattern)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to compile pattern %q: %v", pattern, err)
|
||||
}
|
||||
blacklist = append(blacklist, *regex)
|
||||
}
|
||||
|
||||
// Test URLs against Gemini and Gopher patterns
|
||||
tests := []struct {
|
||||
url string
|
||||
expected bool
|
||||
desc string
|
||||
}{
|
||||
// Gemini URLs
|
||||
{"gemini://badhost.com/", true, "blacklisted gemini host"},
|
||||
{"gemini://badhost.com/page.gmi", true, "blacklisted gemini host with path"},
|
||||
{"gemini://example.com/cgi-bin/script.cgi", true, "gemini cgi-bin path"},
|
||||
{"gemini://example.com/private/docs", true, "gemini private path"},
|
||||
{"gemini://subdomain.evil.org", true, "gemini evil domain pattern"},
|
||||
{"gemini://example.com/public/docs", false, "safe gemini path"},
|
||||
{"gemini://goodhost.com/", false, "safe gemini host"},
|
||||
|
||||
// Gopher URLs
|
||||
{"gopher://badhost.org/1/menu", true, "blacklisted gopher host"},
|
||||
{"gopher://example.org/I/onlyfans/image", true, "gopher onlyfans path"},
|
||||
{"gopher://example.org/1/cgi/script", true, "gopher cgi path"},
|
||||
{"gopher://example.org/1/bin/executable", true, "gopher bin path"},
|
||||
{"gopher://example.org/0/text", false, "safe gopher text"},
|
||||
{"gopher://goodhost.org/1/menu", false, "safe gopher host"},
|
||||
|
||||
// Protocol distinction
|
||||
{"https://badhost.com/", false, "blacklisted host but wrong protocol"},
|
||||
{"http://example.com/cgi-bin/script.cgi", false, "bad path but wrong protocol"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.desc, func(t *testing.T) {
|
||||
result := IsBlacklisted(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("IsBlacklisted(%q) = %v, want %v", tt.url, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsBlacklistedIntegration tests the integration between LoadBlacklist and IsBlacklisted
|
||||
func TestIsBlacklistedIntegration(t *testing.T) {
|
||||
// Save original blacklist to restore after test
|
||||
originalBlacklist := Blacklist
|
||||
originalConfigPath := config.CONFIG.BlacklistPath
|
||||
originalBlacklist := blacklist
|
||||
originalBlacklistPath := config.CONFIG.BlacklistPath
|
||||
defer func() {
|
||||
Blacklist = originalBlacklist
|
||||
config.CONFIG.BlacklistPath = originalConfigPath
|
||||
blacklist = originalBlacklist
|
||||
config.CONFIG.BlacklistPath = originalBlacklistPath
|
||||
}()
|
||||
|
||||
// Create a temporary blacklist file for testing
|
||||
@@ -264,12 +498,12 @@ malicious\.org
|
||||
}
|
||||
|
||||
// Set up the test
|
||||
Blacklist = nil
|
||||
blacklist = nil
|
||||
config.CONFIG.BlacklistPath = tmpFile.Name()
|
||||
|
||||
// Load the blacklist
|
||||
if err := LoadBlacklist(); err != nil {
|
||||
t.Fatalf("LoadBlacklist() failed: %v", err)
|
||||
if err := Initialize(); err != nil {
|
||||
t.Fatalf("Initialize() failed: %v", err)
|
||||
}
|
||||
|
||||
// Test URLs against the loaded blacklist
|
||||
|
||||
@@ -10,8 +10,15 @@ import (
|
||||
|
||||
type LinkList []url.URL
|
||||
|
||||
func (l *LinkList) Value() (driver.Value, error) {
|
||||
return json.Marshal(l)
|
||||
func (l LinkList) Value() (driver.Value, error) {
|
||||
if len(l) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
data, err := json.Marshal(l)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func (l *LinkList) Scan(value interface{}) error {
|
||||
@@ -19,7 +26,7 @@ func (l *LinkList) Scan(value interface{}) error {
|
||||
*l = nil
|
||||
return nil
|
||||
}
|
||||
b, ok := value.([]byte) // Type assertion! Converts to []byte
|
||||
b, ok := value.([]byte)
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to scan LinkList: expected []byte, got %T", value)
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
package common
|
||||
|
||||
import "os"
|
||||
|
||||
// FatalErrorsChan accepts errors from workers.
|
||||
// In case of fatal error, gracefully
|
||||
// exits the application.
|
||||
var (
|
||||
StatusChan chan WorkerStatus
|
||||
// ErrorsChan accepts errors from workers.
|
||||
// In case of fatal error, gracefully
|
||||
// exits the application.
|
||||
ErrorsChan chan error
|
||||
FatalErrorsChan chan error
|
||||
SignalsChan chan os.Signal
|
||||
)
|
||||
|
||||
const VERSION string = "0.0.1"
|
||||
|
||||
@@ -5,12 +5,12 @@ import (
|
||||
|
||||
"gemini-grc/common/linkList"
|
||||
commonUrl "gemini-grc/common/url"
|
||||
"github.com/antanst/go_errors"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
type Snapshot struct {
|
||||
ID int `db:"ID" json:"ID,omitempty"`
|
||||
ID int `db:"id" json:"ID,omitempty"`
|
||||
URL commonUrl.URL `db:"url" json:"url,omitempty"`
|
||||
Host string `db:"host" json:"host,omitempty"`
|
||||
Timestamp null.Time `db:"timestamp" json:"timestamp,omitempty"`
|
||||
@@ -27,7 +27,7 @@ type Snapshot struct {
|
||||
func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
|
||||
url, err := commonUrl.ParseURL(u, "", normalize)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(err)
|
||||
return nil, xerrors.NewError(err, 0, "", false)
|
||||
}
|
||||
newSnapshot := Snapshot{
|
||||
URL: *url,
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/antanst/go_errors"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
)
|
||||
|
||||
type URL struct {
|
||||
@@ -29,7 +29,7 @@ func (u *URL) Scan(value interface{}) error {
|
||||
}
|
||||
b, ok := value.(string)
|
||||
if !ok {
|
||||
return go_errors.NewFatalError(fmt.Errorf("database scan error: expected string, got %T", value))
|
||||
return xerrors.NewError(fmt.Errorf("database scan error: expected string, got %T", value), 0, "", true)
|
||||
}
|
||||
parsedURL, err := ParseURL(b, "", false)
|
||||
if err != nil {
|
||||
@@ -82,7 +82,7 @@ func ParseURL(input string, descr string, normalize bool) (*URL, error) {
|
||||
} else {
|
||||
u, err = url.Parse(input)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input), 0, "", false)
|
||||
}
|
||||
}
|
||||
protocol := u.Scheme
|
||||
@@ -99,7 +99,7 @@ func ParseURL(input string, descr string, normalize bool) (*URL, error) {
|
||||
}
|
||||
port, err := strconv.Atoi(strPort)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w: %s", err, input), 0, "", false)
|
||||
}
|
||||
full := fmt.Sprintf("%s://%s:%d%s", protocol, hostname, port, urlPath)
|
||||
// full field should also contain query params and url fragments
|
||||
@@ -145,13 +145,13 @@ func NormalizeURL(rawURL string) (*url.URL, error) {
|
||||
// Parse the URL
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error normalizing URL: %w: %s", err, rawURL))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error normalizing URL: %w: %s", err, rawURL), 0, "", false)
|
||||
}
|
||||
if u.Scheme == "" {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error normalizing URL: No scheme: %s", rawURL))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error normalizing URL: No scheme: %s", rawURL), 0, "", false)
|
||||
}
|
||||
if u.Host == "" {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error normalizing URL: No host: %s", rawURL))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error normalizing URL: No host: %s", rawURL), 0, "", false)
|
||||
}
|
||||
|
||||
// Convert scheme to lowercase
|
||||
@@ -275,7 +275,7 @@ func ExtractRedirectTargetFromHeader(currentURL URL, input string) (*URL, error)
|
||||
re := regexp.MustCompile(pattern)
|
||||
matches := re.FindStringSubmatch(input)
|
||||
if len(matches) < 2 {
|
||||
return nil, go_errors.NewError(fmt.Errorf("error extracting redirect target from string %s", input))
|
||||
return nil, xerrors.NewError(fmt.Errorf("error extracting redirect target from string %s", input), 0, "", false)
|
||||
}
|
||||
newURL, err := DeriveAbsoluteURL(currentURL, matches[1])
|
||||
if err != nil {
|
||||
|
||||
449
common/worker.go
449
common/worker.go
@@ -1,245 +1,271 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"gemini-grc/common/blackList"
|
||||
errors2 "gemini-grc/common/errors"
|
||||
"gemini-grc/common/contextlog"
|
||||
commonErrors "gemini-grc/common/errors"
|
||||
"gemini-grc/common/snapshot"
|
||||
url2 "gemini-grc/common/url"
|
||||
_db "gemini-grc/db"
|
||||
"gemini-grc/common/whiteList"
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/contextutil"
|
||||
gemdb "gemini-grc/db"
|
||||
"gemini-grc/gemini"
|
||||
"gemini-grc/gopher"
|
||||
"gemini-grc/hostPool"
|
||||
"gemini-grc/logging"
|
||||
"github.com/antanst/go_errors"
|
||||
"gemini-grc/robotsMatch"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
"github.com/guregu/null/v5"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
func CrawlOneURL(db *sqlx.DB, url *string) error {
|
||||
parsedURL, err := url2.ParseURL(*url, "", true)
|
||||
func RunWorkerWithTx(workerID int, job string) {
|
||||
// Extract host from URL for the context.
|
||||
parsedURL, err := url2.ParseURL(job, "", true)
|
||||
if err != nil {
|
||||
return err
|
||||
logging.LogInfo("Failed to parse job URL: %s Error: %s", job, err)
|
||||
return
|
||||
}
|
||||
host := parsedURL.Hostname
|
||||
|
||||
if !url2.IsGeminiUrl(parsedURL.String()) && !url2.IsGopherURL(parsedURL.String()) {
|
||||
return go_errors.NewError(fmt.Errorf("error parsing URL: not a Gemini or Gopher URL: %s", parsedURL.String()))
|
||||
}
|
||||
// Create a new worker context
|
||||
baseCtx := context.Background()
|
||||
ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID)
|
||||
defer cancel() // Ensure the context is cancelled when we're done
|
||||
ctx = contextutil.ContextWithComponent(ctx, "worker")
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job)
|
||||
|
||||
tx, err := db.Beginx()
|
||||
// Create a new db transaction
|
||||
tx, err := gemdb.Database.NewTx(ctx)
|
||||
if err != nil {
|
||||
return go_errors.NewFatalError(err)
|
||||
}
|
||||
|
||||
err = _db.InsertURL(tx, parsedURL.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = workOnUrl(0, tx, parsedURL.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
//if _db.IsDeadlockError(err) {
|
||||
// logging.LogError("Deadlock detected. Rolling back")
|
||||
// time.Sleep(time.Duration(10) * time.Second)
|
||||
// err := tx.Rollback()
|
||||
// return go_errors.NewFatalError(err)
|
||||
//}
|
||||
return go_errors.NewFatalError(err)
|
||||
}
|
||||
logging.LogInfo("Done")
|
||||
return nil
|
||||
}
|
||||
|
||||
func SpawnWorkers(numOfWorkers int, db *sqlx.DB) {
|
||||
logging.LogInfo("Spawning %d workers", numOfWorkers)
|
||||
go PrintWorkerStatus(numOfWorkers, StatusChan)
|
||||
|
||||
for i := range numOfWorkers {
|
||||
go func(i int) {
|
||||
UpdateWorkerStatus(i, "Waiting to start")
|
||||
// Jitter to avoid starting everything at the same time
|
||||
time.Sleep(time.Duration(i+2) * time.Second)
|
||||
for {
|
||||
// TODO: Use cancellable context with tx, logger & worker ID.
|
||||
// ctx := context.WithCancel()
|
||||
// ctx = context.WithValue(ctx, common.CtxKeyLogger, &RequestLogger{r: r})
|
||||
RunWorkerWithTx(i, db)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
}
|
||||
|
||||
func RunWorkerWithTx(workerID int, db *sqlx.DB) {
|
||||
defer func() {
|
||||
UpdateWorkerStatus(workerID, "Done")
|
||||
}()
|
||||
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
ErrorsChan <- err
|
||||
FatalErrorsChan <- err
|
||||
return
|
||||
}
|
||||
|
||||
err = runWorker(workerID, tx)
|
||||
err = runWorker(ctx, tx, []string{job})
|
||||
if err != nil {
|
||||
// TODO: Rollback in this case?
|
||||
ErrorsChan <- err
|
||||
return
|
||||
}
|
||||
|
||||
logging.LogDebug("[%3d] Committing transaction", workerID)
|
||||
err = tx.Commit()
|
||||
// On deadlock errors, rollback and return, otherwise panic.
|
||||
if err != nil {
|
||||
logging.LogError("[%3d] Failed to commit transaction: %w", workerID, err)
|
||||
if _db.IsDeadlockError(err) {
|
||||
logging.LogError("[%3d] Deadlock detected. Rolling back", workerID)
|
||||
time.Sleep(time.Duration(10) * time.Second)
|
||||
err := tx.Rollback()
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("[%3d] Failed to roll back transaction: %v", workerID, err))
|
||||
// Handle context cancellation and timeout errors gracefully, instead of treating them as fatal
|
||||
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker timed out or canceled: %v", err)
|
||||
rollbackErr := SafeRollback(ctx, tx)
|
||||
if rollbackErr != nil {
|
||||
FatalErrorsChan <- rollbackErr
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
panic(fmt.Sprintf("[%3d] Failed to commit transaction: %v", workerID, err))
|
||||
// For other errors, we treat them as fatal.
|
||||
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
|
||||
rollbackErr := SafeRollback(ctx, tx)
|
||||
if rollbackErr != nil {
|
||||
FatalErrorsChan <- rollbackErr
|
||||
}
|
||||
FatalErrorsChan <- err
|
||||
return
|
||||
}
|
||||
logging.LogDebug("[%3d] Worker done!", workerID)
|
||||
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Committing transaction")
|
||||
err = tx.Commit()
|
||||
if err != nil && !errors.Is(err, sql.ErrTxDone) {
|
||||
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to commit transaction: %v", err)
|
||||
if rollbackErr := SafeRollback(ctx, tx); rollbackErr != nil {
|
||||
FatalErrorsChan <- err
|
||||
return
|
||||
}
|
||||
}
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker done!")
|
||||
}
|
||||
|
||||
func runWorker(workerID int, tx *sqlx.Tx) error {
|
||||
var urls []string
|
||||
var err error
|
||||
|
||||
UpdateWorkerStatus(workerID, "Getting URLs from DB")
|
||||
urls, err = _db.GetRandomUrls(tx)
|
||||
// urls, err = _db.GetRandomUrlsWithBasePath(tx)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if len(urls) == 0 {
|
||||
logging.LogInfo("[%3d] No URLs to visit, sleeping...", workerID)
|
||||
UpdateWorkerStatus(workerID, "No URLs to visit, sleeping...")
|
||||
time.Sleep(1 * time.Minute)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Start visiting URLs.
|
||||
total := len(urls)
|
||||
for i, u := range urls {
|
||||
logging.LogInfo("[%3d] Starting %d/%d %s", workerID, i+1, total, u)
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Starting %d/%d %s", i+1, total, u))
|
||||
err := workOnUrl(workerID, tx, u)
|
||||
if err != nil {
|
||||
return err
|
||||
// SafeRollback attempts to roll back a transaction,
|
||||
// handling the case if the tx was already finalized.
|
||||
func SafeRollback(ctx context.Context, tx *sqlx.Tx) error {
|
||||
rollbackErr := tx.Rollback()
|
||||
if rollbackErr != nil {
|
||||
// Check if it's the standard "transaction already finalized" error
|
||||
if errors.Is(rollbackErr, sql.ErrTxDone) {
|
||||
contextlog.LogWarnWithContext(ctx, logging.GetSlogger(), "Rollback failed because transaction is already finalized")
|
||||
return nil
|
||||
}
|
||||
logging.LogDebug("[%3d] Done %d/%d.", workerID, i+1, total)
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Done %d/%d %s", i+1, total, u))
|
||||
// Only panic for other types of rollback failures
|
||||
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to rollback transaction: %v", rollbackErr)
|
||||
return xerrors.NewError(fmt.Errorf("failed to rollback transaction: %w", rollbackErr), 0, "", true)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// workOnUrl visits a URL and stores the result.
|
||||
func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
|
||||
total := len(urls)
|
||||
for i, u := range urls {
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting %d/%d %s", i+1, total, u)
|
||||
urlCtx, cancelFunc := context.WithCancel(ctx)
|
||||
err := WorkOnUrl(urlCtx, tx, u)
|
||||
cancelFunc()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Done %d/%d.", i+1, total)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// WorkOnUrl visits a URL and stores the result.
|
||||
// unexpected errors are returned.
|
||||
// expected errors are stored within the snapshot.
|
||||
func workOnUrl(workerID int, tx *sqlx.Tx, url string) (err error) {
|
||||
s, err := snapshot.SnapshotFromURL(url, false)
|
||||
func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
||||
// Create a context specifically for this URL with "url" component
|
||||
urlCtx := contextutil.ContextWithComponent(ctx, "url")
|
||||
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Processing URL: %s", url)
|
||||
|
||||
s, err := snapshot.SnapshotFromURL(url, true)
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Failed to parse URL: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
isGemini := url2.IsGeminiUrl(s.URL.String())
|
||||
isGopher := url2.IsGopherURL(s.URL.String())
|
||||
|
||||
if !isGemini && !isGopher {
|
||||
return go_errors.NewError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()))
|
||||
return xerrors.NewError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()), 0, "", false)
|
||||
}
|
||||
|
||||
if blackList.IsBlacklisted(s.URL.String()) {
|
||||
logging.LogInfo("[%3d] URL matches blacklist, ignoring", workerID)
|
||||
s.Error = null.StringFrom(errors2.ErrBlacklistMatch.Error())
|
||||
return saveSnapshotAndRemoveURL(tx, s)
|
||||
if isGopher && !config.CONFIG.GopherEnable {
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Skipping gopher URL (disabled in config)")
|
||||
return nil
|
||||
}
|
||||
|
||||
if isGemini {
|
||||
if url != s.URL.Full {
|
||||
err = gemdb.Database.NormalizeURL(ctx, tx, url, s.URL.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Normalized URL: %s → %s", url, s.URL.Full)
|
||||
url = s.URL.Full
|
||||
}
|
||||
|
||||
// Check if URL is whitelisted
|
||||
isUrlWhitelisted := whiteList.IsWhitelisted(s.URL.String())
|
||||
if isUrlWhitelisted {
|
||||
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches whitelist, forcing crawl %s", url)
|
||||
}
|
||||
|
||||
// Only check blacklist if URL is not whitelisted
|
||||
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
|
||||
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches blacklist, ignoring %s", url)
|
||||
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
|
||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||
}
|
||||
|
||||
// Only check robots.txt if URL is not whitelisted and is a Gemini URL
|
||||
var robotMatch bool
|
||||
if !isUrlWhitelisted && isGemini {
|
||||
// If URL matches a robots.txt disallow line,
|
||||
// add it as an error and remove url
|
||||
robotMatch, err := gemini.RobotMatch(s.URL.String())
|
||||
robotMatch, err = robotsMatch.RobotMatch(urlCtx, s.URL.String())
|
||||
if err != nil {
|
||||
// robotMatch returns only network errors!
|
||||
// we stop because we don't want to hit
|
||||
// the server with another request on this case.
|
||||
if commonErrors.IsHostError(err) {
|
||||
return removeURL(ctx, tx, s.URL.String())
|
||||
}
|
||||
return err
|
||||
}
|
||||
if robotMatch {
|
||||
logging.LogInfo("[%3d] URL matches robots.txt, ignoring", workerID)
|
||||
s.Error = null.StringFrom(errors2.ErrRobotsMatch.Error())
|
||||
return saveSnapshotAndRemoveURL(tx, s)
|
||||
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches robots.txt, skipping")
|
||||
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
|
||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||
}
|
||||
}
|
||||
|
||||
logging.LogDebug("[%3d] Adding to pool %s", workerID, s.URL.String())
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Adding to pool %s", s.URL.String()))
|
||||
hostPool.AddHostToHostPool(s.Host)
|
||||
defer func(s string) {
|
||||
hostPool.RemoveHostFromPool(s)
|
||||
}(s.Host)
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Adding to host pool")
|
||||
err = hostPool.AddHostToHostPool(urlCtx, s.Host)
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Failed to add host to pool: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
logging.LogDebug("[%3d] Visiting %s", workerID, s.URL.String())
|
||||
UpdateWorkerStatus(workerID, fmt.Sprintf("Visiting %s", s.URL.String()))
|
||||
defer func(ctx context.Context, host string) {
|
||||
hostPool.RemoveHostFromPool(ctx, host)
|
||||
}(urlCtx, s.Host)
|
||||
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Visiting %s", s.URL.String())
|
||||
|
||||
// Use context-aware visits for both protocols
|
||||
if isGopher {
|
||||
s, err = gopher.Visit(s.URL.String())
|
||||
// Use the context-aware version for Gopher visits
|
||||
s, err = gopher.VisitWithContext(urlCtx, s.URL.String())
|
||||
} else {
|
||||
s, err = gemini.Visit(s.URL.String())
|
||||
// Use the context-aware version for Gemini visits
|
||||
s, err = gemini.Visit(urlCtx, s.URL.String())
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Error visiting URL: %v", err)
|
||||
return err
|
||||
}
|
||||
if s == nil {
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "No snapshot returned")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Handle Gemini redirection.
|
||||
if isGemini &&
|
||||
s.ResponseCode.ValueOrZero() >= 30 &&
|
||||
s.ResponseCode.ValueOrZero() < 40 {
|
||||
err = handleRedirection(workerID, tx, s)
|
||||
err = handleRedirection(urlCtx, tx, s)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while handling redirection: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Store links
|
||||
// Check if content is identical to previous snapshot and we should skip further processing
|
||||
if config.CONFIG.SkipIdenticalContent {
|
||||
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if identical {
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
|
||||
return removeURL(ctx, tx, s.URL.String())
|
||||
}
|
||||
}
|
||||
|
||||
// Process and store links since content has changed
|
||||
if len(s.Links.ValueOrZero()) > 0 {
|
||||
logging.LogDebug("[%3d] Found %d links", workerID, len(s.Links.ValueOrZero()))
|
||||
err = storeLinks(tx, s)
|
||||
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Found %d links", len(s.Links.ValueOrZero()))
|
||||
err = storeLinks(ctx, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String())
|
||||
return saveSnapshotAndRemoveURL(tx, s)
|
||||
// Save the snapshot and remove the URL from the queue
|
||||
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.URL.String())
|
||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||
}
|
||||
|
||||
func storeLinks(tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
// storeLinks checks and stores the snapshot links in the database.
|
||||
func storeLinks(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
if s.Links.Valid { //nolint:nestif
|
||||
for _, link := range s.Links.ValueOrZero() {
|
||||
if shouldPersistURL(&link) {
|
||||
visited, err := haveWeVisitedURL(tx, link.Full)
|
||||
visited, err := haveWeVisitedURL(ctx, tx, link.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !visited {
|
||||
err := _db.InsertURL(tx, link.Full)
|
||||
err := gemdb.Database.InsertURL(ctx, tx, link.Full)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
logging.LogDebug("Link already persisted: %s", link.Full)
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Link already persisted: %s", link.Full)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -247,74 +273,117 @@ func storeLinks(tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func saveSnapshotAndRemoveURL(tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
err := _db.OverwriteSnapshot(tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = _db.DeleteURL(tx, s.URL.String())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
// Context-aware version of removeURL
|
||||
func removeURL(ctx context.Context, tx *sqlx.Tx, url string) error {
|
||||
return gemdb.Database.DeleteURL(ctx, tx, url)
|
||||
}
|
||||
|
||||
// shouldPersistURL returns true if we
|
||||
// should save the URL in the _db.
|
||||
// Only gemini:// urls are saved.
|
||||
// Context-aware version of saveSnapshotAndRemoveURL
|
||||
func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
err := gemdb.Database.SaveSnapshot(ctx, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return gemdb.Database.DeleteURL(ctx, tx, s.URL.String())
|
||||
}
|
||||
|
||||
// shouldPersistURL returns true given URL is a
|
||||
// non-blacklisted Gemini or Gopher URL.
|
||||
func shouldPersistURL(u *url2.URL) bool {
|
||||
return url2.IsGeminiUrl(u.String()) || url2.IsGopherURL(u.String())
|
||||
if blackList.IsBlacklisted(u.String()) {
|
||||
return false
|
||||
}
|
||||
if config.CONFIG.GopherEnable && url2.IsGopherURL(u.String()) {
|
||||
return true
|
||||
}
|
||||
return url2.IsGeminiUrl(u.String())
|
||||
}
|
||||
|
||||
func haveWeVisitedURL(tx *sqlx.Tx, u string) (bool, error) {
|
||||
func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error) {
|
||||
var result []bool
|
||||
err := tx.Select(&result, `SELECT TRUE FROM urls WHERE url=$1`, u)
|
||||
|
||||
// Check if the context is cancelled
|
||||
if err := ctx.Err(); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
// Check the urls table which holds the crawl queue.
|
||||
err := tx.SelectContext(ctx, &result, `SELECT TRUE FROM urls WHERE url=$1`, u)
|
||||
if err != nil {
|
||||
return false, go_errors.NewFatalError(fmt.Errorf("database error: %w", err))
|
||||
return false, xerrors.NewError(fmt.Errorf("database error: %w", err), 0, "", true)
|
||||
}
|
||||
if len(result) > 0 {
|
||||
return result[0], nil
|
||||
return false, nil
|
||||
}
|
||||
err = tx.Select(&result, `SELECT TRUE FROM snapshots WHERE snapshots.url=$1`, u)
|
||||
if err != nil {
|
||||
return false, go_errors.NewFatalError(fmt.Errorf("database error: %w", err))
|
||||
}
|
||||
if len(result) > 0 {
|
||||
return result[0], nil
|
||||
|
||||
// If we're skipping URLs based on recent updates, check if this URL has been
|
||||
// crawled within the specified number of days
|
||||
if config.CONFIG.SkipIfUpdatedDays > 0 {
|
||||
var recentSnapshots []bool
|
||||
cutoffDate := time.Now().AddDate(0, 0, -config.CONFIG.SkipIfUpdatedDays)
|
||||
|
||||
// Check if the context is cancelled
|
||||
if err := ctx.Err(); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
err = tx.SelectContext(ctx, &recentSnapshots, `
|
||||
SELECT TRUE FROM snapshots
|
||||
WHERE snapshots.url=$1
|
||||
AND timestamp > $2
|
||||
LIMIT 1`, u, cutoffDate)
|
||||
if err != nil {
|
||||
return false, xerrors.NewError(fmt.Errorf("database error checking recent snapshots: %w", err), 0, "", true)
|
||||
}
|
||||
|
||||
if len(recentSnapshots) > 0 {
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Skipping URL %s (updated within last %d days)", u, config.CONFIG.SkipIfUpdatedDays)
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// handleRedirection saves redirection URL.
|
||||
func handleRedirection(workerID int, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Error.ValueOrZero())
|
||||
func handleRedirection(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
// Create a context specifically for redirection handling
|
||||
redirectCtx := contextutil.ContextWithComponent(ctx, "redirect")
|
||||
|
||||
// Use the redirectCtx for all operations
|
||||
newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Header.ValueOrZero())
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(redirectCtx, logging.GetSlogger(), "Failed to extract redirect target: %v", err)
|
||||
return err
|
||||
}
|
||||
contextlog.LogDebugWithContext(redirectCtx, logging.GetSlogger(), "Page redirects to %s", newURL)
|
||||
|
||||
haveWeVisited, err := haveWeVisitedURL(redirectCtx, tx, newURL.String())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
logging.LogDebug("[%3d] Page redirects to %s", workerID, newURL)
|
||||
|
||||
haveWeVisited, _ := haveWeVisitedURL(tx, newURL.String())
|
||||
if shouldPersistURL(newURL) && !haveWeVisited {
|
||||
err = _db.InsertURL(tx, newURL.Full)
|
||||
err = gemdb.Database.InsertURL(redirectCtx, tx, newURL.Full)
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(redirectCtx, logging.GetSlogger(), "Failed to insert redirect URL: %v", err)
|
||||
return err
|
||||
}
|
||||
logging.LogDebug("[%3d] Saved redirection URL %s", workerID, newURL.String())
|
||||
contextlog.LogDebugWithContext(redirectCtx, logging.GetSlogger(), "Saved redirection URL %s", newURL.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]snapshot.Snapshot, error) {
|
||||
query := `
|
||||
SELECT *
|
||||
FROM snapshots
|
||||
WHERE url=$1
|
||||
LIMIT 1
|
||||
`
|
||||
var snapshots []snapshot.Snapshot
|
||||
err := tx.Select(&snapshots, query, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return snapshots, nil
|
||||
}
|
||||
//func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]snapshot.Snapshot, error) {
|
||||
// query := `
|
||||
// SELECT *
|
||||
// FROM snapshots
|
||||
// WHERE url=$1
|
||||
// LIMIT 1
|
||||
// `
|
||||
// var snapshots []snapshot.Snapshot
|
||||
// err := tx.Select(&snapshots, query, url)
|
||||
// if err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
// return snapshots, nil
|
||||
//}
|
||||
|
||||
Reference in New Issue
Block a user