Enhance crawler with seed list and SQL utilities

Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
antanst
2025-06-16 12:29:33 +03:00
parent 51f94c90b2
commit 330b596497
37 changed files with 742 additions and 682 deletions

View File

@@ -0,0 +1,67 @@
package seedList
import (
"fmt"
"os"
"strings"
"git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors"
)
var seedlist []string //nolint:gochecknoglobals
func Initialize() error {
var err error
// Initialize seedlist from fixed path
if err = loadSeedlist("seed_urls.txt"); err != nil {
return err
}
return nil
}
func loadSeedlist(filePath string) error {
if seedlist != nil {
return nil
}
data, err := os.ReadFile(filePath)
if err != nil {
seedlist = []string{}
return xerrors.NewError(fmt.Errorf("could not load seedlist file: %w", err), 0, "", true)
}
lines := strings.Split(string(data), "\n")
seedlist = []string{}
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
seedlist = append(seedlist, line)
}
if len(seedlist) > 0 {
logging.LogInfo("Loaded %d seed URLs", len(seedlist))
}
return nil
}
func Shutdown() error {
return nil
}
// GetSeedURLs returns the list of seed URLs
func GetSeedURLs() []string {
if seedlist == nil {
return []string{}
}
// Return a copy to prevent external modification
result := make([]string, len(seedlist))
copy(result, seedlist)
return result
}

View File

@@ -0,0 +1,67 @@
package seedList
import (
"os"
"testing"
)
func TestLoadSeedlist(t *testing.T) {
// Create a temporary test file
content := `# Test seed URLs
gemini://example.com/
gemini://test.com/
# Another comment
gemini://demo.org/`
tmpFile, err := os.CreateTemp("", "seed_urls_test_*.txt")
if err != nil {
t.Fatalf("Failed to create temp file: %v", err)
}
defer os.Remove(tmpFile.Name())
if _, err := tmpFile.WriteString(content); err != nil {
t.Fatalf("Failed to write to temp file: %v", err)
}
tmpFile.Close()
// Reset global variable for test
seedlist = nil
// Test loading
err = loadSeedlist(tmpFile.Name())
if err != nil {
t.Fatalf("Failed to load seedlist: %v", err)
}
// Verify content
expected := []string{
"gemini://example.com/",
"gemini://test.com/",
"gemini://demo.org/",
}
urls := GetSeedURLs()
if len(urls) != len(expected) {
t.Errorf("Expected %d URLs, got %d", len(expected), len(urls))
}
for i, url := range urls {
if url != expected[i] {
t.Errorf("Expected URL %d to be %s, got %s", i, expected[i], url)
}
}
}
func TestGetSeedURLsEmptyList(t *testing.T) {
// Reset global variable
originalSeedlist := seedlist
defer func() { seedlist = originalSeedlist }()
seedlist = nil
urls := GetSeedURLs()
if len(urls) != 0 {
t.Errorf("Expected empty list, got %d URLs", len(urls))
}
}