Enhance crawler with seed list and SQL utilities
Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
67
common/seedList/seedlist.go
Normal file
67
common/seedList/seedlist.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package seedList
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"git.antanst.com/antanst/logging"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
)
|
||||
|
||||
var seedlist []string //nolint:gochecknoglobals
|
||||
|
||||
func Initialize() error {
|
||||
var err error
|
||||
|
||||
// Initialize seedlist from fixed path
|
||||
if err = loadSeedlist("seed_urls.txt"); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func loadSeedlist(filePath string) error {
|
||||
if seedlist != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
seedlist = []string{}
|
||||
return xerrors.NewError(fmt.Errorf("could not load seedlist file: %w", err), 0, "", true)
|
||||
}
|
||||
|
||||
lines := strings.Split(string(data), "\n")
|
||||
seedlist = []string{}
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
seedlist = append(seedlist, line)
|
||||
}
|
||||
|
||||
if len(seedlist) > 0 {
|
||||
logging.LogInfo("Loaded %d seed URLs", len(seedlist))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func Shutdown() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetSeedURLs returns the list of seed URLs
|
||||
func GetSeedURLs() []string {
|
||||
if seedlist == nil {
|
||||
return []string{}
|
||||
}
|
||||
// Return a copy to prevent external modification
|
||||
result := make([]string, len(seedlist))
|
||||
copy(result, seedlist)
|
||||
return result
|
||||
}
|
||||
67
common/seedList/seedlist_test.go
Normal file
67
common/seedList/seedlist_test.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package seedList
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadSeedlist(t *testing.T) {
|
||||
// Create a temporary test file
|
||||
content := `# Test seed URLs
|
||||
gemini://example.com/
|
||||
gemini://test.com/
|
||||
|
||||
# Another comment
|
||||
gemini://demo.org/`
|
||||
|
||||
tmpFile, err := os.CreateTemp("", "seed_urls_test_*.txt")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp file: %v", err)
|
||||
}
|
||||
defer os.Remove(tmpFile.Name())
|
||||
|
||||
if _, err := tmpFile.WriteString(content); err != nil {
|
||||
t.Fatalf("Failed to write to temp file: %v", err)
|
||||
}
|
||||
tmpFile.Close()
|
||||
|
||||
// Reset global variable for test
|
||||
seedlist = nil
|
||||
|
||||
// Test loading
|
||||
err = loadSeedlist(tmpFile.Name())
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to load seedlist: %v", err)
|
||||
}
|
||||
|
||||
// Verify content
|
||||
expected := []string{
|
||||
"gemini://example.com/",
|
||||
"gemini://test.com/",
|
||||
"gemini://demo.org/",
|
||||
}
|
||||
|
||||
urls := GetSeedURLs()
|
||||
if len(urls) != len(expected) {
|
||||
t.Errorf("Expected %d URLs, got %d", len(expected), len(urls))
|
||||
}
|
||||
|
||||
for i, url := range urls {
|
||||
if url != expected[i] {
|
||||
t.Errorf("Expected URL %d to be %s, got %s", i, expected[i], url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetSeedURLsEmptyList(t *testing.T) {
|
||||
// Reset global variable
|
||||
originalSeedlist := seedlist
|
||||
defer func() { seedlist = originalSeedlist }()
|
||||
|
||||
seedlist = nil
|
||||
|
||||
urls := GetSeedURLs()
|
||||
if len(urls) != 0 {
|
||||
t.Errorf("Expected empty list, got %d URLs", len(urls))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user