Enhance crawler with seed list and SQL utilities

Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
antanst
2025-06-16 12:29:33 +03:00
parent 51f94c90b2
commit 330b596497
37 changed files with 742 additions and 682 deletions

9
.gitignore vendored
View File

@@ -1,5 +1,6 @@
**/.#* **/.#*
**/*~ **/*~
**/.DS_Store
/.idea /.idea
/.goroot /.goroot
/dist/** /dist/**
@@ -18,3 +19,11 @@ run*.sh
/db/sql/** /db/sql/**
**/.claude/settings.local.json **/.claude/settings.local.json
/crawl.sh
/crawler.sh
/get.sh
/snapshot_history.sh
/whitelist.txt
/CLAUDE.md

View File

@@ -19,6 +19,13 @@ CREATE UNIQUE INDEX idx_url_timestamp ON snapshots (url, timestamp);
CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC); CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC);
``` ```
## Error handling
- `xerrors` library is used for error creation/wrapping.
- The "Fatal" field is not used, we _always_ panic on fatal errors.
- _All_ internal functions _must_ return `xerror` errors.
- _All_ external errors are wrapped within `xerror` errors.
### Code Changes ### Code Changes
1. **Updated SQL Queries**: 1. **Updated SQL Queries**:

View File

@@ -7,7 +7,7 @@ import (
"strings" "strings"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
) )

View File

@@ -0,0 +1,67 @@
package seedList
import (
"fmt"
"os"
"strings"
"git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors"
)
var seedlist []string //nolint:gochecknoglobals
func Initialize() error {
var err error
// Initialize seedlist from fixed path
if err = loadSeedlist("seed_urls.txt"); err != nil {
return err
}
return nil
}
func loadSeedlist(filePath string) error {
if seedlist != nil {
return nil
}
data, err := os.ReadFile(filePath)
if err != nil {
seedlist = []string{}
return xerrors.NewError(fmt.Errorf("could not load seedlist file: %w", err), 0, "", true)
}
lines := strings.Split(string(data), "\n")
seedlist = []string{}
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
seedlist = append(seedlist, line)
}
if len(seedlist) > 0 {
logging.LogInfo("Loaded %d seed URLs", len(seedlist))
}
return nil
}
func Shutdown() error {
return nil
}
// GetSeedURLs returns the list of seed URLs
func GetSeedURLs() []string {
if seedlist == nil {
return []string{}
}
// Return a copy to prevent external modification
result := make([]string, len(seedlist))
copy(result, seedlist)
return result
}

View File

@@ -0,0 +1,67 @@
package seedList
import (
"os"
"testing"
)
func TestLoadSeedlist(t *testing.T) {
// Create a temporary test file
content := `# Test seed URLs
gemini://example.com/
gemini://test.com/
# Another comment
gemini://demo.org/`
tmpFile, err := os.CreateTemp("", "seed_urls_test_*.txt")
if err != nil {
t.Fatalf("Failed to create temp file: %v", err)
}
defer os.Remove(tmpFile.Name())
if _, err := tmpFile.WriteString(content); err != nil {
t.Fatalf("Failed to write to temp file: %v", err)
}
tmpFile.Close()
// Reset global variable for test
seedlist = nil
// Test loading
err = loadSeedlist(tmpFile.Name())
if err != nil {
t.Fatalf("Failed to load seedlist: %v", err)
}
// Verify content
expected := []string{
"gemini://example.com/",
"gemini://test.com/",
"gemini://demo.org/",
}
urls := GetSeedURLs()
if len(urls) != len(expected) {
t.Errorf("Expected %d URLs, got %d", len(expected), len(urls))
}
for i, url := range urls {
if url != expected[i] {
t.Errorf("Expected URL %d to be %s, got %s", i, expected[i], url)
}
}
}
func TestGetSeedURLsEmptyList(t *testing.T) {
// Reset global variable
originalSeedlist := seedlist
defer func() { seedlist = originalSeedlist }()
seedlist = nil
urls := GetSeedURLs()
if len(urls) != 0 {
t.Errorf("Expected empty list, got %d URLs", len(urls))
}
}

View File

@@ -27,7 +27,7 @@ type Snapshot struct {
func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) { func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
url, err := commonUrl.ParseURL(u, "", normalize) url, err := commonUrl.ParseURL(u, "", normalize)
if err != nil { if err != nil {
return nil, xerrors.NewError(err, 0, "", false) return nil, xerrors.NewSimpleError(err)
} }
newSnapshot := Snapshot{ newSnapshot := Snapshot{
URL: *url, URL: *url,

View File

@@ -117,7 +117,7 @@ func TestURLOperations(t *testing.T) {
} }
}) })
t.Run("NormalizeURL", func(t *testing.T) { t.Run("CheckAndUpdateNormalizedURL", func(t *testing.T) {
t.Parallel() t.Parallel()
tests := []struct { tests := []struct {

View File

@@ -7,7 +7,7 @@ import (
"strings" "strings"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
) )

View File

@@ -19,8 +19,8 @@ import (
"gemini-grc/gemini" "gemini-grc/gemini"
"gemini-grc/gopher" "gemini-grc/gopher"
"gemini-grc/hostPool" "gemini-grc/hostPool"
"gemini-grc/logging"
"gemini-grc/robotsMatch" "gemini-grc/robotsMatch"
"git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
@@ -30,7 +30,7 @@ func RunWorkerWithTx(workerID int, job string) {
// Extract host from URL for the context. // Extract host from URL for the context.
parsedURL, err := url2.ParseURL(job, "", true) parsedURL, err := url2.ParseURL(job, "", true)
if err != nil { if err != nil {
logging.LogInfo("Failed to parse job URL: %s Error: %s", job, err) logging.LogInfo("Failed to parse URL: %s Error: %s", job, err)
return return
} }
host := parsedURL.Hostname host := parsedURL.Hostname
@@ -38,8 +38,9 @@ func RunWorkerWithTx(workerID int, job string) {
// Create a new worker context // Create a new worker context
baseCtx := context.Background() baseCtx := context.Background()
ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID) ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID)
defer cancel() // Ensure the context is cancelled when we're done
ctx = contextutil.ContextWithComponent(ctx, "worker") ctx = contextutil.ContextWithComponent(ctx, "worker")
defer cancel() // Ensure the context is cancelled when we're done
// contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "======================================\n\n")
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job)
// Create a new db transaction // Create a new db transaction
@@ -51,66 +52,53 @@ func RunWorkerWithTx(workerID int, job string) {
err = runWorker(ctx, tx, []string{job}) err = runWorker(ctx, tx, []string{job})
if err != nil { if err != nil {
// Handle context cancellation and timeout errors gracefully, instead of treating them as fatal // Two cases to handle:
// - context cancellation/timeout errors (log and ignore)
// - fatal errors (log and send to chan)
// non-fatal errors should've been handled within
// the runWorker() function and not bubble up here.
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker timed out or canceled: %v", err) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker timed out or canceled: %v", err)
rollbackErr := SafeRollback(ctx, tx) rollbackErr := gemdb.SafeRollback(ctx, tx)
if rollbackErr != nil { if rollbackErr != nil {
FatalErrorsChan <- rollbackErr FatalErrorsChan <- rollbackErr
return return
} }
return return
} else if xerrors.IsFatal(err) {
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
rollbackErr := gemdb.SafeRollback(ctx, tx)
if rollbackErr != nil {
FatalErrorsChan <- rollbackErr
return
}
FatalErrorsChan <- err
return
} }
// For other errors, we treat them as fatal. panic(err) // We shouldn't reach this point!
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
rollbackErr := SafeRollback(ctx, tx)
if rollbackErr != nil {
FatalErrorsChan <- rollbackErr
}
FatalErrorsChan <- err
return
} }
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Committing transaction")
err = tx.Commit() err = tx.Commit()
if err != nil && !errors.Is(err, sql.ErrTxDone) { if err != nil && !errors.Is(err, sql.ErrTxDone) {
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to commit transaction: %v", err) contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to commit transaction: %v", err)
if rollbackErr := SafeRollback(ctx, tx); rollbackErr != nil { if rollbackErr := gemdb.SafeRollback(ctx, tx); rollbackErr != nil {
FatalErrorsChan <- err FatalErrorsChan <- err
return return
} }
} }
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker done!") contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker done.")
}
// SafeRollback attempts to roll back a transaction,
// handling the case if the tx was already finalized.
func SafeRollback(ctx context.Context, tx *sqlx.Tx) error {
rollbackErr := tx.Rollback()
if rollbackErr != nil {
// Check if it's the standard "transaction already finalized" error
if errors.Is(rollbackErr, sql.ErrTxDone) {
contextlog.LogWarnWithContext(ctx, logging.GetSlogger(), "Rollback failed because transaction is already finalized")
return nil
}
// Only panic for other types of rollback failures
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to rollback transaction: %v", rollbackErr)
return xerrors.NewError(fmt.Errorf("failed to rollback transaction: %w", rollbackErr), 0, "", true)
}
return nil
} }
func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error { func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
total := len(urls) for _, u := range urls {
for i, u := range urls { err := WorkOnUrl(ctx, tx, u)
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting %d/%d %s", i+1, total, u)
urlCtx, cancelFunc := context.WithCancel(ctx)
err := WorkOnUrl(urlCtx, tx, u)
cancelFunc()
if err != nil { if err != nil {
return err if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) || xerrors.IsFatal(err) {
return err
}
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
} }
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Done %d/%d.", i+1, total)
} }
return nil return nil
} }
@@ -119,47 +107,44 @@ func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
// unexpected errors are returned. // unexpected errors are returned.
// expected errors are stored within the snapshot. // expected errors are stored within the snapshot.
func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) { func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
// Create a context specifically for this URL with "url" component contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker visiting URL %s", url)
urlCtx := contextutil.ContextWithComponent(ctx, "url")
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Processing URL: %s", url)
s, err := snapshot.SnapshotFromURL(url, true) s, err := snapshot.SnapshotFromURL(url, true)
if err != nil { if err != nil {
contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Failed to parse URL: %v", err) contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to parse URL: %v", err)
return err return err
} }
// We always use the normalized URL
if url != s.URL.Full {
//err = gemdb.Database.CheckAndUpdateNormalizedURL(ctx, tx, url, s.URL.Full)
//if err != nil {
// return err
//}
//contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Normalized URL: %s → %s", url, s.URL.Full)
url = s.URL.Full
}
isGemini := url2.IsGeminiUrl(s.URL.String()) isGemini := url2.IsGeminiUrl(s.URL.String())
isGopher := url2.IsGopherURL(s.URL.String()) isGopher := url2.IsGopherURL(s.URL.String())
if !isGemini && !isGopher { if !isGemini && !isGopher {
return xerrors.NewError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()), 0, "", false) return xerrors.NewSimpleError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()))
} }
if isGopher && !config.CONFIG.GopherEnable { if isGopher && !config.CONFIG.GopherEnable {
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Skipping gopher URL (disabled in config)") return xerrors.NewSimpleError(fmt.Errorf("gopher disabled, not processing Gopher URL: %s", s.URL.String()))
return nil
}
if url != s.URL.Full {
err = gemdb.Database.NormalizeURL(ctx, tx, url, s.URL.Full)
if err != nil {
return err
}
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Normalized URL: %s → %s", url, s.URL.Full)
url = s.URL.Full
} }
// Check if URL is whitelisted // Check if URL is whitelisted
isUrlWhitelisted := whiteList.IsWhitelisted(s.URL.String()) isUrlWhitelisted := whiteList.IsWhitelisted(s.URL.String())
if isUrlWhitelisted { if isUrlWhitelisted {
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches whitelist, forcing crawl %s", url) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches whitelist, forcing crawl %s", url)
} }
// Only check blacklist if URL is not whitelisted // Only check blacklist if URL is not whitelisted
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) { if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches blacklist, ignoring %s", url) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches blacklist, ignoring %s", url)
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error()) s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
return saveSnapshotAndRemoveURL(ctx, tx, s) return saveSnapshotAndRemoveURL(ctx, tx, s)
} }
@@ -169,58 +154,44 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
if !isUrlWhitelisted && isGemini { if !isUrlWhitelisted && isGemini {
// If URL matches a robots.txt disallow line, // If URL matches a robots.txt disallow line,
// add it as an error and remove url // add it as an error and remove url
robotMatch, err = robotsMatch.RobotMatch(urlCtx, s.URL.String()) robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
if err != nil {
if commonErrors.IsHostError(err) {
return removeURL(ctx, tx, s.URL.String())
}
return err
}
if robotMatch { if robotMatch {
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches robots.txt, skipping") contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches robots.txt, skipping")
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error()) s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
return saveSnapshotAndRemoveURL(ctx, tx, s) return saveSnapshotAndRemoveURL(ctx, tx, s)
} }
} }
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Adding to host pool") err = hostPool.AddHostToHostPool(ctx, s.Host)
err = hostPool.AddHostToHostPool(urlCtx, s.Host)
if err != nil { if err != nil {
contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Failed to add host to pool: %v", err)
return err return err
} }
defer func(ctx context.Context, host string) { defer func(ctx context.Context, host string) {
hostPool.RemoveHostFromPool(ctx, host) hostPool.RemoveHostFromPool(ctx, host)
}(urlCtx, s.Host) }(ctx, s.Host)
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Visiting %s", s.URL.String()) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Visiting %s", s.URL.String())
// Use context-aware visits for both protocols // Use context-aware visits for both protocols
if isGopher { if isGopher {
// Use the context-aware version for Gopher visits s, err = gopher.VisitWithContext(ctx, s.URL.String())
s, err = gopher.VisitWithContext(urlCtx, s.URL.String())
} else { } else {
// Use the context-aware version for Gemini visits s, err = gemini.Visit(ctx, s.URL.String())
s, err = gemini.Visit(urlCtx, s.URL.String())
} }
if err != nil { if err != nil {
contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Error visiting URL: %v", err) contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Error visiting URL: %v", err)
return err return err
} }
if s == nil {
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "No snapshot returned")
return nil
}
// Handle Gemini redirection. // Handle Gemini redirection.
if isGemini && if isGemini &&
s.ResponseCode.ValueOrZero() >= 30 && s.ResponseCode.ValueOrZero() >= 30 &&
s.ResponseCode.ValueOrZero() < 40 { s.ResponseCode.ValueOrZero() < 40 {
err = handleRedirection(urlCtx, tx, s) err = saveRedirectURL(ctx, tx, s)
if err != nil { if err != nil {
return fmt.Errorf("error while handling redirection: %s", err) return xerrors.NewSimpleError(fmt.Errorf("error while handling redirection: %s", err))
} }
} }
@@ -231,14 +202,14 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
return err return err
} }
if identical { if identical {
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Content identical to existing snapshot, skipping") contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
return removeURL(ctx, tx, s.URL.String()) return removeURL(ctx, tx, s.URL.String())
} }
} }
// Process and store links since content has changed // Process and store links since content has changed
if len(s.Links.ValueOrZero()) > 0 { if len(s.Links.ValueOrZero()) > 0 {
contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Found %d links", len(s.Links.ValueOrZero())) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Found %d links", len(s.Links.ValueOrZero()))
err = storeLinks(ctx, tx, s) err = storeLinks(ctx, tx, s)
if err != nil { if err != nil {
return err return err
@@ -246,7 +217,11 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
} }
// Save the snapshot and remove the URL from the queue // Save the snapshot and remove the URL from the queue
contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.URL.String()) if s.Error.ValueOrZero() != "" {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
} else {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
}
return saveSnapshotAndRemoveURL(ctx, tx, s) return saveSnapshotAndRemoveURL(ctx, tx, s)
} }
@@ -273,12 +248,10 @@ func storeLinks(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
return nil return nil
} }
// Context-aware version of removeURL
func removeURL(ctx context.Context, tx *sqlx.Tx, url string) error { func removeURL(ctx context.Context, tx *sqlx.Tx, url string) error {
return gemdb.Database.DeleteURL(ctx, tx, url) return gemdb.Database.DeleteURL(ctx, tx, url)
} }
// Context-aware version of saveSnapshotAndRemoveURL
func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error { func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
err := gemdb.Database.SaveSnapshot(ctx, tx, s) err := gemdb.Database.SaveSnapshot(ctx, tx, s)
if err != nil { if err != nil {
@@ -304,7 +277,7 @@ func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error)
// Check if the context is cancelled // Check if the context is cancelled
if err := ctx.Err(); err != nil { if err := ctx.Err(); err != nil {
return false, err return false, xerrors.NewSimpleError(err)
} }
// Check the urls table which holds the crawl queue. // Check the urls table which holds the crawl queue.
@@ -337,7 +310,6 @@ func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error)
} }
if len(recentSnapshots) > 0 { if len(recentSnapshots) > 0 {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Skipping URL %s (updated within last %d days)", u, config.CONFIG.SkipIfUpdatedDays)
return true, nil return true, nil
} }
} }
@@ -345,30 +317,25 @@ func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error)
return false, nil return false, nil
} }
func handleRedirection(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error { func saveRedirectURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
// Create a context specifically for redirection handling
redirectCtx := contextutil.ContextWithComponent(ctx, "redirect")
// Use the redirectCtx for all operations
newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Header.ValueOrZero()) newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Header.ValueOrZero())
if err != nil { if err != nil {
contextlog.LogErrorWithContext(redirectCtx, logging.GetSlogger(), "Failed to extract redirect target: %v", err) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to extract redirect target: %v", err)
return err return err
} }
contextlog.LogDebugWithContext(redirectCtx, logging.GetSlogger(), "Page redirects to %s", newURL) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Page redirects to %s", newURL)
haveWeVisited, err := haveWeVisitedURL(redirectCtx, tx, newURL.String()) haveWeVisited, err := haveWeVisitedURL(ctx, tx, newURL.String())
if err != nil { if err != nil {
return err return err
} }
if shouldPersistURL(newURL) && !haveWeVisited { if shouldPersistURL(newURL) && !haveWeVisited {
err = gemdb.Database.InsertURL(redirectCtx, tx, newURL.Full) err = gemdb.Database.InsertURL(ctx, tx, newURL.Full)
if err != nil { if err != nil {
contextlog.LogErrorWithContext(redirectCtx, logging.GetSlogger(), "Failed to insert redirect URL: %v", err)
return err return err
} }
contextlog.LogDebugWithContext(redirectCtx, logging.GetSlogger(), "Saved redirection URL %s", newURL.String()) contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Saved redirection URL %s", newURL.String())
} }
return nil return nil
} }

View File

@@ -16,7 +16,7 @@ import (
commonUrl "gemini-grc/common/url" commonUrl "gemini-grc/common/url"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/contextutil" "gemini-grc/contextutil"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL _ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
@@ -32,7 +32,7 @@ type DbService interface {
// URL methods // URL methods
InsertURL(ctx context.Context, tx *sqlx.Tx, url string) error InsertURL(ctx context.Context, tx *sqlx.Tx, url string) error
NormalizeURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error
DeleteURL(ctx context.Context, tx *sqlx.Tx, url string) error DeleteURL(ctx context.Context, tx *sqlx.Tx, url string) error
MarkURLsAsBeingProcessed(ctx context.Context, tx *sqlx.Tx, urls []string) error MarkURLsAsBeingProcessed(ctx context.Context, tx *sqlx.Tx, urls []string) error
GetUrlHosts(ctx context.Context, tx *sqlx.Tx) ([]string, error) GetUrlHosts(ctx context.Context, tx *sqlx.Tx) ([]string, error)
@@ -117,10 +117,13 @@ func (d *DbServiceImpl) Initialize(ctx context.Context) error {
return nil return nil
} }
// Shutdown the database with context
func (d *DbServiceImpl) Shutdown(ctx context.Context) error { func (d *DbServiceImpl) Shutdown(ctx context.Context) error {
dbCtx := contextutil.ContextWithComponent(ctx, "database") dbCtx := contextutil.ContextWithComponent(ctx, "database")
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Shutting down database connection") contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Shutting down database connections")
_, err := d.db.Query("UPDATE urls SET being_processed=false")
if err != nil {
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Unable to update urls table: %v", err)
}
d.mu.Lock() d.mu.Lock()
defer d.mu.Unlock() defer d.mu.Unlock()
@@ -129,12 +132,11 @@ func (d *DbServiceImpl) Shutdown(ctx context.Context) error {
return nil return nil
} }
// Check if the context is cancelled before proceeding
if err := ctx.Err(); err != nil { if err := ctx.Err(); err != nil {
return err return err
} }
err := d.db.Close() err = d.db.Close()
if err != nil { if err != nil {
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Error closing database connection: %v", err) contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Error closing database connection: %v", err)
} else { } else {
@@ -154,7 +156,6 @@ func (d *DbServiceImpl) NewTx(ctx context.Context) (*sqlx.Tx, error) {
return nil, err return nil, err
} }
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Creating new database transaction")
tx, err := d.db.BeginTxx(ctx, nil) tx, err := d.db.BeginTxx(ctx, nil)
if err != nil { if err != nil {
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Failed to create transaction: %v", err) contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Failed to create transaction: %v", err)
@@ -199,7 +200,7 @@ func (d *DbServiceImpl) InsertURL(ctx context.Context, tx *sqlx.Tx, url string)
} }
// NormalizeURL normalizes a URL with context // NormalizeURL normalizes a URL with context
func (d *DbServiceImpl) NormalizeURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error { func (d *DbServiceImpl) CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error {
dbCtx := contextutil.ContextWithComponent(ctx, "database") dbCtx := contextutil.ContextWithComponent(ctx, "database")
// Check if URLs are already the same // Check if URLs are already the same
@@ -214,7 +215,6 @@ func (d *DbServiceImpl) NormalizeURL(ctx context.Context, tx *sqlx.Tx, url strin
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating normalized URL %s -> %s", url, normalizedURL) contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating normalized URL %s -> %s", url, normalizedURL)
// Context-aware implementation
query := SQL_UPDATE_URL query := SQL_UPDATE_URL
a := struct { a := struct {
Url string `db:"Url"` Url string `db:"Url"`
@@ -514,12 +514,13 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
return false, err return false, err
} }
// Context-aware implementation // Update: Skipped this because empty pages can be valid
// ex. pages with redirect headers
// Only check for identical content if we have gemtext or data // Only check for identical content if we have gemtext or data
if (!s.GemText.Valid || s.GemText.String == "") && //if (!s.GemText.Valid || s.GemText.String == "") &&
(!s.Data.Valid || len(s.Data.V) == 0) { // (!s.Data.Valid || len(s.Data.V) == 0) {
return false, nil // return false, nil
} //}
// Try to get the latest snapshot for this URL // Try to get the latest snapshot for this URL
latestSnapshot := &snapshot.Snapshot{} latestSnapshot := &snapshot.Snapshot{}
@@ -546,3 +547,20 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
return false, nil return false, nil
} }
// SafeRollback attempts to roll back a transaction,
// handling the case if the tx was already finalized.
func SafeRollback(ctx context.Context, tx *sqlx.Tx) error {
rollbackErr := tx.Rollback()
if rollbackErr != nil {
// Check if it's the standard "transaction already finalized" error
if errors.Is(rollbackErr, sql.ErrTxDone) {
contextlog.LogWarnWithContext(ctx, logging.GetSlogger(), "Rollback failed because transaction is already finalized")
return nil
}
// Only return error for other types of rollback failures
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to rollback transaction: %v", rollbackErr)
return xerrors.NewError(fmt.Errorf("failed to rollback transaction: %w", rollbackErr), 0, "", true)
}
return nil
}

View File

@@ -1,16 +0,0 @@
#!/bin/env bash
set -eu
set -o pipefail
# Max response size 10MiB
LOG_LEVEL=debug \
PRINT_WORKER_STATUS=false \
DRY_RUN=false \
NUM_OF_WORKERS=1 \
WORKER_BATCH_SIZE=1 \
BLACKLIST_PATH="$(pwd)/blacklist.txt" \
MAX_RESPONSE_SIZE=10485760 \
RESPONSE_TIMEOUT=10 \
PANIC_ON_UNEXPECTED_ERROR=true \
go run ./bin/gemget/main.go "$@"

View File

@@ -7,8 +7,8 @@ import (
"gemini-grc/common/linkList" "gemini-grc/common/linkList"
url2 "gemini-grc/common/url" url2 "gemini-grc/common/url"
"gemini-grc/logging"
"gemini-grc/util" "gemini-grc/util"
"git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
) )

View File

@@ -1,23 +1,214 @@
package gemini package gemini
import ( import (
"context"
"crypto/tls"
"errors"
"fmt" "fmt"
"io"
"net"
stdurl "net/url"
"regexp" "regexp"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
"time"
commonErrors "gemini-grc/common/errors" "gemini-grc/common/contextlog"
"gemini-grc/common/snapshot" "gemini-grc/common/snapshot"
_url "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/contextutil"
"git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
// ProcessData processes the raw data from a Gemini response and populates the Snapshot. // Visit visits a given URL using the Gemini protocol,
// and returns a populated snapshot. Any relevant errors
// when visiting the URL are stored in the snapshot;
// an error is returned only when construction of a
// snapshot was not possible (context cancellation errors,
// not a valid URL etc.)
func Visit(ctx context.Context, url string) (s *snapshot.Snapshot, err error) {
geminiCtx := contextutil.ContextWithComponent(ctx, "network")
s, err = snapshot.SnapshotFromURL(url, true)
if err != nil {
return nil, err
}
// Check if the context has been canceled
if err := ctx.Err(); err != nil {
return nil, xerrors.NewSimpleError(err)
}
data, err := ConnectAndGetData(geminiCtx, s.URL.String())
if err != nil {
s.Error = null.StringFrom(err.Error())
return s, nil
}
// Check if the context has been canceled
if err := ctx.Err(); err != nil {
return nil, xerrors.NewSimpleError(err)
}
s = UpdateSnapshotWithData(*s, data)
if !s.Error.Valid &&
s.MimeType.Valid &&
s.MimeType.String == "text/gemini" &&
len(s.GemText.ValueOrZero()) > 0 {
links := GetPageLinks(s.URL, s.GemText.String)
if len(links) > 0 {
s.Links = null.ValueFrom(links)
}
}
return s, nil
}
// ConnectAndGetData is a context-aware version of ConnectAndGetData
// that returns the data from a GET request to a Gemini URL. It uses the context
// for cancellation, timeout, and logging.
func ConnectAndGetData(ctx context.Context, url string) ([]byte, error) {
parsedURL, err := stdurl.Parse(url)
if err != nil {
return nil, xerrors.NewSimpleError(fmt.Errorf("error parsing URL: %w", err))
}
hostname := parsedURL.Hostname()
port := parsedURL.Port()
if port == "" {
port = "1965"
}
host := fmt.Sprintf("%s:%s", hostname, port)
// Check if the context has been canceled before proceeding
if err := ctx.Err(); err != nil {
return nil, err
}
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
// Establish the underlying TCP connection with context-based cancellation
dialer := &net.Dialer{
Timeout: timeoutDuration,
}
conn, err := dialer.DialContext(ctx, "tcp", host)
if err != nil {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to establish TCP connection: %v", err)
return nil, xerrors.NewSimpleError(err)
}
// Make sure we always close the connection
defer func() {
_ = conn.Close()
}()
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, xerrors.NewSimpleError(err)
}
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, xerrors.NewSimpleError(err)
}
// Check if the context has been canceled before proceeding with TLS handshake
if err := ctx.Err(); err != nil {
return nil, err
}
// Perform the TLS handshake
tlsConfig := &tls.Config{
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
ServerName: parsedURL.Hostname(), // SNI says we should not include port in hostname
}
tlsConn := tls.Client(conn, tlsConfig)
err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, xerrors.NewSimpleError(err)
}
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, xerrors.NewSimpleError(err)
}
// Check if the context is done before attempting handshake
if err := ctx.Err(); err != nil {
return nil, err
}
// Perform TLS handshake with regular method
// (HandshakeContext is only available in Go 1.17+)
err = tlsConn.Handshake()
if err != nil {
return nil, xerrors.NewSimpleError(err)
}
// Check again if the context is done after handshake
if err := ctx.Err(); err != nil {
return nil, xerrors.NewSimpleError(err)
}
// We read `buf`-sized chunks and add data to `data`
buf := make([]byte, 4096)
var data []byte
// Check if the context has been canceled before sending request
if err := ctx.Err(); err != nil {
return nil, xerrors.NewSimpleError(err)
}
// Send Gemini request to trigger server response
// Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitly in the URL.
url2, _ := _url.ParseURL(url, "", true)
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
if err != nil {
return nil, xerrors.NewSimpleError(err)
}
// Read response bytes in len(buf) byte chunks
for {
// Check if the context has been canceled before each read
if err := ctx.Err(); err != nil {
return nil, xerrors.NewSimpleError(err)
}
n, err := tlsConn.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
}
if len(data) > config.CONFIG.MaxResponseSize {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
return nil, xerrors.NewSimpleError(fmt.Errorf("response too large"))
}
if err != nil {
if errors.Is(err, io.EOF) {
break
}
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
return nil, xerrors.NewSimpleError(err)
}
}
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes of data", len(data))
return data, nil
}
// UpdateSnapshotWithData processes the raw data from a Gemini response and populates the Snapshot.
// This function is exported for use by the robotsMatch package. // This function is exported for use by the robotsMatch package.
func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) { func UpdateSnapshotWithData(s snapshot.Snapshot, data []byte) *snapshot.Snapshot {
header, body, err := getHeadersAndData(data) header, body, err := getHeadersAndData(data)
if err != nil { if err != nil {
return &s, err s.Error = null.StringFrom(err.Error())
return &s
} }
code, mimeType, lang := getMimeTypeAndLang(header) code, mimeType, lang := getMimeTypeAndLang(header)
@@ -39,13 +230,14 @@ func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
if mimeType == "text/gemini" { if mimeType == "text/gemini" {
validBody, err := BytesToValidUTF8(body) validBody, err := BytesToValidUTF8(body)
if err != nil { if err != nil {
return nil, err s.Error = null.StringFrom(err.Error())
return &s
} }
s.GemText = null.StringFrom(validBody) s.GemText = null.StringFrom(validBody)
} else { } else {
s.Data = null.ValueFrom(body) s.Data = null.ValueFrom(body)
} }
return &s, nil return &s
} }
// Checks for a Gemini header, which is // Checks for a Gemini header, which is
@@ -55,7 +247,7 @@ func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
func getHeadersAndData(data []byte) (string, []byte, error) { func getHeadersAndData(data []byte) (string, []byte, error) {
firstLineEnds := slices.Index(data, '\n') firstLineEnds := slices.Index(data, '\n')
if firstLineEnds == -1 { if firstLineEnds == -1 {
return "", nil, commonErrors.NewHostError(fmt.Errorf("error parsing header")) return "", nil, xerrors.NewSimpleError(fmt.Errorf("error parsing header"))
} }
firstLine := string(data[:firstLineEnds]) firstLine := string(data[:firstLineEnds])
rest := data[firstLineEnds+1:] rest := data[firstLineEnds+1:]
@@ -106,7 +298,3 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
lang := matches[3] // Will be empty string if no lang parameter was found lang := matches[3] // Will be empty string if no lang parameter was found
return code, mimeType, lang return code, mimeType, lang
} }
func isGeminiCapsule(s *snapshot.Snapshot) bool {
return !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini"
}

View File

@@ -1,276 +0,0 @@
package gemini
import (
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"net"
stdurl "net/url"
"time"
"gemini-grc/common/contextlog"
commonErrors "gemini-grc/common/errors"
"gemini-grc/common/snapshot"
_url "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/contextutil"
"gemini-grc/logging"
"git.antanst.com/antanst/xerrors"
"github.com/guregu/null/v5"
)
// Visit visits a given URL using the Gemini protocol.
func Visit(ctx context.Context, url string) (s *snapshot.Snapshot, err error) {
geminiCtx := contextutil.ContextWithComponent(ctx, "gemini")
contextlog.LogDebugWithContext(geminiCtx, logging.GetSlogger(), "Visiting Gemini URL: %s", url)
s, err = snapshot.SnapshotFromURL(url, true)
if err != nil {
contextlog.LogErrorWithContext(geminiCtx, logging.GetSlogger(), "Failed to create snapshot from URL: %v", err)
return nil, err
}
defer func() {
if err == nil {
return
}
// GeminiError and HostError should
// be stored in the snapshot.
if commonErrors.IsHostError(err) {
contextlog.LogInfoWithContext(geminiCtx, logging.GetSlogger(), "Host error: %v", err)
s.Error = null.StringFrom(err.Error())
err = nil
return
} else if IsGeminiError(err) {
contextlog.LogInfoWithContext(geminiCtx, logging.GetSlogger(), "Gemini error: %v", err)
s.Error = null.StringFrom(err.Error())
s.Header = null.StringFrom(errors.Unwrap(err).(*GeminiError).Header)
s.ResponseCode = null.IntFrom(int64(errors.Unwrap(err).(*GeminiError).Code))
err = nil
return
}
}()
// Check if the context has been canceled
if err := ctx.Err(); err != nil {
return s, err
}
data, err := ConnectAndGetDataWithContext(geminiCtx, s.URL.String())
if err != nil {
return s, err
}
// Check if the context has been canceled
if err := ctx.Err(); err != nil {
return s, err
}
s, err = ProcessData(*s, data)
if err != nil {
return s, err
}
if isGeminiCapsule(s) {
links := GetPageLinks(s.URL, s.GemText.String)
if len(links) > 0 {
s.Links = null.ValueFrom(links)
}
}
contextlog.LogDebugWithContext(geminiCtx, logging.GetSlogger(), "Successfully visited URL: %s (Code: %d)", url, s.ResponseCode.ValueOrZero())
return s, nil
}
// ConnectAndGetDataWithContext is a context-aware version of ConnectAndGetData
// that returns the data from a GET request to a Gemini URL. It uses the context
// for cancellation, timeout, and logging.
func ConnectAndGetDataWithContext(ctx context.Context, url string) ([]byte, error) {
// Parse the URL
parsedURL, err := stdurl.Parse(url)
if err != nil {
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w", err), 0, "", false)
}
hostname := parsedURL.Hostname()
port := parsedURL.Port()
if port == "" {
port = "1965"
}
host := fmt.Sprintf("%s:%s", hostname, port)
// Check if the context has been canceled before proceeding
if err := ctx.Err(); err != nil {
return nil, err
}
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Connecting to %s", host)
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
// Establish the underlying TCP connection with context-based cancellation
dialer := &net.Dialer{
Timeout: timeoutDuration,
}
// Use DialContext to allow cancellation via context
conn, err := dialer.DialContext(ctx, "tcp", host)
if err != nil {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to establish TCP connection: %v", err)
return nil, commonErrors.NewHostError(err)
}
// Make sure we always close the connection
defer func() {
_ = conn.Close()
}()
// Set read and write timeouts on the TCP connection
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, commonErrors.NewHostError(err)
}
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, commonErrors.NewHostError(err)
}
// Check if the context has been canceled before proceeding with TLS handshake
if err := ctx.Err(); err != nil {
return nil, err
}
// Perform the TLS handshake
tlsConfig := &tls.Config{
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
ServerName: parsedURL.Hostname(), // SNI says we should not include port in hostname
}
tlsConn := tls.Client(conn, tlsConfig)
err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, commonErrors.NewHostError(err)
}
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, commonErrors.NewHostError(err)
}
// Check if the context is done before attempting handshake
if err := ctx.Err(); err != nil {
return nil, err
}
// Perform TLS handshake with regular method
// (HandshakeContext is only available in Go 1.17+)
err = tlsConn.Handshake()
if err != nil {
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "TLS handshake failed: %v", err)
return nil, commonErrors.NewHostError(err)
}
// Check again if the context is done after handshake
if err := ctx.Err(); err != nil {
return nil, err
}
// We read `buf`-sized chunks and add data to `data`
buf := make([]byte, 4096)
var data []byte
// Check if the context has been canceled before sending request
if err := ctx.Err(); err != nil {
return nil, err
}
// Send Gemini request to trigger server response
// Fix for stupid server bug:
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
// when the port is 1965 and is still specified explicitly in the URL.
url2, _ := _url.ParseURL(url, "", true)
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
if err != nil {
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to send request: %v", err)
return nil, commonErrors.NewHostError(err)
}
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Request sent, reading response")
// Read response bytes in len(buf) byte chunks
for {
// Check if the context has been canceled before each read
if err := ctx.Err(); err != nil {
return nil, err
}
n, err := tlsConn.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
}
if len(data) > config.CONFIG.MaxResponseSize {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
return nil, commonErrors.NewHostError(fmt.Errorf("response too large"))
}
if err != nil {
if errors.Is(err, io.EOF) {
break
}
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
return nil, commonErrors.NewHostError(err)
}
}
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes of data", len(data))
return data, nil
}
// ProcessDataWithContext is a context-aware version of ProcessData that processes
// the raw data from a Gemini response and populates the Snapshot.
func ProcessDataWithContext(ctx context.Context, s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
// Create a processing-specific context with the "process" component
processCtx := contextutil.ContextWithComponent(ctx, "process")
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Processing Gemini response data (%d bytes)", len(data))
header, body, err := getHeadersAndData(data)
if err != nil {
contextlog.LogErrorWithContext(processCtx, logging.GetSlogger(), "Failed to extract headers: %v", err)
return &s, err
}
code, mimeType, lang := getMimeTypeAndLang(header)
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Response code: %d, MimeType: %s, Lang: %s", code, mimeType, lang)
if code != 0 {
s.ResponseCode = null.IntFrom(int64(code))
}
if header != "" {
s.Header = null.StringFrom(header)
}
if mimeType != "" {
s.MimeType = null.StringFrom(mimeType)
}
if lang != "" {
s.Lang = null.StringFrom(lang)
}
// If we've got a Gemini document, populate
// `GemText` field, otherwise raw data goes to `Data`.
if mimeType == "text/gemini" {
validBody, err := BytesToValidUTF8(body)
if err != nil {
contextlog.LogErrorWithContext(processCtx, logging.GetSlogger(), "UTF-8 validation failed: %v", err)
return nil, err
}
s.GemText = null.StringFrom(validBody)
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Processed gemtext content (%d characters)", len(validBody))
} else {
s.Data = null.ValueFrom(body)
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Stored binary data (%d bytes)", len(body))
}
return &s, nil
}

View File

@@ -135,17 +135,7 @@ func TestProcessData(t *testing.T) {
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {
s := snapshot.Snapshot{} s := snapshot.Snapshot{}
result, err := ProcessData(s, test.inputData) result := UpdateSnapshotWithData(s, test.inputData)
if test.expectedError && err == nil {
t.Errorf("Expected error, got nil")
return
}
if !test.expectedError && err != nil {
t.Errorf("Unexpected error: %v", err)
return
}
if test.expectedError { if test.expectedError {
return return

View File

@@ -7,6 +7,7 @@ import (
"io" "io"
"unicode/utf8" "unicode/utf8"
"gemini-grc/config"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
"golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese" "golang.org/x/text/encoding/japanese"
@@ -23,11 +24,16 @@ func BytesToValidUTF8(input []byte) (string, error) {
if len(input) == 0 { if len(input) == 0 {
return "", nil return "", nil
} }
const maxSize = 10 * 1024 * 1024 // 10MB
if len(input) > maxSize { maxSize := config.CONFIG.MaxResponseSize
return "", xerrors.NewError(fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false) if maxSize == 0 {
maxSize = 1024 * 1024 // Default 1MB for tests
} }
// remove NULL byte 0x00 (ReplaceAll accepts slices) if len(input) > maxSize {
return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false)
}
// Always remove NULL bytes first (before UTF-8 validity check)
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{}) inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
if utf8.Valid(inputNoNull) { if utf8.Valid(inputNoNull) {
return string(inputNoNull), nil return string(inputNoNull), nil
@@ -42,6 +48,8 @@ func BytesToValidUTF8(input []byte) (string, error) {
japanese.EUCJP.NewDecoder(), // Japanese japanese.EUCJP.NewDecoder(), // Japanese
korean.EUCKR.NewDecoder(), // Korean korean.EUCKR.NewDecoder(), // Korean
} }
// Still invalid Unicode. Try some encodings to convert to.
// First successful conversion wins. // First successful conversion wins.
var lastErr error var lastErr error
for _, encoding := range encodings { for _, encoding := range encodings {
@@ -56,5 +64,5 @@ func BytesToValidUTF8(input []byte) (string, error) {
} }
} }
return "", xerrors.NewError(fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false) return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false)
} }

5
go.mod
View File

@@ -3,8 +3,9 @@ module gemini-grc
go 1.24.3 go 1.24.3
require ( require (
git.antanst.com/antanst/logging v0.0.1
git.antanst.com/antanst/uid v0.0.1 git.antanst.com/antanst/uid v0.0.1
git.antanst.com/antanst/xerrors v0.0.1 git.antanst.com/antanst/xerrors v0.0.2
github.com/guregu/null/v5 v5.0.0 github.com/guregu/null/v5 v5.0.0
github.com/jackc/pgx/v5 v5.7.2 github.com/jackc/pgx/v5 v5.7.2
github.com/jmoiron/sqlx v1.4.0 github.com/jmoiron/sqlx v1.4.0
@@ -29,3 +30,5 @@ require (
replace git.antanst.com/antanst/xerrors => ../xerrors replace git.antanst.com/antanst/xerrors => ../xerrors
replace git.antanst.com/antanst/uid => ../uid replace git.antanst.com/antanst/uid => ../uid
replace git.antanst.com/antanst/logging => ../logging

View File

@@ -12,7 +12,7 @@ import (
commonErrors "gemini-grc/common/errors" commonErrors "gemini-grc/common/errors"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
) )

View File

@@ -18,7 +18,7 @@ import (
_url "gemini-grc/common/url" _url "gemini-grc/common/url"
"gemini-grc/config" "gemini-grc/config"
"gemini-grc/contextutil" "gemini-grc/contextutil"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors" "git.antanst.com/antanst/xerrors"
"github.com/guregu/null/v5" "github.com/guregu/null/v5"
) )
@@ -30,8 +30,6 @@ func VisitWithContext(ctx context.Context, url string) (*snapshot.Snapshot, erro
// Create a gopher-specific context with the "gopher" component // Create a gopher-specific context with the "gopher" component
gopherCtx := contextutil.ContextWithComponent(ctx, "gopher") gopherCtx := contextutil.ContextWithComponent(ctx, "gopher")
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Visiting Gopher URL: %s", url)
if !config.CONFIG.GopherEnable { if !config.CONFIG.GopherEnable {
contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Gopher protocol is disabled") contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Gopher protocol is disabled")
return nil, nil return nil, nil

View File

@@ -8,7 +8,8 @@ import (
"gemini-grc/common/contextlog" "gemini-grc/common/contextlog"
"gemini-grc/contextutil" "gemini-grc/contextutil"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors"
) )
var hostPool = HostPool{hostnames: make(map[string]struct{})} var hostPool = HostPool{hostnames: make(map[string]struct{})}
@@ -20,18 +21,13 @@ type HostPool struct {
// RemoveHostFromPool removes a host from the pool with context awareness // RemoveHostFromPool removes a host from the pool with context awareness
func RemoveHostFromPool(ctx context.Context, key string) { func RemoveHostFromPool(ctx context.Context, key string) {
// Create a hostPool-specific context
hostCtx := contextutil.ContextWithComponent(ctx, "hostPool") hostCtx := contextutil.ContextWithComponent(ctx, "hostPool")
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Removing host %s from pool", key)
hostPool.lock.Lock() hostPool.lock.Lock()
delete(hostPool.hostnames, key) delete(hostPool.hostnames, key)
hostPool.lock.Unlock() hostPool.lock.Unlock()
// Add some jitter // Add some jitter
time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond) time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond)
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Host %s removed from pool", key) contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Host %s removed from pool", key)
} }
@@ -41,18 +37,18 @@ func AddHostToHostPool(ctx context.Context, key string) error {
// Create a hostPool-specific context // Create a hostPool-specific context
hostCtx := contextutil.ContextWithComponent(ctx, "hostPool") hostCtx := contextutil.ContextWithComponent(ctx, "hostPool")
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Adding host %s to pool", key)
// Use a ticker to periodically check if we can add the host // Use a ticker to periodically check if we can add the host
ticker := time.NewTicker(500 * time.Millisecond) ticker := time.NewTicker(500 * time.Millisecond)
defer ticker.Stop() defer ticker.Stop()
// We continuously poll the pool,
// and if the host isn't already
// there, we add it.
for { for {
// Check if context is done before attempting to acquire lock // Check if context is done before attempting to acquire lock
select { select {
case <-ctx.Done(): case <-ctx.Done():
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Context canceled while waiting to add host %s", key) return xerrors.NewSimpleError(ctx.Err())
return ctx.Err()
default: default:
// Continue with attempt to add host // Continue with attempt to add host
} }
@@ -67,15 +63,13 @@ func AddHostToHostPool(ctx context.Context, key string) error {
} }
hostPool.lock.Unlock() hostPool.lock.Unlock()
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Host %s busy, waiting...", key)
// Wait for next tick or context cancellation // Wait for next tick or context cancellation
select { select {
case <-ticker.C: case <-ticker.C:
// Try again on next tick // Try again on next tick
case <-ctx.Done(): case <-ctx.Done():
contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Context canceled while waiting for host %s", key) contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Context canceled while waiting for host %s", key)
return ctx.Err() return xerrors.NewSimpleError(ctx.Err())
} }
} }
} }

View File

@@ -1,188 +0,0 @@
// Package logging provides a simple, structured logging interface using slog.
// It offers colored output for better readability in terminal environments.
package logging
import (
"context"
"fmt"
"io"
"log/slog"
"os"
"strings"
"sync"
)
// Global logger instance.
var slogLogger *slog.Logger
// Current log level - used to filter logs.
var currentLogLevel = slog.LevelInfo
// ANSI color codes for terminal output.
const (
colorReset = "\033[0m"
levelDebug = "\033[37m" // Gray
levelInfo = "\033[32m" // Green
levelWarn = "\033[33m" // Yellow
levelError = "\033[31m" // Red
)
// Standard helper functions for logging
func LogDebug(format string, args ...interface{}) {
if slogLogger != nil {
slogLogger.Debug(fmt.Sprintf(format, args...))
}
}
func LogInfo(format string, args ...interface{}) {
if slogLogger != nil {
slogLogger.Info(fmt.Sprintf(format, args...))
}
}
func LogWarn(format string, args ...interface{}) {
if slogLogger != nil {
slogLogger.Warn(fmt.Sprintf(format, args...))
}
}
func LogError(format string, args ...interface{}) {
if slogLogger != nil {
msg := fmt.Sprintf(format, args...)
slogLogger.Error(msg, slog.String("error", msg))
}
}
// InitSlogger initializes the slog logger with custom handler.
func InitSlogger(level slog.Level) {
// Set the global log level
currentLogLevel = level
// Create the handler with color support
baseHandler := NewColorHandler(os.Stderr)
// Create and set the new logger
slogLogger = slog.New(baseHandler)
// Set as default logger
slog.SetDefault(slogLogger)
// Print a startup message to verify logging is working
slogLogger.Info("Slog initialized", "level", level.String())
}
// GetSlogger returns the current global slog logger instance.
// Can be used by other packages
func GetSlogger() *slog.Logger {
if slogLogger == nil {
return slog.Default()
}
return slogLogger
}
// ColorHandler formats logs with colors for better terminal readability
type ColorHandler struct {
out io.Writer
mu *sync.Mutex
attrs []slog.Attr // Store attributes for this handler
}
// NewColorHandler creates a new handler that writes colored logs to the provided writer
func NewColorHandler(w io.Writer) *ColorHandler {
if w == nil {
w = os.Stderr
}
return &ColorHandler{
out: w,
mu: &sync.Mutex{},
attrs: make([]slog.Attr, 0),
}
}
// Enabled checks if the given log level is enabled
func (h *ColorHandler) Enabled(_ context.Context, level slog.Level) bool {
return level >= currentLogLevel
}
// Handle processes a log record, formatting it with colors
func (h *ColorHandler) Handle(ctx context.Context, r slog.Record) error {
h.mu.Lock()
defer h.mu.Unlock()
// Format time
timeStr := fmt.Sprintf("[%s]", r.Time.Format("2006-01-02 15:04:05"))
// Format level
var levelStr string
switch r.Level {
case slog.LevelDebug:
levelStr = fmt.Sprintf("%sDEBUG%s", levelDebug, colorReset)
case slog.LevelInfo:
levelStr = fmt.Sprintf("%sINFO%s", levelInfo, colorReset)
case slog.LevelWarn:
levelStr = fmt.Sprintf("%sWARN%s", levelWarn, colorReset)
case slog.LevelError:
levelStr = fmt.Sprintf("%sERROR%s", levelError, colorReset)
default:
levelStr = r.Level.String()
}
// Build prefix
prefix := fmt.Sprintf("%s %s ", timeStr, levelStr)
// Format message - we'll collect any special fields separately
attrMap := make(map[string]string)
// First collect attributes from the handler itself
for _, attr := range h.attrs {
attrMap[attr.Key] = attr.Value.String()
}
// Then extract from record attributes, which might override handler attributes
r.Attrs(func(a slog.Attr) bool {
attrMap[a.Key] = a.Value.String()
return true
})
// Format message with attributes on the same line
msg := fmt.Sprintf("%s%s", prefix, r.Message)
// Add attributes to the same line if present
if len(attrMap) > 0 {
// Add a space after the message
msg += " "
// Build attribute string
attrs := make([]string, 0, len(attrMap))
for k, v := range attrMap {
attrs = append(attrs, fmt.Sprintf("%s=%s", k, v))
}
// Join all attributes with spaces
msg += strings.Join(attrs, " ")
}
// Add newline at the end
msg += "\n"
// Write to output
_, err := io.WriteString(h.out, msg)
return err
}
// WithGroup returns a new Handler that inherits from this Handler
func (h *ColorHandler) WithGroup(name string) slog.Handler {
return h // For simplicity, we don't support groups
}
// WithAttrs returns a new Handler whose attributes include both
// the receiver's attributes and the arguments
func (h *ColorHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
// Create a new handler with the same output but additional attributes
newHandler := &ColorHandler{
out: h.out,
mu: h.mu,
attrs: append(append([]slog.Attr{}, h.attrs...), attrs...),
}
return newHandler
}

28
misc/sql/README.md Normal file
View File

@@ -0,0 +1,28 @@
# SQL Queries for Snapshot Analysis
This directory contains SQL queries to analyze snapshot data in the gemini-grc database.
## Usage
You can run these queries directly from psql using the `\i` directive:
```
\i misc/sql/snapshots_per_url.sql
```
## Available Queries
- **snapshots_per_url.sql** - Basic count of snapshots per URL
- **snapshots_date_range.sql** - Shows snapshot count with date range information for each URL
- **host_snapshot_stats.sql** - Groups snapshots by hosts and shows URLs with multiple snapshots
- **content_changes.sql** - Finds URLs with the most content changes between consecutive snapshots
- **snapshot_distribution.sql** - Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
- **recent_snapshot_activity.sql** - Shows URLs with most snapshots in the last 7 days
- **storage_efficiency.sql** - Shows potential storage savings from deduplication
- **snapshots_by_timeframe.sql** - Shows snapshot count by timeframe (day, week, month)
## Notes
- These queries are designed to work with PostgreSQL and the gemini-grc database schema
- Some queries may be resource-intensive on large databases
- The results can help optimize storage and understand the effectiveness of the versioned snapshot feature

View File

@@ -0,0 +1,26 @@
-- File: content_changes.sql
-- Finds URLs with the most content changes between consecutive snapshots
-- Usage: \i misc/sql/content_changes.sql
WITH snapshot_changes AS (
SELECT
s1.url,
s1.timestamp as prev_timestamp,
s2.timestamp as next_timestamp,
s1.gemtext IS DISTINCT FROM s2.gemtext as gemtext_changed,
s1.data IS DISTINCT FROM s2.data as data_changed
FROM snapshots s1
JOIN snapshots s2 ON s1.url = s2.url AND s1.timestamp < s2.timestamp
WHERE NOT EXISTS (
SELECT 1 FROM snapshots s3
WHERE s3.url = s1.url AND s1.timestamp < s3.timestamp AND s3.timestamp < s2.timestamp
)
)
SELECT
url,
COUNT(*) + 1 as snapshot_count,
SUM(CASE WHEN gemtext_changed OR data_changed THEN 1 ELSE 0 END) as content_changes
FROM snapshot_changes
GROUP BY url
HAVING COUNT(*) + 1 > 1
ORDER BY content_changes DESC, snapshot_count DESC;

View File

@@ -0,0 +1,30 @@
BEGIN;
WITH matching_urls AS (
SELECT url, host
FROM snapshots
WHERE url ~ '^gemini://[^/]+/$'
AND timestamp < (NOW() - INTERVAL '1 week')
ORDER BY random()
LIMIT 500
)
INSERT INTO urls (url, host)
SELECT url, host
FROM matching_urls
ON CONFLICT DO NOTHING;
-- WITH matching_urls AS (
-- SELECT url, host
-- FROM snapshots
-- WHERE url ~ '^gemini://[^/]+/$'
-- AND timestamp < (NOW() - INTERVAL '1 week')
-- ORDER BY random()
-- LIMIT 500
-- )
-- DELETE FROM snapshots
-- WHERE url IN (
-- SELECT url
-- FROM matching_urls
-- );
COMMIT;

View File

@@ -0,0 +1,20 @@
-- File: host_snapshot_stats.sql
-- Groups snapshots by hosts and shows URLs with multiple snapshots
-- Usage: \i misc/sql/host_snapshot_stats.sql
SELECT
host,
COUNT(DISTINCT url) as unique_urls,
SUM(CASE WHEN url_count > 1 THEN 1 ELSE 0 END) as urls_with_multiple_snapshots,
SUM(snapshot_count) as total_snapshots
FROM (
SELECT
host,
url,
COUNT(*) as snapshot_count,
COUNT(*) OVER (PARTITION BY url) as url_count
FROM snapshots
GROUP BY host, url
) subquery
GROUP BY host
ORDER BY total_snapshots DESC;

View File

@@ -0,0 +1 @@
update urls set being_processed=false where being_processed is true;

View File

@@ -0,0 +1,13 @@
-- File: recent_snapshot_activity.sql
-- Shows URLs with most snapshots in the last 7 days
-- Usage: \i misc/sql/recent_snapshot_activity.sql
SELECT
url,
COUNT(*) as snapshot_count
FROM snapshots
WHERE timestamp > NOW() - INTERVAL '7 days'
GROUP BY url
HAVING COUNT(*) > 1
ORDER BY snapshot_count DESC
LIMIT 20;

View File

@@ -0,0 +1,16 @@
-- File: snapshot_distribution.sql
-- Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
-- Usage: \i misc/sql/snapshot_distribution.sql
WITH counts AS (
SELECT url, COUNT(*) as snapshot_count
FROM snapshots
GROUP BY url
)
SELECT
snapshot_count,
COUNT(*) as url_count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM counts
GROUP BY snapshot_count
ORDER BY snapshot_count;

View File

@@ -0,0 +1,37 @@
-- File: snapshots_by_timeframe.sql
-- Shows snapshot count by timeframe (day, week, month)
-- Usage: \i misc/sql/snapshots_by_timeframe.sql
WITH daily_snapshots AS (
SELECT
date_trunc('day', timestamp) as day,
COUNT(*) as snapshot_count,
COUNT(DISTINCT url) as unique_urls
FROM snapshots
GROUP BY day
ORDER BY day
),
weekly_snapshots AS (
SELECT
date_trunc('week', timestamp) as week,
COUNT(*) as snapshot_count,
COUNT(DISTINCT url) as unique_urls
FROM snapshots
GROUP BY week
ORDER BY week
),
monthly_snapshots AS (
SELECT
date_trunc('month', timestamp) as month,
COUNT(*) as snapshot_count,
COUNT(DISTINCT url) as unique_urls
FROM snapshots
GROUP BY month
ORDER BY month
)
SELECT 'Daily' as timeframe, * FROM daily_snapshots
UNION ALL
SELECT 'Weekly' as timeframe, * FROM weekly_snapshots
UNION ALL
SELECT 'Monthly' as timeframe, * FROM monthly_snapshots
ORDER BY timeframe, day;

View File

@@ -0,0 +1,14 @@
-- File: snapshots_date_range.sql
-- Shows snapshot count with date range information for each URL
-- Usage: \i misc/sql/snapshots_date_range.sql
SELECT
url,
COUNT(*) as snapshot_count,
MIN(timestamp) as first_snapshot,
MAX(timestamp) as last_snapshot,
MAX(timestamp) - MIN(timestamp) as time_span
FROM snapshots
GROUP BY url
HAVING COUNT(*) > 1
ORDER BY snapshot_count DESC;

View File

@@ -0,0 +1,8 @@
-- File: snapshots_per_url.sql
-- Basic count of snapshots per URL
-- Usage: \i misc/sql/snapshots_per_url.sql
SELECT url, COUNT(*) as snapshot_count
FROM snapshots
GROUP BY url
ORDER BY snapshot_count DESC;

View File

@@ -0,0 +1,20 @@
-- File: storage_efficiency.sql
-- Shows potential storage savings from deduplication
-- Usage: \i misc/sql/storage_efficiency.sql
WITH duplicate_stats AS (
SELECT
url,
COUNT(*) as snapshot_count,
COUNT(DISTINCT gemtext) as unique_gemtexts,
COUNT(DISTINCT data) as unique_datas
FROM snapshots
GROUP BY url
HAVING COUNT(*) > 1
)
SELECT
SUM(snapshot_count) as total_snapshots,
SUM(unique_gemtexts + unique_datas) as unique_contents,
SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas) as duplicate_content_count,
ROUND((SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas)) * 100.0 / SUM(snapshot_count), 2) as duplicate_percentage
FROM duplicate_stats;

View File

@@ -7,7 +7,7 @@ import (
"gemini-grc/common/contextlog" "gemini-grc/common/contextlog"
"gemini-grc/contextutil" "gemini-grc/contextutil"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
) )
// ParseRobotsTxt takes robots.txt content and a host, and // ParseRobotsTxt takes robots.txt content and a host, and

View File

@@ -10,9 +10,10 @@ import (
"gemini-grc/common/contextlog" "gemini-grc/common/contextlog"
"gemini-grc/common/snapshot" "gemini-grc/common/snapshot"
geminiUrl "gemini-grc/common/url" geminiUrl "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/contextutil" "gemini-grc/contextutil"
"gemini-grc/gemini" "gemini-grc/gemini"
"gemini-grc/logging" "git.antanst.com/antanst/logging"
) )
// RobotsCache is a map of blocked URLs // RobotsCache is a map of blocked URLs
@@ -38,7 +39,7 @@ func populateRobotsCache(ctx context.Context, key string) (entries []string, _er
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Fetching robots.txt from %s", url) contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Fetching robots.txt from %s", url)
// Use the context-aware version to honor timeout and cancellation // Use the context-aware version to honor timeout and cancellation
robotsContent, err := gemini.ConnectAndGetDataWithContext(cacheCtx, url) robotsContent, err := gemini.ConnectAndGetData(cacheCtx, url)
if err != nil { if err != nil {
// Check for context timeout or cancellation specifically // Check for context timeout or cancellation specifically
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
@@ -59,12 +60,7 @@ func populateRobotsCache(ctx context.Context, key string) (entries []string, _er
return []string{}, nil return []string{}, nil
} }
// TODO: Update gemini.ProcessData to accept context s = gemini.UpdateSnapshotWithData(*s, robotsContent)
s, err = gemini.ProcessData(*s, robotsContent)
if err != nil {
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error: %s", err)
return []string{}, nil
}
if s.ResponseCode.ValueOrZero() != 20 { if s.ResponseCode.ValueOrZero() != 20 {
contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero()) contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
@@ -91,14 +87,18 @@ func populateRobotsCache(ctx context.Context, key string) (entries []string, _er
// RobotMatch checks if the snapshot URL matches // RobotMatch checks if the snapshot URL matches
// a robots.txt allow rule. // a robots.txt allow rule.
func RobotMatch(ctx context.Context, u string) (bool, error) { func RobotMatch(ctx context.Context, u string) bool {
// Create a context for robots operations // Create a context for robots operations
robotsCtx := contextutil.ContextWithComponent(ctx, "robotsMatch") robotsCtx := contextutil.ContextWithComponent(ctx, "robotsMatch")
// TODO Missing Gopher functionality
if config.CONFIG.GopherEnable {
return false
}
url, err := geminiUrl.ParseURL(u, "", true) url, err := geminiUrl.ParseURL(u, "", true)
if err != nil { if err != nil {
contextlog.LogErrorWithContext(robotsCtx, logging.GetSlogger(), "Failed to parse URL: %v", err) return false
return false, err
} }
key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port)) key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
@@ -112,16 +112,7 @@ func RobotMatch(ctx context.Context, u string) (bool, error) {
var fetchErr error var fetchErr error
disallowedURLs, fetchErr = populateRobotsCache(ctx, key) disallowedURLs, fetchErr = populateRobotsCache(ctx, key)
if fetchErr != nil { if fetchErr != nil {
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Error populating robots.txt cache for %s: %v", key, fetchErr) return false
// Handle context timeouts by propagating the error
if errors.Is(fetchErr, context.DeadlineExceeded) || errors.Is(fetchErr, context.Canceled) {
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Timeout or cancellation while checking robots.txt")
return false, fetchErr
}
// For other errors, assume we can proceed without robots.txt
return false, nil
} }
if len(disallowedURLs) > 0 { if len(disallowedURLs) > 0 {
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Added to robots.txt cache: %v => %v", key, disallowedURLs) contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Added to robots.txt cache: %v => %v", key, disallowedURLs)
@@ -137,7 +128,7 @@ func RobotMatch(ctx context.Context, u string) (bool, error) {
} }
contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Found %d disallowed paths in robots.txt cache for %s", len(disallowedURLs), key) contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Found %d disallowed paths in robots.txt cache for %s", len(disallowedURLs), key)
} }
return isURLblocked(ctx, disallowedURLs, url.Full), nil return isURLblocked(ctx, disallowedURLs, url.Full)
} }
// Initialize initializes the robots.txt match package // Initialize initializes the robots.txt match package
@@ -157,12 +148,9 @@ func isURLblocked(ctx context.Context, disallowedURLs []string, input string) bo
blockCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.isURLblocked") blockCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.isURLblocked")
inputLower := strings.ToLower(input) inputLower := strings.ToLower(input)
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "Checking URL against robots.txt rules: %s", input)
for _, url := range disallowedURLs { for _, url := range disallowedURLs {
urlLower := strings.ToLower(url) urlLower := strings.ToLower(url)
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "Comparing against rule: %s (lower: %s vs %s)", url, inputLower, urlLower)
if strings.HasPrefix(inputLower, urlLower) { if strings.HasPrefix(inputLower, urlLower) {
contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "MATCH! robots.txt rule: %s blocks URL: %s", url, input) contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "MATCH! robots.txt rule: %s blocks URL: %s", url, input)
return true return true

View File

@@ -2,7 +2,6 @@ package robotsMatch
import ( import (
"context" "context"
"errors"
"sync" "sync"
"testing" "testing"
@@ -32,15 +31,7 @@ func TestRobotMatch_EmptyCache(t *testing.T) {
// For empty cache or DNS errors, RobotMatch should return false (allow the URL) without an error // For empty cache or DNS errors, RobotMatch should return false (allow the URL) without an error
ctx := context.Background() ctx := context.Background()
blocked, err := RobotMatch(ctx, "gemini://nonexistent.example.com/") blocked := RobotMatch(ctx, "gemini://nonexistent.example.com/")
// We expect no error for non-existent host because we changed our error handling
// to be more tolerant of DNS/connectivity issues
if err != nil {
// The only errors we should get are context-related (timeout, cancellation)
if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
t.Errorf("Expected nil error for non-existent host, got: %v", err)
}
}
// The URL should be allowed (not blocked) when robots.txt can't be fetched // The URL should be allowed (not blocked) when robots.txt can't be fetched
if blocked { if blocked {

10
seed_urls.txt Normal file
View File

@@ -0,0 +1,10 @@
gemini://geminiprotocol.net/
gemini://warmedal.se/~antenna/
gemini://skyjake.fi/~Cosmos/
gemini://gemini.circumlunar.space/capcom/
gemini://auragem.letz.dev/
gemini://gemplex.space/
gemini://kennedy.gemi.dev/
gemini://tlgs.one/
gemini://yesterday.gemlog.org/
gemini://gemini.cyberbot.space/feed.gmi

22
test.txt Normal file
View File

@@ -0,0 +1,22 @@
# Test redirect full url:
gemini://gemini.circumlunar.space
# Test blacklist:
gemi.dev
# Test robots disallow:
gemini://tlgs.one/search?aa
# Test TLS cert required:
gemini://astrobotany.mozz.us/app/plant
// 31 redirect
gemini://gemini.circumlunar.space
// body with null byte
gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False
// has invalid url
gemini://tlgs.one/known-hosts
// Needs SNI TLS info (our bug)
gemini://hanzbrix.pollux.casa/gemlog/20241002.gmi