From 37d5e7cd78467aaf31c8417daab30d66725cb721 Mon Sep 17 00:00:00 2001
From: antanst <>
Date: Mon, 16 Jun 2025 12:29:33 +0300
Subject: [PATCH] Enhance crawler with seed list and SQL utilities

Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
---
 .gitignore                             |   9 +
 ARCHITECTURE.md                        |   7 +
 common/blackList/blacklist.go          |   2 +-
 common/seedList/seedlist.go            |  67 ++++++
 common/seedList/seedlist_test.go       |  67 ++++++
 common/snapshot/snapshot.go            |   2 +-
 common/url/url_test.go                 |   2 +-
 common/whiteList/whitelist.go          |   2 +-
 common/worker.go                       | 171 +++++++--------
 db/db.go                               |  46 +++--
 gemget.sh                              |  16 --
 gemini/geminiLinks.go                  |   2 +-
 gemini/network.go                      | 210 ++++++++++++++++++-
 gemini/network_context.go              | 276 -------------------------
 gemini/network_test.go                 |  12 +-
 gemini/processing.go                   |  18 +-
 go.mod                                 |   5 +-
 gopher/network.go                      |   2 +-
 gopher/network_context.go              |   4 +-
 hostPool/hostPool.go                   |  20 +-
 logging/logging.go                     | 188 -----------------
 misc/sql/README.md                     |  28 +++
 misc/sql/content_changes.sql           |  26 +++
 misc/sql/crawl_top_level.sql           |  30 +++
 misc/sql/host_snapshot_stats.sql       |  20 ++
 misc/sql/mark_urls_processed_false.sql |   1 +
 misc/sql/recent_snapshot_activity.sql  |  13 ++
 misc/sql/snapshot_distribution.sql     |  16 ++
 misc/sql/snapshots_by_timeframe.sql    |  37 ++++
 misc/sql/snapshots_date_range.sql      |  14 ++
 misc/sql/snapshots_per_url.sql         |   8 +
 misc/sql/storage_efficiency.sql        |  20 ++
 robotsMatch/robots.go                  |   2 +-
 robotsMatch/robotsMatch.go             |  38 ++--
 robotsMatch/robotsMatch_test.go        |  11 +-
 seed_urls.txt                          |  10 +
 test.txt                               |  22 ++
 37 files changed, 742 insertions(+), 682 deletions(-)
 create mode 100644 common/seedList/seedlist.go
 create mode 100644 common/seedList/seedlist_test.go
 delete mode 100755 gemget.sh
 delete mode 100644 gemini/network_context.go
 delete mode 100644 logging/logging.go
 create mode 100644 misc/sql/README.md
 create mode 100644 misc/sql/content_changes.sql
 create mode 100644 misc/sql/crawl_top_level.sql
 create mode 100644 misc/sql/host_snapshot_stats.sql
 create mode 100644 misc/sql/mark_urls_processed_false.sql
 create mode 100644 misc/sql/recent_snapshot_activity.sql
 create mode 100644 misc/sql/snapshot_distribution.sql
 create mode 100644 misc/sql/snapshots_by_timeframe.sql
 create mode 100644 misc/sql/snapshots_date_range.sql
 create mode 100644 misc/sql/snapshots_per_url.sql
 create mode 100644 misc/sql/storage_efficiency.sql
 create mode 100644 seed_urls.txt
 create mode 100644 test.txt

diff --git a/.gitignore b/.gitignore
index 91610c8..30829ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 **/.#*
 **/*~
+**/.DS_Store
 /.idea
 /.goroot
 /dist/**
@@ -18,3 +19,11 @@ run*.sh
 /db/sql/**
 
 **/.claude/settings.local.json
+
+/crawl.sh
+/crawler.sh
+/get.sh
+/snapshot_history.sh
+/whitelist.txt
+
+/CLAUDE.md
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 6043c47..a95daee 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -19,6 +19,13 @@ CREATE UNIQUE INDEX idx_url_timestamp ON snapshots (url, timestamp);
 CREATE INDEX idx_url_latest ON snapshots (url, timestamp DESC);
 ```
 
+## Error handling
+- `xerrors` library is used for error creation/wrapping.
+- The "Fatal" field is not used, we _always_ panic on fatal errors.
+- _All_ internal functions _must_ return `xerror` errors.
+- _All_ external errors are wrapped within `xerror` errors.
+
+
 ### Code Changes
 
 1. **Updated SQL Queries**:
diff --git a/common/blackList/blacklist.go b/common/blackList/blacklist.go
index febb081..6202634 100644
--- a/common/blackList/blacklist.go
+++ b/common/blackList/blacklist.go
@@ -7,7 +7,7 @@ import (
 	"strings"
 
 	"gemini-grc/config"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
 	"git.antanst.com/antanst/xerrors"
 )
 
diff --git a/common/seedList/seedlist.go b/common/seedList/seedlist.go
new file mode 100644
index 0000000..e8308b2
--- /dev/null
+++ b/common/seedList/seedlist.go
@@ -0,0 +1,67 @@
+package seedList
+
+import (
+	"fmt"
+	"os"
+	"strings"
+
+	"git.antanst.com/antanst/logging"
+	"git.antanst.com/antanst/xerrors"
+)
+
+var seedlist []string //nolint:gochecknoglobals
+
+func Initialize() error {
+	var err error
+
+	// Initialize seedlist from fixed path
+	if err = loadSeedlist("seed_urls.txt"); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func loadSeedlist(filePath string) error {
+	if seedlist != nil {
+		return nil
+	}
+
+	data, err := os.ReadFile(filePath)
+	if err != nil {
+		seedlist = []string{}
+		return xerrors.NewError(fmt.Errorf("could not load seedlist file: %w", err), 0, "", true)
+	}
+
+	lines := strings.Split(string(data), "\n")
+	seedlist = []string{}
+
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		seedlist = append(seedlist, line)
+	}
+
+	if len(seedlist) > 0 {
+		logging.LogInfo("Loaded %d seed URLs", len(seedlist))
+	}
+
+	return nil
+}
+
+func Shutdown() error {
+	return nil
+}
+
+// GetSeedURLs returns the list of seed URLs
+func GetSeedURLs() []string {
+	if seedlist == nil {
+		return []string{}
+	}
+	// Return a copy to prevent external modification
+	result := make([]string, len(seedlist))
+	copy(result, seedlist)
+	return result
+}
diff --git a/common/seedList/seedlist_test.go b/common/seedList/seedlist_test.go
new file mode 100644
index 0000000..23ac791
--- /dev/null
+++ b/common/seedList/seedlist_test.go
@@ -0,0 +1,67 @@
+package seedList
+
+import (
+	"os"
+	"testing"
+)
+
+func TestLoadSeedlist(t *testing.T) {
+	// Create a temporary test file
+	content := `# Test seed URLs
+gemini://example.com/
+gemini://test.com/
+
+# Another comment
+gemini://demo.org/`
+
+	tmpFile, err := os.CreateTemp("", "seed_urls_test_*.txt")
+	if err != nil {
+		t.Fatalf("Failed to create temp file: %v", err)
+	}
+	defer os.Remove(tmpFile.Name())
+
+	if _, err := tmpFile.WriteString(content); err != nil {
+		t.Fatalf("Failed to write to temp file: %v", err)
+	}
+	tmpFile.Close()
+
+	// Reset global variable for test
+	seedlist = nil
+
+	// Test loading
+	err = loadSeedlist(tmpFile.Name())
+	if err != nil {
+		t.Fatalf("Failed to load seedlist: %v", err)
+	}
+
+	// Verify content
+	expected := []string{
+		"gemini://example.com/",
+		"gemini://test.com/",
+		"gemini://demo.org/",
+	}
+
+	urls := GetSeedURLs()
+	if len(urls) != len(expected) {
+		t.Errorf("Expected %d URLs, got %d", len(expected), len(urls))
+	}
+
+	for i, url := range urls {
+		if url != expected[i] {
+			t.Errorf("Expected URL %d to be %s, got %s", i, expected[i], url)
+		}
+	}
+}
+
+func TestGetSeedURLsEmptyList(t *testing.T) {
+	// Reset global variable
+	originalSeedlist := seedlist
+	defer func() { seedlist = originalSeedlist }()
+
+	seedlist = nil
+
+	urls := GetSeedURLs()
+	if len(urls) != 0 {
+		t.Errorf("Expected empty list, got %d URLs", len(urls))
+	}
+}
diff --git a/common/snapshot/snapshot.go b/common/snapshot/snapshot.go
index 09d1db9..ec3580d 100644
--- a/common/snapshot/snapshot.go
+++ b/common/snapshot/snapshot.go
@@ -27,7 +27,7 @@ type Snapshot struct {
 func SnapshotFromURL(u string, normalize bool) (*Snapshot, error) {
 	url, err := commonUrl.ParseURL(u, "", normalize)
 	if err != nil {
-		return nil, xerrors.NewError(err, 0, "", false)
+		return nil, xerrors.NewSimpleError(err)
 	}
 	newSnapshot := Snapshot{
 		URL:       *url,
diff --git a/common/url/url_test.go b/common/url/url_test.go
index 5f4373d..badd343 100644
--- a/common/url/url_test.go
+++ b/common/url/url_test.go
@@ -117,7 +117,7 @@ func TestURLOperations(t *testing.T) {
 		}
 	})
 
-	t.Run("NormalizeURL", func(t *testing.T) {
+	t.Run("CheckAndUpdateNormalizedURL", func(t *testing.T) {
 		t.Parallel()
 
 		tests := []struct {
diff --git a/common/whiteList/whitelist.go b/common/whiteList/whitelist.go
index 119d807..a642591 100644
--- a/common/whiteList/whitelist.go
+++ b/common/whiteList/whitelist.go
@@ -7,7 +7,7 @@ import (
 	"strings"
 
 	"gemini-grc/config"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
 	"git.antanst.com/antanst/xerrors"
 )
 
diff --git a/common/worker.go b/common/worker.go
index 89463f2..5d8c652 100644
--- a/common/worker.go
+++ b/common/worker.go
@@ -19,8 +19,8 @@ import (
 	"gemini-grc/gemini"
 	"gemini-grc/gopher"
 	"gemini-grc/hostPool"
-	"gemini-grc/logging"
 	"gemini-grc/robotsMatch"
+	"git.antanst.com/antanst/logging"
 	"git.antanst.com/antanst/xerrors"
 	"github.com/guregu/null/v5"
 	"github.com/jmoiron/sqlx"
@@ -30,7 +30,7 @@ func RunWorkerWithTx(workerID int, job string) {
 	// Extract host from URL for the context.
 	parsedURL, err := url2.ParseURL(job, "", true)
 	if err != nil {
-		logging.LogInfo("Failed to parse job URL: %s Error: %s", job, err)
+		logging.LogInfo("Failed to parse URL: %s Error: %s", job, err)
 		return
 	}
 	host := parsedURL.Hostname
@@ -38,8 +38,9 @@ func RunWorkerWithTx(workerID int, job string) {
 	// Create a new worker context
 	baseCtx := context.Background()
 	ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID)
-	defer cancel() // Ensure the context is cancelled when we're done
 	ctx = contextutil.ContextWithComponent(ctx, "worker")
+	defer cancel() // Ensure the context is cancelled when we're done
+	// contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "======================================\n\n")
 	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job)
 
 	// Create a new db transaction
@@ -51,66 +52,53 @@ func RunWorkerWithTx(workerID int, job string) {
 
 	err = runWorker(ctx, tx, []string{job})
 	if err != nil {
-		// Handle context cancellation and timeout errors gracefully, instead of treating them as fatal
+		// Two cases to handle:
+		// - context cancellation/timeout errors (log and ignore)
+		// - fatal errors (log and send to chan)
+		// non-fatal errors should've been handled within
+		// the runWorker() function and not bubble up here.
 		if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
 			contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker timed out or canceled: %v", err)
-			rollbackErr := SafeRollback(ctx, tx)
+			rollbackErr := gemdb.SafeRollback(ctx, tx)
 			if rollbackErr != nil {
 				FatalErrorsChan <- rollbackErr
 				return
 			}
 			return
+		} else if xerrors.IsFatal(err) {
+			contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
+			rollbackErr := gemdb.SafeRollback(ctx, tx)
+			if rollbackErr != nil {
+				FatalErrorsChan <- rollbackErr
+				return
+			}
+			FatalErrorsChan <- err
+			return
+
 		}
-		// For other errors, we treat them as fatal.
-		contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
-		rollbackErr := SafeRollback(ctx, tx)
-		if rollbackErr != nil {
-			FatalErrorsChan <- rollbackErr
-		}
-		FatalErrorsChan <- err
-		return
+		panic(err) // We shouldn't reach this point!
 	}
 
-	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Committing transaction")
 	err = tx.Commit()
 	if err != nil && !errors.Is(err, sql.ErrTxDone) {
 		contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to commit transaction: %v", err)
-		if rollbackErr := SafeRollback(ctx, tx); rollbackErr != nil {
+		if rollbackErr := gemdb.SafeRollback(ctx, tx); rollbackErr != nil {
 			FatalErrorsChan <- err
 			return
 		}
 	}
-	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker done!")
-}
-
-// SafeRollback attempts to roll back a transaction,
-// handling the case if the tx was already finalized.
-func SafeRollback(ctx context.Context, tx *sqlx.Tx) error {
-	rollbackErr := tx.Rollback()
-	if rollbackErr != nil {
-		// Check if it's the standard "transaction already finalized" error
-		if errors.Is(rollbackErr, sql.ErrTxDone) {
-			contextlog.LogWarnWithContext(ctx, logging.GetSlogger(), "Rollback failed because transaction is already finalized")
-			return nil
-		}
-		// Only panic for other types of rollback failures
-		contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to rollback transaction: %v", rollbackErr)
-		return xerrors.NewError(fmt.Errorf("failed to rollback transaction: %w", rollbackErr), 0, "", true)
-	}
-	return nil
+	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker done.")
 }
 
 func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
-	total := len(urls)
-	for i, u := range urls {
-		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting %d/%d %s", i+1, total, u)
-		urlCtx, cancelFunc := context.WithCancel(ctx)
-		err := WorkOnUrl(urlCtx, tx, u)
-		cancelFunc()
+	for _, u := range urls {
+		err := WorkOnUrl(ctx, tx, u)
 		if err != nil {
-			return err
+			if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) || xerrors.IsFatal(err) {
+				return err
+			}
+			contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Worker failed: %v", err)
 		}
-		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Done %d/%d.", i+1, total)
 	}
 	return nil
 }
@@ -119,47 +107,44 @@ func runWorker(ctx context.Context, tx *sqlx.Tx, urls []string) error {
 // unexpected errors are returned.
 // expected errors are stored within the snapshot.
 func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
-	// Create a context specifically for this URL with "url" component
-	urlCtx := contextutil.ContextWithComponent(ctx, "url")
-
-	contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Processing URL: %s", url)
+	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Worker visiting URL %s", url)
 
 	s, err := snapshot.SnapshotFromURL(url, true)
 	if err != nil {
-		contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Failed to parse URL: %v", err)
+		contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to parse URL: %v", err)
 		return err
 	}
 
+	// We always use the normalized URL
+	if url != s.URL.Full {
+		//err = gemdb.Database.CheckAndUpdateNormalizedURL(ctx, tx, url, s.URL.Full)
+		//if err != nil {
+		//	return err
+		//}
+		//contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Normalized URL: %s → %s", url, s.URL.Full)
+		url = s.URL.Full
+	}
+
 	isGemini := url2.IsGeminiUrl(s.URL.String())
 	isGopher := url2.IsGopherURL(s.URL.String())
 
 	if !isGemini && !isGopher {
-		return xerrors.NewError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()), 0, "", false)
+		return xerrors.NewSimpleError(fmt.Errorf("not a Gopher or Gemini URL: %s", s.URL.String()))
 	}
 
 	if isGopher && !config.CONFIG.GopherEnable {
-		contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Skipping gopher URL (disabled in config)")
-		return nil
-	}
-
-	if url != s.URL.Full {
-		err = gemdb.Database.NormalizeURL(ctx, tx, url, s.URL.Full)
-		if err != nil {
-			return err
-		}
-		contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Normalized URL: %s → %s", url, s.URL.Full)
-		url = s.URL.Full
+		return xerrors.NewSimpleError(fmt.Errorf("gopher disabled, not processing Gopher URL: %s", s.URL.String()))
 	}
 
 	// Check if URL is whitelisted
 	isUrlWhitelisted := whiteList.IsWhitelisted(s.URL.String())
 	if isUrlWhitelisted {
-		contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches whitelist, forcing crawl %s", url)
+		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches whitelist, forcing crawl %s", url)
 	}
 
 	// Only check blacklist if URL is not whitelisted
 	if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
-		contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches blacklist, ignoring %s", url)
+		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches blacklist, ignoring %s", url)
 		s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
 		return saveSnapshotAndRemoveURL(ctx, tx, s)
 	}
@@ -169,58 +154,44 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
 	if !isUrlWhitelisted && isGemini {
 		// If URL matches a robots.txt disallow line,
 		// add it as an error and remove url
-		robotMatch, err = robotsMatch.RobotMatch(urlCtx, s.URL.String())
-		if err != nil {
-			if commonErrors.IsHostError(err) {
-				return removeURL(ctx, tx, s.URL.String())
-			}
-			return err
-		}
+		robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
 		if robotMatch {
-			contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "URL matches robots.txt, skipping")
+			contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches robots.txt, skipping")
 			s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
 			return saveSnapshotAndRemoveURL(ctx, tx, s)
 		}
 	}
 
-	contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Adding to host pool")
-	err = hostPool.AddHostToHostPool(urlCtx, s.Host)
+	err = hostPool.AddHostToHostPool(ctx, s.Host)
 	if err != nil {
-		contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Failed to add host to pool: %v", err)
 		return err
 	}
 
 	defer func(ctx context.Context, host string) {
 		hostPool.RemoveHostFromPool(ctx, host)
-	}(urlCtx, s.Host)
+	}(ctx, s.Host)
 
-	contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Visiting %s", s.URL.String())
+	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Visiting %s", s.URL.String())
 
 	// Use context-aware visits for both protocols
 	if isGopher {
-		// Use the context-aware version for Gopher visits
-		s, err = gopher.VisitWithContext(urlCtx, s.URL.String())
+		s, err = gopher.VisitWithContext(ctx, s.URL.String())
 	} else {
-		// Use the context-aware version for Gemini visits
-		s, err = gemini.Visit(urlCtx, s.URL.String())
+		s, err = gemini.Visit(ctx, s.URL.String())
 	}
 
 	if err != nil {
-		contextlog.LogErrorWithContext(urlCtx, logging.GetSlogger(), "Error visiting URL: %v", err)
+		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Error visiting URL: %v", err)
 		return err
 	}
-	if s == nil {
-		contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "No snapshot returned")
-		return nil
-	}
 
 	// Handle Gemini redirection.
 	if isGemini &&
 		s.ResponseCode.ValueOrZero() >= 30 &&
 		s.ResponseCode.ValueOrZero() < 40 {
-		err = handleRedirection(urlCtx, tx, s)
+		err = saveRedirectURL(ctx, tx, s)
 		if err != nil {
-			return fmt.Errorf("error while handling redirection: %s", err)
+			return xerrors.NewSimpleError(fmt.Errorf("error while handling redirection: %s", err))
 		}
 	}
 
@@ -231,14 +202,14 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
 			return err
 		}
 		if identical {
-			contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
+			contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
 			return removeURL(ctx, tx, s.URL.String())
 		}
 	}
 
 	// Process and store links since content has changed
 	if len(s.Links.ValueOrZero()) > 0 {
-		contextlog.LogDebugWithContext(urlCtx, logging.GetSlogger(), "Found %d links", len(s.Links.ValueOrZero()))
+		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Found %d links", len(s.Links.ValueOrZero()))
 		err = storeLinks(ctx, tx, s)
 		if err != nil {
 			return err
@@ -246,7 +217,11 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
 	}
 
 	// Save the snapshot and remove the URL from the queue
-	contextlog.LogInfoWithContext(urlCtx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.URL.String())
+	if s.Error.ValueOrZero() != "" {
+		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
+	} else {
+		contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
+	}
 	return saveSnapshotAndRemoveURL(ctx, tx, s)
 }
 
@@ -273,12 +248,10 @@ func storeLinks(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
 	return nil
 }
 
-// Context-aware version of removeURL
 func removeURL(ctx context.Context, tx *sqlx.Tx, url string) error {
 	return gemdb.Database.DeleteURL(ctx, tx, url)
 }
 
-// Context-aware version of saveSnapshotAndRemoveURL
 func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
 	err := gemdb.Database.SaveSnapshot(ctx, tx, s)
 	if err != nil {
@@ -304,7 +277,7 @@ func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error)
 
 	// Check if the context is cancelled
 	if err := ctx.Err(); err != nil {
-		return false, err
+		return false, xerrors.NewSimpleError(err)
 	}
 
 	// Check the urls table which holds the crawl queue.
@@ -337,7 +310,6 @@ func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error)
 		}
 
 		if len(recentSnapshots) > 0 {
-			contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Skipping URL %s (updated within last %d days)", u, config.CONFIG.SkipIfUpdatedDays)
 			return true, nil
 		}
 	}
@@ -345,30 +317,25 @@ func haveWeVisitedURL(ctx context.Context, tx *sqlx.Tx, u string) (bool, error)
 	return false, nil
 }
 
-func handleRedirection(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
-	// Create a context specifically for redirection handling
-	redirectCtx := contextutil.ContextWithComponent(ctx, "redirect")
-
-	// Use the redirectCtx for all operations
+func saveRedirectURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
 	newURL, err := url2.ExtractRedirectTargetFromHeader(s.URL, s.Header.ValueOrZero())
 	if err != nil {
-		contextlog.LogErrorWithContext(redirectCtx, logging.GetSlogger(), "Failed to extract redirect target: %v", err)
+		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to extract redirect target: %v", err)
 		return err
 	}
-	contextlog.LogDebugWithContext(redirectCtx, logging.GetSlogger(), "Page redirects to %s", newURL)
+	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Page redirects to %s", newURL)
 
-	haveWeVisited, err := haveWeVisitedURL(redirectCtx, tx, newURL.String())
+	haveWeVisited, err := haveWeVisitedURL(ctx, tx, newURL.String())
 	if err != nil {
 		return err
 	}
 
 	if shouldPersistURL(newURL) && !haveWeVisited {
-		err = gemdb.Database.InsertURL(redirectCtx, tx, newURL.Full)
+		err = gemdb.Database.InsertURL(ctx, tx, newURL.Full)
 		if err != nil {
-			contextlog.LogErrorWithContext(redirectCtx, logging.GetSlogger(), "Failed to insert redirect URL: %v", err)
 			return err
 		}
-		contextlog.LogDebugWithContext(redirectCtx, logging.GetSlogger(), "Saved redirection URL %s", newURL.String())
+		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Saved redirection URL %s", newURL.String())
 	}
 	return nil
 }
diff --git a/db/db.go b/db/db.go
index 5243fa2..bcd428b 100644
--- a/db/db.go
+++ b/db/db.go
@@ -16,7 +16,7 @@ import (
 	commonUrl "gemini-grc/common/url"
 	"gemini-grc/config"
 	"gemini-grc/contextutil"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
 	"git.antanst.com/antanst/xerrors"
 	"github.com/guregu/null/v5"
 	_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
@@ -32,7 +32,7 @@ type DbService interface {
 
 	// URL methods
 	InsertURL(ctx context.Context, tx *sqlx.Tx, url string) error
-	NormalizeURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error
+	CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error
 	DeleteURL(ctx context.Context, tx *sqlx.Tx, url string) error
 	MarkURLsAsBeingProcessed(ctx context.Context, tx *sqlx.Tx, urls []string) error
 	GetUrlHosts(ctx context.Context, tx *sqlx.Tx) ([]string, error)
@@ -117,10 +117,13 @@ func (d *DbServiceImpl) Initialize(ctx context.Context) error {
 	return nil
 }
 
-// Shutdown the database with context
 func (d *DbServiceImpl) Shutdown(ctx context.Context) error {
 	dbCtx := contextutil.ContextWithComponent(ctx, "database")
-	contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Shutting down database connection")
+	contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Shutting down database connections")
+	_, err := d.db.Query("UPDATE urls SET being_processed=false")
+	if err != nil {
+		contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Unable to update urls table: %v", err)
+	}
 
 	d.mu.Lock()
 	defer d.mu.Unlock()
@@ -129,12 +132,11 @@ func (d *DbServiceImpl) Shutdown(ctx context.Context) error {
 		return nil
 	}
 
-	// Check if the context is cancelled before proceeding
 	if err := ctx.Err(); err != nil {
 		return err
 	}
 
-	err := d.db.Close()
+	err = d.db.Close()
 	if err != nil {
 		contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Error closing database connection: %v", err)
 	} else {
@@ -154,7 +156,6 @@ func (d *DbServiceImpl) NewTx(ctx context.Context) (*sqlx.Tx, error) {
 		return nil, err
 	}
 
-	contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Creating new database transaction")
 	tx, err := d.db.BeginTxx(ctx, nil)
 	if err != nil {
 		contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Failed to create transaction: %v", err)
@@ -199,7 +200,7 @@ func (d *DbServiceImpl) InsertURL(ctx context.Context, tx *sqlx.Tx, url string)
 }
 
 // NormalizeURL normalizes a URL with context
-func (d *DbServiceImpl) NormalizeURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error {
+func (d *DbServiceImpl) CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error {
 	dbCtx := contextutil.ContextWithComponent(ctx, "database")
 
 	// Check if URLs are already the same
@@ -214,7 +215,6 @@ func (d *DbServiceImpl) NormalizeURL(ctx context.Context, tx *sqlx.Tx, url strin
 
 	contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating normalized URL %s -> %s", url, normalizedURL)
 
-	// Context-aware implementation
 	query := SQL_UPDATE_URL
 	a := struct {
 		Url           string `db:"Url"`
@@ -514,12 +514,13 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
 		return false, err
 	}
 
-	// Context-aware implementation
+	// Update: Skipped this because empty pages can be valid
+	// ex. pages with redirect headers
 	// Only check for identical content if we have gemtext or data
-	if (!s.GemText.Valid || s.GemText.String == "") &&
-		(!s.Data.Valid || len(s.Data.V) == 0) {
-		return false, nil
-	}
+	//if (!s.GemText.Valid || s.GemText.String == "") &&
+	//	(!s.Data.Valid || len(s.Data.V) == 0) {
+	//	return false, nil
+	//}
 
 	// Try to get the latest snapshot for this URL
 	latestSnapshot := &snapshot.Snapshot{}
@@ -546,3 +547,20 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
 
 	return false, nil
 }
+
+// SafeRollback attempts to roll back a transaction,
+// handling the case if the tx was already finalized.
+func SafeRollback(ctx context.Context, tx *sqlx.Tx) error {
+	rollbackErr := tx.Rollback()
+	if rollbackErr != nil {
+		// Check if it's the standard "transaction already finalized" error
+		if errors.Is(rollbackErr, sql.ErrTxDone) {
+			contextlog.LogWarnWithContext(ctx, logging.GetSlogger(), "Rollback failed because transaction is already finalized")
+			return nil
+		}
+		// Only return error for other types of rollback failures
+		contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to rollback transaction: %v", rollbackErr)
+		return xerrors.NewError(fmt.Errorf("failed to rollback transaction: %w", rollbackErr), 0, "", true)
+	}
+	return nil
+}
diff --git a/gemget.sh b/gemget.sh
deleted file mode 100755
index 233b677..0000000
--- a/gemget.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/env bash
-set -eu
-set -o pipefail
-
-# Max response size 10MiB
-LOG_LEVEL=debug \
-PRINT_WORKER_STATUS=false \
-DRY_RUN=false \
-NUM_OF_WORKERS=1 \
-WORKER_BATCH_SIZE=1 \
-BLACKLIST_PATH="$(pwd)/blacklist.txt" \
-MAX_RESPONSE_SIZE=10485760 \
-RESPONSE_TIMEOUT=10 \
-PANIC_ON_UNEXPECTED_ERROR=true \
-go run ./bin/gemget/main.go "$@"
-
diff --git a/gemini/geminiLinks.go b/gemini/geminiLinks.go
index 862eff4..9d14678 100644
--- a/gemini/geminiLinks.go
+++ b/gemini/geminiLinks.go
@@ -7,8 +7,8 @@ import (
 
 	"gemini-grc/common/linkList"
 	url2 "gemini-grc/common/url"
-	"gemini-grc/logging"
 	"gemini-grc/util"
+	"git.antanst.com/antanst/logging"
 	"git.antanst.com/antanst/xerrors"
 )
 
diff --git a/gemini/network.go b/gemini/network.go
index be76ae2..f6fb2a9 100644
--- a/gemini/network.go
+++ b/gemini/network.go
@@ -1,23 +1,214 @@
 package gemini
 
 import (
+	"context"
+	"crypto/tls"
+	"errors"
 	"fmt"
+	"io"
+	"net"
+	stdurl "net/url"
 	"regexp"
 	"slices"
 	"strconv"
 	"strings"
+	"time"
 
-	commonErrors "gemini-grc/common/errors"
+	"gemini-grc/common/contextlog"
 	"gemini-grc/common/snapshot"
+	_url "gemini-grc/common/url"
+	"gemini-grc/config"
+	"gemini-grc/contextutil"
+	"git.antanst.com/antanst/logging"
+	"git.antanst.com/antanst/xerrors"
 	"github.com/guregu/null/v5"
 )
 
-// ProcessData processes the raw data from a Gemini response and populates the Snapshot.
+// Visit visits a given URL using the Gemini protocol,
+// and returns a populated snapshot. Any relevant errors
+// when visiting the URL are stored in the snapshot;
+// an error is returned only when construction of a
+// snapshot was not possible (context cancellation errors,
+// not a valid URL etc.)
+func Visit(ctx context.Context, url string) (s *snapshot.Snapshot, err error) {
+	geminiCtx := contextutil.ContextWithComponent(ctx, "network")
+
+	s, err = snapshot.SnapshotFromURL(url, true)
+	if err != nil {
+		return nil, err
+	}
+
+	// Check if the context has been canceled
+	if err := ctx.Err(); err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	data, err := ConnectAndGetData(geminiCtx, s.URL.String())
+	if err != nil {
+		s.Error = null.StringFrom(err.Error())
+		return s, nil
+	}
+
+	// Check if the context has been canceled
+	if err := ctx.Err(); err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	s = UpdateSnapshotWithData(*s, data)
+
+	if !s.Error.Valid &&
+		s.MimeType.Valid &&
+		s.MimeType.String == "text/gemini" &&
+		len(s.GemText.ValueOrZero()) > 0 {
+		links := GetPageLinks(s.URL, s.GemText.String)
+		if len(links) > 0 {
+			s.Links = null.ValueFrom(links)
+		}
+	}
+
+	return s, nil
+}
+
+// ConnectAndGetData is a context-aware version of ConnectAndGetData
+// that returns the data from a GET request to a Gemini URL. It uses the context
+// for cancellation, timeout, and logging.
+func ConnectAndGetData(ctx context.Context, url string) ([]byte, error) {
+	parsedURL, err := stdurl.Parse(url)
+	if err != nil {
+		return nil, xerrors.NewSimpleError(fmt.Errorf("error parsing URL: %w", err))
+	}
+
+	hostname := parsedURL.Hostname()
+	port := parsedURL.Port()
+	if port == "" {
+		port = "1965"
+	}
+	host := fmt.Sprintf("%s:%s", hostname, port)
+
+	// Check if the context has been canceled before proceeding
+	if err := ctx.Err(); err != nil {
+		return nil, err
+	}
+
+	timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
+
+	// Establish the underlying TCP connection with context-based cancellation
+	dialer := &net.Dialer{
+		Timeout: timeoutDuration,
+	}
+
+	conn, err := dialer.DialContext(ctx, "tcp", host)
+	if err != nil {
+		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to establish TCP connection: %v", err)
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	// Make sure we always close the connection
+	defer func() {
+		_ = conn.Close()
+	}()
+
+	err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
+	if err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+	err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
+	if err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	// Check if the context has been canceled before proceeding with TLS handshake
+	if err := ctx.Err(); err != nil {
+		return nil, err
+	}
+
+	// Perform the TLS handshake
+	tlsConfig := &tls.Config{
+		InsecureSkipVerify: true,                 //nolint:gosec    // Accept all TLS certs, even if insecure.
+		ServerName:         parsedURL.Hostname(), // SNI says we should not include port in hostname
+	}
+
+	tlsConn := tls.Client(conn, tlsConfig)
+	err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
+	if err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+	err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
+	if err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	// Check if the context is done before attempting handshake
+	if err := ctx.Err(); err != nil {
+		return nil, err
+	}
+
+	// Perform TLS handshake with regular method
+	// (HandshakeContext is only available in Go 1.17+)
+	err = tlsConn.Handshake()
+	if err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	// Check again if the context is done after handshake
+	if err := ctx.Err(); err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	// We read `buf`-sized chunks and add data to `data`
+	buf := make([]byte, 4096)
+	var data []byte
+
+	// Check if the context has been canceled before sending request
+	if err := ctx.Err(); err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	// Send Gemini request to trigger server response
+	// Fix for stupid server bug:
+	// Some servers return 'Header: 53 No proxying to other hosts or ports!'
+	// when the port is 1965 and is still specified explicitly in the URL.
+	url2, _ := _url.ParseURL(url, "", true)
+	_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
+	if err != nil {
+		return nil, xerrors.NewSimpleError(err)
+	}
+
+	// Read response bytes in len(buf) byte chunks
+	for {
+		// Check if the context has been canceled before each read
+		if err := ctx.Err(); err != nil {
+			return nil, xerrors.NewSimpleError(err)
+		}
+
+		n, err := tlsConn.Read(buf)
+		if n > 0 {
+			data = append(data, buf[:n]...)
+		}
+		if len(data) > config.CONFIG.MaxResponseSize {
+			contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
+			return nil, xerrors.NewSimpleError(fmt.Errorf("response too large"))
+		}
+		if err != nil {
+			if errors.Is(err, io.EOF) {
+				break
+			}
+			contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
+			return nil, xerrors.NewSimpleError(err)
+		}
+	}
+
+	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes of data", len(data))
+	return data, nil
+}
+
+// UpdateSnapshotWithData processes the raw data from a Gemini response and populates the Snapshot.
 // This function is exported for use by the robotsMatch package.
-func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
+func UpdateSnapshotWithData(s snapshot.Snapshot, data []byte) *snapshot.Snapshot {
 	header, body, err := getHeadersAndData(data)
 	if err != nil {
-		return &s, err
+		s.Error = null.StringFrom(err.Error())
+		return &s
 	}
 	code, mimeType, lang := getMimeTypeAndLang(header)
 
@@ -39,13 +230,14 @@ func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
 	if mimeType == "text/gemini" {
 		validBody, err := BytesToValidUTF8(body)
 		if err != nil {
-			return nil, err
+			s.Error = null.StringFrom(err.Error())
+			return &s
 		}
 		s.GemText = null.StringFrom(validBody)
 	} else {
 		s.Data = null.ValueFrom(body)
 	}
-	return &s, nil
+	return &s
 }
 
 // Checks for a Gemini header, which is
@@ -55,7 +247,7 @@ func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
 func getHeadersAndData(data []byte) (string, []byte, error) {
 	firstLineEnds := slices.Index(data, '\n')
 	if firstLineEnds == -1 {
-		return "", nil, commonErrors.NewHostError(fmt.Errorf("error parsing header"))
+		return "", nil, xerrors.NewSimpleError(fmt.Errorf("error parsing header"))
 	}
 	firstLine := string(data[:firstLineEnds])
 	rest := data[firstLineEnds+1:]
@@ -106,7 +298,3 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
 	lang := matches[3] // Will be empty string if no lang parameter was found
 	return code, mimeType, lang
 }
-
-func isGeminiCapsule(s *snapshot.Snapshot) bool {
-	return !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini"
-}
diff --git a/gemini/network_context.go b/gemini/network_context.go
deleted file mode 100644
index b62177e..0000000
--- a/gemini/network_context.go
+++ /dev/null
@@ -1,276 +0,0 @@
-package gemini
-
-import (
-	"context"
-	"crypto/tls"
-	"errors"
-	"fmt"
-	"io"
-	"net"
-	stdurl "net/url"
-	"time"
-
-	"gemini-grc/common/contextlog"
-	commonErrors "gemini-grc/common/errors"
-	"gemini-grc/common/snapshot"
-	_url "gemini-grc/common/url"
-	"gemini-grc/config"
-	"gemini-grc/contextutil"
-	"gemini-grc/logging"
-	"git.antanst.com/antanst/xerrors"
-	"github.com/guregu/null/v5"
-)
-
-// Visit visits a given URL using the Gemini protocol.
-func Visit(ctx context.Context, url string) (s *snapshot.Snapshot, err error) {
-	geminiCtx := contextutil.ContextWithComponent(ctx, "gemini")
-
-	contextlog.LogDebugWithContext(geminiCtx, logging.GetSlogger(), "Visiting Gemini URL: %s", url)
-
-	s, err = snapshot.SnapshotFromURL(url, true)
-	if err != nil {
-		contextlog.LogErrorWithContext(geminiCtx, logging.GetSlogger(), "Failed to create snapshot from URL: %v", err)
-		return nil, err
-	}
-
-	defer func() {
-		if err == nil {
-			return
-		}
-		// GeminiError and HostError should
-		// be stored in the snapshot.
-		if commonErrors.IsHostError(err) {
-			contextlog.LogInfoWithContext(geminiCtx, logging.GetSlogger(), "Host error: %v", err)
-			s.Error = null.StringFrom(err.Error())
-			err = nil
-			return
-		} else if IsGeminiError(err) {
-			contextlog.LogInfoWithContext(geminiCtx, logging.GetSlogger(), "Gemini error: %v", err)
-			s.Error = null.StringFrom(err.Error())
-			s.Header = null.StringFrom(errors.Unwrap(err).(*GeminiError).Header)
-			s.ResponseCode = null.IntFrom(int64(errors.Unwrap(err).(*GeminiError).Code))
-			err = nil
-			return
-		}
-	}()
-
-	// Check if the context has been canceled
-	if err := ctx.Err(); err != nil {
-		return s, err
-	}
-
-	data, err := ConnectAndGetDataWithContext(geminiCtx, s.URL.String())
-	if err != nil {
-		return s, err
-	}
-
-	// Check if the context has been canceled
-	if err := ctx.Err(); err != nil {
-		return s, err
-	}
-
-	s, err = ProcessData(*s, data)
-	if err != nil {
-		return s, err
-	}
-
-	if isGeminiCapsule(s) {
-		links := GetPageLinks(s.URL, s.GemText.String)
-		if len(links) > 0 {
-			s.Links = null.ValueFrom(links)
-		}
-	}
-
-	contextlog.LogDebugWithContext(geminiCtx, logging.GetSlogger(), "Successfully visited URL: %s (Code: %d)", url, s.ResponseCode.ValueOrZero())
-	return s, nil
-}
-
-// ConnectAndGetDataWithContext is a context-aware version of ConnectAndGetData
-// that returns the data from a GET request to a Gemini URL. It uses the context
-// for cancellation, timeout, and logging.
-func ConnectAndGetDataWithContext(ctx context.Context, url string) ([]byte, error) {
-	// Parse the URL
-	parsedURL, err := stdurl.Parse(url)
-	if err != nil {
-		return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w", err), 0, "", false)
-	}
-
-	hostname := parsedURL.Hostname()
-	port := parsedURL.Port()
-	if port == "" {
-		port = "1965"
-	}
-	host := fmt.Sprintf("%s:%s", hostname, port)
-
-	// Check if the context has been canceled before proceeding
-	if err := ctx.Err(); err != nil {
-		return nil, err
-	}
-
-	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Connecting to %s", host)
-
-	timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
-
-	// Establish the underlying TCP connection with context-based cancellation
-	dialer := &net.Dialer{
-		Timeout: timeoutDuration,
-	}
-
-	// Use DialContext to allow cancellation via context
-	conn, err := dialer.DialContext(ctx, "tcp", host)
-	if err != nil {
-		contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to establish TCP connection: %v", err)
-		return nil, commonErrors.NewHostError(err)
-	}
-
-	// Make sure we always close the connection
-	defer func() {
-		_ = conn.Close()
-	}()
-
-	// Set read and write timeouts on the TCP connection
-	err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
-	if err != nil {
-		return nil, commonErrors.NewHostError(err)
-	}
-	err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
-	if err != nil {
-		return nil, commonErrors.NewHostError(err)
-	}
-
-	// Check if the context has been canceled before proceeding with TLS handshake
-	if err := ctx.Err(); err != nil {
-		return nil, err
-	}
-
-	// Perform the TLS handshake
-	tlsConfig := &tls.Config{
-		InsecureSkipVerify: true,                 //nolint:gosec    // Accept all TLS certs, even if insecure.
-		ServerName:         parsedURL.Hostname(), // SNI says we should not include port in hostname
-	}
-
-	tlsConn := tls.Client(conn, tlsConfig)
-	err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
-	if err != nil {
-		return nil, commonErrors.NewHostError(err)
-	}
-	err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
-	if err != nil {
-		return nil, commonErrors.NewHostError(err)
-	}
-
-	// Check if the context is done before attempting handshake
-	if err := ctx.Err(); err != nil {
-		return nil, err
-	}
-
-	// Perform TLS handshake with regular method
-	// (HandshakeContext is only available in Go 1.17+)
-	err = tlsConn.Handshake()
-	if err != nil {
-		contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "TLS handshake failed: %v", err)
-		return nil, commonErrors.NewHostError(err)
-	}
-
-	// Check again if the context is done after handshake
-	if err := ctx.Err(); err != nil {
-		return nil, err
-	}
-
-	// We read `buf`-sized chunks and add data to `data`
-	buf := make([]byte, 4096)
-	var data []byte
-
-	// Check if the context has been canceled before sending request
-	if err := ctx.Err(); err != nil {
-		return nil, err
-	}
-
-	// Send Gemini request to trigger server response
-	// Fix for stupid server bug:
-	// Some servers return 'Header: 53 No proxying to other hosts or ports!'
-	// when the port is 1965 and is still specified explicitly in the URL.
-	url2, _ := _url.ParseURL(url, "", true)
-	_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
-	if err != nil {
-		contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to send request: %v", err)
-		return nil, commonErrors.NewHostError(err)
-	}
-
-	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Request sent, reading response")
-
-	// Read response bytes in len(buf) byte chunks
-	for {
-		// Check if the context has been canceled before each read
-		if err := ctx.Err(); err != nil {
-			return nil, err
-		}
-
-		n, err := tlsConn.Read(buf)
-		if n > 0 {
-			data = append(data, buf[:n]...)
-		}
-		if len(data) > config.CONFIG.MaxResponseSize {
-			contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
-			return nil, commonErrors.NewHostError(fmt.Errorf("response too large"))
-		}
-		if err != nil {
-			if errors.Is(err, io.EOF) {
-				break
-			}
-			contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
-			return nil, commonErrors.NewHostError(err)
-		}
-	}
-
-	contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes of data", len(data))
-	return data, nil
-}
-
-// ProcessDataWithContext is a context-aware version of ProcessData that processes
-// the raw data from a Gemini response and populates the Snapshot.
-func ProcessDataWithContext(ctx context.Context, s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
-	// Create a processing-specific context with the "process" component
-	processCtx := contextutil.ContextWithComponent(ctx, "process")
-
-	contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Processing Gemini response data (%d bytes)", len(data))
-
-	header, body, err := getHeadersAndData(data)
-	if err != nil {
-		contextlog.LogErrorWithContext(processCtx, logging.GetSlogger(), "Failed to extract headers: %v", err)
-		return &s, err
-	}
-
-	code, mimeType, lang := getMimeTypeAndLang(header)
-	contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Response code: %d, MimeType: %s, Lang: %s", code, mimeType, lang)
-
-	if code != 0 {
-		s.ResponseCode = null.IntFrom(int64(code))
-	}
-	if header != "" {
-		s.Header = null.StringFrom(header)
-	}
-	if mimeType != "" {
-		s.MimeType = null.StringFrom(mimeType)
-	}
-	if lang != "" {
-		s.Lang = null.StringFrom(lang)
-	}
-
-	// If we've got a Gemini document, populate
-	// `GemText` field, otherwise raw data goes to `Data`.
-	if mimeType == "text/gemini" {
-		validBody, err := BytesToValidUTF8(body)
-		if err != nil {
-			contextlog.LogErrorWithContext(processCtx, logging.GetSlogger(), "UTF-8 validation failed: %v", err)
-			return nil, err
-		}
-		s.GemText = null.StringFrom(validBody)
-		contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Processed gemtext content (%d characters)", len(validBody))
-	} else {
-		s.Data = null.ValueFrom(body)
-		contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Stored binary data (%d bytes)", len(body))
-	}
-
-	return &s, nil
-}
diff --git a/gemini/network_test.go b/gemini/network_test.go
index 94d0a9b..7ea8c1e 100644
--- a/gemini/network_test.go
+++ b/gemini/network_test.go
@@ -135,17 +135,7 @@ func TestProcessData(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := snapshot.Snapshot{}
-			result, err := ProcessData(s, test.inputData)
-
-			if test.expectedError && err == nil {
-				t.Errorf("Expected error, got nil")
-				return
-			}
-
-			if !test.expectedError && err != nil {
-				t.Errorf("Unexpected error: %v", err)
-				return
-			}
+			result := UpdateSnapshotWithData(s, test.inputData)
 
 			if test.expectedError {
 				return
diff --git a/gemini/processing.go b/gemini/processing.go
index 771c841..d79dfbd 100644
--- a/gemini/processing.go
+++ b/gemini/processing.go
@@ -7,6 +7,7 @@ import (
 	"io"
 	"unicode/utf8"
 
+	"gemini-grc/config"
 	"git.antanst.com/antanst/xerrors"
 	"golang.org/x/text/encoding/charmap"
 	"golang.org/x/text/encoding/japanese"
@@ -23,11 +24,16 @@ func BytesToValidUTF8(input []byte) (string, error) {
 	if len(input) == 0 {
 		return "", nil
 	}
-	const maxSize = 10 * 1024 * 1024 // 10MB
-	if len(input) > maxSize {
-		return "", xerrors.NewError(fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false)
+
+	maxSize := config.CONFIG.MaxResponseSize
+	if maxSize == 0 {
+		maxSize = 1024 * 1024 // Default 1MB for tests
 	}
-	// remove NULL byte 0x00 (ReplaceAll accepts slices)
+	if len(input) > maxSize {
+		return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false)
+	}
+
+	// Always remove NULL bytes first (before UTF-8 validity check)
 	inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
 	if utf8.Valid(inputNoNull) {
 		return string(inputNoNull), nil
@@ -42,6 +48,8 @@ func BytesToValidUTF8(input []byte) (string, error) {
 		japanese.EUCJP.NewDecoder(),      // Japanese
 		korean.EUCKR.NewDecoder(),        // Korean
 	}
+
+	// Still invalid Unicode. Try some encodings to convert to.
 	// First successful conversion wins.
 	var lastErr error
 	for _, encoding := range encodings {
@@ -56,5 +64,5 @@ func BytesToValidUTF8(input []byte) (string, error) {
 		}
 	}
 
-	return "", xerrors.NewError(fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false)
+	return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false)
 }
diff --git a/go.mod b/go.mod
index b93a867..a818120 100644
--- a/go.mod
+++ b/go.mod
@@ -3,8 +3,9 @@ module gemini-grc
 go 1.24.3
 
 require (
+	git.antanst.com/antanst/logging v0.0.1
 	git.antanst.com/antanst/uid v0.0.1
-	git.antanst.com/antanst/xerrors v0.0.1
+	git.antanst.com/antanst/xerrors v0.0.2
 	github.com/guregu/null/v5 v5.0.0
 	github.com/jackc/pgx/v5 v5.7.2
 	github.com/jmoiron/sqlx v1.4.0
@@ -29,3 +30,5 @@ require (
 replace git.antanst.com/antanst/xerrors => ../xerrors
 
 replace git.antanst.com/antanst/uid => ../uid
+
+replace git.antanst.com/antanst/logging => ../logging
diff --git a/gopher/network.go b/gopher/network.go
index 3268656..d050cbd 100644
--- a/gopher/network.go
+++ b/gopher/network.go
@@ -12,7 +12,7 @@ import (
 
 	commonErrors "gemini-grc/common/errors"
 	"gemini-grc/config"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
 	"git.antanst.com/antanst/xerrors"
 )
 
diff --git a/gopher/network_context.go b/gopher/network_context.go
index 84c313c..110dc55 100644
--- a/gopher/network_context.go
+++ b/gopher/network_context.go
@@ -18,7 +18,7 @@ import (
 	_url "gemini-grc/common/url"
 	"gemini-grc/config"
 	"gemini-grc/contextutil"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
 	"git.antanst.com/antanst/xerrors"
 	"github.com/guregu/null/v5"
 )
@@ -30,8 +30,6 @@ func VisitWithContext(ctx context.Context, url string) (*snapshot.Snapshot, erro
 	// Create a gopher-specific context with the "gopher" component
 	gopherCtx := contextutil.ContextWithComponent(ctx, "gopher")
 
-	contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Visiting Gopher URL: %s", url)
-
 	if !config.CONFIG.GopherEnable {
 		contextlog.LogDebugWithContext(gopherCtx, logging.GetSlogger(), "Gopher protocol is disabled")
 		return nil, nil
diff --git a/hostPool/hostPool.go b/hostPool/hostPool.go
index cc76a78..b41c8ed 100644
--- a/hostPool/hostPool.go
+++ b/hostPool/hostPool.go
@@ -8,7 +8,8 @@ import (
 
 	"gemini-grc/common/contextlog"
 	"gemini-grc/contextutil"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
+	"git.antanst.com/antanst/xerrors"
 )
 
 var hostPool = HostPool{hostnames: make(map[string]struct{})}
@@ -20,18 +21,13 @@ type HostPool struct {
 
 // RemoveHostFromPool removes a host from the pool with context awareness
 func RemoveHostFromPool(ctx context.Context, key string) {
-	// Create a hostPool-specific context
 	hostCtx := contextutil.ContextWithComponent(ctx, "hostPool")
-
-	contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Removing host %s from pool", key)
-
 	hostPool.lock.Lock()
 	delete(hostPool.hostnames, key)
 	hostPool.lock.Unlock()
 
 	// Add some jitter
 	time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond)
-
 	contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Host %s removed from pool", key)
 }
 
@@ -41,18 +37,18 @@ func AddHostToHostPool(ctx context.Context, key string) error {
 	// Create a hostPool-specific context
 	hostCtx := contextutil.ContextWithComponent(ctx, "hostPool")
 
-	contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Adding host %s to pool", key)
-
 	// Use a ticker to periodically check if we can add the host
 	ticker := time.NewTicker(500 * time.Millisecond)
 	defer ticker.Stop()
 
+	// We continuously poll the pool,
+	// and if the host isn't already
+	// there, we add it.
 	for {
 		// Check if context is done before attempting to acquire lock
 		select {
 		case <-ctx.Done():
-			contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Context canceled while waiting to add host %s", key)
-			return ctx.Err()
+			return xerrors.NewSimpleError(ctx.Err())
 		default:
 			// Continue with attempt to add host
 		}
@@ -67,15 +63,13 @@ func AddHostToHostPool(ctx context.Context, key string) error {
 		}
 		hostPool.lock.Unlock()
 
-		contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Host %s busy, waiting...", key)
-
 		// Wait for next tick or context cancellation
 		select {
 		case <-ticker.C:
 			// Try again on next tick
 		case <-ctx.Done():
 			contextlog.LogDebugWithContext(hostCtx, logging.GetSlogger(), "Context canceled while waiting for host %s", key)
-			return ctx.Err()
+			return xerrors.NewSimpleError(ctx.Err())
 		}
 	}
 }
diff --git a/logging/logging.go b/logging/logging.go
deleted file mode 100644
index d932862..0000000
--- a/logging/logging.go
+++ /dev/null
@@ -1,188 +0,0 @@
-// Package logging provides a simple, structured logging interface using slog.
-// It offers colored output for better readability in terminal environments.
-package logging
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"strings"
-	"sync"
-)
-
-// Global logger instance.
-var slogLogger *slog.Logger
-
-// Current log level - used to filter logs.
-var currentLogLevel = slog.LevelInfo
-
-// ANSI color codes for terminal output.
-const (
-	colorReset = "\033[0m"
-	levelDebug = "\033[37m" // Gray
-	levelInfo  = "\033[32m" // Green
-	levelWarn  = "\033[33m" // Yellow
-	levelError = "\033[31m" // Red
-)
-
-// Standard helper functions for logging
-func LogDebug(format string, args ...interface{}) {
-	if slogLogger != nil {
-		slogLogger.Debug(fmt.Sprintf(format, args...))
-	}
-}
-
-func LogInfo(format string, args ...interface{}) {
-	if slogLogger != nil {
-		slogLogger.Info(fmt.Sprintf(format, args...))
-	}
-}
-
-func LogWarn(format string, args ...interface{}) {
-	if slogLogger != nil {
-		slogLogger.Warn(fmt.Sprintf(format, args...))
-	}
-}
-
-func LogError(format string, args ...interface{}) {
-	if slogLogger != nil {
-		msg := fmt.Sprintf(format, args...)
-		slogLogger.Error(msg, slog.String("error", msg))
-	}
-}
-
-// InitSlogger initializes the slog logger with custom handler.
-func InitSlogger(level slog.Level) {
-	// Set the global log level
-	currentLogLevel = level
-
-	// Create the handler with color support
-	baseHandler := NewColorHandler(os.Stderr)
-
-	// Create and set the new logger
-	slogLogger = slog.New(baseHandler)
-
-	// Set as default logger
-	slog.SetDefault(slogLogger)
-
-	// Print a startup message to verify logging is working
-	slogLogger.Info("Slog initialized", "level", level.String())
-}
-
-// GetSlogger returns the current global slog logger instance.
-// Can be used by other packages
-func GetSlogger() *slog.Logger {
-	if slogLogger == nil {
-		return slog.Default()
-	}
-	return slogLogger
-}
-
-// ColorHandler formats logs with colors for better terminal readability
-type ColorHandler struct {
-	out   io.Writer
-	mu    *sync.Mutex
-	attrs []slog.Attr // Store attributes for this handler
-}
-
-// NewColorHandler creates a new handler that writes colored logs to the provided writer
-func NewColorHandler(w io.Writer) *ColorHandler {
-	if w == nil {
-		w = os.Stderr
-	}
-	return &ColorHandler{
-		out:   w,
-		mu:    &sync.Mutex{},
-		attrs: make([]slog.Attr, 0),
-	}
-}
-
-// Enabled checks if the given log level is enabled
-func (h *ColorHandler) Enabled(_ context.Context, level slog.Level) bool {
-	return level >= currentLogLevel
-}
-
-// Handle processes a log record, formatting it with colors
-func (h *ColorHandler) Handle(ctx context.Context, r slog.Record) error {
-	h.mu.Lock()
-	defer h.mu.Unlock()
-
-	// Format time
-	timeStr := fmt.Sprintf("[%s]", r.Time.Format("2006-01-02 15:04:05"))
-
-	// Format level
-	var levelStr string
-	switch r.Level {
-	case slog.LevelDebug:
-		levelStr = fmt.Sprintf("%sDEBUG%s", levelDebug, colorReset)
-	case slog.LevelInfo:
-		levelStr = fmt.Sprintf("%sINFO%s", levelInfo, colorReset)
-	case slog.LevelWarn:
-		levelStr = fmt.Sprintf("%sWARN%s", levelWarn, colorReset)
-	case slog.LevelError:
-		levelStr = fmt.Sprintf("%sERROR%s", levelError, colorReset)
-	default:
-		levelStr = r.Level.String()
-	}
-
-	// Build prefix
-	prefix := fmt.Sprintf("%s %s ", timeStr, levelStr)
-
-	// Format message - we'll collect any special fields separately
-	attrMap := make(map[string]string)
-
-	// First collect attributes from the handler itself
-	for _, attr := range h.attrs {
-		attrMap[attr.Key] = attr.Value.String()
-	}
-
-	// Then extract from record attributes, which might override handler attributes
-	r.Attrs(func(a slog.Attr) bool {
-		attrMap[a.Key] = a.Value.String()
-		return true
-	})
-
-	// Format message with attributes on the same line
-	msg := fmt.Sprintf("%s%s", prefix, r.Message)
-
-	// Add attributes to the same line if present
-	if len(attrMap) > 0 {
-		// Add a space after the message
-		msg += " "
-
-		// Build attribute string
-		attrs := make([]string, 0, len(attrMap))
-		for k, v := range attrMap {
-			attrs = append(attrs, fmt.Sprintf("%s=%s", k, v))
-		}
-
-		// Join all attributes with spaces
-		msg += strings.Join(attrs, " ")
-	}
-
-	// Add newline at the end
-	msg += "\n"
-
-	// Write to output
-	_, err := io.WriteString(h.out, msg)
-	return err
-}
-
-// WithGroup returns a new Handler that inherits from this Handler
-func (h *ColorHandler) WithGroup(name string) slog.Handler {
-	return h // For simplicity, we don't support groups
-}
-
-// WithAttrs returns a new Handler whose attributes include both
-// the receiver's attributes and the arguments
-func (h *ColorHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
-	// Create a new handler with the same output but additional attributes
-	newHandler := &ColorHandler{
-		out:   h.out,
-		mu:    h.mu,
-		attrs: append(append([]slog.Attr{}, h.attrs...), attrs...),
-	}
-	return newHandler
-}
diff --git a/misc/sql/README.md b/misc/sql/README.md
new file mode 100644
index 0000000..db2705b
--- /dev/null
+++ b/misc/sql/README.md
@@ -0,0 +1,28 @@
+# SQL Queries for Snapshot Analysis
+
+This directory contains SQL queries to analyze snapshot data in the gemini-grc database.
+
+## Usage
+
+You can run these queries directly from psql using the `\i` directive:
+
+```
+\i misc/sql/snapshots_per_url.sql
+```
+
+## Available Queries
+
+- **snapshots_per_url.sql** - Basic count of snapshots per URL
+- **snapshots_date_range.sql** - Shows snapshot count with date range information for each URL
+- **host_snapshot_stats.sql** - Groups snapshots by hosts and shows URLs with multiple snapshots
+- **content_changes.sql** - Finds URLs with the most content changes between consecutive snapshots
+- **snapshot_distribution.sql** - Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
+- **recent_snapshot_activity.sql** - Shows URLs with most snapshots in the last 7 days
+- **storage_efficiency.sql** - Shows potential storage savings from deduplication
+- **snapshots_by_timeframe.sql** - Shows snapshot count by timeframe (day, week, month)
+
+## Notes
+
+- These queries are designed to work with PostgreSQL and the gemini-grc database schema
+- Some queries may be resource-intensive on large databases
+- The results can help optimize storage and understand the effectiveness of the versioned snapshot feature
\ No newline at end of file
diff --git a/misc/sql/content_changes.sql b/misc/sql/content_changes.sql
new file mode 100644
index 0000000..b2d5104
--- /dev/null
+++ b/misc/sql/content_changes.sql
@@ -0,0 +1,26 @@
+-- File: content_changes.sql
+-- Finds URLs with the most content changes between consecutive snapshots
+-- Usage: \i misc/sql/content_changes.sql
+
+WITH snapshot_changes AS (
+    SELECT 
+        s1.url,
+        s1.timestamp as prev_timestamp,
+        s2.timestamp as next_timestamp,
+        s1.gemtext IS DISTINCT FROM s2.gemtext as gemtext_changed,
+        s1.data IS DISTINCT FROM s2.data as data_changed
+    FROM snapshots s1
+    JOIN snapshots s2 ON s1.url = s2.url AND s1.timestamp < s2.timestamp
+    WHERE NOT EXISTS (
+        SELECT 1 FROM snapshots s3
+        WHERE s3.url = s1.url AND s1.timestamp < s3.timestamp AND s3.timestamp < s2.timestamp
+    )
+)
+SELECT 
+    url,
+    COUNT(*) + 1 as snapshot_count,
+    SUM(CASE WHEN gemtext_changed OR data_changed THEN 1 ELSE 0 END) as content_changes
+FROM snapshot_changes
+GROUP BY url
+HAVING COUNT(*) + 1 > 1
+ORDER BY content_changes DESC, snapshot_count DESC;
\ No newline at end of file
diff --git a/misc/sql/crawl_top_level.sql b/misc/sql/crawl_top_level.sql
new file mode 100644
index 0000000..29c13d4
--- /dev/null
+++ b/misc/sql/crawl_top_level.sql
@@ -0,0 +1,30 @@
+BEGIN;
+
+WITH matching_urls AS (
+    SELECT url, host
+    FROM snapshots
+    WHERE url ~ '^gemini://[^/]+/$'
+      AND timestamp < (NOW() - INTERVAL '1 week')
+    ORDER BY random()
+    LIMIT 500
+)
+INSERT INTO urls (url, host)
+SELECT url, host
+FROM matching_urls
+ON CONFLICT DO NOTHING;
+
+-- WITH matching_urls AS (
+--     SELECT url, host
+--     FROM snapshots
+--     WHERE url ~ '^gemini://[^/]+/$'
+--       AND timestamp < (NOW() - INTERVAL '1 week')
+--     ORDER BY random()
+--     LIMIT 500
+-- )
+-- DELETE FROM snapshots
+-- WHERE url IN (
+--     SELECT url
+--     FROM matching_urls
+-- );
+
+COMMIT;
diff --git a/misc/sql/host_snapshot_stats.sql b/misc/sql/host_snapshot_stats.sql
new file mode 100644
index 0000000..e459122
--- /dev/null
+++ b/misc/sql/host_snapshot_stats.sql
@@ -0,0 +1,20 @@
+-- File: host_snapshot_stats.sql
+-- Groups snapshots by hosts and shows URLs with multiple snapshots
+-- Usage: \i misc/sql/host_snapshot_stats.sql
+
+SELECT 
+    host,
+    COUNT(DISTINCT url) as unique_urls,
+    SUM(CASE WHEN url_count > 1 THEN 1 ELSE 0 END) as urls_with_multiple_snapshots,
+    SUM(snapshot_count) as total_snapshots
+FROM (
+    SELECT 
+        host, 
+        url, 
+        COUNT(*) as snapshot_count,
+        COUNT(*) OVER (PARTITION BY url) as url_count
+    FROM snapshots
+    GROUP BY host, url
+) subquery
+GROUP BY host
+ORDER BY total_snapshots DESC;
\ No newline at end of file
diff --git a/misc/sql/mark_urls_processed_false.sql b/misc/sql/mark_urls_processed_false.sql
new file mode 100644
index 0000000..aa06619
--- /dev/null
+++ b/misc/sql/mark_urls_processed_false.sql
@@ -0,0 +1 @@
+update urls set being_processed=false where being_processed is true;
diff --git a/misc/sql/recent_snapshot_activity.sql b/misc/sql/recent_snapshot_activity.sql
new file mode 100644
index 0000000..a3468e4
--- /dev/null
+++ b/misc/sql/recent_snapshot_activity.sql
@@ -0,0 +1,13 @@
+-- File: recent_snapshot_activity.sql
+-- Shows URLs with most snapshots in the last 7 days
+-- Usage: \i misc/sql/recent_snapshot_activity.sql
+
+SELECT 
+    url, 
+    COUNT(*) as snapshot_count
+FROM snapshots
+WHERE timestamp > NOW() - INTERVAL '7 days'
+GROUP BY url
+HAVING COUNT(*) > 1
+ORDER BY snapshot_count DESC
+LIMIT 20;
\ No newline at end of file
diff --git a/misc/sql/snapshot_distribution.sql b/misc/sql/snapshot_distribution.sql
new file mode 100644
index 0000000..74f11d8
--- /dev/null
+++ b/misc/sql/snapshot_distribution.sql
@@ -0,0 +1,16 @@
+-- File: snapshot_distribution.sql
+-- Shows the distribution of snapshots per URL (how many URLs have 1, 2, 3, etc. snapshots)
+-- Usage: \i misc/sql/snapshot_distribution.sql
+
+WITH counts AS (
+    SELECT url, COUNT(*) as snapshot_count
+    FROM snapshots
+    GROUP BY url
+)
+SELECT 
+    snapshot_count,
+    COUNT(*) as url_count,
+    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
+FROM counts
+GROUP BY snapshot_count
+ORDER BY snapshot_count;
\ No newline at end of file
diff --git a/misc/sql/snapshots_by_timeframe.sql b/misc/sql/snapshots_by_timeframe.sql
new file mode 100644
index 0000000..4dd0148
--- /dev/null
+++ b/misc/sql/snapshots_by_timeframe.sql
@@ -0,0 +1,37 @@
+-- File: snapshots_by_timeframe.sql
+-- Shows snapshot count by timeframe (day, week, month)
+-- Usage: \i misc/sql/snapshots_by_timeframe.sql
+
+WITH daily_snapshots AS (
+    SELECT 
+        date_trunc('day', timestamp) as day,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT url) as unique_urls
+    FROM snapshots
+    GROUP BY day
+    ORDER BY day
+),
+weekly_snapshots AS (
+    SELECT 
+        date_trunc('week', timestamp) as week,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT url) as unique_urls
+    FROM snapshots
+    GROUP BY week
+    ORDER BY week
+),
+monthly_snapshots AS (
+    SELECT 
+        date_trunc('month', timestamp) as month,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT url) as unique_urls
+    FROM snapshots
+    GROUP BY month
+    ORDER BY month
+)
+SELECT 'Daily' as timeframe, * FROM daily_snapshots
+UNION ALL
+SELECT 'Weekly' as timeframe, * FROM weekly_snapshots
+UNION ALL
+SELECT 'Monthly' as timeframe, * FROM monthly_snapshots
+ORDER BY timeframe, day;
\ No newline at end of file
diff --git a/misc/sql/snapshots_date_range.sql b/misc/sql/snapshots_date_range.sql
new file mode 100644
index 0000000..53395c3
--- /dev/null
+++ b/misc/sql/snapshots_date_range.sql
@@ -0,0 +1,14 @@
+-- File: snapshots_date_range.sql
+-- Shows snapshot count with date range information for each URL
+-- Usage: \i misc/sql/snapshots_date_range.sql
+
+SELECT 
+    url, 
+    COUNT(*) as snapshot_count,
+    MIN(timestamp) as first_snapshot,
+    MAX(timestamp) as last_snapshot,
+    MAX(timestamp) - MIN(timestamp) as time_span
+FROM snapshots
+GROUP BY url
+HAVING COUNT(*) > 1
+ORDER BY snapshot_count DESC;
\ No newline at end of file
diff --git a/misc/sql/snapshots_per_url.sql b/misc/sql/snapshots_per_url.sql
new file mode 100644
index 0000000..a47534d
--- /dev/null
+++ b/misc/sql/snapshots_per_url.sql
@@ -0,0 +1,8 @@
+-- File: snapshots_per_url.sql
+-- Basic count of snapshots per URL
+-- Usage: \i misc/sql/snapshots_per_url.sql
+
+SELECT url, COUNT(*) as snapshot_count
+FROM snapshots
+GROUP BY url
+ORDER BY snapshot_count DESC;
\ No newline at end of file
diff --git a/misc/sql/storage_efficiency.sql b/misc/sql/storage_efficiency.sql
new file mode 100644
index 0000000..8da7ab9
--- /dev/null
+++ b/misc/sql/storage_efficiency.sql
@@ -0,0 +1,20 @@
+-- File: storage_efficiency.sql
+-- Shows potential storage savings from deduplication
+-- Usage: \i misc/sql/storage_efficiency.sql
+
+WITH duplicate_stats AS (
+    SELECT 
+        url,
+        COUNT(*) as snapshot_count,
+        COUNT(DISTINCT gemtext) as unique_gemtexts,
+        COUNT(DISTINCT data) as unique_datas
+    FROM snapshots
+    GROUP BY url
+    HAVING COUNT(*) > 1
+)
+SELECT 
+    SUM(snapshot_count) as total_snapshots,
+    SUM(unique_gemtexts + unique_datas) as unique_contents,
+    SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas) as duplicate_content_count,
+    ROUND((SUM(snapshot_count) - SUM(unique_gemtexts + unique_datas)) * 100.0 / SUM(snapshot_count), 2) as duplicate_percentage
+FROM duplicate_stats;
\ No newline at end of file
diff --git a/robotsMatch/robots.go b/robotsMatch/robots.go
index 5bf6663..8dea331 100644
--- a/robotsMatch/robots.go
+++ b/robotsMatch/robots.go
@@ -7,7 +7,7 @@ import (
 
 	"gemini-grc/common/contextlog"
 	"gemini-grc/contextutil"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
 )
 
 // ParseRobotsTxt takes robots.txt content and a host, and
diff --git a/robotsMatch/robotsMatch.go b/robotsMatch/robotsMatch.go
index ed3ea07..ff97217 100644
--- a/robotsMatch/robotsMatch.go
+++ b/robotsMatch/robotsMatch.go
@@ -10,9 +10,10 @@ import (
 	"gemini-grc/common/contextlog"
 	"gemini-grc/common/snapshot"
 	geminiUrl "gemini-grc/common/url"
+	"gemini-grc/config"
 	"gemini-grc/contextutil"
 	"gemini-grc/gemini"
-	"gemini-grc/logging"
+	"git.antanst.com/antanst/logging"
 )
 
 // RobotsCache is a map of blocked URLs
@@ -38,7 +39,7 @@ func populateRobotsCache(ctx context.Context, key string) (entries []string, _er
 	contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "Fetching robots.txt from %s", url)
 
 	// Use the context-aware version to honor timeout and cancellation
-	robotsContent, err := gemini.ConnectAndGetDataWithContext(cacheCtx, url)
+	robotsContent, err := gemini.ConnectAndGetData(cacheCtx, url)
 	if err != nil {
 		// Check for context timeout or cancellation specifically
 		if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
@@ -59,12 +60,7 @@ func populateRobotsCache(ctx context.Context, key string) (entries []string, _er
 		return []string{}, nil
 	}
 
-	// TODO: Update gemini.ProcessData to accept context
-	s, err = gemini.ProcessData(*s, robotsContent)
-	if err != nil {
-		contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error: %s", err)
-		return []string{}, nil
-	}
+	s = gemini.UpdateSnapshotWithData(*s, robotsContent)
 
 	if s.ResponseCode.ValueOrZero() != 20 {
 		contextlog.LogDebugWithContext(cacheCtx, logging.GetSlogger(), "robots.txt error code %d, ignoring", s.ResponseCode.ValueOrZero())
@@ -91,14 +87,18 @@ func populateRobotsCache(ctx context.Context, key string) (entries []string, _er
 
 // RobotMatch checks if the snapshot URL matches
 // a robots.txt allow rule.
-func RobotMatch(ctx context.Context, u string) (bool, error) {
+func RobotMatch(ctx context.Context, u string) bool {
 	// Create a context for robots operations
 	robotsCtx := contextutil.ContextWithComponent(ctx, "robotsMatch")
 
+	// TODO Missing Gopher functionality
+	if config.CONFIG.GopherEnable {
+		return false
+	}
+
 	url, err := geminiUrl.ParseURL(u, "", true)
 	if err != nil {
-		contextlog.LogErrorWithContext(robotsCtx, logging.GetSlogger(), "Failed to parse URL: %v", err)
-		return false, err
+		return false
 	}
 
 	key := strings.ToLower(fmt.Sprintf("%s:%d", url.Hostname, url.Port))
@@ -112,16 +112,7 @@ func RobotMatch(ctx context.Context, u string) (bool, error) {
 		var fetchErr error
 		disallowedURLs, fetchErr = populateRobotsCache(ctx, key)
 		if fetchErr != nil {
-			contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Error populating robots.txt cache for %s: %v", key, fetchErr)
-
-			// Handle context timeouts by propagating the error
-			if errors.Is(fetchErr, context.DeadlineExceeded) || errors.Is(fetchErr, context.Canceled) {
-				contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Timeout or cancellation while checking robots.txt")
-				return false, fetchErr
-			}
-
-			// For other errors, assume we can proceed without robots.txt
-			return false, nil
+			return false
 		}
 		if len(disallowedURLs) > 0 {
 			contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Added to robots.txt cache: %v => %v", key, disallowedURLs)
@@ -137,7 +128,7 @@ func RobotMatch(ctx context.Context, u string) (bool, error) {
 		}
 		contextlog.LogDebugWithContext(robotsCtx, logging.GetSlogger(), "Found %d disallowed paths in robots.txt cache for %s", len(disallowedURLs), key)
 	}
-	return isURLblocked(ctx, disallowedURLs, url.Full), nil
+	return isURLblocked(ctx, disallowedURLs, url.Full)
 }
 
 // Initialize initializes the robots.txt match package
@@ -157,12 +148,9 @@ func isURLblocked(ctx context.Context, disallowedURLs []string, input string) bo
 	blockCtx := contextutil.ContextWithComponent(ctx, "robotsMatch.isURLblocked")
 
 	inputLower := strings.ToLower(input)
-	contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "Checking URL against robots.txt rules: %s", input)
 
 	for _, url := range disallowedURLs {
 		urlLower := strings.ToLower(url)
-		contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "Comparing against rule: %s (lower: %s vs %s)", url, inputLower, urlLower)
-
 		if strings.HasPrefix(inputLower, urlLower) {
 			contextlog.LogDebugWithContext(blockCtx, logging.GetSlogger(), "MATCH! robots.txt rule: %s blocks URL: %s", url, input)
 			return true
diff --git a/robotsMatch/robotsMatch_test.go b/robotsMatch/robotsMatch_test.go
index 3a965e5..a3ea90b 100644
--- a/robotsMatch/robotsMatch_test.go
+++ b/robotsMatch/robotsMatch_test.go
@@ -2,7 +2,6 @@ package robotsMatch
 
 import (
 	"context"
-	"errors"
 	"sync"
 	"testing"
 
@@ -32,15 +31,7 @@ func TestRobotMatch_EmptyCache(t *testing.T) {
 
 	// For empty cache or DNS errors, RobotMatch should return false (allow the URL) without an error
 	ctx := context.Background()
-	blocked, err := RobotMatch(ctx, "gemini://nonexistent.example.com/")
-	// We expect no error for non-existent host because we changed our error handling
-	// to be more tolerant of DNS/connectivity issues
-	if err != nil {
-		// The only errors we should get are context-related (timeout, cancellation)
-		if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
-			t.Errorf("Expected nil error for non-existent host, got: %v", err)
-		}
-	}
+	blocked := RobotMatch(ctx, "gemini://nonexistent.example.com/")
 
 	// The URL should be allowed (not blocked) when robots.txt can't be fetched
 	if blocked {
diff --git a/seed_urls.txt b/seed_urls.txt
new file mode 100644
index 0000000..7f649f1
--- /dev/null
+++ b/seed_urls.txt
@@ -0,0 +1,10 @@
+gemini://geminiprotocol.net/
+gemini://warmedal.se/~antenna/
+gemini://skyjake.fi/~Cosmos/
+gemini://gemini.circumlunar.space/capcom/
+gemini://auragem.letz.dev/
+gemini://gemplex.space/
+gemini://kennedy.gemi.dev/
+gemini://tlgs.one/
+gemini://yesterday.gemlog.org/
+gemini://gemini.cyberbot.space/feed.gmi
diff --git a/test.txt b/test.txt
new file mode 100644
index 0000000..5071807
--- /dev/null
+++ b/test.txt
@@ -0,0 +1,22 @@
+# Test redirect full url:
+gemini://gemini.circumlunar.space 
+
+# Test blacklist:
+gemi.dev
+
+# Test robots disallow:
+gemini://tlgs.one/search?aa
+
+# Test TLS cert required:
+gemini://astrobotany.mozz.us/app/plant
+// 31 redirect
+gemini://gemini.circumlunar.space
+
+// body with null byte
+gemini://kennedy.gemi.dev/archive/cached?url=gemini://spam.works/mirrors/textfiles/fun/consult.how&t=638427244900000000&raw=False
+
+// has invalid url
+gemini://tlgs.one/known-hosts
+
+// Needs SNI TLS info (our bug)
+gemini://hanzbrix.pollux.casa/gemlog/20241002.gmi