Fix snapshot overwrite logic to preserve successful responses

- Prevent overwriting snapshots that have valid response codes - Ensure URL is removed from queue when snapshot update is skipped - Add last_crawled timestamp tracking for better crawl scheduling - Remove SkipIdenticalContent flag, simplify content deduplication logic - Update database schema with last_crawled column and indexes
2025-06-18 11:23:56 +03:00
parent 98d3ed6707
commit 2357135d5a
8 changed files with 284 additions and 242 deletions
--- a/db/db.go
+++ b/db/db.go
@@ -41,7 +41,7 @@ type DbService interface {
 	// Snapshot methods
 	SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
 	OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
-	RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error
+	UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error
 	GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url string) (*snapshot.Snapshot, error)
 	GetSnapshotAtTimestamp(ctx context.Context, tx *sqlx.Tx, url string, timestamp time.Time) (*snapshot.Snapshot, error)
 	GetAllSnapshotsForURL(ctx context.Context, tx *sqlx.Tx, url string) ([]*snapshot.Snapshot, error)
@@ -374,21 +374,10 @@ func (d *DbServiceImpl) SaveSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapsh
 		return err
 	}

-	// Check if we should skip storing identical content
-	if config.CONFIG.SkipIdenticalContent {
-		// Use the context-aware version to check for identical content
-		identical, err := d.IsContentIdentical(ctx, tx, s)
-		if err != nil {
-			return err
-		} else if identical {
-			contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Skipping URL with identical content to existing snapshot: %s", s.URL.String())
-			return nil
-		}
-	}
-
-	// Always ensure we have a current timestamp
-	s.Timestamp = null.TimeFrom(time.Now())
-	// last_crawled will be set automatically by database DEFAULT
+	// Always ensure we have current timestamps
+	currentTime := time.Now()
+	s.Timestamp = null.TimeFrom(currentTime)
+	s.LastCrawled = null.TimeFrom(currentTime)

 	// For PostgreSQL, use the global sqlx.NamedQueryContext function
 	// The SQL_INSERT_SNAPSHOT already has a RETURNING id clause
@@ -423,26 +412,20 @@ func (d *DbServiceImpl) OverwriteSnapshot(ctx context.Context, tx *sqlx.Tx, s *s
 	return d.SaveSnapshot(ctx, tx, s)
 }

-// RecordCrawlAttempt records a crawl attempt without saving full content (when content is identical)
-func (d *DbServiceImpl) RecordCrawlAttempt(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
+// UpdateLastCrawled updates the last_crawled timestamp for the most recent snapshot of a URL
+func (d *DbServiceImpl) UpdateLastCrawled(ctx context.Context, tx *sqlx.Tx, url string) error {
 	dbCtx := contextutil.ContextWithComponent(ctx, "database")
-	contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Recording crawl attempt for URL %s", s.URL.String())
+	contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating last_crawled timestamp for URL %s", url)

 	// Check if the context is cancelled before proceeding
 	if err := ctx.Err(); err != nil {
 		return err
 	}

-	// Record the crawl attempt with minimal data
-	// timestamp and last_crawled will be set automatically by database DEFAULT
-	_, err := tx.ExecContext(ctx, SQL_RECORD_CRAWL_ATTEMPT,
-		s.URL.String(),
-		s.Host,
-		s.MimeType.String,
-		s.ResponseCode.ValueOrZero(),
-		s.Error.String)
+	// Update the last_crawled timestamp for the most recent snapshot
+	_, err := tx.ExecContext(ctx, SQL_UPDATE_LAST_CRAWLED, url)
 	if err != nil {
-		return xerrors.NewError(fmt.Errorf("cannot record crawl attempt for URL %s: %w", s.URL.String(), err), 0, "", true)
+		return xerrors.NewError(fmt.Errorf("cannot update last_crawled for URL %s: %w", url, err), 0, "", true)
 	}

 	return nil
@@ -541,14 +524,6 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
 		return false, err
 	}

-	// Update: Skipped this because empty pages can be valid
-	// ex. pages with redirect headers
-	// Only check for identical content if we have gemtext or data
-	//if (!s.GemText.Valid || s.GemText.String == "") &&
-	//	(!s.Data.Valid || len(s.Data.V) == 0) {
-	//	return false, nil
-	//}
-
 	// Try to get the latest snapshot for this URL
 	latestSnapshot := &snapshot.Snapshot{}
 	err := tx.GetContext(ctx, latestSnapshot, SQL_GET_LATEST_SNAPSHOT, s.URL.String())
--- a/db/db_queries.go
+++ b/db/db_queries.go
@@ -67,38 +67,10 @@ LIMIT $1
 `
 	// New query - always insert a new snapshot without conflict handling
 	SQL_INSERT_SNAPSHOT = `
-        INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header)
-        VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header)
+        INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header, last_crawled)
+        VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header, :last_crawled)
        RETURNING id
    `
-	// Keep for backward compatibility, but should be phased out
-	SQL_INSERT_SNAPSHOT_IF_NEW = `
-        INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header)
-        VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header)
-        ON CONFLICT DO NOTHING
-    `
-	// Update to match the SQL_INSERT_SNAPSHOT - we no longer want to upsert, just insert new versions
-	SQL_UPSERT_SNAPSHOT = `
-        INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header)
-        VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header)
-        RETURNING id
-    `
-	SQL_UPDATE_SNAPSHOT = `UPDATE snapshots
-SET url = :url,
-host = :host,
-timestamp = :timestamp,
-mimetype = :mimetype,
-data = :data,
-gemtext = :gemtext,
-links = :links,
-lang = :lang,
-response_code = :response_code,
-error = :error,
-header = :header,
-last_crawled = CURRENT_TIMESTAMP
-WHERE id = :id
-RETURNING id
-`
 	SQL_INSERT_URL = `
        INSERT INTO urls (url, host, timestamp)
        VALUES (:url, :host, :timestamp)
@@ -115,7 +87,6 @@ RETURNING id
 	SQL_DELETE_URL = `
        DELETE FROM urls WHERE url=$1
    `
-	// New queries for retrieving snapshots
 	SQL_GET_LATEST_SNAPSHOT = `
        SELECT * FROM snapshots
        WHERE url = $1
@@ -140,9 +111,65 @@ RETURNING id
        AND timestamp BETWEEN $2 AND $3
        ORDER BY timestamp DESC
    `
-	// New query to record crawl attempt when content is identical (no new snapshot needed)
-	SQL_RECORD_CRAWL_ATTEMPT = `
-        INSERT INTO snapshots (url, host, mimetype, response_code, error)
-        VALUES ($1, $2, $3, $4, $5)
+	// Update last_crawled timestamp for the most recent snapshot of a URL
+	SQL_UPDATE_LAST_CRAWLED = `
+        UPDATE snapshots 
+        SET last_crawled = CURRENT_TIMESTAMP 
+        WHERE id = (
+            SELECT id FROM snapshots 
+            WHERE url = $1 
+            ORDER BY timestamp DESC 
+            LIMIT 1
+        )
+    `
+	// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
+	// This query finds root domain URLs that haven't been crawled recently and selects
+	// one URL per host for diversity. Uses CTEs to:
+	// 1. Find latest crawl attempt per URL (via MAX(last_crawled))
+	// 2. Filter to URLs with actual content and successful responses (20-29)
+	// 3. Select URLs where latest crawl is older than cutoff date
+	// 4. Rank randomly within each host and pick one URL per host
+	// Parameters: $1 = cutoff_date, $2 = limit
+	SQL_FETCH_SNAPSHOTS_FROM_HISTORY = `
+		WITH latest_attempts AS (
+			SELECT 
+				url,
+				host,
+				COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
+			FROM snapshots
+			WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini'
+			GROUP BY url, host
+		),
+		root_urls_with_content AS (
+			SELECT DISTINCT
+				la.url,
+				la.host,
+				la.latest_attempt
+			FROM latest_attempts la
+			JOIN snapshots s ON s.url = la.url 
+			WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL)
+				AND s.response_code BETWEEN 20 AND 29
+		),
+		eligible_urls AS (
+			SELECT 
+				url,
+				host,
+				latest_attempt
+			FROM root_urls_with_content
+			WHERE latest_attempt < $1
+		),
+		ranked_urls AS (
+			SELECT
+				url,
+				host,
+				latest_attempt,
+				ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank
+			FROM eligible_urls
+		)
+		SELECT url, host
+		FROM ranked_urls
+		WHERE rank = 1
+		ORDER BY RANDOM()
+		LIMIT $2
    `
 )