Files
gemini-grc/db/db_queries.go
antanst ada6cda4ac Fix snapshot overwrite logic to preserve successful responses
- Prevent overwriting snapshots that have valid response codes
- Ensure URL is removed from queue when snapshot update is skipped
- Add last_crawled timestamp tracking for better crawl scheduling
- Remove SkipIdenticalContent flag, simplify content deduplication logic
- Update database schema with last_crawled column and indexes

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-18 11:23:56 +03:00

176 lines
4.5 KiB
Go

package db
const (
SQL_SELECT_RANDOM_URLS_UNIQUE_HOSTS = `
SELECT url
FROM urls u
WHERE u.id IN (
SELECT id FROM (
SELECT id, ROW_NUMBER() OVER (PARTITION BY host ORDER BY id) as rn
FROM urls
) t
WHERE rn <= 3
)
ORDER BY RANDOM()
FOR UPDATE SKIP LOCKED
LIMIT $1
`
SQL_SELECT_RANDOM_URLS = `
SELECT url
FROM urls u
WHERE u.being_processed IS NOT TRUE
ORDER BY RANDOM()
FOR UPDATE SKIP LOCKED
LIMIT $1
`
SQL_MARK_URLS_BEING_PROCESSED = `UPDATE urls SET being_processed = true WHERE url IN (%s)`
SQL_SELECT_RANDOM_URLS_GEMINI_ONLY = `
SELECT url
FROM urls u
WHERE u.url like 'gemini://%'
AND u.being_processed IS NOT TRUE
ORDER BY RANDOM()
FOR UPDATE SKIP LOCKED
LIMIT $1
`
SQL_SELECT_RANDOM_URLS_GEMINI_ONLY_2 = `
WITH RankedUrls AS (
-- Step 1: Assign a random rank to each URL within its host group
SELECT
url,
host,
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
FROM
urls
WHERE url like 'gemini://%'
AND being_processed IS NOT TRUE
),
OneUrlPerHost AS (
-- Step 2: Filter to keep only the first-ranked (random) URL per host
SELECT
url,
host
FROM
RankedUrls
WHERE
rn = 1
)
-- Step 3: From the set of one URL per host, randomly select X
SELECT
url
FROM
OneUrlPerHost
ORDER BY
RANDOM()
FOR UPDATE SKIP LOCKED
LIMIT $1
`
// New query - always insert a new snapshot without conflict handling
SQL_INSERT_SNAPSHOT = `
INSERT INTO snapshots (url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error, header, last_crawled)
VALUES (:url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error, :header, :last_crawled)
RETURNING id
`
SQL_INSERT_URL = `
INSERT INTO urls (url, host, timestamp)
VALUES (:url, :host, :timestamp)
ON CONFLICT (url) DO NOTHING
`
SQL_UPDATE_URL = `
UPDATE urls
SET url = :NormalizedURL
WHERE url = :Url
AND NOT EXISTS (
SELECT 1 FROM urls WHERE url = :NormalizedURL
)
`
SQL_DELETE_URL = `
DELETE FROM urls WHERE url=$1
`
SQL_GET_LATEST_SNAPSHOT = `
SELECT * FROM snapshots
WHERE url = $1
ORDER BY timestamp DESC
LIMIT 1
`
SQL_GET_SNAPSHOT_AT_TIMESTAMP = `
SELECT * FROM snapshots
WHERE url = $1
AND timestamp <= $2
ORDER BY timestamp DESC
LIMIT 1
`
SQL_GET_ALL_SNAPSHOTS_FOR_URL = `
SELECT * FROM snapshots
WHERE url = $1
ORDER BY timestamp DESC
`
SQL_GET_SNAPSHOTS_BY_DATE_RANGE = `
SELECT * FROM snapshots
WHERE url = $1
AND timestamp BETWEEN $2 AND $3
ORDER BY timestamp DESC
`
// Update last_crawled timestamp for the most recent snapshot of a URL
SQL_UPDATE_LAST_CRAWLED = `
UPDATE snapshots
SET last_crawled = CURRENT_TIMESTAMP
WHERE id = (
SELECT id FROM snapshots
WHERE url = $1
ORDER BY timestamp DESC
LIMIT 1
)
`
// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
// This query finds root domain URLs that haven't been crawled recently and selects
// one URL per host for diversity. Uses CTEs to:
// 1. Find latest crawl attempt per URL (via MAX(last_crawled))
// 2. Filter to URLs with actual content and successful responses (20-29)
// 3. Select URLs where latest crawl is older than cutoff date
// 4. Rank randomly within each host and pick one URL per host
// Parameters: $1 = cutoff_date, $2 = limit
SQL_FETCH_SNAPSHOTS_FROM_HISTORY = `
WITH latest_attempts AS (
SELECT
url,
host,
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
FROM snapshots
WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini'
GROUP BY url, host
),
root_urls_with_content AS (
SELECT DISTINCT
la.url,
la.host,
la.latest_attempt
FROM latest_attempts la
JOIN snapshots s ON s.url = la.url
WHERE (s.gemtext IS NOT NULL OR s.data IS NOT NULL)
AND s.response_code BETWEEN 20 AND 29
),
eligible_urls AS (
SELECT
url,
host,
latest_attempt
FROM root_urls_with_content
WHERE latest_attempt < $1
),
ranked_urls AS (
SELECT
url,
host,
latest_attempt,
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rank
FROM eligible_urls
)
SELECT url, host
FROM ranked_urls
WHERE rank = 1
ORDER BY RANDOM()
LIMIT $2
`
)