Fix snapshot overwrite logic to preserve successful responses

- Prevent overwriting snapshots that have valid response codes
- Ensure URL is removed from queue when snapshot update is skipped
- Add last_crawled timestamp tracking for better crawl scheduling
- Remove SkipIdenticalContent flag, simplify content deduplication logic
- Update database schema with last_crawled column and indexes
This commit is contained in:
antanst
2025-06-18 11:23:56 +03:00
parent 98d3ed6707
commit 2357135d5a
8 changed files with 284 additions and 242 deletions

View File

@@ -9,19 +9,18 @@ import (
// Config holds the application configuration loaded from environment variables.
type Config struct {
PgURL string
LogLevel slog.Level // Logging level (debug, info, warn, error)
MaxResponseSize int // Maximum size of response in bytes
MaxDbConnections int // Maximum number of database connections.
NumOfWorkers int // Number of concurrent workers
ResponseTimeout int // Timeout for responses in seconds
BlacklistPath string // File that has blacklisted strings of "host:port"
WhitelistPath string // File with URLs that should always be crawled regardless of blacklist
DryRun bool // If false, don't write to disk
GopherEnable bool // Enable Gopher crawling
SeedUrlPath string // Add URLs from file to queue
SkipIdenticalContent bool // When true, skip storing snapshots with identical content
SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable, default 0)
PgURL string
LogLevel slog.Level // Logging level (debug, info, warn, error)
MaxResponseSize int // Maximum size of response in bytes
MaxDbConnections int // Maximum number of database connections.
NumOfWorkers int // Number of concurrent workers
ResponseTimeout int // Timeout for responses in seconds
BlacklistPath string // File that has blacklisted strings of "host:port"
WhitelistPath string // File with URLs that should always be crawled regardless of blacklist
DryRun bool // If false, don't write to disk
GopherEnable bool // Enable Gopher crawling
SeedUrlPath string // Add URLs from file to queue
SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable)
}
var CONFIG Config //nolint:gochecknoglobals
@@ -39,7 +38,6 @@ func Initialize() *Config {
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes")
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds")
blacklistPath := flag.String("blacklist-path", "", "File that has blacklist regexes")
skipIdenticalContent := flag.Bool("skip-identical-content", true, "Skip storing snapshots with identical content")
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable)")
whitelistPath := flag.String("whitelist-path", "", "File with URLs that should always be crawled regardless of blacklist")
seedUrlPath := flag.String("seed-url-path", "", "File with seed URLs that should be added to the queue immediatelly")
@@ -56,7 +54,6 @@ func Initialize() *Config {
config.WhitelistPath = *whitelistPath
config.SeedUrlPath = *seedUrlPath
config.MaxDbConnections = *maxDbConnections
config.SkipIdenticalContent = *skipIdenticalContent
config.SkipIfUpdatedDays = *skipIfUpdatedDays
level, err := ParseSlogLevel(*loglevel)