Refine content deduplication and improve configuration

This commit is contained in:
antanst
2025-06-16 17:09:26 +03:00
parent 330b596497
commit f9024d15aa
3 changed files with 87 additions and 28 deletions

View File

@@ -32,15 +32,15 @@ func Initialize() *Config {
loglevel := flag.String("log-level", "info", "Logging level (debug, info, warn, error)")
pgURL := flag.String("pgurl", "", "Postgres URL")
dryRun := flag.Bool("dry-run", false, "Dry run mode (default false)")
gopherEnable := flag.Bool("gopher", false, "Enable crawling of Gopher holes (default false)")
maxDbConnections := flag.Int("max-db-connections", 100, "Maximum number of database connections (default 100)")
numOfWorkers := flag.Int("workers", 1, "Number of concurrent workers (default 1)")
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes (default 1MB)")
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds (default 10)")
dryRun := flag.Bool("dry-run", false, "Dry run mode")
gopherEnable := flag.Bool("gopher", false, "Enable crawling of Gopher holes")
maxDbConnections := flag.Int("max-db-connections", 100, "Maximum number of database connections")
numOfWorkers := flag.Int("workers", 1, "Number of concurrent workers")
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes")
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds")
blacklistPath := flag.String("blacklist-path", "", "File that has blacklist regexes")
skipIdenticalContent := flag.Bool("skip-identical-content", true, "Skip storing snapshots with identical content (default true)")
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable, default 60)")
skipIdenticalContent := flag.Bool("skip-identical-content", true, "Skip storing snapshots with identical content")
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable)")
whitelistPath := flag.String("whitelist-path", "", "File with URLs that should always be crawled regardless of blacklist")
seedUrlPath := flag.String("seed-url-path", "", "File with seed URLs that should be added to the queue immediatelly")