Refine content deduplication and improve configuration
This commit is contained in:
@@ -32,15 +32,15 @@ func Initialize() *Config {
|
||||
|
||||
loglevel := flag.String("log-level", "info", "Logging level (debug, info, warn, error)")
|
||||
pgURL := flag.String("pgurl", "", "Postgres URL")
|
||||
dryRun := flag.Bool("dry-run", false, "Dry run mode (default false)")
|
||||
gopherEnable := flag.Bool("gopher", false, "Enable crawling of Gopher holes (default false)")
|
||||
maxDbConnections := flag.Int("max-db-connections", 100, "Maximum number of database connections (default 100)")
|
||||
numOfWorkers := flag.Int("workers", 1, "Number of concurrent workers (default 1)")
|
||||
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes (default 1MB)")
|
||||
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds (default 10)")
|
||||
dryRun := flag.Bool("dry-run", false, "Dry run mode")
|
||||
gopherEnable := flag.Bool("gopher", false, "Enable crawling of Gopher holes")
|
||||
maxDbConnections := flag.Int("max-db-connections", 100, "Maximum number of database connections")
|
||||
numOfWorkers := flag.Int("workers", 1, "Number of concurrent workers")
|
||||
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes")
|
||||
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds")
|
||||
blacklistPath := flag.String("blacklist-path", "", "File that has blacklist regexes")
|
||||
skipIdenticalContent := flag.Bool("skip-identical-content", true, "Skip storing snapshots with identical content (default true)")
|
||||
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable, default 60)")
|
||||
skipIdenticalContent := flag.Bool("skip-identical-content", true, "Skip storing snapshots with identical content")
|
||||
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable)")
|
||||
whitelistPath := flag.String("whitelist-path", "", "File with URLs that should always be crawled regardless of blacklist")
|
||||
seedUrlPath := flag.String("seed-url-path", "", "File with seed URLs that should be added to the queue immediatelly")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user