- Update common package utilities - Refactor network code for better error handling - Remove deprecated files and functionality - Enhance blacklist and filtering capabilities - Improve snapshot handling and processing
94 lines
3.8 KiB
Go
94 lines
3.8 KiB
Go
package config
|
|
|
|
import (
|
|
"flag"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
)
|
|
|
|
// Config holds the application configuration loaded from environment variables.
|
|
type Config struct {
|
|
PgURL string
|
|
LogLevel slog.Level // Logging level (debug, info, warn, error)
|
|
MaxResponseSize int // Maximum size of response in bytes
|
|
MaxDbConnections int // Maximum number of database connections.
|
|
NumOfWorkers int // Number of concurrent workers
|
|
ResponseTimeout int // Timeout for responses in seconds
|
|
BlacklistPath string // File that has blacklisted strings of "host:port"
|
|
WhitelistPath string // File with URLs that should always be crawled regardless of blacklist
|
|
DryRun bool // If false, don't write to disk
|
|
GopherEnable bool // Enable Gopher crawling
|
|
SeedUrlPath string // Add URLs from file to queue
|
|
SkipIdenticalContent bool // When true, skip storing snapshots with identical content
|
|
SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable, default 0)
|
|
}
|
|
|
|
var CONFIG Config //nolint:gochecknoglobals
|
|
|
|
// Initialize loads and validates configuration from environment variables
|
|
func Initialize() *Config {
|
|
config := &Config{}
|
|
|
|
loglevel := flag.String("log-level", "info", "Logging level (debug, info, warn, error)")
|
|
pgURL := flag.String("pgurl", "", "Postgres URL")
|
|
dryRun := flag.Bool("dry-run", false, "Dry run mode (default false)")
|
|
gopherEnable := flag.Bool("gopher", false, "Enable crawling of Gopher holes (default false)")
|
|
maxDbConnections := flag.Int("max-db-connections", 100, "Maximum number of database connections (default 100)")
|
|
numOfWorkers := flag.Int("workers", 1, "Number of concurrent workers (default 1)")
|
|
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes (default 1MB)")
|
|
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds (default 10)")
|
|
blacklistPath := flag.String("blacklist-path", "", "File that has blacklist regexes")
|
|
skipIdenticalContent := flag.Bool("skip-identical-content", true, "Skip storing snapshots with identical content (default true)")
|
|
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable, default 60)")
|
|
whitelistPath := flag.String("whitelist-path", "", "File with URLs that should always be crawled regardless of blacklist")
|
|
seedUrlPath := flag.String("seed-url-path", "", "File with seed URLs that should be added to the queue immediatelly")
|
|
|
|
flag.Parse()
|
|
|
|
config.PgURL = *pgURL
|
|
config.DryRun = *dryRun
|
|
config.GopherEnable = *gopherEnable
|
|
config.NumOfWorkers = *numOfWorkers
|
|
config.MaxResponseSize = *maxResponseSize
|
|
config.ResponseTimeout = *responseTimeout
|
|
config.BlacklistPath = *blacklistPath
|
|
config.WhitelistPath = *whitelistPath
|
|
config.SeedUrlPath = *seedUrlPath
|
|
config.MaxDbConnections = *maxDbConnections
|
|
config.SkipIdenticalContent = *skipIdenticalContent
|
|
config.SkipIfUpdatedDays = *skipIfUpdatedDays
|
|
|
|
level, err := ParseSlogLevel(*loglevel)
|
|
if err != nil {
|
|
_, _ = fmt.Fprint(os.Stderr, err.Error())
|
|
os.Exit(-1)
|
|
}
|
|
config.LogLevel = level
|
|
|
|
return config
|
|
}
|
|
|
|
// ParseSlogLevel converts a string level to slog.Level
|
|
func ParseSlogLevel(levelStr string) (slog.Level, error) {
|
|
switch levelStr {
|
|
case "debug":
|
|
return slog.LevelDebug, nil
|
|
case "info":
|
|
return slog.LevelInfo, nil
|
|
case "warn":
|
|
return slog.LevelWarn, nil
|
|
case "error":
|
|
return slog.LevelError, nil
|
|
default:
|
|
return slog.LevelInfo, fmt.Errorf("invalid log level: %s", levelStr)
|
|
}
|
|
}
|
|
|
|
// Convert method for backward compatibility with existing codebase
|
|
// This can be removed once all references to Convert() are updated
|
|
func (c *Config) Convert() *Config {
|
|
// Just return the config itself as it now directly contains slog.Level
|
|
return c
|
|
}
|