Files
gemini-grc/config/config.go
antanst 2357135d5a Fix snapshot overwrite logic to preserve successful responses
- Prevent overwriting snapshots that have valid response codes
- Ensure URL is removed from queue when snapshot update is skipped
- Add last_crawled timestamp tracking for better crawl scheduling
- Remove SkipIdenticalContent flag, simplify content deduplication logic
- Update database schema with last_crawled column and indexes
2025-06-29 22:38:38 +03:00

91 lines
3.4 KiB
Go

package config
import (
"flag"
"fmt"
"log/slog"
"os"
)
// Config holds the application configuration loaded from environment variables.
type Config struct {
PgURL string
LogLevel slog.Level // Logging level (debug, info, warn, error)
MaxResponseSize int // Maximum size of response in bytes
MaxDbConnections int // Maximum number of database connections.
NumOfWorkers int // Number of concurrent workers
ResponseTimeout int // Timeout for responses in seconds
BlacklistPath string // File that has blacklisted strings of "host:port"
WhitelistPath string // File with URLs that should always be crawled regardless of blacklist
DryRun bool // If false, don't write to disk
GopherEnable bool // Enable Gopher crawling
SeedUrlPath string // Add URLs from file to queue
SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable)
}
var CONFIG Config //nolint:gochecknoglobals
// Initialize loads and validates configuration from environment variables
func Initialize() *Config {
config := &Config{}
loglevel := flag.String("log-level", "info", "Logging level (debug, info, warn, error)")
pgURL := flag.String("pgurl", "", "Postgres URL")
dryRun := flag.Bool("dry-run", false, "Dry run mode")
gopherEnable := flag.Bool("gopher", false, "Enable crawling of Gopher holes")
maxDbConnections := flag.Int("max-db-connections", 100, "Maximum number of database connections")
numOfWorkers := flag.Int("workers", 1, "Number of concurrent workers")
maxResponseSize := flag.Int("max-response-size", 1024*1024, "Maximum size of response in bytes")
responseTimeout := flag.Int("response-timeout", 10, "Timeout for network responses in seconds")
blacklistPath := flag.String("blacklist-path", "", "File that has blacklist regexes")
skipIfUpdatedDays := flag.Int("skip-if-updated-days", 60, "Skip re-crawling URLs updated within this many days (0 to disable)")
whitelistPath := flag.String("whitelist-path", "", "File with URLs that should always be crawled regardless of blacklist")
seedUrlPath := flag.String("seed-url-path", "", "File with seed URLs that should be added to the queue immediatelly")
flag.Parse()
config.PgURL = *pgURL
config.DryRun = *dryRun
config.GopherEnable = *gopherEnable
config.NumOfWorkers = *numOfWorkers
config.MaxResponseSize = *maxResponseSize
config.ResponseTimeout = *responseTimeout
config.BlacklistPath = *blacklistPath
config.WhitelistPath = *whitelistPath
config.SeedUrlPath = *seedUrlPath
config.MaxDbConnections = *maxDbConnections
config.SkipIfUpdatedDays = *skipIfUpdatedDays
level, err := ParseSlogLevel(*loglevel)
if err != nil {
_, _ = fmt.Fprint(os.Stderr, err.Error())
os.Exit(-1)
}
config.LogLevel = level
return config
}
// ParseSlogLevel converts a string level to slog.Level
func ParseSlogLevel(levelStr string) (slog.Level, error) {
switch levelStr {
case "debug":
return slog.LevelDebug, nil
case "info":
return slog.LevelInfo, nil
case "warn":
return slog.LevelWarn, nil
case "error":
return slog.LevelError, nil
default:
return slog.LevelInfo, fmt.Errorf("invalid log level: %s", levelStr)
}
}
// Convert method for backward compatibility with existing codebase
// This can be removed once all references to Convert() are updated
func (c *Config) Convert() *Config {
// Just return the config itself as it now directly contains slog.Level
return c
}