Compare commits

...

7 Commits

Author SHA1 Message Date
antanst
0db2557cfc Update README to reflect command-line flag configuration
- Replace environment variables with command-line flags
- Update run example with proper flag syntax
- Fix database schema path to misc/sql/initdb.sql
- Add missing configuration options (gopher, seed-url-path, max-db-connections)
- Remove outdated configuration options
2025-06-29 22:28:28 +03:00
antanst
db3448f448 Improve crawler performance and logging
- Optimize job scheduler to use NumOfWorkers for URL limits
- Clean up verbose logging in worker processing
- Update log messages for better clarity
2025-06-29 22:28:05 +03:00
antanst
9a09dd7735 Update log message for clarity
- Change "old content" to "old snapshot" for more accurate terminology

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-19 10:19:29 +03:00
antanst
0f62b0c622 Clean up logging in worker processing
- Move response code logging to happen after successful snapshot save
- Remove error message from log output for cleaner display
- Consolidate logging logic in saveSnapshotAndRemoveURL function

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-19 10:04:46 +03:00
antanst
3bdff0e22e Improve crawler performance and worker coordination
- Add WaitGroup synchronization for workers to prevent overlapping scheduler runs
- Increase history fetch multiplier and sleep intervals for better resource usage
- Simplify error handling and logging in worker processing
- Update SQL query to exclude error snapshots from history selection
- Fix worker ID variable reference in spawning loop
- Streamline snapshot update logic and error reporting

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-19 09:59:50 +03:00
antanst
a74f29d7b0 Update log message to reflect crawl date update behavior
🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-18 12:03:37 +03:00
antanst
ffeef334e7 Update last_crawled timestamp when skipping duplicate content and improve error handling
🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-18 12:02:55 +03:00
7 changed files with 117 additions and 94 deletions

View File

@@ -4,13 +4,13 @@ A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) netw
Easily extendable as a "wayback machine" of Gemini.
## Features
- [x] Save image/* and text/* files
- [x] Concurrent downloading with configurable number of workers
- [x] Save image/* and text/* files
- [x] Connection limit per host
- [x] URL Blacklist
- [x] URL Whitelist (overrides blacklist and robots.txt)
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
- [x] Configuration via environment variables
- [x] Configuration via command-line flags
- [x] Storing capsule snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
- [x] Proper URL normalization
@@ -22,46 +22,55 @@ This crawler uses `InsecureSkipVerify: true` in TLS configuration to accept all
## How to run
Spin up a PostgreSQL, check `db/sql/initdb.sql` to create the tables and start the crawler.
All configuration is done via environment variables.
Spin up a PostgreSQL, check `misc/sql/initdb.sql` to create the tables and start the crawler.
All configuration is done via command-line flags.
## Configuration
Bool can be `true`,`false` or `0`,`1`.
Available command-line flags:
```text
LogLevel string // Logging level (debug, info, warn, error)
MaxResponseSize int // Maximum size of response in bytes
NumOfWorkers int // Number of concurrent workers
ResponseTimeout int // Timeout for responses in seconds
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
BlacklistPath string // File that has blacklisted strings of "host:port"
WhitelistPath string // File with URLs that should always be crawled regardless of blacklist or robots.txt
DryRun bool // If false, don't write to disk
SkipIdenticalContent bool // When true, skip storing snapshots with identical content
SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable)
-blacklist-path string
File that has blacklist regexes
-dry-run
Dry run mode
-gopher
Enable crawling of Gopher holes
-log-level string
Logging level (debug, info, warn, error) (default "info")
-max-db-connections int
Maximum number of database connections (default 100)
-max-response-size int
Maximum size of response in bytes (default 1048576)
-pgurl string
Postgres URL
-response-timeout int
Timeout for network responses in seconds (default 10)
-seed-url-path string
File with seed URLs that should be added to the queue immediately
-skip-if-updated-days int
Skip re-crawling URLs updated within this many days (0 to disable) (default 60)
-whitelist-path string
File with URLs that should always be crawled regardless of blacklist
-workers int
Number of concurrent workers (default 1)
```
Example:
```shell
LOG_LEVEL=info \
NUM_OF_WORKERS=10 \
BLACKLIST_PATH="./blacklist.txt" \ # one url per line, can be empty
WHITELIST_PATH="./whitelist.txt" \ # URLs that override blacklist and robots.txt
MAX_RESPONSE_SIZE=10485760 \
RESPONSE_TIMEOUT=10 \
PANIC_ON_UNEXPECTED_ERROR=true \
PG_DATABASE=test \
PG_HOST=127.0.0.1 \
PG_MAX_OPEN_CONNECTIONS=100 \
PG_PORT=5434 \
PG_USER=test \
PG_PASSWORD=test \
DRY_RUN=false \
SKIP_IDENTICAL_CONTENT=false \
SKIP_IF_UPDATED_DAYS=7 \
./gemini-grc
./dist/crawler \
-pgurl="postgres://test:test@127.0.0.1:5434/test?sslmode=disable" \
-log-level=info \
-workers=10 \
-blacklist-path="./blacklist.txt" \
-whitelist-path="./whitelist.txt" \
-max-response-size=10485760 \
-response-timeout=10 \
-max-db-connections=100 \
-skip-if-updated-days=7 \
-gopher \
-seed-url-path="./seed_urls.txt"
```
## Development
@@ -107,6 +116,9 @@ You can access the snapshot history using the included `snapshot_history.sh` scr
Good starting points:
gemini://warmedal.se/~antenna/
gemini://tlgs.one/
gopher://i-logout.cz:70/1/bongusta/
gopher://gopher.quux.org:70/

View File

@@ -148,7 +148,7 @@ func spawnWorkers(total int) {
go func(a int) {
for {
job := <-jobs
common.RunWorkerWithTx(id, job)
common.RunWorkerWithTx(a, job)
}
}(id)
}
@@ -215,7 +215,7 @@ func runJobScheduler() {
common.FatalErrorsChan <- err
return
}
// Commit this tx here so the loop sees the changes.
// Commit this tx here so the loop below sees the changes.
err := tx.Commit()
if err != nil {
common.FatalErrorsChan <- err
@@ -251,14 +251,14 @@ func runJobScheduler() {
// When out of pending URLs, add some random ones.
if len(distinctHosts) == 0 {
// Queue random old URLs from history.
count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers*3, config.CONFIG.SkipIfUpdatedDays)
count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers, config.CONFIG.SkipIfUpdatedDays)
if err != nil {
common.FatalErrorsChan <- err
return
}
if count == 0 {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
time.Sleep(30 * time.Second)
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
time.Sleep(120 * time.Second)
continue
}
distinctHosts, err = gemdb.Database.GetUrlHosts(dbCtx, tx)
@@ -269,7 +269,7 @@ func runJobScheduler() {
}
// Get some URLs from each host, up to a limit
urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, 10, tx)
urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, config.CONFIG.NumOfWorkers, tx)
if err != nil {
common.FatalErrorsChan <- err
return
@@ -282,28 +282,39 @@ func runJobScheduler() {
}
if len(urls) == 0 {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
time.Sleep(30 * time.Second)
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
time.Sleep(120 * time.Second)
continue
}
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Queueing %d distinct hosts -> %d urls to crawl", len(distinctHosts), len(urls))
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%d urls to crawl", len(urls))
// Add jobs to WaitGroup before queuing
common.WorkerWG.Add(len(urls))
for _, url := range urls {
jobs <- url
}
// Wait for all workers to complete their jobs
common.WorkerWG.Wait()
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "All workers done. New scheduler run starts")
logging.LogInfo("")
logging.LogInfo("")
}
}
func enqueueSeedURLs(ctx context.Context, tx *sqlx.Tx) error {
// Get seed URLs from seedList module
urls := seedList.GetSeedURLs()
for _, url := range urls {
err := gemdb.Database.InsertURL(ctx, tx, url)
if err != nil {
return err
}
}
//urls := seedList.GetSeedURLs()
//
//for _, url := range urls {
// err := gemdb.Database.InsertURL(ctx, tx, url)
// if err != nil {
// return err
// }
//}
return nil
}
@@ -332,7 +343,6 @@ func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age in
}
if len(snapshotURLs) == 0 {
contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "No URLs with old latest crawl attempts found to recrawl")
return 0, nil
}

View File

@@ -1,6 +1,9 @@
package common
import "os"
import (
"os"
"sync"
)
// FatalErrorsChan accepts errors from workers.
// In case of fatal error, gracefully
@@ -8,6 +11,7 @@ import "os"
var (
FatalErrorsChan chan error
SignalsChan chan os.Signal
WorkerWG sync.WaitGroup
)
const VERSION string = "0.0.1"

View File

@@ -27,7 +27,6 @@ import (
)
func RunWorkerWithTx(workerID int, job string) {
// Extract host from URL for the context.
parsedURL, err := url2.ParseURL(job, "", true)
if err != nil {
logging.LogInfo("Failed to parse URL: %s Error: %s", job, err)
@@ -40,7 +39,6 @@ func RunWorkerWithTx(workerID int, job string) {
ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID)
ctx = contextutil.ContextWithComponent(ctx, "worker")
defer cancel() // Ensure the context is cancelled when we're done
// contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "======================================\n\n")
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job)
// Create a new db transaction
@@ -51,6 +49,7 @@ func RunWorkerWithTx(workerID int, job string) {
}
err = runWorker(ctx, tx, []string{job})
WorkerWG.Done()
if err != nil {
// Two cases to handle:
// - context cancellation/timeout errors (log and ignore)
@@ -114,17 +113,11 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
s, err := snapshot.SnapshotFromURL(url, true)
if err != nil {
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to parse URL: %v", err)
return err
}
// We always use the normalized URL
if url != s.URL.Full {
//err = gemdb.Database.CheckAndUpdateNormalizedURL(ctx, tx, url, s.URL.Full)
//if err != nil {
// return err
//}
//contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Normalized URL: %s → %s", url, s.URL.Full)
url = s.URL.Full
}
@@ -147,7 +140,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
// Only check blacklist if URL is not whitelisted
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches blacklist, ignoring %s", url)
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
return saveSnapshotAndRemoveURL(ctx, tx, s)
}
@@ -159,7 +151,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
// add it as an error and remove url
robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
if robotMatch {
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches robots.txt, skipping")
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
return saveSnapshotAndRemoveURL(ctx, tx, s)
}
@@ -184,7 +175,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
}
if err != nil {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Error visiting URL: %v", err)
return err
}
@@ -223,40 +213,32 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
}
}
// Save the snapshot and remove the URL from the queue
if s.Error.ValueOrZero() != "" {
// Only save error if we didn't have any valid
// snapshot data from a previous crawl!
shouldUpdateSnapshot, err := shouldUpdateSnapshotData(ctx, tx, s)
if err != nil {
return err
}
if shouldUpdateSnapshot {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
return saveSnapshotAndRemoveURL(ctx, tx, s)
} else {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (but old content exists, not updating)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
return removeURL(ctx, tx, s.URL.String())
}
} else {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
return saveSnapshotAndRemoveURL(ctx, tx, s)
}
}
func shouldUpdateSnapshotData(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
// If we don't have an error, save the new snapshot.
if !s.Error.Valid {
return true, nil
}
prevSnapshot, err := gemdb.Database.GetLatestSnapshot(ctx, tx, s.URL.String())
if err != nil {
return false, err
}
// If we don't have a previous snapshot, save it anyway.
if prevSnapshot == nil {
return true, nil
}
if prevSnapshot.ResponseCode.Valid {
return false, nil
}
// If we have a previous snapshot,
// and it didn't have an error, save.
// This means that we can have a max
// of one consecutive snapshot with
// an error.
if prevSnapshot.Error.ValueOrZero() == "" {
return true, nil
}
return false, nil
}
func isContentIdentical(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
// Always check if content is identical to previous snapshot
@@ -295,11 +277,25 @@ func removeURL(ctx context.Context, tx *sqlx.Tx, url string) error {
}
func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
shouldUpdateSnapshot, err := shouldUpdateSnapshotData(ctx, tx, s)
if err != nil {
return err
}
if shouldUpdateSnapshot {
err := gemdb.Database.SaveSnapshot(ctx, tx, s)
if err != nil {
return err
}
return gemdb.Database.DeleteURL(ctx, tx, s.URL.String())
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
return removeURL(ctx, tx, s.URL.String())
} else {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (updating crawl date)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String())
if err != nil {
return err
}
return removeURL(ctx, tx, s.URL.String())
}
}
// shouldPersistURL returns true given URL is a

View File

@@ -448,7 +448,7 @@ func (d *DbServiceImpl) GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url
if errors.Is(err, sql.ErrNoRows) {
return nil, nil
}
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", false)
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", true)
}
return s, nil
}

View File

@@ -115,12 +115,7 @@ LIMIT $1
SQL_UPDATE_LAST_CRAWLED = `
UPDATE snapshots
SET last_crawled = CURRENT_TIMESTAMP
WHERE id = (
SELECT id FROM snapshots
WHERE url = $1
ORDER BY timestamp DESC
LIMIT 1
)
`
// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
// This query finds root domain URLs that haven't been crawled recently and selects
@@ -137,7 +132,7 @@ LIMIT $1
host,
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
FROM snapshots
WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini'
WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini' AND error IS NULL
GROUP BY url, host
),
root_urls_with_content AS (

View File

@@ -0,0 +1,6 @@
select count(*) from snapshots
where last_crawled < now() - interval '30 days'
and error IS NULL
and gemtext IS NOT NULL
and mimetype='text/gemini'
and url ~ '^gemini://[^/]+/?$';