Compare commits
7 Commits
8bbe6efabc
...
0db2557cfc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0db2557cfc | ||
|
|
db3448f448 | ||
|
|
9a09dd7735 | ||
|
|
0f62b0c622 | ||
|
|
3bdff0e22e | ||
|
|
a74f29d7b0 | ||
|
|
ffeef334e7 |
76
README.md
76
README.md
@@ -4,13 +4,13 @@ A crawler for the [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) netw
|
|||||||
Easily extendable as a "wayback machine" of Gemini.
|
Easily extendable as a "wayback machine" of Gemini.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
- [x] Save image/* and text/* files
|
|
||||||
- [x] Concurrent downloading with configurable number of workers
|
- [x] Concurrent downloading with configurable number of workers
|
||||||
|
- [x] Save image/* and text/* files
|
||||||
- [x] Connection limit per host
|
- [x] Connection limit per host
|
||||||
- [x] URL Blacklist
|
- [x] URL Blacklist
|
||||||
- [x] URL Whitelist (overrides blacklist and robots.txt)
|
- [x] URL Whitelist (overrides blacklist and robots.txt)
|
||||||
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
|
- [x] Follow robots.txt, see gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
- [x] Configuration via environment variables
|
- [x] Configuration via command-line flags
|
||||||
- [x] Storing capsule snapshots in PostgreSQL
|
- [x] Storing capsule snapshots in PostgreSQL
|
||||||
- [x] Proper response header & body UTF-8 and format validation
|
- [x] Proper response header & body UTF-8 and format validation
|
||||||
- [x] Proper URL normalization
|
- [x] Proper URL normalization
|
||||||
@@ -22,46 +22,55 @@ This crawler uses `InsecureSkipVerify: true` in TLS configuration to accept all
|
|||||||
|
|
||||||
## How to run
|
## How to run
|
||||||
|
|
||||||
Spin up a PostgreSQL, check `db/sql/initdb.sql` to create the tables and start the crawler.
|
Spin up a PostgreSQL, check `misc/sql/initdb.sql` to create the tables and start the crawler.
|
||||||
All configuration is done via environment variables.
|
All configuration is done via command-line flags.
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
Bool can be `true`,`false` or `0`,`1`.
|
Available command-line flags:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
LogLevel string // Logging level (debug, info, warn, error)
|
-blacklist-path string
|
||||||
MaxResponseSize int // Maximum size of response in bytes
|
File that has blacklist regexes
|
||||||
NumOfWorkers int // Number of concurrent workers
|
-dry-run
|
||||||
ResponseTimeout int // Timeout for responses in seconds
|
Dry run mode
|
||||||
PanicOnUnexpectedError bool // Panic on unexpected errors when visiting a URL
|
-gopher
|
||||||
BlacklistPath string // File that has blacklisted strings of "host:port"
|
Enable crawling of Gopher holes
|
||||||
WhitelistPath string // File with URLs that should always be crawled regardless of blacklist or robots.txt
|
-log-level string
|
||||||
DryRun bool // If false, don't write to disk
|
Logging level (debug, info, warn, error) (default "info")
|
||||||
SkipIdenticalContent bool // When true, skip storing snapshots with identical content
|
-max-db-connections int
|
||||||
SkipIfUpdatedDays int // Skip re-crawling URLs updated within this many days (0 to disable)
|
Maximum number of database connections (default 100)
|
||||||
|
-max-response-size int
|
||||||
|
Maximum size of response in bytes (default 1048576)
|
||||||
|
-pgurl string
|
||||||
|
Postgres URL
|
||||||
|
-response-timeout int
|
||||||
|
Timeout for network responses in seconds (default 10)
|
||||||
|
-seed-url-path string
|
||||||
|
File with seed URLs that should be added to the queue immediately
|
||||||
|
-skip-if-updated-days int
|
||||||
|
Skip re-crawling URLs updated within this many days (0 to disable) (default 60)
|
||||||
|
-whitelist-path string
|
||||||
|
File with URLs that should always be crawled regardless of blacklist
|
||||||
|
-workers int
|
||||||
|
Number of concurrent workers (default 1)
|
||||||
```
|
```
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
LOG_LEVEL=info \
|
./dist/crawler \
|
||||||
NUM_OF_WORKERS=10 \
|
-pgurl="postgres://test:test@127.0.0.1:5434/test?sslmode=disable" \
|
||||||
BLACKLIST_PATH="./blacklist.txt" \ # one url per line, can be empty
|
-log-level=info \
|
||||||
WHITELIST_PATH="./whitelist.txt" \ # URLs that override blacklist and robots.txt
|
-workers=10 \
|
||||||
MAX_RESPONSE_SIZE=10485760 \
|
-blacklist-path="./blacklist.txt" \
|
||||||
RESPONSE_TIMEOUT=10 \
|
-whitelist-path="./whitelist.txt" \
|
||||||
PANIC_ON_UNEXPECTED_ERROR=true \
|
-max-response-size=10485760 \
|
||||||
PG_DATABASE=test \
|
-response-timeout=10 \
|
||||||
PG_HOST=127.0.0.1 \
|
-max-db-connections=100 \
|
||||||
PG_MAX_OPEN_CONNECTIONS=100 \
|
-skip-if-updated-days=7 \
|
||||||
PG_PORT=5434 \
|
-gopher \
|
||||||
PG_USER=test \
|
-seed-url-path="./seed_urls.txt"
|
||||||
PG_PASSWORD=test \
|
|
||||||
DRY_RUN=false \
|
|
||||||
SKIP_IDENTICAL_CONTENT=false \
|
|
||||||
SKIP_IF_UPDATED_DAYS=7 \
|
|
||||||
./gemini-grc
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
@@ -107,6 +116,9 @@ You can access the snapshot history using the included `snapshot_history.sh` scr
|
|||||||
Good starting points:
|
Good starting points:
|
||||||
|
|
||||||
gemini://warmedal.se/~antenna/
|
gemini://warmedal.se/~antenna/
|
||||||
|
|
||||||
gemini://tlgs.one/
|
gemini://tlgs.one/
|
||||||
|
|
||||||
gopher://i-logout.cz:70/1/bongusta/
|
gopher://i-logout.cz:70/1/bongusta/
|
||||||
|
|
||||||
gopher://gopher.quux.org:70/
|
gopher://gopher.quux.org:70/
|
||||||
@@ -148,7 +148,7 @@ func spawnWorkers(total int) {
|
|||||||
go func(a int) {
|
go func(a int) {
|
||||||
for {
|
for {
|
||||||
job := <-jobs
|
job := <-jobs
|
||||||
common.RunWorkerWithTx(id, job)
|
common.RunWorkerWithTx(a, job)
|
||||||
}
|
}
|
||||||
}(id)
|
}(id)
|
||||||
}
|
}
|
||||||
@@ -215,7 +215,7 @@ func runJobScheduler() {
|
|||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Commit this tx here so the loop sees the changes.
|
// Commit this tx here so the loop below sees the changes.
|
||||||
err := tx.Commit()
|
err := tx.Commit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
@@ -251,14 +251,14 @@ func runJobScheduler() {
|
|||||||
// When out of pending URLs, add some random ones.
|
// When out of pending URLs, add some random ones.
|
||||||
if len(distinctHosts) == 0 {
|
if len(distinctHosts) == 0 {
|
||||||
// Queue random old URLs from history.
|
// Queue random old URLs from history.
|
||||||
count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers*3, config.CONFIG.SkipIfUpdatedDays)
|
count, err := fetchSnapshotsFromHistory(dbCtx, tx, config.CONFIG.NumOfWorkers, config.CONFIG.SkipIfUpdatedDays)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if count == 0 {
|
if count == 0 {
|
||||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
|
||||||
time.Sleep(30 * time.Second)
|
time.Sleep(120 * time.Second)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
distinctHosts, err = gemdb.Database.GetUrlHosts(dbCtx, tx)
|
distinctHosts, err = gemdb.Database.GetUrlHosts(dbCtx, tx)
|
||||||
@@ -269,7 +269,7 @@ func runJobScheduler() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get some URLs from each host, up to a limit
|
// Get some URLs from each host, up to a limit
|
||||||
urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, 10, tx)
|
urls, err := gemdb.Database.GetRandomUrlsFromHosts(dbCtx, distinctHosts, config.CONFIG.NumOfWorkers, tx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.FatalErrorsChan <- err
|
common.FatalErrorsChan <- err
|
||||||
return
|
return
|
||||||
@@ -282,28 +282,39 @@ func runJobScheduler() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(urls) == 0 {
|
if len(urls) == 0 {
|
||||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "No work, waiting to poll DB...")
|
||||||
time.Sleep(30 * time.Second)
|
time.Sleep(120 * time.Second)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Queueing %d distinct hosts -> %d urls to crawl", len(distinctHosts), len(urls))
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%d urls to crawl", len(urls))
|
||||||
|
|
||||||
|
// Add jobs to WaitGroup before queuing
|
||||||
|
common.WorkerWG.Add(len(urls))
|
||||||
|
|
||||||
for _, url := range urls {
|
for _, url := range urls {
|
||||||
jobs <- url
|
jobs <- url
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Wait for all workers to complete their jobs
|
||||||
|
common.WorkerWG.Wait()
|
||||||
|
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "All workers done. New scheduler run starts")
|
||||||
|
logging.LogInfo("")
|
||||||
|
logging.LogInfo("")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func enqueueSeedURLs(ctx context.Context, tx *sqlx.Tx) error {
|
func enqueueSeedURLs(ctx context.Context, tx *sqlx.Tx) error {
|
||||||
// Get seed URLs from seedList module
|
// Get seed URLs from seedList module
|
||||||
urls := seedList.GetSeedURLs()
|
//urls := seedList.GetSeedURLs()
|
||||||
|
//
|
||||||
for _, url := range urls {
|
//for _, url := range urls {
|
||||||
err := gemdb.Database.InsertURL(ctx, tx, url)
|
// err := gemdb.Database.InsertURL(ctx, tx, url)
|
||||||
if err != nil {
|
// if err != nil {
|
||||||
return err
|
// return err
|
||||||
}
|
// }
|
||||||
}
|
//}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -332,7 +343,6 @@ func fetchSnapshotsFromHistory(ctx context.Context, tx *sqlx.Tx, num int, age in
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(snapshotURLs) == 0 {
|
if len(snapshotURLs) == 0 {
|
||||||
contextlog.LogInfoWithContext(historyCtx, logging.GetSlogger(), "No URLs with old latest crawl attempts found to recrawl")
|
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
package common
|
package common
|
||||||
|
|
||||||
import "os"
|
import (
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
// FatalErrorsChan accepts errors from workers.
|
// FatalErrorsChan accepts errors from workers.
|
||||||
// In case of fatal error, gracefully
|
// In case of fatal error, gracefully
|
||||||
@@ -8,6 +11,7 @@ import "os"
|
|||||||
var (
|
var (
|
||||||
FatalErrorsChan chan error
|
FatalErrorsChan chan error
|
||||||
SignalsChan chan os.Signal
|
SignalsChan chan os.Signal
|
||||||
|
WorkerWG sync.WaitGroup
|
||||||
)
|
)
|
||||||
|
|
||||||
const VERSION string = "0.0.1"
|
const VERSION string = "0.0.1"
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func RunWorkerWithTx(workerID int, job string) {
|
func RunWorkerWithTx(workerID int, job string) {
|
||||||
// Extract host from URL for the context.
|
|
||||||
parsedURL, err := url2.ParseURL(job, "", true)
|
parsedURL, err := url2.ParseURL(job, "", true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logging.LogInfo("Failed to parse URL: %s Error: %s", job, err)
|
logging.LogInfo("Failed to parse URL: %s Error: %s", job, err)
|
||||||
@@ -40,7 +39,6 @@ func RunWorkerWithTx(workerID int, job string) {
|
|||||||
ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID)
|
ctx, cancel := contextutil.NewRequestContext(baseCtx, job, host, workerID)
|
||||||
ctx = contextutil.ContextWithComponent(ctx, "worker")
|
ctx = contextutil.ContextWithComponent(ctx, "worker")
|
||||||
defer cancel() // Ensure the context is cancelled when we're done
|
defer cancel() // Ensure the context is cancelled when we're done
|
||||||
// contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "======================================\n\n")
|
|
||||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job)
|
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Starting worker for URL %s", job)
|
||||||
|
|
||||||
// Create a new db transaction
|
// Create a new db transaction
|
||||||
@@ -51,6 +49,7 @@ func RunWorkerWithTx(workerID int, job string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
err = runWorker(ctx, tx, []string{job})
|
err = runWorker(ctx, tx, []string{job})
|
||||||
|
WorkerWG.Done()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Two cases to handle:
|
// Two cases to handle:
|
||||||
// - context cancellation/timeout errors (log and ignore)
|
// - context cancellation/timeout errors (log and ignore)
|
||||||
@@ -114,17 +113,11 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
|||||||
|
|
||||||
s, err := snapshot.SnapshotFromURL(url, true)
|
s, err := snapshot.SnapshotFromURL(url, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to parse URL: %v", err)
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// We always use the normalized URL
|
// We always use the normalized URL
|
||||||
if url != s.URL.Full {
|
if url != s.URL.Full {
|
||||||
//err = gemdb.Database.CheckAndUpdateNormalizedURL(ctx, tx, url, s.URL.Full)
|
|
||||||
//if err != nil {
|
|
||||||
// return err
|
|
||||||
//}
|
|
||||||
//contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Normalized URL: %s → %s", url, s.URL.Full)
|
|
||||||
url = s.URL.Full
|
url = s.URL.Full
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -147,7 +140,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
|||||||
|
|
||||||
// Only check blacklist if URL is not whitelisted
|
// Only check blacklist if URL is not whitelisted
|
||||||
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
|
if !isUrlWhitelisted && blackList.IsBlacklisted(s.URL.String()) {
|
||||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches blacklist, ignoring %s", url)
|
|
||||||
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
|
s.Error = null.StringFrom(commonErrors.ErrBlacklistMatch.Error())
|
||||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
}
|
}
|
||||||
@@ -159,7 +151,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
|||||||
// add it as an error and remove url
|
// add it as an error and remove url
|
||||||
robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
|
robotMatch = robotsMatch.RobotMatch(ctx, s.URL.String())
|
||||||
if robotMatch {
|
if robotMatch {
|
||||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "URL matches robots.txt, skipping")
|
|
||||||
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
|
s.Error = null.StringFrom(commonErrors.ErrRobotsMatch.Error())
|
||||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
}
|
}
|
||||||
@@ -184,7 +175,6 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Error visiting URL: %v", err)
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,39 +213,31 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save the snapshot and remove the URL from the queue
|
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||||
if s.Error.ValueOrZero() != "" {
|
|
||||||
// Only save error if we didn't have any valid
|
|
||||||
// snapshot data from a previous crawl!
|
|
||||||
shouldUpdateSnapshot, err := shouldUpdateSnapshotData(ctx, tx, s)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if shouldUpdateSnapshot {
|
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
|
|
||||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
|
||||||
} else {
|
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (but old content exists, not updating)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
|
|
||||||
return removeURL(ctx, tx, s.URL.String())
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
|
|
||||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func shouldUpdateSnapshotData(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
func shouldUpdateSnapshotData(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
||||||
|
// If we don't have an error, save the new snapshot.
|
||||||
|
if !s.Error.Valid {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
prevSnapshot, err := gemdb.Database.GetLatestSnapshot(ctx, tx, s.URL.String())
|
prevSnapshot, err := gemdb.Database.GetLatestSnapshot(ctx, tx, s.URL.String())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
// If we don't have a previous snapshot, save it anyway.
|
||||||
if prevSnapshot == nil {
|
if prevSnapshot == nil {
|
||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
if prevSnapshot.ResponseCode.Valid {
|
// If we have a previous snapshot,
|
||||||
return false, nil
|
// and it didn't have an error, save.
|
||||||
|
// This means that we can have a max
|
||||||
|
// of one consecutive snapshot with
|
||||||
|
// an error.
|
||||||
|
if prevSnapshot.Error.ValueOrZero() == "" {
|
||||||
|
return true, nil
|
||||||
}
|
}
|
||||||
return true, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func isContentIdentical(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
func isContentIdentical(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
||||||
@@ -295,11 +277,25 @@ func removeURL(ctx context.Context, tx *sqlx.Tx, url string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
func saveSnapshotAndRemoveURL(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||||
err := gemdb.Database.SaveSnapshot(ctx, tx, s)
|
shouldUpdateSnapshot, err := shouldUpdateSnapshotData(ctx, tx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return gemdb.Database.DeleteURL(ctx, tx, s.URL.String())
|
if shouldUpdateSnapshot {
|
||||||
|
err := gemdb.Database.SaveSnapshot(ctx, tx, s)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d", s.ResponseCode.ValueOrZero())
|
||||||
|
return removeURL(ctx, tx, s.URL.String())
|
||||||
|
} else {
|
||||||
|
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "%2d %s (updating crawl date)", s.ResponseCode.ValueOrZero(), s.Error.ValueOrZero())
|
||||||
|
err = gemdb.Database.UpdateLastCrawled(ctx, tx, s.URL.String())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return removeURL(ctx, tx, s.URL.String())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// shouldPersistURL returns true given URL is a
|
// shouldPersistURL returns true given URL is a
|
||||||
|
|||||||
2
db/db.go
2
db/db.go
@@ -448,7 +448,7 @@ func (d *DbServiceImpl) GetLatestSnapshot(ctx context.Context, tx *sqlx.Tx, url
|
|||||||
if errors.Is(err, sql.ErrNoRows) {
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", false)
|
return nil, xerrors.NewError(fmt.Errorf("cannot get latest snapshot for URL %s: %w", url, err), 0, "", true)
|
||||||
}
|
}
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -115,12 +115,7 @@ LIMIT $1
|
|||||||
SQL_UPDATE_LAST_CRAWLED = `
|
SQL_UPDATE_LAST_CRAWLED = `
|
||||||
UPDATE snapshots
|
UPDATE snapshots
|
||||||
SET last_crawled = CURRENT_TIMESTAMP
|
SET last_crawled = CURRENT_TIMESTAMP
|
||||||
WHERE id = (
|
WHERE url = $1
|
||||||
SELECT id FROM snapshots
|
|
||||||
WHERE url = $1
|
|
||||||
ORDER BY timestamp DESC
|
|
||||||
LIMIT 1
|
|
||||||
)
|
|
||||||
`
|
`
|
||||||
// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
|
// SQL_FETCH_SNAPSHOTS_FROM_HISTORY Fetches URLs from snapshots for re-crawling based on last_crawled timestamp
|
||||||
// This query finds root domain URLs that haven't been crawled recently and selects
|
// This query finds root domain URLs that haven't been crawled recently and selects
|
||||||
@@ -137,7 +132,7 @@ LIMIT $1
|
|||||||
host,
|
host,
|
||||||
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
|
COALESCE(MAX(last_crawled), '1970-01-01'::timestamp) as latest_attempt
|
||||||
FROM snapshots
|
FROM snapshots
|
||||||
WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini'
|
WHERE url ~ '^gemini://[^/]+/?$' AND mimetype = 'text/gemini' AND error IS NULL
|
||||||
GROUP BY url, host
|
GROUP BY url, host
|
||||||
),
|
),
|
||||||
root_urls_with_content AS (
|
root_urls_with_content AS (
|
||||||
|
|||||||
6
misc/sql/fetch-snapshot-history.sql
Normal file
6
misc/sql/fetch-snapshot-history.sql
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
select count(*) from snapshots
|
||||||
|
where last_crawled < now() - interval '30 days'
|
||||||
|
and error IS NULL
|
||||||
|
and gemtext IS NOT NULL
|
||||||
|
and mimetype='text/gemini'
|
||||||
|
and url ~ '^gemini://[^/]+/?$';
|
||||||
Reference in New Issue
Block a user