Optimize worker random snapshot selection
This commit is contained in:
@@ -47,8 +47,8 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the host's ip is in the pool, stop
|
// If the host's ip is in the connections pool,
|
||||||
// and add the url in the queue later.
|
// stop and add the url in the queue later.
|
||||||
IpPool.Lock.RLock()
|
IpPool.Lock.RLock()
|
||||||
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL)
|
logging.LogDebug("[%d] [%s] Checking pool for IP", id, s.URL)
|
||||||
for _, ip := range IPs {
|
for _, ip := range IPs {
|
||||||
@@ -100,6 +100,7 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Should we save the given URL for crawling?
|
||||||
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
|
func shouldPersistURL(tx *sqlx.Tx, u GeminiUrl) bool {
|
||||||
if !strings.HasPrefix(u.String(), "gemini://") {
|
if !strings.HasPrefix(u.String(), "gemini://") {
|
||||||
return false
|
return false
|
||||||
@@ -154,12 +155,29 @@ func GetRandomSnapshots(tx *sqlx.Tx) ([]Snapshot, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
func GetRandomSnapshotsDistinctHosts(tx *sqlx.Tx) ([]Snapshot, error) {
|
||||||
|
// Old, unoptimized query
|
||||||
|
//
|
||||||
|
// query := `
|
||||||
|
// SELECT DISTINCT ON (host) *
|
||||||
|
// FROM snapshots
|
||||||
|
// WHERE response_code IS NULL
|
||||||
|
// AND error IS NULL
|
||||||
|
// ORDER BY host, RANDOM()
|
||||||
|
// LIMIT $1
|
||||||
|
// `
|
||||||
query := `
|
query := `
|
||||||
SELECT DISTINCT ON (host) *
|
WITH RankedSnapshots AS (
|
||||||
|
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
||||||
|
links, lang, response_code, error,
|
||||||
|
ROW_NUMBER() OVER (PARTITION BY host ORDER BY RANDOM()) as rn
|
||||||
FROM snapshots
|
FROM snapshots
|
||||||
WHERE response_code IS NULL
|
WHERE response_code IS NULL
|
||||||
AND error IS NULL
|
AND error IS NULL
|
||||||
ORDER BY host, RANDOM()
|
)
|
||||||
|
SELECT id, uid, url, host, timestamp, mimetype, data, gemtext,
|
||||||
|
links, lang, response_code, error
|
||||||
|
FROM RankedSnapshots
|
||||||
|
WHERE rn = 1
|
||||||
LIMIT $1
|
LIMIT $1
|
||||||
`
|
`
|
||||||
var snapshots []Snapshot
|
var snapshots []Snapshot
|
||||||
@@ -199,7 +217,7 @@ func runWorker(id int, db *sqlx.DB) {
|
|||||||
total := len(snapshots)
|
total := len(snapshots)
|
||||||
for i, s := range snapshots {
|
for i, s := range snapshots {
|
||||||
if InBlacklist(&s) {
|
if InBlacklist(&s) {
|
||||||
logging.LogWarn("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
|
logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
|
||||||
}
|
}
|
||||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
||||||
err = workOnSnapshot(id, tx, &s)
|
err = workOnSnapshot(id, tx, &s)
|
||||||
|
|||||||
Reference in New Issue
Block a user