Add robots.txt checking
Still needs periodic cache refresh
This commit is contained in:
@@ -30,12 +30,17 @@ func printPoolIPs() {
|
||||
}
|
||||
|
||||
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
// Wrap errors with more info.
|
||||
defer func() {
|
||||
// If URL matches a robots.txt disallow line,
|
||||
// add it as an error so next time it won't be
|
||||
// crawled.
|
||||
if RobotMatch(s) {
|
||||
s.Error = null.StringFrom("robots.txt disallow match")
|
||||
err = SaveSnapshotToDB(tx, s)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("[%d] Worker Error: %w", id, err)
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
IPs, err := getHostIPAddresses(s.Host)
|
||||
if err != nil {
|
||||
@@ -88,19 +93,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
if s.Links != nil {
|
||||
var batchSnapshots []*Snapshot
|
||||
timestamp := null.TimeFrom(time.Now())
|
||||
|
||||
|
||||
for _, link := range *s.Links {
|
||||
if shouldPersistURL(tx, link) {
|
||||
newSnapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
URL: link,
|
||||
Host: link.Hostname,
|
||||
UID: uid.UID(),
|
||||
URL: link,
|
||||
Host: link.Hostname,
|
||||
Timestamp: timestamp,
|
||||
}
|
||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if len(batchSnapshots) > 0 {
|
||||
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
|
||||
err = SaveLinksToDB(tx, batchSnapshots)
|
||||
@@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) {
|
||||
}
|
||||
total := len(snapshots)
|
||||
for i, s := range snapshots {
|
||||
if InBlacklist(&s) {
|
||||
logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
|
||||
}
|
||||
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
|
||||
err = workOnSnapshot(id, tx, &s)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user