Add robots.txt checking

Still needs periodic cache refresh
This commit is contained in:
2024-10-23 14:24:10 +03:00
parent c49a69728a
commit 561f83a007
7 changed files with 114 additions and 42 deletions

View File

@@ -30,12 +30,17 @@ func printPoolIPs() {
}
func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
// Wrap errors with more info.
defer func() {
// If URL matches a robots.txt disallow line,
// add it as an error so next time it won't be
// crawled.
if RobotMatch(s) {
s.Error = null.StringFrom("robots.txt disallow match")
err = SaveSnapshotToDB(tx, s)
if err != nil {
err = fmt.Errorf("[%d] Worker Error: %w", id, err)
return fmt.Errorf("[%d] DB Error: %w", id, err)
}
}()
return nil
}
IPs, err := getHostIPAddresses(s.Host)
if err != nil {
@@ -88,19 +93,19 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
if s.Links != nil {
var batchSnapshots []*Snapshot
timestamp := null.TimeFrom(time.Now())
for _, link := range *s.Links {
if shouldPersistURL(tx, link) {
newSnapshot := &Snapshot{
UID: uid.UID(),
URL: link,
Host: link.Hostname,
UID: uid.UID(),
URL: link,
Host: link.Hostname,
Timestamp: timestamp,
}
batchSnapshots = append(batchSnapshots, newSnapshot)
}
}
if len(batchSnapshots) > 0 {
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
err = SaveLinksToDB(tx, batchSnapshots)
@@ -228,9 +233,6 @@ func runWorker(id int, db *sqlx.DB) {
}
total := len(snapshots)
for i, s := range snapshots {
if InBlacklist(&s) {
logging.LogDebug("[%d] Ignoring %d/%d blacklisted URL %s", id, i+1, total, s.URL)
}
logging.LogInfo("[%d] Starting %d/%d %s", id, i+1, total, s.URL)
err = workOnSnapshot(id, tx, &s)
if err != nil {