package gemini import ( "errors" "fmt" "strings" "time" "gemini-grc/logging" "gemini-grc/util" "github.com/guregu/null/v5" "github.com/jmoiron/sqlx" ) type WorkerStatus struct { id int status string } func PrintWorkerStatus(totalWorkers int, statusChan chan WorkerStatus) { // Create a slice to store current status of each worker statuses := make([]string, totalWorkers) // Initialize empty statuses for i := range statuses { statuses[i] = "" } // Initial print var output strings.Builder // \033[H moves the cursor to the top left corner of the screen // (ie, the first column of the first row in the screen). // \033[J clears the part of the screen from the cursor to the end of the screen. output.WriteString("\033[H\033[J") // Clear screen and move cursor to top for i := range statuses { output.WriteString(fmt.Sprintf("[%2d] \n", i)) } fmt.Print(output.String()) // Continuously receive status updates for update := range statusChan { if update.id >= totalWorkers { continue } // Update the status statuses[update.id] = update.status // Build the complete output string output.Reset() output.WriteString("\033[H\033[J") // Clear screen and move cursor to top for i, status := range statuses { output.WriteString(fmt.Sprintf("[%2d] %.100s\n", i, status)) } // Print the entire status fmt.Print(output.String()) } } var statusChan chan WorkerStatus func SpawnWorkers(numOfWorkers int, db *sqlx.DB) { logging.LogInfo("Spawning %d workers", numOfWorkers) statusChan = make(chan WorkerStatus, numOfWorkers) go PrintWorkerStatus(numOfWorkers, statusChan) for i := range numOfWorkers { go func(i int) { // Jitter to avoid starting everything at the same time time.Sleep(time.Duration(util.SecureRandomInt(10)) * time.Second) for { RunWorkerWithTx(i, db, nil) } }(i) } } func RunWorkerWithTx(workerID int, db *sqlx.DB, url *string) { statusChan <- WorkerStatus{ id: workerID, status: "Starting up", } defer func() { statusChan <- WorkerStatus{ id: workerID, status: "Done", } }() tx, err := db.Beginx() if err != nil { panic(fmt.Sprintf("Failed to begin transaction: %v", err)) } runWorker(workerID, tx, url) logging.LogDebug("[%d] Committing transaction", workerID) err = tx.Commit() // On deadlock errors, rollback and return, otherwise panic. if err != nil { logging.LogError("[%d] Failed to commit transaction: %w", workerID, err) if isDeadlockError(err) { logging.LogError("[%d] Deadlock detected. Rolling back", workerID) time.Sleep(time.Duration(10) * time.Second) err := tx.Rollback() if err != nil { panic(fmt.Sprintf("[%d] Failed to roll back transaction: %v", workerID, err)) } return } panic(fmt.Sprintf("[%d] Failed to commit transaction: %v", workerID, err)) } logging.LogDebug("[%d] Worker done!", workerID) } func runWorker(workerID int, tx *sqlx.Tx, url *string) { var snapshots []Snapshot var err error // If not given a specific URL, // get some random ones to visit from DB. if url == nil { statusChan <- WorkerStatus{ id: workerID, status: "Getting snapshots", } snapshots, err = GetSnapshotsToVisit(tx) if err != nil { logging.LogError("[%d] GeminiError retrieving snapshot: %w", workerID, err) panic("This should never happen") } else if len(snapshots) == 0 { logging.LogInfo("[%d] No snapshots to visit.", workerID) time.Sleep(1 * time.Minute) return } } else { snapshotURL, err := ParseURL(*url, "") if err != nil { logging.LogError("Invalid URL given: %s", *url) return } snapshots = []Snapshot{{ // UID: uid.UID(), URL: *snapshotURL, Host: snapshotURL.Hostname, Timestamp: null.TimeFrom(time.Now()), }} } total := len(snapshots) for i, s := range snapshots { logging.LogDebug("[%d] Snapshot %d/%d: %s", workerID, i+1, total, s.URL.String()) } // Start visiting URLs. for i, s := range snapshots { logging.LogDebug("[%d] Starting %d/%d %s", workerID, i+1, total, s.URL.String()) // We differentiate between errors: // Unexpected errors are the ones returned from the following function. // If an error is unexpected (which should never happen) we panic. // Expected errors are stored as strings within the snapshot, // so that they can also be stored in DB. err := workOnSnapshot(workerID, tx, &s) if err != nil { logging.LogError("[%d] [%s] Unexpected GeminiError %w", workerID, s.URL.String(), err) util.PrintStackAndPanic(err) } if s.Error.Valid { logging.LogDebug("[%d] Error: %v", workerID, s.Error.String) } logging.LogDebug("[%d] Done %d/%d.", workerID, i+1, total) } } // workOnSnapshot visits a URL and stores the result. // unexpected errors are returned. // expected errors are stored within the snapshot. func workOnSnapshot(workerID int, tx *sqlx.Tx, s *Snapshot) (err error) { if IsBlacklisted(s.URL) { logging.LogDebug("[%d] URL matches Blacklist, ignoring %s", workerID, s.URL.String()) return nil } // If URL matches a robots.txt disallow line, // add it as an error so next time it won't be // crawled. if RobotMatch(s.URL) { s.Error = null.StringFrom(ErrGeminiRobotsDisallowed.Error()) err = UpsertSnapshot(workerID, tx, s) if err != nil { return fmt.Errorf("[%d] %w", workerID, err) } return nil } // Resolve IP address via DNS IPs, err := getHostIPAddresses(s.Host) if err != nil { s.Error = null.StringFrom(err.Error()) err = UpsertSnapshot(workerID, tx, s) if err != nil { return fmt.Errorf("[%d] %w", workerID, err) } return nil } for { count := 1 if isAnotherWorkerVisitingHost(workerID, IPs) { logging.LogDebug("[%d] Another worker is visiting this host, waiting", workerID) statusChan <- WorkerStatus{ id: workerID, status: fmt.Sprintf("Waiting to grab lock for host %s", s.Host), } time.Sleep(1 * time.Second) // Avoid flood-retrying count++ if count == 3 { return } } else { break } } AddIPsToPool(IPs) // After finishing, remove the host IPs from // the connections pool, with a small delay // to avoid potentially hitting the same IP quickly. defer func() { go func() { time.Sleep(1 * time.Second) RemoveIPsFromPool(IPs) }() }() statusChan <- WorkerStatus{ id: workerID, status: fmt.Sprintf("Visiting %s", s.URL.String()), } err = Visit(s) if err != nil { if !IsKnownError(err) { logging.LogError("[%d] Unknown error visiting %s: %w", workerID, s.URL.String(), err) return err } s.Error = null.StringFrom(err.Error()) // Check if error is redirection, and handle it if errors.As(err, new(*GeminiError)) && err.(*GeminiError).Msg == "redirect" { err = handleRedirection(workerID, tx, s) if err != nil { if IsKnownError(err) { s.Error = null.StringFrom(err.Error()) } else { return err } } } } // If this is a gemini page, parse possible links inside if !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini" { links := GetPageLinks(s.URL, s.GemText.String) if len(links) > 0 { logging.LogDebug("[%d] Found %d links", workerID, len(links)) s.Links = null.ValueFrom(links) err = storeLinks(tx, s) if err != nil { return err } } } else { logging.LogDebug("[%d] Not text/gemini, so not looking for page links", workerID) } err = UpsertSnapshot(workerID, tx, s) logging.LogInfo("[%3d] %2d %s", workerID, s.ResponseCode.ValueOrZero(), s.URL.String()) if err != nil { return err } return nil } func isAnotherWorkerVisitingHost(workerID int, IPs []string) bool { IPPool.Lock.RLock() defer func() { IPPool.Lock.RUnlock() }() logging.LogDebug("[%d] Checking pool for IPs", workerID) for _, ip := range IPs { _, ok := IPPool.IPs[ip] if ok { return true } } return false } func storeLinks(tx *sqlx.Tx, s *Snapshot) error { if s.Links.Valid { var batchSnapshots []*Snapshot for _, link := range s.Links.ValueOrZero() { if shouldPersistURL(&link) { newSnapshot := &Snapshot{ URL: link, Host: link.Hostname, Timestamp: null.TimeFrom(time.Now()), } batchSnapshots = append(batchSnapshots, newSnapshot) } } if len(batchSnapshots) > 0 { err := SaveLinksToDBinBatches(tx, batchSnapshots) if err != nil { return err } } } return nil } // shouldPersistURL returns true if we // should save the URL in the DB. // Only gemini:// urls are saved. func shouldPersistURL(u *URL) bool { return strings.HasPrefix(u.String(), "gemini://") } // handleRedirection saves redirect URL as new snapshot func handleRedirection(workerID int, tx *sqlx.Tx, s *Snapshot) error { newURL, err := extractRedirectTarget(s.URL, s.Error.ValueOrZero()) if err != nil { if errors.Is(err, ErrGeminiRedirect) { logging.LogDebug("[%d] %s", workerID, err) } return err } logging.LogDebug("[%d] Page redirects to %s", workerID, newURL) // Insert fresh snapshot with new URL if shouldPersistURL(newURL) { snapshot := &Snapshot{ // UID: uid.UID(), URL: *newURL, Host: newURL.Hostname, Timestamp: null.TimeFrom(time.Now()), } logging.LogDebug("[%d] Saving redirection URL %s", workerID, snapshot.URL.String()) err = SaveSnapshotIfNew(tx, snapshot) if err != nil { return err } } return nil } func GetSnapshotFromURL(tx *sqlx.Tx, url string) ([]Snapshot, error) { query := ` SELECT * FROM snapshots WHERE url=$1 LIMIT 1 ` var snapshots []Snapshot err := tx.Select(&snapshots, query, url) if err != nil { return nil, err } return snapshots, nil }