feat: Implement robots.txt parser

This commit is contained in:
2024-10-23 09:26:39 +03:00
parent 17ef03d621
commit e51d84cad8
2 changed files with 71 additions and 0 deletions

46
gemini/robots.go Normal file
View File

@@ -0,0 +1,46 @@
package gemini
import (
"bufio"
"fmt"
"strings"
)
// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited
func ParseRobotsTxt(content string, host string) []string {
scanner := bufio.NewScanner(strings.NewReader(content))
var disallowedPaths []string
// Skip everything until we find "User-agent: *" line
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.ToLower(line) == "user-agent: *" {
break
}
}
// Now collect all Disallow paths
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Stop if we hit another User-agent section
if strings.HasPrefix(strings.ToLower(line), "user-agent:") {
break
}
// Parse Disallow lines
if strings.HasPrefix(strings.ToLower(line), "disallow:") {
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
path := strings.TrimSpace(parts[1])
if path != "" {
// Construct full Gemini URL
disallowedPaths = append(disallowedPaths,
fmt.Sprintf("gemini://%s%s", host, path))
}
}
}
}
return disallowedPaths
}