Simplify robots.txt parsing logic
This commit is contained in:
@@ -1,35 +1,21 @@
|
|||||||
package gemini
|
package gemini
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ParseRobotsTxt takes robots.txt content and a host, returns list of full URLs that shouldn't be visited
|
// Takes robots.txt content and a host, and
|
||||||
|
// returns a list of full URLs that shouldn't
|
||||||
|
// be visited.
|
||||||
|
// TODO Also take into account the user agent?
|
||||||
|
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||||
func ParseRobotsTxt(content string, host string) []string {
|
func ParseRobotsTxt(content string, host string) []string {
|
||||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
|
||||||
var disallowedPaths []string
|
var disallowedPaths []string
|
||||||
|
for _, line := range strings.Split(content, "\n") {
|
||||||
// Skip everything until we find "User-agent: *" line
|
line = strings.TrimSpace(line)
|
||||||
for scanner.Scan() {
|
line = strings.ToLower(line)
|
||||||
line := strings.TrimSpace(scanner.Text())
|
if strings.HasPrefix(line, "disallow:") {
|
||||||
if strings.ToLower(line) == "user-agent: *" {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now collect all Disallow paths
|
|
||||||
for scanner.Scan() {
|
|
||||||
line := strings.TrimSpace(scanner.Text())
|
|
||||||
|
|
||||||
// Stop if we hit another User-agent section
|
|
||||||
if strings.HasPrefix(strings.ToLower(line), "user-agent:") {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse Disallow lines
|
|
||||||
if strings.HasPrefix(strings.ToLower(line), "disallow:") {
|
|
||||||
parts := strings.SplitN(line, ":", 2)
|
parts := strings.SplitN(line, ":", 2)
|
||||||
if len(parts) == 2 {
|
if len(parts) == 2 {
|
||||||
path := strings.TrimSpace(parts[1])
|
path := strings.TrimSpace(parts[1])
|
||||||
@@ -41,6 +27,5 @@ func ParseRobotsTxt(content string, host string) []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return disallowedPaths
|
return disallowedPaths
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user