Add Gopherspace crawling!

This commit is contained in:
2025-02-26 10:35:28 +02:00
parent 29877cb2da
commit d89dd72fe9
3 changed files with 613 additions and 0 deletions

32
gopher/errors.go Normal file
View File

@@ -0,0 +1,32 @@
package gopher
import (
"gemini-grc/errors"
)
// GopherError is an error encountered while
// visiting a Gopher host, and is only for
// Gopher errors (item type indicator 3).
type GopherError struct {
Err error
}
func (e *GopherError) Error() string {
return e.Err.Error()
}
func (e *GopherError) Unwrap() error {
return e.Err
}
func NewGopherError(err error) error {
return &GopherError{Err: err}
}
func IsGopherError(err error) bool {
if err == nil {
return false
}
var asError *GopherError
return errors.As(err, &asError)
}

283
gopher/network.go Normal file
View File

@@ -0,0 +1,283 @@
package gopher
import (
"fmt"
"io"
"net"
stdurl "net/url"
"regexp"
"strings"
"time"
"unicode/utf8"
errors2 "gemini-grc/common/errors"
"gemini-grc/common/linkList"
"gemini-grc/common/snapshot"
_url "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/errors"
"gemini-grc/logging"
"github.com/guregu/null/v5"
)
// References:
// RFC 1436 https://www.rfc-editor.org/rfc/rfc1436.html
// The default port for Gopher is 70.
// Originally Gopher used ASCII or
// ISO-8859-1, now others use UTF-8.
// In any case, just converting to UTF-8
// will work. If not, we bail.
// Here's the complete list of Gopher item type indicators (prefixes):
//
// `0` - Plain Text File
// `1` - Directory/Menu
// `2` - CSO Phone Book Server
// `3` - Error Message
// `4` - BinHexed Macintosh File
// `5` - DOS Binary Archive
// `6` - UNIX uuencoded File
// `7` - Index/Search Server
// `8` - Telnet Session
// `9` - Binary File
// `+` - Mirror/Redundant Server
// `g` - GIF Image
// `I` - Image File (non-GIF)
// `T` - TN3270 Session
// `i` - Informational Message (menu line)
// `h` - HTML File
// `s` - Sound/Music File
// `d` - Document File
// `w` - WHOIS Service
// `;` - Document File with Alternative View
// `<` - Video File
// `M` - MIME File (mail message or similar)
// `:` - Bitmap Image
// `c` - Calendar File
// `p` - PostScript File
// The most commonly used ones are `0` (text), `1` (directory), `i` (info), and `3` (error).
// The original Gopher protocol only specified types 0-9, `+`, `g`, `I`, and `T`.
// The others were added by various implementations and extensions over time.
// Error methodology:
// HostError for DNS/network errors
// GopherError for network/gopher errors
// NewError for other errors
// NewFatalError for other fatal errors
func Visit(url string) (*snapshot.Snapshot, error) {
s, err := snapshot.SnapshotFromURL(url, false)
if err != nil {
return nil, err
}
data, err := connectAndGetData(url)
if err != nil {
logging.LogDebug("Error: %s", err.Error())
if IsGopherError(err) || errors2.IsHostError(err) {
s.Error = null.StringFrom(err.Error())
return s, nil
}
return nil, err
}
isValidUTF8 := utf8.ValidString(string(data))
if isValidUTF8 {
s.GemText = null.StringFrom(removeNullChars(string(data)))
} else {
s.Data = null.ValueFrom(data)
}
if !isValidUTF8 {
return s, nil
}
responseError := checkForError(string(data))
if responseError != nil {
s.Error = null.StringFrom(responseError.Error())
return s, nil
}
links := getGopherPageLinks(string(data))
linkURLs := linkList.LinkList(make([]_url.URL, len(links)))
for i, link := range links {
linkURL, err := _url.ParseURL(link, "", true)
if err == nil {
linkURLs[i] = *linkURL
}
}
if len(links) != 0 {
s.Links = null.ValueFrom(linkURLs)
}
return s, nil
}
func connectAndGetData(url string) ([]byte, error) {
parsedURL, err := stdurl.Parse(url)
if err != nil {
return nil, errors.NewError(err)
}
hostname := parsedURL.Hostname()
port := parsedURL.Port()
if port == "" {
port = "70"
}
host := fmt.Sprintf("%s:%s", hostname, port)
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
// Establish the underlying TCP connection.
dialer := &net.Dialer{
Timeout: timeoutDuration,
}
logging.LogDebug("Dialing %s", host)
conn, err := dialer.Dial("tcp", host)
if err != nil {
return nil, errors2.NewHostError(err)
}
// Make sure we always close the connection.
defer func() {
_ = conn.Close()
}()
// Set read and write timeouts on the TCP connection.
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, errors2.NewHostError(err)
}
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
if err != nil {
return nil, errors2.NewHostError(err)
}
// We read `buf`-sized chunks and add data to `data`.
buf := make([]byte, 4096)
var data []byte
// Send Gopher request to trigger server response.
payload := constructPayloadFromPath(parsedURL.Path)
_, err = conn.Write([]byte(fmt.Sprintf("%s\r\n", payload)))
if err != nil {
return nil, errors2.NewHostError(err)
}
// Read response bytes in len(buf) byte chunks
for {
n, err := conn.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
}
if err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, errors2.NewHostError(err)
}
if len(data) > config.CONFIG.MaxResponseSize {
return nil, errors2.NewHostError(fmt.Errorf("response exceeded max"))
}
}
logging.LogDebug("Got %d bytes", len(data))
return data, nil
}
func constructPayloadFromPath(urlpath string) string {
// remove Gopher item type in URL from payload, if one.
re := regexp.MustCompile(`^/[\w]/.*`)
payloadWithoutItemtype := urlpath
if re.Match([]byte(urlpath)) {
payloadWithoutItemtype = strings.Join(strings.Split(urlpath, "/")[2:], "/")
}
if !strings.HasPrefix(payloadWithoutItemtype, "/") {
payloadWithoutItemtype = fmt.Sprintf("/%s", payloadWithoutItemtype)
}
return payloadWithoutItemtype
}
func checkForError(utfData string) error {
lines := strings.Split(strings.TrimSpace(utfData), "\n")
var firstLine string
if len(lines) > 0 {
firstLine = lines[0]
} else {
return nil
}
if strings.HasPrefix(firstLine, "3") {
split := strings.Split(firstLine, "\t")
return NewGopherError(fmt.Errorf("gopher error: %s", strings.TrimSpace(split[0])))
}
return nil
}
func getGopherPageLinks(content string) []string {
var links []string
lines := strings.Split(strings.TrimSpace(content), "\n")
for _, line := range lines {
if line == "" || line == "." {
continue
}
if len(line) < 1 {
continue
}
itemType := line[0]
if itemType == 'i' {
continue
}
parts := strings.SplitN(line[1:], "\t", 4)
if len(parts) < 3 {
continue
}
selector := strings.TrimSpace(parts[1])
host := strings.TrimSpace(parts[2])
if host == "" {
continue
}
// Handle HTML links first
if itemType == 'h' && strings.HasPrefix(selector, "URL:") {
if url := strings.TrimSpace(selector[4:]); url != "" {
links = append(links, url)
}
continue
}
// For gopher links, build URL carefully
var url strings.Builder
// Protocol and host:port
url.WriteString("gopher://")
url.WriteString(host)
url.WriteString(":")
if len(parts) > 3 && strings.TrimSpace(parts[3]) != "" {
url.WriteString(strings.TrimSpace(parts[3]))
} else {
url.WriteString("70")
}
// Path: always /type + selector
url.WriteString("/")
url.WriteString(string(itemType))
if strings.HasPrefix(selector, "/") {
url.WriteString(selector)
} else {
url.WriteString("/" + selector)
}
links = append(links, url.String())
}
return links
}
func removeNullChars(input string) string {
// Replace all null characters with an empty string
return strings.ReplaceAll(input, "\u0000", "")
}

298
gopher/network_test.go Normal file
View File

@@ -0,0 +1,298 @@
package gopher
import (
"net"
"testing"
"gemini-grc/common/errors"
"gemini-grc/config"
"github.com/stretchr/testify/assert"
)
func TestConstructPayloadFromPath(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "Path with Gopher item type",
input: "/1/path/to/resource",
expected: "/path/to/resource",
},
{
name: "Path with different item type",
input: "/0/another/path",
expected: "/another/path",
},
{
name: "Path without item type but with leading slash",
input: "/simple/path",
expected: "/simple/path",
},
{
name: "Path without item type and without leading slash",
input: "no/leading/slash",
expected: "/no/leading/slash",
},
{
name: "Empty path",
input: "",
expected: "/",
},
{
name: "Single character item type",
input: "/h/homepage",
expected: "/homepage",
},
{
name: "Single slash",
input: "/",
expected: "/",
},
{
name: "Item type-looking path",
input: "/1",
expected: "/1",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := constructPayloadFromPath(tt.input)
if result != tt.expected {
t.Errorf("constructPayloadFromPath(%q) = %q, want %q",
tt.input, result, tt.expected)
}
})
}
}
func TestParseLinks(t *testing.T) {
tests := []struct {
name string
currentURL string
input string
want int // number of expected links
wantErr bool
}{
{
name: "Empty input",
currentURL: "gopher://example.com:70",
input: "",
want: 0,
wantErr: false,
},
{
name: "Single directory link",
currentURL: "gopher://example.com:70",
input: "1About Us\t/about\texample.com",
want: 1,
wantErr: false,
},
{
name: "Single text file link",
currentURL: "gopher://example.com:70",
input: "0README\t/readme.txt\texample.com",
want: 1,
wantErr: false,
},
{
name: "Multiple links of different types",
currentURL: "gopher://example.com:70",
input: "1About Us\t/about\texample.com\n0README\t/readme.txt\texample.com\n1Contact\t/contact\texample.com",
want: 3,
wantErr: false,
},
{
name: "Ignore non-linkable types",
currentURL: "gopher://example.com:70",
input: "iInfo line\t/info\texample.com\n1Directory\t/dir\texample.com\n0Text\t/text.txt\texample.com",
want: 2,
wantErr: false,
},
{
name: "Malformed lines",
currentURL: "gopher://example.com:70",
input: "1Incomplete line\n0No tabs\n1Missing parts\t",
want: 0,
wantErr: false,
},
{
name: "Mixed valid and invalid lines",
currentURL: "gopher://example.com:70",
input: "1Valid link\t/valid\texample.com\n1Incomplete\t\n0Text file\t/text.txt\texample.com\n1Another valid\t/another\texample.com",
want: 3,
wantErr: false,
},
{
name: "Absolute URLs",
currentURL: "gopher://example.com:70",
input: "1External link\tgopher://external.com/path\texternal.com\n0Document\tgopher://other.com/doc.txt\tother.com",
want: 2,
wantErr: false,
},
{
name: "With whitespace",
currentURL: "gopher://example.com:70",
input: " 1Padded line \t/padded\texample.com\n0Text file \t/doc.txt\texample.com",
want: 2,
wantErr: false,
},
{
name: "Special characters in paths",
currentURL: "gopher://example.com:70",
input: "1Special chars\t/path with spaces\texample.com\n0Doc\t/über/päth.txt\texample.com",
want: 2,
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := getGopherPageLinks(tt.input)
assert.Equal(t, tt.want, len(got), "expected %d links, got %d", tt.want, len(got))
})
}
}
func TestCheckForError(t *testing.T) {
tests := []struct {
name string
input string
wantError bool
errorPrefix string
}{
{
name: "No error",
input: "1Directory\t/dir\texample.com\n0Text\t/text.txt\texample.com",
wantError: false,
errorPrefix: "",
},
{
name: "Simple error message",
input: "3Error: File not found\t\texample.com",
wantError: true,
errorPrefix: "gopher error: 3Error: File not found",
},
{
name: "Error with multiple tabs",
input: "3File not found\t/error\texample.com\t70",
wantError: true,
errorPrefix: "gopher error: 3File not found",
},
{
name: "Error among valid entries",
input: `1Welcome\t/welcome\texample.com
3Access denied\t\texample.com
0README\t/readme.txt\texample.com`,
wantError: false,
errorPrefix: "",
},
{
name: "Error with no tabs",
input: "3Server is down for maintenance",
wantError: true,
errorPrefix: "gopher error: 3Server is down for maintenance",
},
{
name: "Multiple errors (should return first)",
input: `3First error\t\texample.com
3Second error\t\texample.com`,
wantError: true,
errorPrefix: "gopher error: 3First error",
},
{
name: "Error with whitespace",
input: " 3 Error with spaces \t\texample.com",
wantError: true,
errorPrefix: "gopher error: 3 Error with spaces",
},
{
name: "Empty input",
input: "",
wantError: false,
errorPrefix: "",
},
{
name: "Just newlines",
input: "\n\n\n",
wantError: false,
errorPrefix: "",
},
{
name: "Error after empty lines",
input: `
3Error after empty lines\t\texample.com`,
wantError: true,
errorPrefix: "gopher error: 3Error after empty lines",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := checkForError(tt.input)
if !tt.wantError {
assert.NoError(t, err)
return
}
assert.Error(t, err)
assert.Contains(t, err.Error(), tt.errorPrefix)
})
}
}
func TestConnectAndGetDataTimeout(t *testing.T) {
// Start a test server that doesn't respond
listener, err := net.Listen("tcp", "localhost:0")
if err != nil {
t.Fatalf("Failed to start listener: %v", err)
}
defer listener.Close()
// Accept the connection but don't respond
go func() {
conn, err := listener.Accept()
if err != nil {
t.Logf("Failed to accept connection: %v", err)
return
}
defer conn.Close()
// Keep the connection open without sending any data to simulate a timeout
select {}
}()
// Construct the URL of our test server
address := listener.Addr().String()
testURL := "gopher://" + address + "/testpath"
// Save original config values
originalTimeout := config.CONFIG.ResponseTimeout
originalMaxSize := config.CONFIG.MaxResponseSize
// Set test config values
config.CONFIG.ResponseTimeout = 1 // Set a very short timeout for this test
config.CONFIG.MaxResponseSize = 1024 // Just for consistency, we won't reach this
// Test the function
_, err = connectAndGetData(testURL)
// Reset config values
config.CONFIG.ResponseTimeout = originalTimeout
config.CONFIG.MaxResponseSize = originalMaxSize
// Check if the error is due to timeout
if err == nil {
t.Error("Expected an error due to timeout, but got no error")
} else if !errors.IsHostError(err) {
t.Errorf("Expected a HostError, but got: %v", err)
} else {
// Here you might want to check if the specific error message contains 'timeout'
// However, since we don't have the exact error string, we're checking the type
t.Logf("Successfully timed out: %v", err)
}
}