Compare commits
8 Commits
a2a6bd200a
...
1ac250ca6e
| Author | SHA1 | Date | |
|---|---|---|---|
| 1ac250ca6e | |||
| 6a96fb26cc | |||
| 3e01cb1819 | |||
| 8d9ea6cdec | |||
| f9b5fd5e7f | |||
| 62369d90ae | |||
| e51d84cad8 | |||
| 17ef03d621 |
@@ -6,6 +6,7 @@ import (
|
||||
"gemini-grc/config"
|
||||
"io"
|
||||
"net"
|
||||
go_url "net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
@@ -14,6 +15,14 @@ import (
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
type GeminiPageData struct {
|
||||
ResponseCode int
|
||||
MimeType string
|
||||
Lang string
|
||||
GemText string
|
||||
Data []byte
|
||||
}
|
||||
|
||||
// Resolve the URL hostname and
|
||||
// check if we already have an open
|
||||
// connection to this host.
|
||||
@@ -31,51 +40,53 @@ func getHostIPAddresses(hostname string) ([]string, error) {
|
||||
return addrs, nil
|
||||
}
|
||||
|
||||
// Connect to given URL, using the Gemini protocol.
|
||||
// Return a Snapshot with the data or the error.
|
||||
// Any errors are stored within the snapshot.
|
||||
func Visit(s *Snapshot) {
|
||||
func ConnectAndGetData(url string) ([]byte, error) {
|
||||
parsedUrl, err := go_url.Parse(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Could not parse URL, error %w", err)
|
||||
}
|
||||
hostname := parsedUrl.Hostname()
|
||||
port := parsedUrl.Port()
|
||||
if port == "" {
|
||||
port = "1965"
|
||||
}
|
||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||
// Establish the underlying TCP connection.
|
||||
host := fmt.Sprintf("%s:%d", s.Host, s.URL.Port)
|
||||
dialer := &net.Dialer{
|
||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second, // Set the overall connection timeout
|
||||
KeepAlive: 30 * time.Second,
|
||||
Timeout: time.Duration(config.CONFIG.ResponseTimeout) * time.Second,
|
||||
KeepAlive: 10 * time.Second,
|
||||
}
|
||||
conn, err := dialer.Dial("tcp", host)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("TCP connection failed: %v", err))
|
||||
return
|
||||
return nil, fmt.Errorf("TCP connection failed: %w", err)
|
||||
}
|
||||
// Make sure we always close the connection.
|
||||
defer func() {
|
||||
err := conn.Close()
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error closing connection: %s", err))
|
||||
// Do nothing! Connection will timeout eventually if still open somehow.
|
||||
}
|
||||
}()
|
||||
|
||||
// Set read and write timeouts on the TCP connection.
|
||||
err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error setting connection deadline: %s", err))
|
||||
return
|
||||
return nil, fmt.Errorf("Error setting connection deadline: %w", err)
|
||||
}
|
||||
err = conn.SetWriteDeadline(time.Now().Add(time.Duration(config.CONFIG.ResponseTimeout) * time.Second))
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error setting connection deadline: %s", err))
|
||||
return
|
||||
return nil, fmt.Errorf("Error setting connection deadline: %w", err)
|
||||
}
|
||||
|
||||
// Perform the TLS handshake
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure.
|
||||
ServerName: s.URL.Hostname, // SNI
|
||||
InsecureSkipVerify: true, // Accept all TLS certs, even if insecure.
|
||||
ServerName: parsedUrl.Hostname(), // SNI should not include port
|
||||
// MinVersion: tls.VersionTLS12, // Use a minimum TLS version. Warning breaks a lot of sites.
|
||||
}
|
||||
tlsConn := tls.Client(conn, tlsConfig)
|
||||
if err := tlsConn.Handshake(); err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("TLS handshake error: %v", err))
|
||||
return
|
||||
return nil, fmt.Errorf("TLS handshake error: %w", err)
|
||||
}
|
||||
|
||||
// We read `buf`-sized chunks and add data to `data`.
|
||||
@@ -83,10 +94,9 @@ func Visit(s *Snapshot) {
|
||||
var data []byte
|
||||
|
||||
// Send Gemini request to trigger server response.
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", s.URL.String())))
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url)))
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Error sending network request: %s", err))
|
||||
return
|
||||
return nil, fmt.Errorf("Error sending network request: %w", err)
|
||||
}
|
||||
// Read response bytes in len(buf) byte chunks
|
||||
for {
|
||||
@@ -96,21 +106,40 @@ func Visit(s *Snapshot) {
|
||||
}
|
||||
if len(data) > config.CONFIG.MaxResponseSize {
|
||||
data = []byte{}
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize))
|
||||
return nil, fmt.Errorf("Response size exceeded maximum of %d bytes", config.CONFIG.MaxResponseSize)
|
||||
}
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
} else {
|
||||
s.Error = null.StringFrom(fmt.Sprintf("Network error: %s", err))
|
||||
return
|
||||
return nil, fmt.Errorf("Network error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Great, response data received.
|
||||
err = processResponse(s, data)
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Connect to given URL, using the Gemini protocol.
|
||||
// Mutate given Snapshot with the data or the error.
|
||||
func Visit(s *Snapshot) {
|
||||
data, err := ConnectAndGetData(s.URL.String())
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return
|
||||
}
|
||||
pageData, err := processData(data)
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return
|
||||
}
|
||||
s.ResponseCode = null.IntFrom(int64(pageData.ResponseCode))
|
||||
s.MimeType = null.StringFrom(pageData.MimeType)
|
||||
s.Lang = null.StringFrom(pageData.Lang)
|
||||
if pageData.GemText != "" {
|
||||
s.GemText = null.StringFrom(string(pageData.GemText))
|
||||
}
|
||||
if pageData.Data != nil {
|
||||
s.Data = null.ValueFrom(pageData.Data)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -118,31 +147,33 @@ func Visit(s *Snapshot) {
|
||||
// Update given snapshot with the
|
||||
// Gemini header data: response code,
|
||||
// mime type and lang (optional)
|
||||
func processResponse(snapshot *Snapshot, data []byte) error {
|
||||
func processData(data []byte) (*GeminiPageData, error) {
|
||||
headers, body, err := getHeadersAndData(data)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
code, mimeType, lang := getMimeTypeAndLang(headers)
|
||||
geminiError := checkGeminiStatusCode(code)
|
||||
if geminiError != nil {
|
||||
return geminiError
|
||||
return nil, geminiError
|
||||
}
|
||||
pageData := GeminiPageData{
|
||||
ResponseCode: code,
|
||||
MimeType: mimeType,
|
||||
Lang: lang,
|
||||
}
|
||||
snapshot.ResponseCode = null.IntFrom(int64(code))
|
||||
snapshot.MimeType = null.StringFrom(mimeType)
|
||||
snapshot.Lang = null.StringFrom(lang)
|
||||
// If we've got a Gemini document, populate
|
||||
// `GemText` field, otherwise raw data goes to `Data`.
|
||||
if mimeType == "text/gemini" {
|
||||
validBody, err := EnsureValidUTF8(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("UTF-8 error: %w", err)
|
||||
return nil, fmt.Errorf("UTF-8 error: %w", err)
|
||||
}
|
||||
snapshot.GemText = null.StringFrom(string(validBody))
|
||||
pageData.GemText = validBody
|
||||
} else {
|
||||
snapshot.Data = null.ValueFrom(body)
|
||||
pageData.Data = body
|
||||
}
|
||||
return nil
|
||||
return &pageData, nil
|
||||
}
|
||||
|
||||
// Checks for a Gemini header, which is
|
||||
|
||||
@@ -57,16 +57,16 @@ func SaveSnapshotToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func SaveLinkToDB(tx *sqlx.Tx, s *Snapshot) error {
|
||||
func SaveLinksToDB(tx *sqlx.Tx, snapshots []*Snapshot) error {
|
||||
query := `
|
||||
INSERT INTO snapshots (uid, url, host, timestamp, mimetype, data, gemtext, links, lang, response_code, error)
|
||||
VALUES (:uid, :url, :host, :timestamp, :mimetype, :data, :gemtext, :links, :lang, :response_code, :error)
|
||||
ON CONFLICT (uid) DO NOTHING
|
||||
`
|
||||
_, err := tx.NamedExec(query, s)
|
||||
_, err := tx.NamedExec(query, snapshots)
|
||||
if err != nil {
|
||||
logging.LogError("[%s] [%s] Error upserting snapshot: %w", s.URL, s.MimeType.String, err)
|
||||
return fmt.Errorf("DB error: %w", err) // Return the error instead of panicking
|
||||
logging.LogError("Error batch inserting snapshots: %w", err)
|
||||
return fmt.Errorf("DB error: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
31
gemini/robots.go
Normal file
31
gemini/robots.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Takes robots.txt content and a host, and
|
||||
// returns a list of full URLs that shouldn't
|
||||
// be visited.
|
||||
// TODO Also take into account the user agent?
|
||||
// Check gemini://geminiprotocol.net/docs/companion/robots.gmi
|
||||
func ParseRobotsTxt(content string, host string) []string {
|
||||
var disallowedPaths []string
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
line = strings.ToLower(line)
|
||||
if strings.HasPrefix(line, "disallow:") {
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
path := strings.TrimSpace(parts[1])
|
||||
if path != "" {
|
||||
// Construct full Gemini URL
|
||||
disallowedPaths = append(disallowedPaths,
|
||||
fmt.Sprintf("gemini://%s%s", host, path))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return disallowedPaths
|
||||
}
|
||||
25
gemini/robots_test.go
Normal file
25
gemini/robots_test.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"reflect"
|
||||
)
|
||||
|
||||
func TestParseRobotsTxt(t *testing.T) {
|
||||
input := `User-agent: *
|
||||
Disallow: /cgi-bin/wp.cgi/view
|
||||
Disallow: /cgi-bin/wp.cgi/media
|
||||
User-agent: googlebot
|
||||
Disallow: /admin/`
|
||||
|
||||
expected := []string{
|
||||
"gemini://example.com/cgi-bin/wp.cgi/view",
|
||||
"gemini://example.com/cgi-bin/wp.cgi/media",
|
||||
}
|
||||
|
||||
result := ParseRobotsTxt(input, "example.com")
|
||||
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("ParseRobotsTxt() = %v, want %v", result, expected)
|
||||
}
|
||||
}
|
||||
@@ -84,16 +84,28 @@ func workOnSnapshot(id int, tx *sqlx.Tx, s *Snapshot) (err error) {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
}
|
||||
|
||||
// Store links
|
||||
// Store links in batch
|
||||
if s.Links != nil {
|
||||
var batchSnapshots []*Snapshot
|
||||
timestamp := null.TimeFrom(time.Now())
|
||||
|
||||
for _, link := range *s.Links {
|
||||
newSnapshot := Snapshot{UID: uid.UID(), URL: link, Host: link.Hostname, Timestamp: null.TimeFrom(time.Now())}
|
||||
if shouldPersistURL(tx, link) {
|
||||
logging.LogDebug("[%d] Saving link %s", id, link)
|
||||
err = SaveLinkToDB(tx, &newSnapshot)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
newSnapshot := &Snapshot{
|
||||
UID: uid.UID(),
|
||||
URL: link,
|
||||
Host: link.Hostname,
|
||||
Timestamp: timestamp,
|
||||
}
|
||||
batchSnapshots = append(batchSnapshots, newSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
if len(batchSnapshots) > 0 {
|
||||
logging.LogDebug("[%d] Batch saving %d links", id, len(batchSnapshots))
|
||||
err = SaveLinksToDB(tx, batchSnapshots)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[%d] DB Error: %w", id, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user