Basic functionality
This commit is contained in:
134
gemini.go
Normal file
134
gemini.go
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Process(result *Result) *Result {
|
||||||
|
LogInfo("[%s] Processing data", result.url.String())
|
||||||
|
code, err := ParseFirstTwoDigits(result.data)
|
||||||
|
if err != nil {
|
||||||
|
result.error = fmt.Errorf("[%s] Invalid gemini response code", result.url.String())
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
if code != 20 {
|
||||||
|
result.error = fmt.Errorf("[%s] Gemini response code != 20, skipping", result.url.String())
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
// Grab link lines
|
||||||
|
linkLines := ExtractLinkLines(result.data)
|
||||||
|
LogDebug("[%s] Found %d links", result.url.String(), len(linkLines))
|
||||||
|
// Normalize URLs in links, and store them in result
|
||||||
|
for _, line := range linkLines {
|
||||||
|
normalizedLink, descr, error := NormalizeLink(line, result.url.String())
|
||||||
|
if error != nil {
|
||||||
|
LogError("[%s] Invalid link URL %w", result.url.String(), error)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
geminiUrl, error := ParseUrl(normalizedLink, descr)
|
||||||
|
if error != nil {
|
||||||
|
LogError("[%s] Unparseable gemini link %w", result.url.String(), error)
|
||||||
|
}
|
||||||
|
result.links = append(result.links, *geminiUrl)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||||
|
u, err := url.Parse(input)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
|
||||||
|
}
|
||||||
|
protocol := u.Scheme
|
||||||
|
hostname := u.Hostname()
|
||||||
|
str_port := u.Port()
|
||||||
|
if str_port == "" {
|
||||||
|
str_port = "1965"
|
||||||
|
}
|
||||||
|
port, err := strconv.Atoi(str_port)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
|
||||||
|
}
|
||||||
|
return &GeminiUrl{protocol: protocol, hostname: hostname, port: port, path: u.Path, descr: descr}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||||
|
func ExtractLinkLines(gemtext string) []string {
|
||||||
|
// Define the regular expression pattern to match link lines
|
||||||
|
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
|
||||||
|
|
||||||
|
// Find all matches using the regular expression
|
||||||
|
matches := re.FindAllString(gemtext, -1)
|
||||||
|
|
||||||
|
return matches
|
||||||
|
}
|
||||||
|
|
||||||
|
// Take a single link line and the current URL,
|
||||||
|
// return the URL converted to an absolute URL
|
||||||
|
// and its description.
|
||||||
|
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
|
||||||
|
// Parse the current URL
|
||||||
|
baseURL, err := url.Parse(currentURL)
|
||||||
|
if err != nil {
|
||||||
|
return "", "", fmt.Errorf("invalid current URL: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regular expression to extract the URL part from a link line
|
||||||
|
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
|
||||||
|
|
||||||
|
// Use regex to extract the URL and the rest of the line
|
||||||
|
matches := re.FindStringSubmatch(linkLine)
|
||||||
|
if len(matches) == 0 {
|
||||||
|
// If the line doesn't match the expected format, return it unchanged
|
||||||
|
return "", "", fmt.Errorf("Not a link line: %v", linkLine)
|
||||||
|
}
|
||||||
|
|
||||||
|
originalURLStr := matches[1]
|
||||||
|
restOfLine := ""
|
||||||
|
if len(matches) > 2 {
|
||||||
|
restOfLine = matches[2]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse the URL from the link line
|
||||||
|
parsedURL, err := url.Parse(originalURLStr)
|
||||||
|
if err != nil {
|
||||||
|
// If URL parsing fails, return an error
|
||||||
|
return "", "", fmt.Errorf("Invalid URL in link line '%s': %v", originalURLStr, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve relative URLs against the base URL
|
||||||
|
if !parsedURL.IsAbs() {
|
||||||
|
parsedURL = baseURL.ResolveReference(parsedURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Construct the canonicalized link line
|
||||||
|
canonicalURLStr := parsedURL.String()
|
||||||
|
return canonicalURLStr, restOfLine, nil
|
||||||
|
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
|
||||||
|
// return canonicalizedLine, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
|
||||||
|
// If no valid digits are found, it returns an error.
|
||||||
|
func ParseFirstTwoDigits(input string) (int, error) {
|
||||||
|
// Define the regular expression pattern to match one or two leading digits
|
||||||
|
re := regexp.MustCompile(`^(\d{1,2})`)
|
||||||
|
|
||||||
|
// Find the first match in the string
|
||||||
|
matches := re.FindStringSubmatch(input)
|
||||||
|
if len(matches) == 0 {
|
||||||
|
return 0, errors.New("no digits found at the beginning of the string")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse the captured match as an integer
|
||||||
|
result, err := strconv.Atoi(matches[1])
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
10
go.mod
Normal file
10
go.mod
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
module gemini-grc
|
||||||
|
|
||||||
|
go 1.23.1
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/rs/zerolog v1.33.0 // indirect
|
||||||
|
golang.org/x/sys v0.25.0 // indirect
|
||||||
|
)
|
||||||
17
go.sum
Normal file
17
go.sum
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||||
|
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
||||||
|
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||||
|
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||||
|
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||||
|
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||||
|
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
||||||
|
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
|
||||||
|
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
|
||||||
|
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
|
||||||
|
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
18
logging.go
Normal file
18
logging.go
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
zlog "github.com/rs/zerolog/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
func LogDebug(format string, args ...interface{}) {
|
||||||
|
zlog.Debug().Msg(fmt.Sprintf(format, args...))
|
||||||
|
}
|
||||||
|
|
||||||
|
func LogInfo(format string, args ...interface{}) {
|
||||||
|
zlog.Info().Msg(fmt.Sprintf(format, args...))
|
||||||
|
}
|
||||||
|
func LogError(format string, args ...interface{}) {
|
||||||
|
zlog.Error().Err(fmt.Errorf(format, args...)).Msg("")
|
||||||
|
}
|
||||||
103
main.go
Normal file
103
main.go
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/rs/zerolog"
|
||||||
|
zlog "github.com/rs/zerolog/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
||||||
|
// zerolog.SetGlobalLevel(zerolog.DebugLevel)
|
||||||
|
zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
||||||
|
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr})
|
||||||
|
if err := runApp(); err != nil {
|
||||||
|
LogError("Application error: %v", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func runApp() error {
|
||||||
|
urls := []string{"gemini://smol.gr"} //, "gemini://gmi.noulin.neta/", "gemini://in.gr:443"}
|
||||||
|
|
||||||
|
queue := make(chan string)
|
||||||
|
done := make(chan struct{})
|
||||||
|
|
||||||
|
// Start the crawler.
|
||||||
|
go crawler(queue, done)
|
||||||
|
|
||||||
|
// Send URLs to the queue
|
||||||
|
for _, url := range urls {
|
||||||
|
// Send URL to queue; blocks until crawler receives it
|
||||||
|
queue <- url
|
||||||
|
}
|
||||||
|
|
||||||
|
// All URLs have been sent and received
|
||||||
|
// because queue is unbuffered; safe to close the queue
|
||||||
|
close(queue)
|
||||||
|
|
||||||
|
// Wait until crawler signals finish
|
||||||
|
<-done
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func crawler(queue <-chan string, done chan struct{}) {
|
||||||
|
// Start processing results.
|
||||||
|
results := make(chan Result)
|
||||||
|
resultsDone := make(chan struct{})
|
||||||
|
go resultsHandler(results, resultsDone)
|
||||||
|
|
||||||
|
// Create workers that consume the queue channel,
|
||||||
|
// and send their result to results channel.
|
||||||
|
workers := 3
|
||||||
|
LogInfo("Spawning %d workers", workers)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
// Start worker goroutines
|
||||||
|
for range workers {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
worker(queue, results)
|
||||||
|
wg.Done()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait until all workers have finished.
|
||||||
|
wg.Wait()
|
||||||
|
LogInfo("All workers have finished")
|
||||||
|
|
||||||
|
// Nobody left to send to results, so we
|
||||||
|
// close it, and the ResultsProcessor can
|
||||||
|
// finish
|
||||||
|
close(results)
|
||||||
|
<-resultsDone
|
||||||
|
|
||||||
|
close(done)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resultsHandler(results <-chan Result, done chan struct{}) {
|
||||||
|
for result := range results {
|
||||||
|
if result.error != nil {
|
||||||
|
LogError("%w", result.error)
|
||||||
|
} else {
|
||||||
|
LogInfo("[%s] Done. Result: %#v", result.url, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LogInfo("All results have been processed")
|
||||||
|
close(done)
|
||||||
|
}
|
||||||
|
|
||||||
|
func worker(queue <-chan string, results chan Result) {
|
||||||
|
for url := range queue {
|
||||||
|
result := Visit(url)
|
||||||
|
// If we encountered an error when
|
||||||
|
// visiting, skip processing
|
||||||
|
if result.error != nil {
|
||||||
|
results <- *result
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result = Process(result)
|
||||||
|
results <- *result
|
||||||
|
}
|
||||||
|
}
|
||||||
66
network.go
Normal file
66
network.go
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crypto/tls"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Visit(url string) (result *Result) {
|
||||||
|
result = &Result{}
|
||||||
|
|
||||||
|
// Wrap error with additional information
|
||||||
|
defer func() {
|
||||||
|
if result.error != nil {
|
||||||
|
result.error = fmt.Errorf("[%s] Error: %w", result.url, result.error)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
geminiUrl, err := ParseUrl(url, "")
|
||||||
|
if err != nil {
|
||||||
|
result.error = err
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
result.url = *geminiUrl
|
||||||
|
|
||||||
|
LogInfo("[%s] Dialing", geminiUrl.String())
|
||||||
|
|
||||||
|
// Establish a TLS connection
|
||||||
|
tlsConfig := &tls.Config{
|
||||||
|
InsecureSkipVerify: true,
|
||||||
|
}
|
||||||
|
conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.hostname, geminiUrl.port), tlsConfig)
|
||||||
|
if err != nil {
|
||||||
|
result.error = err
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
defer conn.Close()
|
||||||
|
|
||||||
|
// Read data from the connection
|
||||||
|
conn.SetReadDeadline(time.Now().Add(5 * time.Second))
|
||||||
|
buf := make([]byte, 1024)
|
||||||
|
var data []byte
|
||||||
|
// Write Gemini request to get response.
|
||||||
|
conn.Write([]byte(fmt.Sprintf("%s\r\n", geminiUrl.String())))
|
||||||
|
// Read response bytes in len(buf) byte chunks
|
||||||
|
for {
|
||||||
|
n, err := conn.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
data = append(data, buf[:n]...)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
result.error = err
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data))
|
||||||
|
// time.Sleep(time.Duration(time.Second * 2))
|
||||||
|
// LogDebug("[%s] Visitor finished", geminiUrl.String())
|
||||||
|
result.data = string(data)
|
||||||
|
return result
|
||||||
|
}
|
||||||
25
types.go
Normal file
25
types.go
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
type GeminiUrl struct {
|
||||||
|
protocol string
|
||||||
|
hostname string
|
||||||
|
port int
|
||||||
|
path string
|
||||||
|
descr string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (self GeminiUrl) String() string {
|
||||||
|
return fmt.Sprintf("%s://%s:%d%s", self.protocol, self.hostname, self.port, self.path)
|
||||||
|
}
|
||||||
|
|
||||||
|
type Result struct {
|
||||||
|
url GeminiUrl
|
||||||
|
data string
|
||||||
|
links []GeminiUrl
|
||||||
|
code int
|
||||||
|
error error
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user