Basic functionality
This commit is contained in:
134
gemini.go
Normal file
134
gemini.go
Normal file
@@ -0,0 +1,134 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func Process(result *Result) *Result {
|
||||
LogInfo("[%s] Processing data", result.url.String())
|
||||
code, err := ParseFirstTwoDigits(result.data)
|
||||
if err != nil {
|
||||
result.error = fmt.Errorf("[%s] Invalid gemini response code", result.url.String())
|
||||
return result
|
||||
}
|
||||
if code != 20 {
|
||||
result.error = fmt.Errorf("[%s] Gemini response code != 20, skipping", result.url.String())
|
||||
return result
|
||||
}
|
||||
// Grab link lines
|
||||
linkLines := ExtractLinkLines(result.data)
|
||||
LogDebug("[%s] Found %d links", result.url.String(), len(linkLines))
|
||||
// Normalize URLs in links, and store them in result
|
||||
for _, line := range linkLines {
|
||||
normalizedLink, descr, error := NormalizeLink(line, result.url.String())
|
||||
if error != nil {
|
||||
LogError("[%s] Invalid link URL %w", result.url.String(), error)
|
||||
continue
|
||||
}
|
||||
geminiUrl, error := ParseUrl(normalizedLink, descr)
|
||||
if error != nil {
|
||||
LogError("[%s] Unparseable gemini link %w", result.url.String(), error)
|
||||
}
|
||||
result.links = append(result.links, *geminiUrl)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
|
||||
u, err := url.Parse(input)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
|
||||
}
|
||||
protocol := u.Scheme
|
||||
hostname := u.Hostname()
|
||||
str_port := u.Port()
|
||||
if str_port == "" {
|
||||
str_port = "1965"
|
||||
}
|
||||
port, err := strconv.Atoi(str_port)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
|
||||
}
|
||||
return &GeminiUrl{protocol: protocol, hostname: hostname, port: port, path: u.Path, descr: descr}, nil
|
||||
}
|
||||
|
||||
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
|
||||
func ExtractLinkLines(gemtext string) []string {
|
||||
// Define the regular expression pattern to match link lines
|
||||
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
|
||||
|
||||
// Find all matches using the regular expression
|
||||
matches := re.FindAllString(gemtext, -1)
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
// Take a single link line and the current URL,
|
||||
// return the URL converted to an absolute URL
|
||||
// and its description.
|
||||
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
|
||||
// Parse the current URL
|
||||
baseURL, err := url.Parse(currentURL)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("invalid current URL: %v", err)
|
||||
}
|
||||
|
||||
// Regular expression to extract the URL part from a link line
|
||||
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
|
||||
|
||||
// Use regex to extract the URL and the rest of the line
|
||||
matches := re.FindStringSubmatch(linkLine)
|
||||
if len(matches) == 0 {
|
||||
// If the line doesn't match the expected format, return it unchanged
|
||||
return "", "", fmt.Errorf("Not a link line: %v", linkLine)
|
||||
}
|
||||
|
||||
originalURLStr := matches[1]
|
||||
restOfLine := ""
|
||||
if len(matches) > 2 {
|
||||
restOfLine = matches[2]
|
||||
}
|
||||
|
||||
// Parse the URL from the link line
|
||||
parsedURL, err := url.Parse(originalURLStr)
|
||||
if err != nil {
|
||||
// If URL parsing fails, return an error
|
||||
return "", "", fmt.Errorf("Invalid URL in link line '%s': %v", originalURLStr, err)
|
||||
}
|
||||
|
||||
// Resolve relative URLs against the base URL
|
||||
if !parsedURL.IsAbs() {
|
||||
parsedURL = baseURL.ResolveReference(parsedURL)
|
||||
}
|
||||
|
||||
// Construct the canonicalized link line
|
||||
canonicalURLStr := parsedURL.String()
|
||||
return canonicalURLStr, restOfLine, nil
|
||||
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
|
||||
// return canonicalizedLine, nil
|
||||
}
|
||||
|
||||
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
|
||||
// If no valid digits are found, it returns an error.
|
||||
func ParseFirstTwoDigits(input string) (int, error) {
|
||||
// Define the regular expression pattern to match one or two leading digits
|
||||
re := regexp.MustCompile(`^(\d{1,2})`)
|
||||
|
||||
// Find the first match in the string
|
||||
matches := re.FindStringSubmatch(input)
|
||||
if len(matches) == 0 {
|
||||
return 0, errors.New("no digits found at the beginning of the string")
|
||||
}
|
||||
|
||||
// Parse the captured match as an integer
|
||||
result, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
10
go.mod
Normal file
10
go.mod
Normal file
@@ -0,0 +1,10 @@
|
||||
module gemini-grc
|
||||
|
||||
go 1.23.1
|
||||
|
||||
require (
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/rs/zerolog v1.33.0 // indirect
|
||||
golang.org/x/sys v0.25.0 // indirect
|
||||
)
|
||||
17
go.sum
Normal file
17
go.sum
Normal file
@@ -0,0 +1,17 @@
|
||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
|
||||
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
|
||||
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
|
||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
|
||||
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
18
logging.go
Normal file
18
logging.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
zlog "github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func LogDebug(format string, args ...interface{}) {
|
||||
zlog.Debug().Msg(fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func LogInfo(format string, args ...interface{}) {
|
||||
zlog.Info().Msg(fmt.Sprintf(format, args...))
|
||||
}
|
||||
func LogError(format string, args ...interface{}) {
|
||||
zlog.Error().Err(fmt.Errorf(format, args...)).Msg("")
|
||||
}
|
||||
103
main.go
Normal file
103
main.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
zlog "github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func main() {
|
||||
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
||||
// zerolog.SetGlobalLevel(zerolog.DebugLevel)
|
||||
zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
||||
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr})
|
||||
if err := runApp(); err != nil {
|
||||
LogError("Application error: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func runApp() error {
|
||||
urls := []string{"gemini://smol.gr"} //, "gemini://gmi.noulin.neta/", "gemini://in.gr:443"}
|
||||
|
||||
queue := make(chan string)
|
||||
done := make(chan struct{})
|
||||
|
||||
// Start the crawler.
|
||||
go crawler(queue, done)
|
||||
|
||||
// Send URLs to the queue
|
||||
for _, url := range urls {
|
||||
// Send URL to queue; blocks until crawler receives it
|
||||
queue <- url
|
||||
}
|
||||
|
||||
// All URLs have been sent and received
|
||||
// because queue is unbuffered; safe to close the queue
|
||||
close(queue)
|
||||
|
||||
// Wait until crawler signals finish
|
||||
<-done
|
||||
return nil
|
||||
}
|
||||
|
||||
func crawler(queue <-chan string, done chan struct{}) {
|
||||
// Start processing results.
|
||||
results := make(chan Result)
|
||||
resultsDone := make(chan struct{})
|
||||
go resultsHandler(results, resultsDone)
|
||||
|
||||
// Create workers that consume the queue channel,
|
||||
// and send their result to results channel.
|
||||
workers := 3
|
||||
LogInfo("Spawning %d workers", workers)
|
||||
var wg sync.WaitGroup
|
||||
// Start worker goroutines
|
||||
for range workers {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
worker(queue, results)
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
|
||||
// Wait until all workers have finished.
|
||||
wg.Wait()
|
||||
LogInfo("All workers have finished")
|
||||
|
||||
// Nobody left to send to results, so we
|
||||
// close it, and the ResultsProcessor can
|
||||
// finish
|
||||
close(results)
|
||||
<-resultsDone
|
||||
|
||||
close(done)
|
||||
}
|
||||
|
||||
func resultsHandler(results <-chan Result, done chan struct{}) {
|
||||
for result := range results {
|
||||
if result.error != nil {
|
||||
LogError("%w", result.error)
|
||||
} else {
|
||||
LogInfo("[%s] Done. Result: %#v", result.url, result)
|
||||
}
|
||||
}
|
||||
LogInfo("All results have been processed")
|
||||
close(done)
|
||||
}
|
||||
|
||||
func worker(queue <-chan string, results chan Result) {
|
||||
for url := range queue {
|
||||
result := Visit(url)
|
||||
// If we encountered an error when
|
||||
// visiting, skip processing
|
||||
if result.error != nil {
|
||||
results <- *result
|
||||
continue
|
||||
}
|
||||
result = Process(result)
|
||||
results <- *result
|
||||
}
|
||||
}
|
||||
66
network.go
Normal file
66
network.go
Normal file
@@ -0,0 +1,66 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
)
|
||||
|
||||
func Visit(url string) (result *Result) {
|
||||
result = &Result{}
|
||||
|
||||
// Wrap error with additional information
|
||||
defer func() {
|
||||
if result.error != nil {
|
||||
result.error = fmt.Errorf("[%s] Error: %w", result.url, result.error)
|
||||
}
|
||||
}()
|
||||
|
||||
geminiUrl, err := ParseUrl(url, "")
|
||||
if err != nil {
|
||||
result.error = err
|
||||
return result
|
||||
}
|
||||
result.url = *geminiUrl
|
||||
|
||||
LogInfo("[%s] Dialing", geminiUrl.String())
|
||||
|
||||
// Establish a TLS connection
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
}
|
||||
conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.hostname, geminiUrl.port), tlsConfig)
|
||||
if err != nil {
|
||||
result.error = err
|
||||
return result
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
// Read data from the connection
|
||||
conn.SetReadDeadline(time.Now().Add(5 * time.Second))
|
||||
buf := make([]byte, 1024)
|
||||
var data []byte
|
||||
// Write Gemini request to get response.
|
||||
conn.Write([]byte(fmt.Sprintf("%s\r\n", geminiUrl.String())))
|
||||
// Read response bytes in len(buf) byte chunks
|
||||
for {
|
||||
n, err := conn.Read(buf)
|
||||
if n > 0 {
|
||||
data = append(data, buf[:n]...)
|
||||
}
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
} else {
|
||||
result.error = err
|
||||
return result
|
||||
}
|
||||
}
|
||||
}
|
||||
LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data))
|
||||
// time.Sleep(time.Duration(time.Second * 2))
|
||||
// LogDebug("[%s] Visitor finished", geminiUrl.String())
|
||||
result.data = string(data)
|
||||
return result
|
||||
}
|
||||
25
types.go
Normal file
25
types.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type GeminiUrl struct {
|
||||
protocol string
|
||||
hostname string
|
||||
port int
|
||||
path string
|
||||
descr string
|
||||
}
|
||||
|
||||
func (self GeminiUrl) String() string {
|
||||
return fmt.Sprintf("%s://%s:%d%s", self.protocol, self.hostname, self.port, self.path)
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
url GeminiUrl
|
||||
data string
|
||||
links []GeminiUrl
|
||||
code int
|
||||
error error
|
||||
}
|
||||
Reference in New Issue
Block a user