Basic functionality

This commit is contained in:
2024-10-04 13:15:07 +03:00
parent eb963542b7
commit 74be6b4d0d
7 changed files with 373 additions and 0 deletions

134
gemini.go Normal file
View File

@@ -0,0 +1,134 @@
package main
import (
"errors"
"fmt"
"net/url"
"regexp"
"strconv"
)
func Process(result *Result) *Result {
LogInfo("[%s] Processing data", result.url.String())
code, err := ParseFirstTwoDigits(result.data)
if err != nil {
result.error = fmt.Errorf("[%s] Invalid gemini response code", result.url.String())
return result
}
if code != 20 {
result.error = fmt.Errorf("[%s] Gemini response code != 20, skipping", result.url.String())
return result
}
// Grab link lines
linkLines := ExtractLinkLines(result.data)
LogDebug("[%s] Found %d links", result.url.String(), len(linkLines))
// Normalize URLs in links, and store them in result
for _, line := range linkLines {
normalizedLink, descr, error := NormalizeLink(line, result.url.String())
if error != nil {
LogError("[%s] Invalid link URL %w", result.url.String(), error)
continue
}
geminiUrl, error := ParseUrl(normalizedLink, descr)
if error != nil {
LogError("[%s] Unparseable gemini link %w", result.url.String(), error)
}
result.links = append(result.links, *geminiUrl)
}
return result
}
func ParseUrl(input string, descr string) (*GeminiUrl, error) {
u, err := url.Parse(input)
if err != nil {
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
}
protocol := u.Scheme
hostname := u.Hostname()
str_port := u.Port()
if str_port == "" {
str_port = "1965"
}
port, err := strconv.Atoi(str_port)
if err != nil {
return nil, fmt.Errorf("Error parsing URL %s: %w", input, err)
}
return &GeminiUrl{protocol: protocol, hostname: hostname, port: port, path: u.Path, descr: descr}, nil
}
// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines
func ExtractLinkLines(gemtext string) []string {
// Define the regular expression pattern to match link lines
re := regexp.MustCompile(`(?m)^=>[ \t]+.*`)
// Find all matches using the regular expression
matches := re.FindAllString(gemtext, -1)
return matches
}
// Take a single link line and the current URL,
// return the URL converted to an absolute URL
// and its description.
func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) {
// Parse the current URL
baseURL, err := url.Parse(currentURL)
if err != nil {
return "", "", fmt.Errorf("invalid current URL: %v", err)
}
// Regular expression to extract the URL part from a link line
re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`)
// Use regex to extract the URL and the rest of the line
matches := re.FindStringSubmatch(linkLine)
if len(matches) == 0 {
// If the line doesn't match the expected format, return it unchanged
return "", "", fmt.Errorf("Not a link line: %v", linkLine)
}
originalURLStr := matches[1]
restOfLine := ""
if len(matches) > 2 {
restOfLine = matches[2]
}
// Parse the URL from the link line
parsedURL, err := url.Parse(originalURLStr)
if err != nil {
// If URL parsing fails, return an error
return "", "", fmt.Errorf("Invalid URL in link line '%s': %v", originalURLStr, err)
}
// Resolve relative URLs against the base URL
if !parsedURL.IsAbs() {
parsedURL = baseURL.ResolveReference(parsedURL)
}
// Construct the canonicalized link line
canonicalURLStr := parsedURL.String()
return canonicalURLStr, restOfLine, nil
// canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine)
// return canonicalizedLine, nil
}
// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int.
// If no valid digits are found, it returns an error.
func ParseFirstTwoDigits(input string) (int, error) {
// Define the regular expression pattern to match one or two leading digits
re := regexp.MustCompile(`^(\d{1,2})`)
// Find the first match in the string
matches := re.FindStringSubmatch(input)
if len(matches) == 0 {
return 0, errors.New("no digits found at the beginning of the string")
}
// Parse the captured match as an integer
result, err := strconv.Atoi(matches[1])
if err != nil {
return 0, fmt.Errorf("failed to convert matched digits to int: %v", err)
}
return result, nil
}

10
go.mod Normal file
View File

@@ -0,0 +1,10 @@
module gemini-grc
go 1.23.1
require (
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/rs/zerolog v1.33.0 // indirect
golang.org/x/sys v0.25.0 // indirect
)

17
go.sum Normal file
View File

@@ -0,0 +1,17 @@
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=

18
logging.go Normal file
View File

@@ -0,0 +1,18 @@
package main
import (
"fmt"
zlog "github.com/rs/zerolog/log"
)
func LogDebug(format string, args ...interface{}) {
zlog.Debug().Msg(fmt.Sprintf(format, args...))
}
func LogInfo(format string, args ...interface{}) {
zlog.Info().Msg(fmt.Sprintf(format, args...))
}
func LogError(format string, args ...interface{}) {
zlog.Error().Err(fmt.Errorf(format, args...)).Msg("")
}

103
main.go Normal file
View File

@@ -0,0 +1,103 @@
package main
import (
"os"
"sync"
"github.com/rs/zerolog"
zlog "github.com/rs/zerolog/log"
)
func main() {
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
// zerolog.SetGlobalLevel(zerolog.DebugLevel)
zerolog.SetGlobalLevel(zerolog.InfoLevel)
zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr})
if err := runApp(); err != nil {
LogError("Application error: %v", err)
os.Exit(1)
}
}
func runApp() error {
urls := []string{"gemini://smol.gr"} //, "gemini://gmi.noulin.neta/", "gemini://in.gr:443"}
queue := make(chan string)
done := make(chan struct{})
// Start the crawler.
go crawler(queue, done)
// Send URLs to the queue
for _, url := range urls {
// Send URL to queue; blocks until crawler receives it
queue <- url
}
// All URLs have been sent and received
// because queue is unbuffered; safe to close the queue
close(queue)
// Wait until crawler signals finish
<-done
return nil
}
func crawler(queue <-chan string, done chan struct{}) {
// Start processing results.
results := make(chan Result)
resultsDone := make(chan struct{})
go resultsHandler(results, resultsDone)
// Create workers that consume the queue channel,
// and send their result to results channel.
workers := 3
LogInfo("Spawning %d workers", workers)
var wg sync.WaitGroup
// Start worker goroutines
for range workers {
wg.Add(1)
go func() {
worker(queue, results)
wg.Done()
}()
}
// Wait until all workers have finished.
wg.Wait()
LogInfo("All workers have finished")
// Nobody left to send to results, so we
// close it, and the ResultsProcessor can
// finish
close(results)
<-resultsDone
close(done)
}
func resultsHandler(results <-chan Result, done chan struct{}) {
for result := range results {
if result.error != nil {
LogError("%w", result.error)
} else {
LogInfo("[%s] Done. Result: %#v", result.url, result)
}
}
LogInfo("All results have been processed")
close(done)
}
func worker(queue <-chan string, results chan Result) {
for url := range queue {
result := Visit(url)
// If we encountered an error when
// visiting, skip processing
if result.error != nil {
results <- *result
continue
}
result = Process(result)
results <- *result
}
}

66
network.go Normal file
View File

@@ -0,0 +1,66 @@
package main
import (
"crypto/tls"
"fmt"
"io"
"time"
)
func Visit(url string) (result *Result) {
result = &Result{}
// Wrap error with additional information
defer func() {
if result.error != nil {
result.error = fmt.Errorf("[%s] Error: %w", result.url, result.error)
}
}()
geminiUrl, err := ParseUrl(url, "")
if err != nil {
result.error = err
return result
}
result.url = *geminiUrl
LogInfo("[%s] Dialing", geminiUrl.String())
// Establish a TLS connection
tlsConfig := &tls.Config{
InsecureSkipVerify: true,
}
conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.hostname, geminiUrl.port), tlsConfig)
if err != nil {
result.error = err
return result
}
defer conn.Close()
// Read data from the connection
conn.SetReadDeadline(time.Now().Add(5 * time.Second))
buf := make([]byte, 1024)
var data []byte
// Write Gemini request to get response.
conn.Write([]byte(fmt.Sprintf("%s\r\n", geminiUrl.String())))
// Read response bytes in len(buf) byte chunks
for {
n, err := conn.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
}
if err != nil {
if err == io.EOF {
break
} else {
result.error = err
return result
}
}
}
LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data))
// time.Sleep(time.Duration(time.Second * 2))
// LogDebug("[%s] Visitor finished", geminiUrl.String())
result.data = string(data)
return result
}

25
types.go Normal file
View File

@@ -0,0 +1,25 @@
package main
import (
"fmt"
)
type GeminiUrl struct {
protocol string
hostname string
port int
path string
descr string
}
func (self GeminiUrl) String() string {
return fmt.Sprintf("%s://%s:%d%s", self.protocol, self.hostname, self.port, self.path)
}
type Result struct {
url GeminiUrl
data string
links []GeminiUrl
code int
error error
}