diff --git a/gemini.go b/gemini.go new file mode 100644 index 0000000..313c8c5 --- /dev/null +++ b/gemini.go @@ -0,0 +1,134 @@ +package main + +import ( + "errors" + "fmt" + "net/url" + "regexp" + "strconv" +) + +func Process(result *Result) *Result { + LogInfo("[%s] Processing data", result.url.String()) + code, err := ParseFirstTwoDigits(result.data) + if err != nil { + result.error = fmt.Errorf("[%s] Invalid gemini response code", result.url.String()) + return result + } + if code != 20 { + result.error = fmt.Errorf("[%s] Gemini response code != 20, skipping", result.url.String()) + return result + } + // Grab link lines + linkLines := ExtractLinkLines(result.data) + LogDebug("[%s] Found %d links", result.url.String(), len(linkLines)) + // Normalize URLs in links, and store them in result + for _, line := range linkLines { + normalizedLink, descr, error := NormalizeLink(line, result.url.String()) + if error != nil { + LogError("[%s] Invalid link URL %w", result.url.String(), error) + continue + } + geminiUrl, error := ParseUrl(normalizedLink, descr) + if error != nil { + LogError("[%s] Unparseable gemini link %w", result.url.String(), error) + } + result.links = append(result.links, *geminiUrl) + } + return result +} + +func ParseUrl(input string, descr string) (*GeminiUrl, error) { + u, err := url.Parse(input) + if err != nil { + return nil, fmt.Errorf("Error parsing URL %s: %w", input, err) + } + protocol := u.Scheme + hostname := u.Hostname() + str_port := u.Port() + if str_port == "" { + str_port = "1965" + } + port, err := strconv.Atoi(str_port) + if err != nil { + return nil, fmt.Errorf("Error parsing URL %s: %w", input, err) + } + return &GeminiUrl{protocol: protocol, hostname: hostname, port: port, path: u.Path, descr: descr}, nil +} + +// ExtractLinkLines takes a Gemtext document as a string and returns all lines that are link lines +func ExtractLinkLines(gemtext string) []string { + // Define the regular expression pattern to match link lines + re := regexp.MustCompile(`(?m)^=>[ \t]+.*`) + + // Find all matches using the regular expression + matches := re.FindAllString(gemtext, -1) + + return matches +} + +// Take a single link line and the current URL, +// return the URL converted to an absolute URL +// and its description. +func NormalizeLink(linkLine string, currentURL string) (link string, descr string, err error) { + // Parse the current URL + baseURL, err := url.Parse(currentURL) + if err != nil { + return "", "", fmt.Errorf("invalid current URL: %v", err) + } + + // Regular expression to extract the URL part from a link line + re := regexp.MustCompile(`^=>[ \t]+(\S+)([ \t]+.*)?`) + + // Use regex to extract the URL and the rest of the line + matches := re.FindStringSubmatch(linkLine) + if len(matches) == 0 { + // If the line doesn't match the expected format, return it unchanged + return "", "", fmt.Errorf("Not a link line: %v", linkLine) + } + + originalURLStr := matches[1] + restOfLine := "" + if len(matches) > 2 { + restOfLine = matches[2] + } + + // Parse the URL from the link line + parsedURL, err := url.Parse(originalURLStr) + if err != nil { + // If URL parsing fails, return an error + return "", "", fmt.Errorf("Invalid URL in link line '%s': %v", originalURLStr, err) + } + + // Resolve relative URLs against the base URL + if !parsedURL.IsAbs() { + parsedURL = baseURL.ResolveReference(parsedURL) + } + + // Construct the canonicalized link line + canonicalURLStr := parsedURL.String() + return canonicalURLStr, restOfLine, nil + // canonicalizedLine := fmt.Sprintf("=> %s%s", canonicalURLStr, restOfLine) + // return canonicalizedLine, nil +} + +// ParseFirstTwoDigits takes a string and returns the first one or two digits as an int. +// If no valid digits are found, it returns an error. +func ParseFirstTwoDigits(input string) (int, error) { + // Define the regular expression pattern to match one or two leading digits + re := regexp.MustCompile(`^(\d{1,2})`) + + // Find the first match in the string + matches := re.FindStringSubmatch(input) + if len(matches) == 0 { + return 0, errors.New("no digits found at the beginning of the string") + } + + // Parse the captured match as an integer + result, err := strconv.Atoi(matches[1]) + if err != nil { + return 0, fmt.Errorf("failed to convert matched digits to int: %v", err) + } + + return result, nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..b4c0810 --- /dev/null +++ b/go.mod @@ -0,0 +1,10 @@ +module gemini-grc + +go 1.23.1 + +require ( + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/rs/zerolog v1.33.0 // indirect + golang.org/x/sys v0.25.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..b52b48f --- /dev/null +++ b/go.sum @@ -0,0 +1,17 @@ +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= +github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/logging.go b/logging.go new file mode 100644 index 0000000..0e2d3f7 --- /dev/null +++ b/logging.go @@ -0,0 +1,18 @@ +package main + +import ( + "fmt" + + zlog "github.com/rs/zerolog/log" +) + +func LogDebug(format string, args ...interface{}) { + zlog.Debug().Msg(fmt.Sprintf(format, args...)) +} + +func LogInfo(format string, args ...interface{}) { + zlog.Info().Msg(fmt.Sprintf(format, args...)) +} +func LogError(format string, args ...interface{}) { + zlog.Error().Err(fmt.Errorf(format, args...)).Msg("") +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..09a4566 --- /dev/null +++ b/main.go @@ -0,0 +1,103 @@ +package main + +import ( + "os" + "sync" + + "github.com/rs/zerolog" + zlog "github.com/rs/zerolog/log" +) + +func main() { + zerolog.TimeFieldFormat = zerolog.TimeFormatUnix + // zerolog.SetGlobalLevel(zerolog.DebugLevel) + zerolog.SetGlobalLevel(zerolog.InfoLevel) + zlog.Logger = zlog.Output(zerolog.ConsoleWriter{Out: os.Stderr}) + if err := runApp(); err != nil { + LogError("Application error: %v", err) + os.Exit(1) + } +} + +func runApp() error { + urls := []string{"gemini://smol.gr"} //, "gemini://gmi.noulin.neta/", "gemini://in.gr:443"} + + queue := make(chan string) + done := make(chan struct{}) + + // Start the crawler. + go crawler(queue, done) + + // Send URLs to the queue + for _, url := range urls { + // Send URL to queue; blocks until crawler receives it + queue <- url + } + + // All URLs have been sent and received + // because queue is unbuffered; safe to close the queue + close(queue) + + // Wait until crawler signals finish + <-done + return nil +} + +func crawler(queue <-chan string, done chan struct{}) { + // Start processing results. + results := make(chan Result) + resultsDone := make(chan struct{}) + go resultsHandler(results, resultsDone) + + // Create workers that consume the queue channel, + // and send their result to results channel. + workers := 3 + LogInfo("Spawning %d workers", workers) + var wg sync.WaitGroup + // Start worker goroutines + for range workers { + wg.Add(1) + go func() { + worker(queue, results) + wg.Done() + }() + } + + // Wait until all workers have finished. + wg.Wait() + LogInfo("All workers have finished") + + // Nobody left to send to results, so we + // close it, and the ResultsProcessor can + // finish + close(results) + <-resultsDone + + close(done) +} + +func resultsHandler(results <-chan Result, done chan struct{}) { + for result := range results { + if result.error != nil { + LogError("%w", result.error) + } else { + LogInfo("[%s] Done. Result: %#v", result.url, result) + } + } + LogInfo("All results have been processed") + close(done) +} + +func worker(queue <-chan string, results chan Result) { + for url := range queue { + result := Visit(url) + // If we encountered an error when + // visiting, skip processing + if result.error != nil { + results <- *result + continue + } + result = Process(result) + results <- *result + } +} diff --git a/network.go b/network.go new file mode 100644 index 0000000..7702abc --- /dev/null +++ b/network.go @@ -0,0 +1,66 @@ +package main + +import ( + "crypto/tls" + "fmt" + "io" + "time" +) + +func Visit(url string) (result *Result) { + result = &Result{} + + // Wrap error with additional information + defer func() { + if result.error != nil { + result.error = fmt.Errorf("[%s] Error: %w", result.url, result.error) + } + }() + + geminiUrl, err := ParseUrl(url, "") + if err != nil { + result.error = err + return result + } + result.url = *geminiUrl + + LogInfo("[%s] Dialing", geminiUrl.String()) + + // Establish a TLS connection + tlsConfig := &tls.Config{ + InsecureSkipVerify: true, + } + conn, err := tls.Dial("tcp", fmt.Sprintf("%s:%d", geminiUrl.hostname, geminiUrl.port), tlsConfig) + if err != nil { + result.error = err + return result + } + defer conn.Close() + + // Read data from the connection + conn.SetReadDeadline(time.Now().Add(5 * time.Second)) + buf := make([]byte, 1024) + var data []byte + // Write Gemini request to get response. + conn.Write([]byte(fmt.Sprintf("%s\r\n", geminiUrl.String()))) + // Read response bytes in len(buf) byte chunks + for { + n, err := conn.Read(buf) + if n > 0 { + data = append(data, buf[:n]...) + } + if err != nil { + if err == io.EOF { + break + } else { + result.error = err + return result + } + } + } + LogInfo("[%s] Received %d bytes", geminiUrl.String(), len(data)) + // time.Sleep(time.Duration(time.Second * 2)) + // LogDebug("[%s] Visitor finished", geminiUrl.String()) + result.data = string(data) + return result +} diff --git a/types.go b/types.go new file mode 100644 index 0000000..5bcc108 --- /dev/null +++ b/types.go @@ -0,0 +1,25 @@ +package main + +import ( + "fmt" +) + +type GeminiUrl struct { + protocol string + hostname string + port int + path string + descr string +} + +func (self GeminiUrl) String() string { + return fmt.Sprintf("%s://%s:%d%s", self.protocol, self.hostname, self.port, self.path) +} + +type Result struct { + url GeminiUrl + data string + links []GeminiUrl + code int + error error +}