Lots of features, first version that reliably crawls Geminispace.

- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
- [x] Save image/* and text/* files
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
.

.

.
This commit is contained in:
2024-10-21 20:04:09 +03:00
parent 212345764b
commit cd60c1363b
37 changed files with 1231 additions and 323 deletions

113
gemini/files.go Normal file
View File

@@ -0,0 +1,113 @@
package gemini
import (
"fmt"
"gemini-grc/logging"
"net/url"
"os"
"path"
"path/filepath"
"strings"
)
// sanitizePath encodes invalid filesystem characters using URL encoding.
// Example:
// /example/path/to/page?query=param&another=value
// would become
// example/path/to/page%3Fquery%3Dparam%26another%3Dvalue
func sanitizePath(p string) string {
// Split the path into its components
components := strings.Split(p, "/")
// Encode each component separately
for i, component := range components {
// Decode any existing percent-encoded characters
decodedComponent, err := url.PathUnescape(component)
if err != nil {
decodedComponent = component // Fallback to original if unescape fails
}
// Encode the component to escape invalid filesystem characters
encodedComponent := url.QueryEscape(decodedComponent)
// Replace '+' (from QueryEscape) with '%20' to handle spaces correctly
encodedComponent = strings.ReplaceAll(encodedComponent, "+", "%20")
components[i] = encodedComponent
}
// Rejoin the components into a sanitized path
safe := filepath.Join(components...)
return safe
}
// getFilePath constructs a safe file path from the root path and URL path.
// It URL-encodes invalid filesystem characters to ensure the path is valid.
func calcFilePath(rootPath, urlPath string) (string, error) {
// Normalize the URL path
cleanPath := filepath.Clean(urlPath)
// Safe check to prevent directory traversal
if strings.Contains(cleanPath, "..") {
return "", fmt.Errorf("Invalid URL path: contains directory traversal")
}
// Sanitize the path by encoding invalid characters
safePath := sanitizePath(cleanPath)
// Join the root path and the sanitized URL path
finalPath := filepath.Join(rootPath, safePath)
return finalPath, nil
}
func SaveToFile(rootPath string, s *Snapshot, done chan struct{}) {
parentPath := path.Join(rootPath, s.URL.Hostname)
urlPath := s.URL.Path
// If path is empty, add `index.gmi` as the file to save
if urlPath == "" || urlPath == "." {
urlPath = fmt.Sprintf("index.gmi")
}
// If path ends with '/' then add index.gmi for the
// directory to be created.
if strings.HasSuffix(urlPath, "/") {
urlPath = strings.Join([]string{urlPath, "index.gmi"}, "")
}
finalPath, err := calcFilePath(parentPath, urlPath)
if err != nil {
logging.LogError("Error saving %s: %w", s.URL, err)
return
}
// Ensure the directory exists
dir := filepath.Dir(finalPath)
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
logging.LogError("Failed to create directory: %w", err)
return
}
if s.MimeType.Valid && s.MimeType.String == "text/gemini" {
err = os.WriteFile(finalPath, (*s).Data.V, 0666)
} else {
err = os.WriteFile(finalPath, []byte((*s).GemText.String), 0666)
}
if err != nil {
logging.LogError("Error saving %s: %w", s.URL.Full, err)
}
close(done)
}
func ReadLines(path string) []string {
data, err := os.ReadFile(path)
if err != nil {
panic(fmt.Sprintf("Failed to read blacklist file: %s", err))
}
lines := strings.Split(string(data), "\n")
// Remove last line if empty
// (happens when file ends with '\n')
if lines[len(lines)-1] == "" {
lines = lines[:len(lines)-1]
}
logging.LogInfo("Loaded %d blacklist URLs", len(lines))
return lines
}