Lots of features, first version that reliably crawls Geminispace.

- [x] Concurrent downloading with workers
- [x] Concurrent connection limit per host
- [x] URL Blacklist
- [x] Save image/* and text/* files
- [x] Configuration via environment variables
- [x] Storing snapshots in PostgreSQL
- [x] Proper response header & body UTF-8 and format validation
.

.

.
This commit is contained in:
2024-10-21 20:04:09 +03:00
parent 212345764b
commit cd60c1363b
37 changed files with 1231 additions and 323 deletions

33
gemini/processing.go Normal file
View File

@@ -0,0 +1,33 @@
package gemini
import (
"bytes"
"fmt"
"io"
"unicode/utf8"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/transform"
)
func EnsureValidUTF8(input []byte) (string, error) {
// Remove NULL byte 0x00
inputNoNull := bytes.ReplaceAll(input, []byte{0}, nil)
isValidUTF8 := utf8.Valid(inputNoNull)
if !isValidUTF8 {
encodings := []transform.Transformer{
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
// TODO: Try more encodings?
}
for _, encoding := range encodings {
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
result, err := io.ReadAll(reader)
if err != nil {
return "", fmt.Errorf("UTF-8 error: %w", err)
}
return string(result), nil
}
}
return string(inputNoNull), nil
}