Files
gemini-grc/gemini/processing.go

61 lines
1.6 KiB
Go

package gemini
import (
"bytes"
"errors"
"fmt"
"io"
"unicode/utf8"
"github.com/antanst/go_errors"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/transform"
)
var (
ErrInputTooLarge = errors.New("input too large")
ErrUTF8Conversion = errors.New("UTF-8 conversion error")
)
func BytesToValidUTF8(input []byte) (string, error) {
if len(input) == 0 {
return "", nil
}
const maxSize = 10 * 1024 * 1024 // 10MB
if len(input) > maxSize {
return "", go_errors.NewError(fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize))
}
// remove NULL byte 0x00 (ReplaceAll accepts slices)
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
if utf8.Valid(inputNoNull) {
return string(inputNoNull), nil
}
encodings := []transform.Transformer{
charmap.ISO8859_1.NewDecoder(),
charmap.ISO8859_7.NewDecoder(),
charmap.Windows1250.NewDecoder(), // Central European
charmap.Windows1251.NewDecoder(), // Cyrillic
charmap.Windows1252.NewDecoder(),
charmap.Windows1256.NewDecoder(), // Arabic
japanese.EUCJP.NewDecoder(), // Japanese
korean.EUCKR.NewDecoder(), // Korean
}
// First successful conversion wins.
var lastErr error
for _, encoding := range encodings {
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
result, err := io.ReadAll(reader)
if err != nil {
lastErr = err
continue
}
if utf8.Valid(result) {
return string(result), nil
}
}
return "", go_errors.NewError(fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr))
}