Better unicode conversion
This commit is contained in:
@@ -10,25 +10,25 @@ import (
|
|||||||
"golang.org/x/text/transform"
|
"golang.org/x/text/transform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func EnsureValidUTF8(input []byte) (string, error) {
|
func BytesToValidUTF8(input []byte) (string, error) {
|
||||||
// Remove NULL byte 0x00
|
// Remove NULL byte 0x00 (ReplaceAll accepts slices)
|
||||||
inputNoNull := bytes.ReplaceAll(input, []byte{0}, nil)
|
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
|
||||||
isValidUTF8 := utf8.Valid(inputNoNull)
|
isValidUTF8 := utf8.Valid(inputNoNull)
|
||||||
if !isValidUTF8 {
|
if isValidUTF8 {
|
||||||
|
return string(inputNoNull), nil
|
||||||
|
}
|
||||||
encodings := []transform.Transformer{
|
encodings := []transform.Transformer{
|
||||||
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
|
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
|
||||||
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
|
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
|
||||||
// TODO: Try more encodings?
|
// TODO: Try more encodings?
|
||||||
}
|
}
|
||||||
// First successful conversion wins
|
// First successful conversion wins.
|
||||||
for _, encoding := range encodings {
|
for _, encoding := range encodings {
|
||||||
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
|
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
|
||||||
result, err := io.ReadAll(reader)
|
result, err := io.ReadAll(reader)
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return "", fmt.Errorf("UTF-8 error: %w", err)
|
|
||||||
}
|
|
||||||
return string(result), nil
|
return string(result), nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return string(inputNoNull), nil
|
return "", fmt.Errorf("UTF-8 error: %w", err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import "testing"
|
|||||||
func TestEnsureValidUTF8(t *testing.T) {
|
func TestEnsureValidUTF8(t *testing.T) {
|
||||||
// Create a string with a null byte
|
// Create a string with a null byte
|
||||||
strWithNull := "Hello" + string('\x00') + "world"
|
strWithNull := "Hello" + string('\x00') + "world"
|
||||||
result, _ := EnsureValidUTF8([]byte(strWithNull))
|
result, _ := BytesToValidUTF8([]byte(strWithNull))
|
||||||
if result != "Helloworld" {
|
if result != "Helloworld" {
|
||||||
t.Errorf("Expected string without NULL byte, got %s", result)
|
t.Errorf("Expected string without NULL byte, got %s", result)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user