Better unicode conversion

This commit is contained in:
2024-11-05 12:39:14 +02:00
parent a0563074ed
commit d5da9ac62d
2 changed files with 18 additions and 18 deletions

View File

@@ -10,25 +10,25 @@ import (
"golang.org/x/text/transform" "golang.org/x/text/transform"
) )
func EnsureValidUTF8(input []byte) (string, error) { func BytesToValidUTF8(input []byte) (string, error) {
// Remove NULL byte 0x00 // Remove NULL byte 0x00 (ReplaceAll accepts slices)
inputNoNull := bytes.ReplaceAll(input, []byte{0}, nil) inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
isValidUTF8 := utf8.Valid(inputNoNull) isValidUTF8 := utf8.Valid(inputNoNull)
if !isValidUTF8 { if isValidUTF8 {
return string(inputNoNull), nil
}
encodings := []transform.Transformer{ encodings := []transform.Transformer{
charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1 charmap.ISO8859_1.NewDecoder(), // First try ISO8859-1
charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc charmap.Windows1252.NewDecoder(), // Then try Windows-1252, etc
// TODO: Try more encodings? // TODO: Try more encodings?
} }
// First successful conversion wins // First successful conversion wins.
for _, encoding := range encodings { for _, encoding := range encodings {
reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding) reader := transform.NewReader(bytes.NewReader(inputNoNull), encoding)
result, err := io.ReadAll(reader) result, err := io.ReadAll(reader)
if err != nil { if err == nil {
return "", fmt.Errorf("UTF-8 error: %w", err)
}
return string(result), nil return string(result), nil
} }
} }
return string(inputNoNull), nil return "", fmt.Errorf("UTF-8 error: %w", err)
} }

View File

@@ -6,7 +6,7 @@ import "testing"
func TestEnsureValidUTF8(t *testing.T) { func TestEnsureValidUTF8(t *testing.T) {
// Create a string with a null byte // Create a string with a null byte
strWithNull := "Hello" + string('\x00') + "world" strWithNull := "Hello" + string('\x00') + "world"
result, _ := EnsureValidUTF8([]byte(strWithNull)) result, _ := BytesToValidUTF8([]byte(strWithNull))
if result != "Helloworld" { if result != "Helloworld" {
t.Errorf("Expected string without NULL byte, got %s", result) t.Errorf("Expected string without NULL byte, got %s", result)
} }