Enhance crawler with seed list and SQL utilities
Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
@@ -7,8 +7,8 @@ import (
|
||||
|
||||
"gemini-grc/common/linkList"
|
||||
url2 "gemini-grc/common/url"
|
||||
"gemini-grc/logging"
|
||||
"gemini-grc/util"
|
||||
"git.antanst.com/antanst/logging"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
)
|
||||
|
||||
|
||||
@@ -1,23 +1,214 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
stdurl "net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
commonErrors "gemini-grc/common/errors"
|
||||
"gemini-grc/common/contextlog"
|
||||
"gemini-grc/common/snapshot"
|
||||
_url "gemini-grc/common/url"
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/contextutil"
|
||||
"git.antanst.com/antanst/logging"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
// ProcessData processes the raw data from a Gemini response and populates the Snapshot.
|
||||
// Visit visits a given URL using the Gemini protocol,
|
||||
// and returns a populated snapshot. Any relevant errors
|
||||
// when visiting the URL are stored in the snapshot;
|
||||
// an error is returned only when construction of a
|
||||
// snapshot was not possible (context cancellation errors,
|
||||
// not a valid URL etc.)
|
||||
func Visit(ctx context.Context, url string) (s *snapshot.Snapshot, err error) {
|
||||
geminiCtx := contextutil.ContextWithComponent(ctx, "network")
|
||||
|
||||
s, err = snapshot.SnapshotFromURL(url, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check if the context has been canceled
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
data, err := ConnectAndGetData(geminiCtx, s.URL.String())
|
||||
if err != nil {
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Check if the context has been canceled
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
s = UpdateSnapshotWithData(*s, data)
|
||||
|
||||
if !s.Error.Valid &&
|
||||
s.MimeType.Valid &&
|
||||
s.MimeType.String == "text/gemini" &&
|
||||
len(s.GemText.ValueOrZero()) > 0 {
|
||||
links := GetPageLinks(s.URL, s.GemText.String)
|
||||
if len(links) > 0 {
|
||||
s.Links = null.ValueFrom(links)
|
||||
}
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// ConnectAndGetData is a context-aware version of ConnectAndGetData
|
||||
// that returns the data from a GET request to a Gemini URL. It uses the context
|
||||
// for cancellation, timeout, and logging.
|
||||
func ConnectAndGetData(ctx context.Context, url string) ([]byte, error) {
|
||||
parsedURL, err := stdurl.Parse(url)
|
||||
if err != nil {
|
||||
return nil, xerrors.NewSimpleError(fmt.Errorf("error parsing URL: %w", err))
|
||||
}
|
||||
|
||||
hostname := parsedURL.Hostname()
|
||||
port := parsedURL.Port()
|
||||
if port == "" {
|
||||
port = "1965"
|
||||
}
|
||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||
|
||||
// Check if the context has been canceled before proceeding
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
|
||||
|
||||
// Establish the underlying TCP connection with context-based cancellation
|
||||
dialer := &net.Dialer{
|
||||
Timeout: timeoutDuration,
|
||||
}
|
||||
|
||||
conn, err := dialer.DialContext(ctx, "tcp", host)
|
||||
if err != nil {
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to establish TCP connection: %v", err)
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
// Make sure we always close the connection
|
||||
defer func() {
|
||||
_ = conn.Close()
|
||||
}()
|
||||
|
||||
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
// Check if the context has been canceled before proceeding with TLS handshake
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Perform the TLS handshake
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
|
||||
ServerName: parsedURL.Hostname(), // SNI says we should not include port in hostname
|
||||
}
|
||||
|
||||
tlsConn := tls.Client(conn, tlsConfig)
|
||||
err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
// Check if the context is done before attempting handshake
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Perform TLS handshake with regular method
|
||||
// (HandshakeContext is only available in Go 1.17+)
|
||||
err = tlsConn.Handshake()
|
||||
if err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
// Check again if the context is done after handshake
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
// We read `buf`-sized chunks and add data to `data`
|
||||
buf := make([]byte, 4096)
|
||||
var data []byte
|
||||
|
||||
// Check if the context has been canceled before sending request
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
// Send Gemini request to trigger server response
|
||||
// Fix for stupid server bug:
|
||||
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||
// when the port is 1965 and is still specified explicitly in the URL.
|
||||
url2, _ := _url.ParseURL(url, "", true)
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
|
||||
if err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
// Read response bytes in len(buf) byte chunks
|
||||
for {
|
||||
// Check if the context has been canceled before each read
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
|
||||
n, err := tlsConn.Read(buf)
|
||||
if n > 0 {
|
||||
data = append(data, buf[:n]...)
|
||||
}
|
||||
if len(data) > config.CONFIG.MaxResponseSize {
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
|
||||
return nil, xerrors.NewSimpleError(fmt.Errorf("response too large"))
|
||||
}
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
|
||||
return nil, xerrors.NewSimpleError(err)
|
||||
}
|
||||
}
|
||||
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes of data", len(data))
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// UpdateSnapshotWithData processes the raw data from a Gemini response and populates the Snapshot.
|
||||
// This function is exported for use by the robotsMatch package.
|
||||
func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
|
||||
func UpdateSnapshotWithData(s snapshot.Snapshot, data []byte) *snapshot.Snapshot {
|
||||
header, body, err := getHeadersAndData(data)
|
||||
if err != nil {
|
||||
return &s, err
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return &s
|
||||
}
|
||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||
|
||||
@@ -39,13 +230,14 @@ func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
|
||||
if mimeType == "text/gemini" {
|
||||
validBody, err := BytesToValidUTF8(body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
return &s
|
||||
}
|
||||
s.GemText = null.StringFrom(validBody)
|
||||
} else {
|
||||
s.Data = null.ValueFrom(body)
|
||||
}
|
||||
return &s, nil
|
||||
return &s
|
||||
}
|
||||
|
||||
// Checks for a Gemini header, which is
|
||||
@@ -55,7 +247,7 @@ func ProcessData(s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
|
||||
func getHeadersAndData(data []byte) (string, []byte, error) {
|
||||
firstLineEnds := slices.Index(data, '\n')
|
||||
if firstLineEnds == -1 {
|
||||
return "", nil, commonErrors.NewHostError(fmt.Errorf("error parsing header"))
|
||||
return "", nil, xerrors.NewSimpleError(fmt.Errorf("error parsing header"))
|
||||
}
|
||||
firstLine := string(data[:firstLineEnds])
|
||||
rest := data[firstLineEnds+1:]
|
||||
@@ -106,7 +298,3 @@ func getMimeTypeAndLang(headers string) (int, string, string) {
|
||||
lang := matches[3] // Will be empty string if no lang parameter was found
|
||||
return code, mimeType, lang
|
||||
}
|
||||
|
||||
func isGeminiCapsule(s *snapshot.Snapshot) bool {
|
||||
return !s.Error.Valid && s.MimeType.Valid && s.MimeType.String == "text/gemini"
|
||||
}
|
||||
|
||||
@@ -1,276 +0,0 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
stdurl "net/url"
|
||||
"time"
|
||||
|
||||
"gemini-grc/common/contextlog"
|
||||
commonErrors "gemini-grc/common/errors"
|
||||
"gemini-grc/common/snapshot"
|
||||
_url "gemini-grc/common/url"
|
||||
"gemini-grc/config"
|
||||
"gemini-grc/contextutil"
|
||||
"gemini-grc/logging"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
"github.com/guregu/null/v5"
|
||||
)
|
||||
|
||||
// Visit visits a given URL using the Gemini protocol.
|
||||
func Visit(ctx context.Context, url string) (s *snapshot.Snapshot, err error) {
|
||||
geminiCtx := contextutil.ContextWithComponent(ctx, "gemini")
|
||||
|
||||
contextlog.LogDebugWithContext(geminiCtx, logging.GetSlogger(), "Visiting Gemini URL: %s", url)
|
||||
|
||||
s, err = snapshot.SnapshotFromURL(url, true)
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(geminiCtx, logging.GetSlogger(), "Failed to create snapshot from URL: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
// GeminiError and HostError should
|
||||
// be stored in the snapshot.
|
||||
if commonErrors.IsHostError(err) {
|
||||
contextlog.LogInfoWithContext(geminiCtx, logging.GetSlogger(), "Host error: %v", err)
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
err = nil
|
||||
return
|
||||
} else if IsGeminiError(err) {
|
||||
contextlog.LogInfoWithContext(geminiCtx, logging.GetSlogger(), "Gemini error: %v", err)
|
||||
s.Error = null.StringFrom(err.Error())
|
||||
s.Header = null.StringFrom(errors.Unwrap(err).(*GeminiError).Header)
|
||||
s.ResponseCode = null.IntFrom(int64(errors.Unwrap(err).(*GeminiError).Code))
|
||||
err = nil
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
||||
// Check if the context has been canceled
|
||||
if err := ctx.Err(); err != nil {
|
||||
return s, err
|
||||
}
|
||||
|
||||
data, err := ConnectAndGetDataWithContext(geminiCtx, s.URL.String())
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
|
||||
// Check if the context has been canceled
|
||||
if err := ctx.Err(); err != nil {
|
||||
return s, err
|
||||
}
|
||||
|
||||
s, err = ProcessData(*s, data)
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
|
||||
if isGeminiCapsule(s) {
|
||||
links := GetPageLinks(s.URL, s.GemText.String)
|
||||
if len(links) > 0 {
|
||||
s.Links = null.ValueFrom(links)
|
||||
}
|
||||
}
|
||||
|
||||
contextlog.LogDebugWithContext(geminiCtx, logging.GetSlogger(), "Successfully visited URL: %s (Code: %d)", url, s.ResponseCode.ValueOrZero())
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// ConnectAndGetDataWithContext is a context-aware version of ConnectAndGetData
|
||||
// that returns the data from a GET request to a Gemini URL. It uses the context
|
||||
// for cancellation, timeout, and logging.
|
||||
func ConnectAndGetDataWithContext(ctx context.Context, url string) ([]byte, error) {
|
||||
// Parse the URL
|
||||
parsedURL, err := stdurl.Parse(url)
|
||||
if err != nil {
|
||||
return nil, xerrors.NewError(fmt.Errorf("error parsing URL: %w", err), 0, "", false)
|
||||
}
|
||||
|
||||
hostname := parsedURL.Hostname()
|
||||
port := parsedURL.Port()
|
||||
if port == "" {
|
||||
port = "1965"
|
||||
}
|
||||
host := fmt.Sprintf("%s:%s", hostname, port)
|
||||
|
||||
// Check if the context has been canceled before proceeding
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Connecting to %s", host)
|
||||
|
||||
timeoutDuration := time.Duration(config.CONFIG.ResponseTimeout) * time.Second
|
||||
|
||||
// Establish the underlying TCP connection with context-based cancellation
|
||||
dialer := &net.Dialer{
|
||||
Timeout: timeoutDuration,
|
||||
}
|
||||
|
||||
// Use DialContext to allow cancellation via context
|
||||
conn, err := dialer.DialContext(ctx, "tcp", host)
|
||||
if err != nil {
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Failed to establish TCP connection: %v", err)
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
|
||||
// Make sure we always close the connection
|
||||
defer func() {
|
||||
_ = conn.Close()
|
||||
}()
|
||||
|
||||
// Set read and write timeouts on the TCP connection
|
||||
err = conn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
err = conn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
|
||||
// Check if the context has been canceled before proceeding with TLS handshake
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Perform the TLS handshake
|
||||
tlsConfig := &tls.Config{
|
||||
InsecureSkipVerify: true, //nolint:gosec // Accept all TLS certs, even if insecure.
|
||||
ServerName: parsedURL.Hostname(), // SNI says we should not include port in hostname
|
||||
}
|
||||
|
||||
tlsConn := tls.Client(conn, tlsConfig)
|
||||
err = tlsConn.SetReadDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
err = tlsConn.SetWriteDeadline(time.Now().Add(timeoutDuration))
|
||||
if err != nil {
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
|
||||
// Check if the context is done before attempting handshake
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Perform TLS handshake with regular method
|
||||
// (HandshakeContext is only available in Go 1.17+)
|
||||
err = tlsConn.Handshake()
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "TLS handshake failed: %v", err)
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
|
||||
// Check again if the context is done after handshake
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// We read `buf`-sized chunks and add data to `data`
|
||||
buf := make([]byte, 4096)
|
||||
var data []byte
|
||||
|
||||
// Check if the context has been canceled before sending request
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Send Gemini request to trigger server response
|
||||
// Fix for stupid server bug:
|
||||
// Some servers return 'Header: 53 No proxying to other hosts or ports!'
|
||||
// when the port is 1965 and is still specified explicitly in the URL.
|
||||
url2, _ := _url.ParseURL(url, "", true)
|
||||
_, err = tlsConn.Write([]byte(fmt.Sprintf("%s\r\n", url2.StringNoDefaultPort())))
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to send request: %v", err)
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Request sent, reading response")
|
||||
|
||||
// Read response bytes in len(buf) byte chunks
|
||||
for {
|
||||
// Check if the context has been canceled before each read
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
n, err := tlsConn.Read(buf)
|
||||
if n > 0 {
|
||||
data = append(data, buf[:n]...)
|
||||
}
|
||||
if len(data) > config.CONFIG.MaxResponseSize {
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Response too large (max: %d bytes)", config.CONFIG.MaxResponseSize)
|
||||
return nil, commonErrors.NewHostError(fmt.Errorf("response too large"))
|
||||
}
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Error reading data: %v", err)
|
||||
return nil, commonErrors.NewHostError(err)
|
||||
}
|
||||
}
|
||||
|
||||
contextlog.LogDebugWithContext(ctx, logging.GetSlogger(), "Received %d bytes of data", len(data))
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// ProcessDataWithContext is a context-aware version of ProcessData that processes
|
||||
// the raw data from a Gemini response and populates the Snapshot.
|
||||
func ProcessDataWithContext(ctx context.Context, s snapshot.Snapshot, data []byte) (*snapshot.Snapshot, error) {
|
||||
// Create a processing-specific context with the "process" component
|
||||
processCtx := contextutil.ContextWithComponent(ctx, "process")
|
||||
|
||||
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Processing Gemini response data (%d bytes)", len(data))
|
||||
|
||||
header, body, err := getHeadersAndData(data)
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(processCtx, logging.GetSlogger(), "Failed to extract headers: %v", err)
|
||||
return &s, err
|
||||
}
|
||||
|
||||
code, mimeType, lang := getMimeTypeAndLang(header)
|
||||
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Response code: %d, MimeType: %s, Lang: %s", code, mimeType, lang)
|
||||
|
||||
if code != 0 {
|
||||
s.ResponseCode = null.IntFrom(int64(code))
|
||||
}
|
||||
if header != "" {
|
||||
s.Header = null.StringFrom(header)
|
||||
}
|
||||
if mimeType != "" {
|
||||
s.MimeType = null.StringFrom(mimeType)
|
||||
}
|
||||
if lang != "" {
|
||||
s.Lang = null.StringFrom(lang)
|
||||
}
|
||||
|
||||
// If we've got a Gemini document, populate
|
||||
// `GemText` field, otherwise raw data goes to `Data`.
|
||||
if mimeType == "text/gemini" {
|
||||
validBody, err := BytesToValidUTF8(body)
|
||||
if err != nil {
|
||||
contextlog.LogErrorWithContext(processCtx, logging.GetSlogger(), "UTF-8 validation failed: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
s.GemText = null.StringFrom(validBody)
|
||||
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Processed gemtext content (%d characters)", len(validBody))
|
||||
} else {
|
||||
s.Data = null.ValueFrom(body)
|
||||
contextlog.LogDebugWithContext(processCtx, logging.GetSlogger(), "Stored binary data (%d bytes)", len(body))
|
||||
}
|
||||
|
||||
return &s, nil
|
||||
}
|
||||
@@ -135,17 +135,7 @@ func TestProcessData(t *testing.T) {
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
s := snapshot.Snapshot{}
|
||||
result, err := ProcessData(s, test.inputData)
|
||||
|
||||
if test.expectedError && err == nil {
|
||||
t.Errorf("Expected error, got nil")
|
||||
return
|
||||
}
|
||||
|
||||
if !test.expectedError && err != nil {
|
||||
t.Errorf("Unexpected error: %v", err)
|
||||
return
|
||||
}
|
||||
result := UpdateSnapshotWithData(s, test.inputData)
|
||||
|
||||
if test.expectedError {
|
||||
return
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
|
||||
"gemini-grc/config"
|
||||
"git.antanst.com/antanst/xerrors"
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
"golang.org/x/text/encoding/japanese"
|
||||
@@ -23,11 +24,16 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
||||
if len(input) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
const maxSize = 10 * 1024 * 1024 // 10MB
|
||||
if len(input) > maxSize {
|
||||
return "", xerrors.NewError(fmt.Errorf("%w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false)
|
||||
|
||||
maxSize := config.CONFIG.MaxResponseSize
|
||||
if maxSize == 0 {
|
||||
maxSize = 1024 * 1024 // Default 1MB for tests
|
||||
}
|
||||
// remove NULL byte 0x00 (ReplaceAll accepts slices)
|
||||
if len(input) > maxSize {
|
||||
return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w: %d bytes (max %d)", ErrInputTooLarge, len(input), maxSize), 0, "", false)
|
||||
}
|
||||
|
||||
// Always remove NULL bytes first (before UTF-8 validity check)
|
||||
inputNoNull := bytes.ReplaceAll(input, []byte{byte(0)}, []byte{})
|
||||
if utf8.Valid(inputNoNull) {
|
||||
return string(inputNoNull), nil
|
||||
@@ -42,6 +48,8 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
||||
japanese.EUCJP.NewDecoder(), // Japanese
|
||||
korean.EUCKR.NewDecoder(), // Korean
|
||||
}
|
||||
|
||||
// Still invalid Unicode. Try some encodings to convert to.
|
||||
// First successful conversion wins.
|
||||
var lastErr error
|
||||
for _, encoding := range encodings {
|
||||
@@ -56,5 +64,5 @@ func BytesToValidUTF8(input []byte) (string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
return "", xerrors.NewError(fmt.Errorf("%w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false)
|
||||
return "", xerrors.NewError(fmt.Errorf("BytesToValidUTF8: %w (tried %d encodings): %w", ErrUTF8Conversion, len(encodings), lastErr), 0, "", false)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user