Enhance crawler with seed list and SQL utilities

Add seedList module for URL initialization, comprehensive SQL utilities for database analysis, and update project configuration.
This commit is contained in:
antanst
2025-06-16 12:29:33 +03:00
parent 51f94c90b2
commit 330b596497
37 changed files with 742 additions and 682 deletions

View File

@@ -16,7 +16,7 @@ import (
commonUrl "gemini-grc/common/url"
"gemini-grc/config"
"gemini-grc/contextutil"
"gemini-grc/logging"
"git.antanst.com/antanst/logging"
"git.antanst.com/antanst/xerrors"
"github.com/guregu/null/v5"
_ "github.com/jackc/pgx/v5/stdlib" // PGX driver for PostgreSQL
@@ -32,7 +32,7 @@ type DbService interface {
// URL methods
InsertURL(ctx context.Context, tx *sqlx.Tx, url string) error
NormalizeURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error
CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error
DeleteURL(ctx context.Context, tx *sqlx.Tx, url string) error
MarkURLsAsBeingProcessed(ctx context.Context, tx *sqlx.Tx, urls []string) error
GetUrlHosts(ctx context.Context, tx *sqlx.Tx) ([]string, error)
@@ -117,10 +117,13 @@ func (d *DbServiceImpl) Initialize(ctx context.Context) error {
return nil
}
// Shutdown the database with context
func (d *DbServiceImpl) Shutdown(ctx context.Context) error {
dbCtx := contextutil.ContextWithComponent(ctx, "database")
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Shutting down database connection")
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Shutting down database connections")
_, err := d.db.Query("UPDATE urls SET being_processed=false")
if err != nil {
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Unable to update urls table: %v", err)
}
d.mu.Lock()
defer d.mu.Unlock()
@@ -129,12 +132,11 @@ func (d *DbServiceImpl) Shutdown(ctx context.Context) error {
return nil
}
// Check if the context is cancelled before proceeding
if err := ctx.Err(); err != nil {
return err
}
err := d.db.Close()
err = d.db.Close()
if err != nil {
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Error closing database connection: %v", err)
} else {
@@ -154,7 +156,6 @@ func (d *DbServiceImpl) NewTx(ctx context.Context) (*sqlx.Tx, error) {
return nil, err
}
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Creating new database transaction")
tx, err := d.db.BeginTxx(ctx, nil)
if err != nil {
contextlog.LogErrorWithContext(dbCtx, logging.GetSlogger(), "Failed to create transaction: %v", err)
@@ -199,7 +200,7 @@ func (d *DbServiceImpl) InsertURL(ctx context.Context, tx *sqlx.Tx, url string)
}
// NormalizeURL normalizes a URL with context
func (d *DbServiceImpl) NormalizeURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error {
func (d *DbServiceImpl) CheckAndUpdateNormalizedURL(ctx context.Context, tx *sqlx.Tx, url string, normalizedURL string) error {
dbCtx := contextutil.ContextWithComponent(ctx, "database")
// Check if URLs are already the same
@@ -214,7 +215,6 @@ func (d *DbServiceImpl) NormalizeURL(ctx context.Context, tx *sqlx.Tx, url strin
contextlog.LogDebugWithContext(dbCtx, logging.GetSlogger(), "Updating normalized URL %s -> %s", url, normalizedURL)
// Context-aware implementation
query := SQL_UPDATE_URL
a := struct {
Url string `db:"Url"`
@@ -514,12 +514,13 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
return false, err
}
// Context-aware implementation
// Update: Skipped this because empty pages can be valid
// ex. pages with redirect headers
// Only check for identical content if we have gemtext or data
if (!s.GemText.Valid || s.GemText.String == "") &&
(!s.Data.Valid || len(s.Data.V) == 0) {
return false, nil
}
//if (!s.GemText.Valid || s.GemText.String == "") &&
// (!s.Data.Valid || len(s.Data.V) == 0) {
// return false, nil
//}
// Try to get the latest snapshot for this URL
latestSnapshot := &snapshot.Snapshot{}
@@ -546,3 +547,20 @@ func (d *DbServiceImpl) IsContentIdentical(ctx context.Context, tx *sqlx.Tx, s *
return false, nil
}
// SafeRollback attempts to roll back a transaction,
// handling the case if the tx was already finalized.
func SafeRollback(ctx context.Context, tx *sqlx.Tx) error {
rollbackErr := tx.Rollback()
if rollbackErr != nil {
// Check if it's the standard "transaction already finalized" error
if errors.Is(rollbackErr, sql.ErrTxDone) {
contextlog.LogWarnWithContext(ctx, logging.GetSlogger(), "Rollback failed because transaction is already finalized")
return nil
}
// Only return error for other types of rollback failures
contextlog.LogErrorWithContext(ctx, logging.GetSlogger(), "Failed to rollback transaction: %v", rollbackErr)
return xerrors.NewError(fmt.Errorf("failed to rollback transaction: %w", rollbackErr), 0, "", true)
}
return nil
}