Refine content deduplication and improve configuration

This commit is contained in:
antanst
2025-06-16 17:09:26 +03:00
parent 330b596497
commit f9024d15aa
3 changed files with 87 additions and 28 deletions

View File

@@ -195,16 +195,15 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
}
}
// Check if content is identical to previous snapshot and we should skip further processing
if config.CONFIG.SkipIdenticalContent {
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
if err != nil {
return err
}
if identical {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
return removeURL(ctx, tx, s.URL.String())
}
// Check if we should skip a potentially
// identical snapshot
skipIdentical, err := shouldSkipIdenticalSnapshot(ctx, tx, s)
if err != nil {
return err
}
if skipIdentical {
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
return removeURL(ctx, tx, s.URL.String())
}
// Process and store links since content has changed
@@ -225,6 +224,32 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
return saveSnapshotAndRemoveURL(ctx, tx, s)
}
func shouldSkipIdenticalSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
// Check if content is identical to previous snapshot and we should skip further processing
if config.CONFIG.SkipIdenticalContent {
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
if err != nil {
return false, err
}
if identical {
return true, nil
}
}
// We write every Gemini capsule, but still
// skip identical pages that aren't capsules.
if s.MimeType.String != "text/gemini" {
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
if err != nil {
return false, err
}
if identical {
return true, nil
}
}
return false, nil
}
// storeLinks checks and stores the snapshot links in the database.
func storeLinks(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
if s.Links.Valid { //nolint:nestif