Refine content deduplication and improve configuration
This commit is contained in:
@@ -195,16 +195,15 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
||||
}
|
||||
}
|
||||
|
||||
// Check if content is identical to previous snapshot and we should skip further processing
|
||||
if config.CONFIG.SkipIdenticalContent {
|
||||
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if identical {
|
||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
|
||||
return removeURL(ctx, tx, s.URL.String())
|
||||
}
|
||||
// Check if we should skip a potentially
|
||||
// identical snapshot
|
||||
skipIdentical, err := shouldSkipIdenticalSnapshot(ctx, tx, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if skipIdentical {
|
||||
contextlog.LogInfoWithContext(ctx, logging.GetSlogger(), "Content identical to existing snapshot, skipping")
|
||||
return removeURL(ctx, tx, s.URL.String())
|
||||
}
|
||||
|
||||
// Process and store links since content has changed
|
||||
@@ -225,6 +224,32 @@ func WorkOnUrl(ctx context.Context, tx *sqlx.Tx, url string) (err error) {
|
||||
return saveSnapshotAndRemoveURL(ctx, tx, s)
|
||||
}
|
||||
|
||||
func shouldSkipIdenticalSnapshot(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) (bool, error) {
|
||||
// Check if content is identical to previous snapshot and we should skip further processing
|
||||
if config.CONFIG.SkipIdenticalContent {
|
||||
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if identical {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
// We write every Gemini capsule, but still
|
||||
// skip identical pages that aren't capsules.
|
||||
if s.MimeType.String != "text/gemini" {
|
||||
identical, err := gemdb.Database.IsContentIdentical(ctx, tx, s)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if identical {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// storeLinks checks and stores the snapshot links in the database.
|
||||
func storeLinks(ctx context.Context, tx *sqlx.Tx, s *snapshot.Snapshot) error {
|
||||
if s.Links.Valid { //nolint:nestif
|
||||
|
||||
Reference in New Issue
Block a user