From 62f0bbb59820c565aabb88c026552bde64c07db0 Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Tue, 16 Jun 2020 12:39:26 +0100 Subject: [PATCH] dedupe: Make it obey the --size-only flag for duplicate detection #4321 --- cmd/dedupe/dedupe.go | 49 +++++++++++++++++++++++------------- fs/operations/dedupe.go | 38 ++++++++++++++++++---------- fs/operations/dedupe_test.go | 21 ++++++++++++++++ 3 files changed, 77 insertions(+), 31 deletions(-) diff --git a/cmd/dedupe/dedupe.go b/cmd/dedupe/dedupe.go index 68b52fdeb..1a42e4905 100644 --- a/cmd/dedupe/dedupe.go +++ b/cmd/dedupe/dedupe.go @@ -22,19 +22,32 @@ func init() { var commandDefinition = &cobra.Command{ Use: "dedupe [mode] remote:path", - Short: `Interactively find duplicate files and delete/rename them.`, + Short: `Interactively find duplicate filenames and delete/rename them.`, Long: ` -By default ` + "`" + `dedupe` + "`" + ` interactively finds duplicate files and offers to -delete all but one or rename them to be different. Only useful with -Google Drive which can have duplicate file names. + +By default ` + "`dedupe`" + ` interactively finds files with duplicate +names and offers to delete all but one or rename them to be +different. + +This is only useful with backends like Google Drive which can have +duplicate file names. It can be run on wrapping backends (eg crypt) if +they wrap a backend which supports duplicate file names. In the first pass it will merge directories with the same name. It -will do this iteratively until all the identical directories have been -merged. +will do this iteratively until all the identically named directories +have been merged. -The ` + "`" + `dedupe` + "`" + ` command will delete all but one of any identical (same -md5sum) files it finds without confirmation. This means that for most -duplicated files the ` + "`" + `dedupe` + "`" + ` command will not be interactive. +In the second pass, for every group of duplicate file names, it will +delete all but one identical files it finds without confirmation. +This means that for most duplicated files the ` + "`dedupe`" + ` +command will not be interactive. + +` + "`dedupe`" + ` considers files to be identical if they have the +same hash. If the backend does not support hashes (eg crypt wrapping +Google Drive) then they will never be found to be identical. If you +use the ` + "`--size-only`" + ` flag then files will be considered +identical if they have the same size (any hash will be ignored). This +can be useful on crypt backends which do not support hashes. **Important**: Since this can cause data loss, test first with the ` + "`--dry-run` or the `--interactive`/`-i`" + ` flag. @@ -52,26 +65,26 @@ Before - with duplicates 1744073 2016-03-05 16:22:38.104000000 two.txt 564374 2016-03-05 16:22:52.118000000 two.txt -Now the ` + "`" + `dedupe` + "`" + ` session +Now the ` + "`dedupe`" + ` session $ rclone dedupe drive:dupes 2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode. - one.txt: Found 4 duplicates - deleting identical copies - one.txt: Deleting 2/3 identical duplicates (md5sum "1eedaa9fe86fd4b8632e2ac549403b36") + one.txt: Found 4 files with duplicate names + one.txt: Deleting 2/3 identical duplicates (MD5 "1eedaa9fe86fd4b8632e2ac549403b36") one.txt: 2 duplicates remain - 1: 6048320 bytes, 2016-03-05 16:23:16.798000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36 - 2: 564374 bytes, 2016-03-05 16:23:06.731000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81 + 1: 6048320 bytes, 2016-03-05 16:23:16.798000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36 + 2: 564374 bytes, 2016-03-05 16:23:06.731000000, MD5 7594e7dc9fc28f727c42ee3e0749de81 s) Skip and do nothing k) Keep just one (choose which in next step) r) Rename all to be different (by changing file.jpg to file-1.jpg) s/k/r> k Enter the number of the file to keep> 1 one.txt: Deleted 1 extra copies - two.txt: Found 3 duplicates - deleting identical copies + two.txt: Found 3 files with duplicates names two.txt: 3 duplicates remain - 1: 564374 bytes, 2016-03-05 16:22:52.118000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81 - 2: 6048320 bytes, 2016-03-05 16:22:46.185000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36 - 3: 1744073 bytes, 2016-03-05 16:22:38.104000000, md5sum 851957f7fb6f0bc4ce76be966d336802 + 1: 564374 bytes, 2016-03-05 16:22:52.118000000, MD5 7594e7dc9fc28f727c42ee3e0749de81 + 2: 6048320 bytes, 2016-03-05 16:22:46.185000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36 + 3: 1744073 bytes, 2016-03-05 16:22:38.104000000, MD5 851957f7fb6f0bc4ce76be966d336802 s) Skip and do nothing k) Keep just one (choose which in next step) r) Rename all to be different (by changing file.jpg to file-1.jpg) diff --git a/fs/operations/dedupe.go b/fs/operations/dedupe.go index 799a4fac9..0c7c1e2af 100644 --- a/fs/operations/dedupe.go +++ b/fs/operations/dedupe.go @@ -101,22 +101,30 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj objs = newObjs // See how many of these duplicates are identical - byHash := make(map[string][]fs.Object, len(objs)) + dupesByID := make(map[string][]fs.Object, len(objs)) for _, o := range objs { - md5sum, err := o.Hash(ctx, ht) - if err != nil || md5sum == "" { + ID := "" + if fs.Config.SizeOnly && o.Size() >= 0 { + ID = fmt.Sprintf("size %d", o.Size()) + } else if ht != hash.None { + hashValue, err := o.Hash(ctx, ht) + if err == nil && hashValue != "" { + ID = fmt.Sprintf("%v %s", ht, hashValue) + } + } + if ID == "" { remainingObjs = append(remainingObjs, o) } else { - byHash[md5sum] = append(byHash[md5sum], o) + dupesByID[ID] = append(dupesByID[ID], o) } } // Delete identical duplicates, filling remainingObjs with the ones remaining - for md5sum, hashObjs := range byHash { - remainingObjs = append(remainingObjs, hashObjs[0]) - if len(hashObjs) > 1 { - fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum) - for _, o := range hashObjs[1:] { + for ID, dupes := range dupesByID { + remainingObjs = append(remainingObjs, dupes[0]) + if len(dupes) > 1 { + fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID) + for _, o := range dupes[1:] { err := DeleteFile(ctx, o) if err != nil { remainingObjs = append(remainingObjs, o) @@ -132,11 +140,15 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) { fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) for i, o := range objs { - md5sum, err := o.Hash(ctx, ht) - if err != nil { - md5sum = err.Error() + hashValue := "" + if ht != hash.None { + var err error + hashValue, err = o.Hash(ctx, ht) + if err != nil { + hashValue = err.Error() + } } - fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum) + fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue) } switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { case 's': diff --git a/fs/operations/dedupe_test.go b/fs/operations/dedupe_test.go index 7a441fd58..9bee4fcf0 100644 --- a/fs/operations/dedupe_test.go +++ b/fs/operations/dedupe_test.go @@ -75,6 +75,27 @@ func TestDeduplicateSkip(t *testing.T) { r.CheckWithDuplicates(t, file1, file3) } +func TestDeduplicateSizeOnly(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfCantDedupe(t, r.Fremote) + + file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1) + file2 := r.WriteUncheckedObject(context.Background(), "one", "THIS IS ONE", t1) + file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t1) + r.CheckWithDuplicates(t, file1, file2, file3) + + fs.Config.SizeOnly = true + defer func() { + fs.Config.SizeOnly = false + }() + + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip) + require.NoError(t, err) + + r.CheckWithDuplicates(t, file1, file3) +} + func TestDeduplicateFirst(t *testing.T) { r := fstest.NewRun(t) defer r.Finalise()