dedupe: Make it obey the --size-only flag for duplicate detection #4321

This commit is contained in:
Nick Craig-Wood 2020-06-16 12:39:26 +01:00
parent d5f4c74697
commit 62f0bbb598
3 changed files with 77 additions and 31 deletions

View File

@ -22,19 +22,32 @@ func init() {
var commandDefinition = &cobra.Command{ var commandDefinition = &cobra.Command{
Use: "dedupe [mode] remote:path", Use: "dedupe [mode] remote:path",
Short: `Interactively find duplicate files and delete/rename them.`, Short: `Interactively find duplicate filenames and delete/rename them.`,
Long: ` Long: `
By default ` + "`" + `dedupe` + "`" + ` interactively finds duplicate files and offers to
delete all but one or rename them to be different. Only useful with By default ` + "`dedupe`" + ` interactively finds files with duplicate
Google Drive which can have duplicate file names. names and offers to delete all but one or rename them to be
different.
This is only useful with backends like Google Drive which can have
duplicate file names. It can be run on wrapping backends (eg crypt) if
they wrap a backend which supports duplicate file names.
In the first pass it will merge directories with the same name. It In the first pass it will merge directories with the same name. It
will do this iteratively until all the identical directories have been will do this iteratively until all the identically named directories
merged. have been merged.
The ` + "`" + `dedupe` + "`" + ` command will delete all but one of any identical (same In the second pass, for every group of duplicate file names, it will
md5sum) files it finds without confirmation. This means that for most delete all but one identical files it finds without confirmation.
duplicated files the ` + "`" + `dedupe` + "`" + ` command will not be interactive. This means that for most duplicated files the ` + "`dedupe`" + `
command will not be interactive.
` + "`dedupe`" + ` considers files to be identical if they have the
same hash. If the backend does not support hashes (eg crypt wrapping
Google Drive) then they will never be found to be identical. If you
use the ` + "`--size-only`" + ` flag then files will be considered
identical if they have the same size (any hash will be ignored). This
can be useful on crypt backends which do not support hashes.
**Important**: Since this can cause data loss, test first with the **Important**: Since this can cause data loss, test first with the
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag. ` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
@ -52,26 +65,26 @@ Before - with duplicates
1744073 2016-03-05 16:22:38.104000000 two.txt 1744073 2016-03-05 16:22:38.104000000 two.txt
564374 2016-03-05 16:22:52.118000000 two.txt 564374 2016-03-05 16:22:52.118000000 two.txt
Now the ` + "`" + `dedupe` + "`" + ` session Now the ` + "`dedupe`" + ` session
$ rclone dedupe drive:dupes $ rclone dedupe drive:dupes
2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode. 2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode.
one.txt: Found 4 duplicates - deleting identical copies one.txt: Found 4 files with duplicate names
one.txt: Deleting 2/3 identical duplicates (md5sum "1eedaa9fe86fd4b8632e2ac549403b36") one.txt: Deleting 2/3 identical duplicates (MD5 "1eedaa9fe86fd4b8632e2ac549403b36")
one.txt: 2 duplicates remain one.txt: 2 duplicates remain
1: 6048320 bytes, 2016-03-05 16:23:16.798000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36 1: 6048320 bytes, 2016-03-05 16:23:16.798000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
2: 564374 bytes, 2016-03-05 16:23:06.731000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81 2: 564374 bytes, 2016-03-05 16:23:06.731000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
s) Skip and do nothing s) Skip and do nothing
k) Keep just one (choose which in next step) k) Keep just one (choose which in next step)
r) Rename all to be different (by changing file.jpg to file-1.jpg) r) Rename all to be different (by changing file.jpg to file-1.jpg)
s/k/r> k s/k/r> k
Enter the number of the file to keep> 1 Enter the number of the file to keep> 1
one.txt: Deleted 1 extra copies one.txt: Deleted 1 extra copies
two.txt: Found 3 duplicates - deleting identical copies two.txt: Found 3 files with duplicates names
two.txt: 3 duplicates remain two.txt: 3 duplicates remain
1: 564374 bytes, 2016-03-05 16:22:52.118000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81 1: 564374 bytes, 2016-03-05 16:22:52.118000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
2: 6048320 bytes, 2016-03-05 16:22:46.185000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36 2: 6048320 bytes, 2016-03-05 16:22:46.185000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
3: 1744073 bytes, 2016-03-05 16:22:38.104000000, md5sum 851957f7fb6f0bc4ce76be966d336802 3: 1744073 bytes, 2016-03-05 16:22:38.104000000, MD5 851957f7fb6f0bc4ce76be966d336802
s) Skip and do nothing s) Skip and do nothing
k) Keep just one (choose which in next step) k) Keep just one (choose which in next step)
r) Rename all to be different (by changing file.jpg to file-1.jpg) r) Rename all to be different (by changing file.jpg to file-1.jpg)

View File

@ -101,22 +101,30 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
objs = newObjs objs = newObjs
// See how many of these duplicates are identical // See how many of these duplicates are identical
byHash := make(map[string][]fs.Object, len(objs)) dupesByID := make(map[string][]fs.Object, len(objs))
for _, o := range objs { for _, o := range objs {
md5sum, err := o.Hash(ctx, ht) ID := ""
if err != nil || md5sum == "" { if fs.Config.SizeOnly && o.Size() >= 0 {
ID = fmt.Sprintf("size %d", o.Size())
} else if ht != hash.None {
hashValue, err := o.Hash(ctx, ht)
if err == nil && hashValue != "" {
ID = fmt.Sprintf("%v %s", ht, hashValue)
}
}
if ID == "" {
remainingObjs = append(remainingObjs, o) remainingObjs = append(remainingObjs, o)
} else { } else {
byHash[md5sum] = append(byHash[md5sum], o) dupesByID[ID] = append(dupesByID[ID], o)
} }
} }
// Delete identical duplicates, filling remainingObjs with the ones remaining // Delete identical duplicates, filling remainingObjs with the ones remaining
for md5sum, hashObjs := range byHash { for ID, dupes := range dupesByID {
remainingObjs = append(remainingObjs, hashObjs[0]) remainingObjs = append(remainingObjs, dupes[0])
if len(hashObjs) > 1 { if len(dupes) > 1 {
fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum) fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
for _, o := range hashObjs[1:] { for _, o := range dupes[1:] {
err := DeleteFile(ctx, o) err := DeleteFile(ctx, o)
if err != nil { if err != nil {
remainingObjs = append(remainingObjs, o) remainingObjs = append(remainingObjs, o)
@ -132,11 +140,15 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) { func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
for i, o := range objs { for i, o := range objs {
md5sum, err := o.Hash(ctx, ht) hashValue := ""
if err != nil { if ht != hash.None {
md5sum = err.Error() var err error
hashValue, err = o.Hash(ctx, ht)
if err != nil {
hashValue = err.Error()
}
} }
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum) fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
} }
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
case 's': case 's':

View File

@ -75,6 +75,27 @@ func TestDeduplicateSkip(t *testing.T) {
r.CheckWithDuplicates(t, file1, file3) r.CheckWithDuplicates(t, file1, file3)
} }
func TestDeduplicateSizeOnly(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
skipIfCantDedupe(t, r.Fremote)
file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
file2 := r.WriteUncheckedObject(context.Background(), "one", "THIS IS ONE", t1)
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t1)
r.CheckWithDuplicates(t, file1, file2, file3)
fs.Config.SizeOnly = true
defer func() {
fs.Config.SizeOnly = false
}()
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
require.NoError(t, err)
r.CheckWithDuplicates(t, file1, file3)
}
func TestDeduplicateFirst(t *testing.T) { func TestDeduplicateFirst(t *testing.T) {
r := fstest.NewRun(t) r := fstest.NewRun(t)
defer r.Finalise() defer r.Finalise()