mirror of
https://github.com/rclone/rclone.git
synced 2025-01-19 15:12:45 +08:00
dedupe: Make it obey the --size-only flag for duplicate detection #4321
This commit is contained in:
parent
d5f4c74697
commit
62f0bbb598
|
@ -22,19 +22,32 @@ func init() {
|
||||||
|
|
||||||
var commandDefinition = &cobra.Command{
|
var commandDefinition = &cobra.Command{
|
||||||
Use: "dedupe [mode] remote:path",
|
Use: "dedupe [mode] remote:path",
|
||||||
Short: `Interactively find duplicate files and delete/rename them.`,
|
Short: `Interactively find duplicate filenames and delete/rename them.`,
|
||||||
Long: `
|
Long: `
|
||||||
By default ` + "`" + `dedupe` + "`" + ` interactively finds duplicate files and offers to
|
|
||||||
delete all but one or rename them to be different. Only useful with
|
By default ` + "`dedupe`" + ` interactively finds files with duplicate
|
||||||
Google Drive which can have duplicate file names.
|
names and offers to delete all but one or rename them to be
|
||||||
|
different.
|
||||||
|
|
||||||
|
This is only useful with backends like Google Drive which can have
|
||||||
|
duplicate file names. It can be run on wrapping backends (eg crypt) if
|
||||||
|
they wrap a backend which supports duplicate file names.
|
||||||
|
|
||||||
In the first pass it will merge directories with the same name. It
|
In the first pass it will merge directories with the same name. It
|
||||||
will do this iteratively until all the identical directories have been
|
will do this iteratively until all the identically named directories
|
||||||
merged.
|
have been merged.
|
||||||
|
|
||||||
The ` + "`" + `dedupe` + "`" + ` command will delete all but one of any identical (same
|
In the second pass, for every group of duplicate file names, it will
|
||||||
md5sum) files it finds without confirmation. This means that for most
|
delete all but one identical files it finds without confirmation.
|
||||||
duplicated files the ` + "`" + `dedupe` + "`" + ` command will not be interactive.
|
This means that for most duplicated files the ` + "`dedupe`" + `
|
||||||
|
command will not be interactive.
|
||||||
|
|
||||||
|
` + "`dedupe`" + ` considers files to be identical if they have the
|
||||||
|
same hash. If the backend does not support hashes (eg crypt wrapping
|
||||||
|
Google Drive) then they will never be found to be identical. If you
|
||||||
|
use the ` + "`--size-only`" + ` flag then files will be considered
|
||||||
|
identical if they have the same size (any hash will be ignored). This
|
||||||
|
can be useful on crypt backends which do not support hashes.
|
||||||
|
|
||||||
**Important**: Since this can cause data loss, test first with the
|
**Important**: Since this can cause data loss, test first with the
|
||||||
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
|
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
|
||||||
|
@ -52,26 +65,26 @@ Before - with duplicates
|
||||||
1744073 2016-03-05 16:22:38.104000000 two.txt
|
1744073 2016-03-05 16:22:38.104000000 two.txt
|
||||||
564374 2016-03-05 16:22:52.118000000 two.txt
|
564374 2016-03-05 16:22:52.118000000 two.txt
|
||||||
|
|
||||||
Now the ` + "`" + `dedupe` + "`" + ` session
|
Now the ` + "`dedupe`" + ` session
|
||||||
|
|
||||||
$ rclone dedupe drive:dupes
|
$ rclone dedupe drive:dupes
|
||||||
2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode.
|
2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode.
|
||||||
one.txt: Found 4 duplicates - deleting identical copies
|
one.txt: Found 4 files with duplicate names
|
||||||
one.txt: Deleting 2/3 identical duplicates (md5sum "1eedaa9fe86fd4b8632e2ac549403b36")
|
one.txt: Deleting 2/3 identical duplicates (MD5 "1eedaa9fe86fd4b8632e2ac549403b36")
|
||||||
one.txt: 2 duplicates remain
|
one.txt: 2 duplicates remain
|
||||||
1: 6048320 bytes, 2016-03-05 16:23:16.798000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36
|
1: 6048320 bytes, 2016-03-05 16:23:16.798000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
|
||||||
2: 564374 bytes, 2016-03-05 16:23:06.731000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81
|
2: 564374 bytes, 2016-03-05 16:23:06.731000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
|
||||||
s) Skip and do nothing
|
s) Skip and do nothing
|
||||||
k) Keep just one (choose which in next step)
|
k) Keep just one (choose which in next step)
|
||||||
r) Rename all to be different (by changing file.jpg to file-1.jpg)
|
r) Rename all to be different (by changing file.jpg to file-1.jpg)
|
||||||
s/k/r> k
|
s/k/r> k
|
||||||
Enter the number of the file to keep> 1
|
Enter the number of the file to keep> 1
|
||||||
one.txt: Deleted 1 extra copies
|
one.txt: Deleted 1 extra copies
|
||||||
two.txt: Found 3 duplicates - deleting identical copies
|
two.txt: Found 3 files with duplicates names
|
||||||
two.txt: 3 duplicates remain
|
two.txt: 3 duplicates remain
|
||||||
1: 564374 bytes, 2016-03-05 16:22:52.118000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81
|
1: 564374 bytes, 2016-03-05 16:22:52.118000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
|
||||||
2: 6048320 bytes, 2016-03-05 16:22:46.185000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36
|
2: 6048320 bytes, 2016-03-05 16:22:46.185000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
|
||||||
3: 1744073 bytes, 2016-03-05 16:22:38.104000000, md5sum 851957f7fb6f0bc4ce76be966d336802
|
3: 1744073 bytes, 2016-03-05 16:22:38.104000000, MD5 851957f7fb6f0bc4ce76be966d336802
|
||||||
s) Skip and do nothing
|
s) Skip and do nothing
|
||||||
k) Keep just one (choose which in next step)
|
k) Keep just one (choose which in next step)
|
||||||
r) Rename all to be different (by changing file.jpg to file-1.jpg)
|
r) Rename all to be different (by changing file.jpg to file-1.jpg)
|
||||||
|
|
|
@ -101,22 +101,30 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
|
||||||
objs = newObjs
|
objs = newObjs
|
||||||
|
|
||||||
// See how many of these duplicates are identical
|
// See how many of these duplicates are identical
|
||||||
byHash := make(map[string][]fs.Object, len(objs))
|
dupesByID := make(map[string][]fs.Object, len(objs))
|
||||||
for _, o := range objs {
|
for _, o := range objs {
|
||||||
md5sum, err := o.Hash(ctx, ht)
|
ID := ""
|
||||||
if err != nil || md5sum == "" {
|
if fs.Config.SizeOnly && o.Size() >= 0 {
|
||||||
|
ID = fmt.Sprintf("size %d", o.Size())
|
||||||
|
} else if ht != hash.None {
|
||||||
|
hashValue, err := o.Hash(ctx, ht)
|
||||||
|
if err == nil && hashValue != "" {
|
||||||
|
ID = fmt.Sprintf("%v %s", ht, hashValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ID == "" {
|
||||||
remainingObjs = append(remainingObjs, o)
|
remainingObjs = append(remainingObjs, o)
|
||||||
} else {
|
} else {
|
||||||
byHash[md5sum] = append(byHash[md5sum], o)
|
dupesByID[ID] = append(dupesByID[ID], o)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Delete identical duplicates, filling remainingObjs with the ones remaining
|
// Delete identical duplicates, filling remainingObjs with the ones remaining
|
||||||
for md5sum, hashObjs := range byHash {
|
for ID, dupes := range dupesByID {
|
||||||
remainingObjs = append(remainingObjs, hashObjs[0])
|
remainingObjs = append(remainingObjs, dupes[0])
|
||||||
if len(hashObjs) > 1 {
|
if len(dupes) > 1 {
|
||||||
fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
|
fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
|
||||||
for _, o := range hashObjs[1:] {
|
for _, o := range dupes[1:] {
|
||||||
err := DeleteFile(ctx, o)
|
err := DeleteFile(ctx, o)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
remainingObjs = append(remainingObjs, o)
|
remainingObjs = append(remainingObjs, o)
|
||||||
|
@ -132,11 +140,15 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
|
||||||
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
|
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
|
||||||
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
|
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
|
||||||
for i, o := range objs {
|
for i, o := range objs {
|
||||||
md5sum, err := o.Hash(ctx, ht)
|
hashValue := ""
|
||||||
if err != nil {
|
if ht != hash.None {
|
||||||
md5sum = err.Error()
|
var err error
|
||||||
|
hashValue, err = o.Hash(ctx, ht)
|
||||||
|
if err != nil {
|
||||||
|
hashValue = err.Error()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
|
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
|
||||||
}
|
}
|
||||||
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
|
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
|
||||||
case 's':
|
case 's':
|
||||||
|
|
|
@ -75,6 +75,27 @@ func TestDeduplicateSkip(t *testing.T) {
|
||||||
r.CheckWithDuplicates(t, file1, file3)
|
r.CheckWithDuplicates(t, file1, file3)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDeduplicateSizeOnly(t *testing.T) {
|
||||||
|
r := fstest.NewRun(t)
|
||||||
|
defer r.Finalise()
|
||||||
|
skipIfCantDedupe(t, r.Fremote)
|
||||||
|
|
||||||
|
file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
|
||||||
|
file2 := r.WriteUncheckedObject(context.Background(), "one", "THIS IS ONE", t1)
|
||||||
|
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t1)
|
||||||
|
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||||
|
|
||||||
|
fs.Config.SizeOnly = true
|
||||||
|
defer func() {
|
||||||
|
fs.Config.SizeOnly = false
|
||||||
|
}()
|
||||||
|
|
||||||
|
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
r.CheckWithDuplicates(t, file1, file3)
|
||||||
|
}
|
||||||
|
|
||||||
func TestDeduplicateFirst(t *testing.T) {
|
func TestDeduplicateFirst(t *testing.T) {
|
||||||
r := fstest.NewRun(t)
|
r := fstest.NewRun(t)
|
||||||
defer r.Finalise()
|
defer r.Finalise()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user