2018-04-13 00:17:11 +08:00
// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
package operations
import (
2019-06-17 16:34:30 +08:00
"context"
2018-04-13 00:17:11 +08:00
"fmt"
"log"
"path"
"sort"
"strings"
"github.com/pkg/errors"
2019-07-29 01:47:38 +08:00
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fs/walk"
2018-04-13 00:17:11 +08:00
)
// dedupeRename renames the objs slice to different names
2019-06-17 16:34:30 +08:00
func dedupeRename ( ctx context . Context , f fs . Fs , remote string , objs [ ] fs . Object ) {
2018-04-13 00:17:11 +08:00
doMove := f . Features ( ) . Move
if doMove == nil {
log . Fatalf ( "Fs %v doesn't support Move" , f )
}
ext := path . Ext ( remote )
base := remote [ : len ( remote ) - len ( ext ) ]
2018-10-29 12:05:45 +08:00
outer :
2018-04-13 00:17:11 +08:00
for i , o := range objs {
2018-10-29 12:05:45 +08:00
suffix := 1
newName := fmt . Sprintf ( "%s-%d%s" , base , i + suffix , ext )
2019-06-17 16:34:30 +08:00
_ , err := f . NewObject ( ctx , newName )
2018-10-29 12:05:45 +08:00
for ; err != fs . ErrorObjectNotFound ; suffix ++ {
if err != nil {
2019-11-18 22:13:02 +08:00
err = fs . CountError ( err )
2018-10-29 12:05:45 +08:00
fs . Errorf ( o , "Failed to check for existing object: %v" , err )
continue outer
}
if suffix > 100 {
fs . Errorf ( o , "Could not find an available new name" )
continue outer
}
newName = fmt . Sprintf ( "%s-%d%s" , base , i + suffix , ext )
2019-06-17 16:34:30 +08:00
_ , err = f . NewObject ( ctx , newName )
2018-10-29 12:05:45 +08:00
}
2018-04-13 00:17:11 +08:00
if ! fs . Config . DryRun {
2019-06-17 16:34:30 +08:00
newObj , err := doMove ( ctx , o , newName )
2018-04-13 00:17:11 +08:00
if err != nil {
2019-11-18 22:13:02 +08:00
err = fs . CountError ( err )
2018-04-13 00:17:11 +08:00
fs . Errorf ( o , "Failed to rename: %v" , err )
continue
}
fs . Infof ( newObj , "renamed from: %v" , o )
} else {
fs . Logf ( remote , "Not renaming to %q as --dry-run" , newName )
}
}
}
// dedupeDeleteAllButOne deletes all but the one in keep
2019-06-17 16:34:30 +08:00
func dedupeDeleteAllButOne ( ctx context . Context , keep int , remote string , objs [ ] fs . Object ) {
2019-10-08 23:35:53 +08:00
count := 0
2018-04-13 00:17:11 +08:00
for i , o := range objs {
if i == keep {
continue
}
2019-10-08 23:35:53 +08:00
err := DeleteFile ( ctx , o )
if err == nil {
count ++
}
}
if count > 0 {
fs . Logf ( remote , "Deleted %d extra copies" , count )
2018-04-13 00:17:11 +08:00
}
}
// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
2019-06-17 16:34:30 +08:00
func dedupeDeleteIdentical ( ctx context . Context , ht hash . Type , remote string , objs [ ] fs . Object ) ( remainingObjs [ ] fs . Object ) {
2018-04-13 00:17:11 +08:00
// See how many of these duplicates are identical
byHash := make ( map [ string ] [ ] fs . Object , len ( objs ) )
for _ , o := range objs {
2019-06-17 16:34:30 +08:00
md5sum , err := o . Hash ( ctx , ht )
2018-04-13 00:17:11 +08:00
if err != nil || md5sum == "" {
remainingObjs = append ( remainingObjs , o )
} else {
byHash [ md5sum ] = append ( byHash [ md5sum ] , o )
}
}
// Delete identical duplicates, filling remainingObjs with the ones remaining
for md5sum , hashObjs := range byHash {
2019-09-20 02:42:17 +08:00
remainingObjs = append ( remainingObjs , hashObjs [ 0 ] )
2018-04-13 00:17:11 +08:00
if len ( hashObjs ) > 1 {
fs . Logf ( remote , "Deleting %d/%d identical duplicates (%v %q)" , len ( hashObjs ) - 1 , len ( hashObjs ) , ht , md5sum )
for _ , o := range hashObjs [ 1 : ] {
2019-09-20 02:42:17 +08:00
err := DeleteFile ( ctx , o )
if err != nil {
remainingObjs = append ( remainingObjs , o )
}
2018-04-13 00:17:11 +08:00
}
}
}
return remainingObjs
}
// dedupeInteractive interactively dedupes the slice of objects
2019-06-17 16:34:30 +08:00
func dedupeInteractive ( ctx context . Context , f fs . Fs , ht hash . Type , remote string , objs [ ] fs . Object ) {
2018-04-13 00:17:11 +08:00
fmt . Printf ( "%s: %d duplicates remain\n" , remote , len ( objs ) )
for i , o := range objs {
2019-06-17 16:34:30 +08:00
md5sum , err := o . Hash ( ctx , ht )
2018-04-13 00:17:11 +08:00
if err != nil {
md5sum = err . Error ( )
}
2019-06-17 16:34:30 +08:00
fmt . Printf ( " %d: %12d bytes, %s, %v %32s\n" , i + 1 , o . Size ( ) , o . ModTime ( ctx ) . Local ( ) . Format ( "2006-01-02 15:04:05.000000000" ) , ht , md5sum )
2018-04-13 00:17:11 +08:00
}
switch config . Command ( [ ] string { "sSkip and do nothing" , "kKeep just one (choose which in next step)" , "rRename all to be different (by changing file.jpg to file-1.jpg)" } ) {
case 's' :
case 'k' :
keep := config . ChooseNumber ( "Enter the number of the file to keep" , 1 , len ( objs ) )
2019-06-17 16:34:30 +08:00
dedupeDeleteAllButOne ( ctx , keep - 1 , remote , objs )
2018-04-13 00:17:11 +08:00
case 'r' :
2019-06-17 16:34:30 +08:00
dedupeRename ( ctx , f , remote , objs )
2018-04-13 00:17:11 +08:00
}
}
// DeduplicateMode is how the dedupe command chooses what to do
type DeduplicateMode int
// Deduplicate modes
const (
DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
DeduplicateSkip // skip all conflicts
DeduplicateFirst // choose the first object
DeduplicateNewest // choose the newest object
DeduplicateOldest // choose the oldest object
DeduplicateRename // rename the objects
2018-04-22 05:57:08 +08:00
DeduplicateLargest // choose the largest object
2020-01-16 21:47:15 +08:00
DeduplicateSmallest // choose the smallest object
2018-04-13 00:17:11 +08:00
)
func ( x DeduplicateMode ) String ( ) string {
switch x {
case DeduplicateInteractive :
return "interactive"
case DeduplicateSkip :
return "skip"
case DeduplicateFirst :
return "first"
case DeduplicateNewest :
return "newest"
case DeduplicateOldest :
return "oldest"
case DeduplicateRename :
return "rename"
2018-04-22 05:57:08 +08:00
case DeduplicateLargest :
return "largest"
2020-01-16 21:47:15 +08:00
case DeduplicateSmallest :
return "smallest"
2018-04-13 00:17:11 +08:00
}
return "unknown"
}
// Set a DeduplicateMode from a string
func ( x * DeduplicateMode ) Set ( s string ) error {
switch strings . ToLower ( s ) {
case "interactive" :
* x = DeduplicateInteractive
case "skip" :
* x = DeduplicateSkip
case "first" :
* x = DeduplicateFirst
case "newest" :
* x = DeduplicateNewest
case "oldest" :
* x = DeduplicateOldest
case "rename" :
* x = DeduplicateRename
2018-04-22 05:57:08 +08:00
case "largest" :
* x = DeduplicateLargest
2020-01-16 21:47:15 +08:00
case "smallest" :
* x = DeduplicateSmallest
2018-04-13 00:17:11 +08:00
default :
return errors . Errorf ( "Unknown mode for dedupe %q." , s )
}
return nil
}
// Type of the value
func ( x * DeduplicateMode ) Type ( ) string {
return "string"
}
// dedupeFindDuplicateDirs scans f for duplicate directories
2019-06-17 16:34:30 +08:00
func dedupeFindDuplicateDirs ( ctx context . Context , f fs . Fs ) ( [ ] [ ] fs . Directory , error ) {
2019-03-17 18:44:32 +08:00
dirs := map [ string ] [ ] fs . Directory { }
2019-06-17 16:34:30 +08:00
err := walk . ListR ( ctx , f , "" , true , fs . Config . MaxDepth , walk . ListDirs , func ( entries fs . DirEntries ) error {
2018-04-13 00:17:11 +08:00
entries . ForDir ( func ( d fs . Directory ) {
dirs [ d . Remote ( ) ] = append ( dirs [ d . Remote ( ) ] , d )
} )
return nil
} )
if err != nil {
return nil , errors . Wrap ( err , "find duplicate dirs" )
}
2019-11-26 18:58:52 +08:00
// make sure parents are before children
duplicateNames := [ ] string { }
for name , ds := range dirs {
2019-03-17 18:44:32 +08:00
if len ( ds ) > 1 {
2019-11-26 18:58:52 +08:00
duplicateNames = append ( duplicateNames , name )
2019-03-17 18:44:32 +08:00
}
}
2019-11-26 18:58:52 +08:00
sort . Strings ( duplicateNames )
duplicateDirs := [ ] [ ] fs . Directory { }
for _ , name := range duplicateNames {
duplicateDirs = append ( duplicateDirs , dirs [ name ] )
}
2018-04-13 00:17:11 +08:00
return duplicateDirs , nil
}
// dedupeMergeDuplicateDirs merges all the duplicate directories found
2019-06-17 16:34:30 +08:00
func dedupeMergeDuplicateDirs ( ctx context . Context , f fs . Fs , duplicateDirs [ ] [ ] fs . Directory ) error {
2018-04-13 00:17:11 +08:00
mergeDirs := f . Features ( ) . MergeDirs
if mergeDirs == nil {
return errors . Errorf ( "%v: can't merge directories" , f )
}
dirCacheFlush := f . Features ( ) . DirCacheFlush
if dirCacheFlush == nil {
return errors . Errorf ( "%v: can't flush dir cache" , f )
}
for _ , dirs := range duplicateDirs {
if ! fs . Config . DryRun {
fs . Infof ( dirs [ 0 ] , "Merging contents of duplicate directories" )
2019-06-17 16:34:30 +08:00
err := mergeDirs ( ctx , dirs )
2018-04-13 00:17:11 +08:00
if err != nil {
2019-11-26 18:58:52 +08:00
err = fs . CountError ( err )
fs . Errorf ( nil , "merge duplicate dirs: %v" , err )
2018-04-13 00:17:11 +08:00
}
} else {
fs . Infof ( dirs [ 0 ] , "NOT Merging contents of duplicate directories as --dry-run" )
}
}
dirCacheFlush ( )
return nil
}
2020-01-16 21:47:15 +08:00
// sort oldest first
func sortOldestFirst ( objs [ ] fs . Object ) {
sort . Slice ( objs , func ( i , j int ) bool {
return objs [ i ] . ModTime ( context . TODO ( ) ) . Before ( objs [ j ] . ModTime ( context . TODO ( ) ) )
} )
}
// sort smallest first
func sortSmallestFirst ( objs [ ] fs . Object ) {
sort . Slice ( objs , func ( i , j int ) bool {
return objs [ i ] . Size ( ) < objs [ j ] . Size ( )
} )
}
2018-04-13 00:17:11 +08:00
// Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names.
2019-06-17 16:34:30 +08:00
func Deduplicate ( ctx context . Context , f fs . Fs , mode DeduplicateMode ) error {
2018-04-13 00:17:11 +08:00
fs . Infof ( f , "Looking for duplicates using %v mode." , mode )
2019-11-26 18:58:52 +08:00
// Find duplicate directories first and fix them
duplicateDirs , err := dedupeFindDuplicateDirs ( ctx , f )
if err != nil {
return err
}
if len ( duplicateDirs ) != 0 {
2019-06-17 16:34:30 +08:00
err = dedupeMergeDuplicateDirs ( ctx , f , duplicateDirs )
2018-04-13 00:17:11 +08:00
if err != nil {
return err
}
}
// find a hash to use
ht := f . Hashes ( ) . GetOne ( )
// Now find duplicate files
files := map [ string ] [ ] fs . Object { }
2019-11-26 18:58:52 +08:00
err = walk . ListR ( ctx , f , "" , true , fs . Config . MaxDepth , walk . ListObjects , func ( entries fs . DirEntries ) error {
2018-04-13 00:17:11 +08:00
entries . ForObject ( func ( o fs . Object ) {
remote := o . Remote ( )
files [ remote ] = append ( files [ remote ] , o )
} )
return nil
} )
if err != nil {
return err
}
2018-04-22 05:57:08 +08:00
2018-04-13 00:17:11 +08:00
for remote , objs := range files {
if len ( objs ) > 1 {
fs . Logf ( remote , "Found %d duplicates - deleting identical copies" , len ( objs ) )
2019-06-17 16:34:30 +08:00
objs = dedupeDeleteIdentical ( ctx , ht , remote , objs )
2018-04-13 00:17:11 +08:00
if len ( objs ) <= 1 {
fs . Logf ( remote , "All duplicates removed" )
continue
}
switch mode {
case DeduplicateInteractive :
2019-06-17 16:34:30 +08:00
dedupeInteractive ( ctx , f , ht , remote , objs )
2018-04-13 00:17:11 +08:00
case DeduplicateFirst :
2019-06-17 16:34:30 +08:00
dedupeDeleteAllButOne ( ctx , 0 , remote , objs )
2018-04-13 00:17:11 +08:00
case DeduplicateNewest :
2020-01-16 21:47:15 +08:00
sortOldestFirst ( objs )
2019-06-17 16:34:30 +08:00
dedupeDeleteAllButOne ( ctx , len ( objs ) - 1 , remote , objs )
2018-04-13 00:17:11 +08:00
case DeduplicateOldest :
2020-01-16 21:47:15 +08:00
sortOldestFirst ( objs )
2019-06-17 16:34:30 +08:00
dedupeDeleteAllButOne ( ctx , 0 , remote , objs )
2018-04-13 00:17:11 +08:00
case DeduplicateRename :
2019-06-17 16:34:30 +08:00
dedupeRename ( ctx , f , remote , objs )
2018-04-22 05:57:08 +08:00
case DeduplicateLargest :
2020-01-16 21:47:15 +08:00
sortSmallestFirst ( objs )
dedupeDeleteAllButOne ( ctx , len ( objs ) - 1 , remote , objs )
case DeduplicateSmallest :
sortSmallestFirst ( objs )
dedupeDeleteAllButOne ( ctx , 0 , remote , objs )
2018-04-13 00:17:11 +08:00
case DeduplicateSkip :
// skip
default :
//skip
}
}
}
return nil
}