march: added flag to allow Unicode filenames to remain unique

If your filenames contain two near-identical Unicode characters,
rclone will normalize these, making them identical. This flag
gives you the ability to keep them unique. This might
create unintended side effects, such as duplicating files that
contain certain Unicode characters, when downloading them from
certain cloud providers to a macOS filesystem.

Fixes #4228
This commit is contained in:
Ben Zenker 2020-05-14 19:27:59 -04:00 committed by Nick Craig-Wood
parent 4006345cfb
commit 899c8e0697
6 changed files with 119 additions and 72 deletions

View File

@ -908,6 +908,20 @@ changed and won't need copying then you shouldn't use `--no-traverse`.
See [rclone copy](/commands/rclone_copy/) for an example of how to use it.
### --no-unicode-normalization ###
Don't normalize unicode characters in filenames during the sync routine.
Sometimes, an operating system will store filenames containing unicode
parts in their decomposed form (particularly macOS). Some cloud storage
systems will then recompose the unicode, resulting in duplicate files if
the data is ever copied back to a local filesystem.
Using this flag will disable that functionality, treating each unicode
character as unique. For example, by default é and é will be normalized
into the same character. With `--no-unicode-normalization` they will be
treated as unique characters.
### --no-update-modtime ###
When using this flag, rclone won't update modification times of remote

View File

@ -70,6 +70,7 @@ type ConfigInfo struct {
IgnoreCaseSync bool
NoTraverse bool
NoCheckDest bool
NoUnicodeNormalization bool
NoUpdateModTime bool
DataRateUnit string
CompareDest string

View File

@ -75,6 +75,7 @@ func AddFlags(flagSet *pflag.FlagSet) {
flags.BoolVarP(flagSet, &fs.Config.IgnoreCaseSync, "ignore-case-sync", "", fs.Config.IgnoreCaseSync, "Ignore case when synchronizing")
flags.BoolVarP(flagSet, &fs.Config.NoTraverse, "no-traverse", "", fs.Config.NoTraverse, "Don't traverse destination file system on copy.")
flags.BoolVarP(flagSet, &fs.Config.NoCheckDest, "no-check-dest", "", fs.Config.NoCheckDest, "Don't check the destination, copy regardless.")
flags.BoolVarP(flagSet, &fs.Config.NoUnicodeNormalization, "no-unicode-normalization", "", fs.Config.NoUnicodeNormalization, "Don't normalize unicode characters in filenames.")
flags.BoolVarP(flagSet, &fs.Config.NoUpdateModTime, "no-update-modtime", "", fs.Config.NoUpdateModTime, "Don't update destination mod-time if files identical.")
flags.StringVarP(flagSet, &fs.Config.CompareDest, "compare-dest", "", fs.Config.CompareDest, "Include additional server-side path during comparison.")
flags.StringVarP(flagSet, &fs.Config.CopyDest, "copy-dest", "", fs.Config.CopyDest, "Implies --compare-dest but also copies files from path into destination.")

View File

@ -22,15 +22,16 @@ import (
// calling Callback for each match
type March struct {
// parameters
Ctx context.Context // context for background goroutines
Fdst fs.Fs // source Fs
Fsrc fs.Fs // dest Fs
Dir string // directory
NoTraverse bool // don't traverse the destination
SrcIncludeAll bool // don't include all files in the src
DstIncludeAll bool // don't include all files in the destination
Callback Marcher // object to call with results
NoCheckDest bool // transfer all objects regardless without checking dst
Ctx context.Context // context for background goroutines
Fdst fs.Fs // source Fs
Fsrc fs.Fs // dest Fs
Dir string // directory
NoTraverse bool // don't traverse the destination
SrcIncludeAll bool // don't include all files in the src
DstIncludeAll bool // don't include all files in the destination
Callback Marcher // object to call with results
NoCheckDest bool // transfer all objects regardless without checking dst
NoUnicodeNormalization bool // don't normalize unicode characters in filenames
// internal state
srcListDir listDirFn // function to call to list a directory in the src
dstListDir listDirFn // function to call to list a directory in the dst
@ -55,7 +56,9 @@ func (m *March) init() {
}
// Now create the matching transform
// ..normalise the UTF8 first
m.transforms = append(m.transforms, norm.NFC.String)
if !m.NoUnicodeNormalization {
m.transforms = append(m.transforms, norm.NFC.String)
}
// ..if destination is caseInsensitive then make it lower case
// case Insensitive | src | dst | lower case compare |
// | No | No | No |

View File

@ -19,6 +19,7 @@ import (
"github.com/rclone/rclone/fstest/mockobject"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/text/unicode/norm"
)
// Some times used in the tests
@ -313,6 +314,8 @@ func TestMatchListings(t *testing.T) {
b = mockobject.Object("b")
c = mockobject.Object("c")
d = mockobject.Object("d")
uE1 = mockobject.Object("é") // one of the unicode E characters
uE2 = mockobject.Object("é") // a different unicode E character
dirA = mockdir.New("A")
dirb = mockdir.New("b")
)
@ -419,6 +422,28 @@ func TestMatchListings(t *testing.T) {
},
transforms: []matchTransformFn{strings.ToLower},
},
{
what: "Unicode near-duplicate that becomes duplicate with normalization",
input: fs.DirEntries{
uE1, uE1,
uE2, uE2,
},
matches: []matchPair{
{uE1, uE1},
},
transforms: []matchTransformFn{norm.NFC.String},
},
{
what: "Unicode near-duplicate with no normalization",
input: fs.DirEntries{
uE1, uE1,
uE2, uE2,
},
matches: []matchPair{
{uE1, uE1},
{uE2, uE2},
},
},
{
what: "File and directory are not duplicates - srcOnly",
input: fs.DirEntries{

View File

@ -30,42 +30,43 @@ type syncCopyMove struct {
deleteEmptySrcDirs bool
dir string
// internal state
ctx context.Context // internal context for controlling go-routines
cancel func() // cancel the context
noTraverse bool // if set don't traverse the dst
noCheckDest bool // if set transfer all objects regardless without checking dst
deletersWg sync.WaitGroup // for delete before go routine
deleteFilesCh chan fs.Object // channel to receive deletes if delete before
trackRenames bool // set if we should do server side renames
trackRenamesStrategy trackRenamesStrategy // stratgies used for tracking renames
dstFilesMu sync.Mutex // protect dstFiles
dstFiles map[string]fs.Object // dst files, always filled
srcFiles map[string]fs.Object // src files, only used if deleteBefore
srcFilesChan chan fs.Object // passes src objects
srcFilesResult chan error // error result of src listing
dstFilesResult chan error // error result of dst listing
dstEmptyDirsMu sync.Mutex // protect dstEmptyDirs
dstEmptyDirs map[string]fs.DirEntry // potentially empty directories
srcEmptyDirsMu sync.Mutex // protect srcEmptyDirs
srcEmptyDirs map[string]fs.DirEntry // potentially empty directories
checkerWg sync.WaitGroup // wait for checkers
toBeChecked *pipe // checkers channel
transfersWg sync.WaitGroup // wait for transfers
toBeUploaded *pipe // copiers channel
errorMu sync.Mutex // Mutex covering the errors variables
err error // normal error from copy process
noRetryErr error // error with NoRetry set
fatalErr error // fatal error
commonHash hash.Type // common hash type between src and dst
renameMapMu sync.Mutex // mutex to protect the below
renameMap map[string][]fs.Object // dst files by hash - only used by trackRenames
renamerWg sync.WaitGroup // wait for renamers
toBeRenamed *pipe // renamers channel
trackRenamesWg sync.WaitGroup // wg for background track renames
trackRenamesCh chan fs.Object // objects are pumped in here
renameCheck []fs.Object // accumulate files to check for rename here
compareCopyDest fs.Fs // place to check for files to server side copy
backupDir fs.Fs // place to store overwrites/deletes
ctx context.Context // internal context for controlling go-routines
cancel func() // cancel the context
noTraverse bool // if set don't traverse the dst
noCheckDest bool // if set transfer all objects regardless without checking dst
noUnicodeNormalization bool // don't normalize unicode characters in filenames
deletersWg sync.WaitGroup // for delete before go routine
deleteFilesCh chan fs.Object // channel to receive deletes if delete before
trackRenames bool // set if we should do server side renames
trackRenamesStrategy trackRenamesStrategy // stratgies used for tracking renames
dstFilesMu sync.Mutex // protect dstFiles
dstFiles map[string]fs.Object // dst files, always filled
srcFiles map[string]fs.Object // src files, only used if deleteBefore
srcFilesChan chan fs.Object // passes src objects
srcFilesResult chan error // error result of src listing
dstFilesResult chan error // error result of dst listing
dstEmptyDirsMu sync.Mutex // protect dstEmptyDirs
dstEmptyDirs map[string]fs.DirEntry // potentially empty directories
srcEmptyDirsMu sync.Mutex // protect srcEmptyDirs
srcEmptyDirs map[string]fs.DirEntry // potentially empty directories
checkerWg sync.WaitGroup // wait for checkers
toBeChecked *pipe // checkers channel
transfersWg sync.WaitGroup // wait for transfers
toBeUploaded *pipe // copiers channel
errorMu sync.Mutex // Mutex covering the errors variables
err error // normal error from copy process
noRetryErr error // error with NoRetry set
fatalErr error // fatal error
commonHash hash.Type // common hash type between src and dst
renameMapMu sync.Mutex // mutex to protect the below
renameMap map[string][]fs.Object // dst files by hash - only used by trackRenames
renamerWg sync.WaitGroup // wait for renamers
toBeRenamed *pipe // renamers channel
trackRenamesWg sync.WaitGroup // wg for background track renames
trackRenamesCh chan fs.Object // objects are pumped in here
renameCheck []fs.Object // accumulate files to check for rename here
compareCopyDest fs.Fs // place to check for files to server side copy
backupDir fs.Fs // place to store overwrites/deletes
}
type trackRenamesStrategy byte
@ -88,24 +89,25 @@ func newSyncCopyMove(ctx context.Context, fdst, fsrc fs.Fs, deleteMode fs.Delete
return nil, fserrors.FatalError(fs.ErrorOverlapping)
}
s := &syncCopyMove{
fdst: fdst,
fsrc: fsrc,
deleteMode: deleteMode,
DoMove: DoMove,
copyEmptySrcDirs: copyEmptySrcDirs,
deleteEmptySrcDirs: deleteEmptySrcDirs,
dir: "",
srcFilesChan: make(chan fs.Object, fs.Config.Checkers+fs.Config.Transfers),
srcFilesResult: make(chan error, 1),
dstFilesResult: make(chan error, 1),
dstEmptyDirs: make(map[string]fs.DirEntry),
srcEmptyDirs: make(map[string]fs.DirEntry),
noTraverse: fs.Config.NoTraverse,
noCheckDest: fs.Config.NoCheckDest,
deleteFilesCh: make(chan fs.Object, fs.Config.Checkers),
trackRenames: fs.Config.TrackRenames,
commonHash: fsrc.Hashes().Overlap(fdst.Hashes()).GetOne(),
trackRenamesCh: make(chan fs.Object, fs.Config.Checkers),
fdst: fdst,
fsrc: fsrc,
deleteMode: deleteMode,
DoMove: DoMove,
copyEmptySrcDirs: copyEmptySrcDirs,
deleteEmptySrcDirs: deleteEmptySrcDirs,
dir: "",
srcFilesChan: make(chan fs.Object, fs.Config.Checkers+fs.Config.Transfers),
srcFilesResult: make(chan error, 1),
dstFilesResult: make(chan error, 1),
dstEmptyDirs: make(map[string]fs.DirEntry),
srcEmptyDirs: make(map[string]fs.DirEntry),
noTraverse: fs.Config.NoTraverse,
noCheckDest: fs.Config.NoCheckDest,
noUnicodeNormalization: fs.Config.NoUnicodeNormalization,
deleteFilesCh: make(chan fs.Object, fs.Config.Checkers),
trackRenames: fs.Config.TrackRenames,
commonHash: fsrc.Hashes().Overlap(fdst.Hashes()).GetOne(),
trackRenamesCh: make(chan fs.Object, fs.Config.Checkers),
}
var err error
s.toBeChecked, err = newPipe(fs.Config.OrderBy, accounting.Stats(ctx).SetCheckQueue, fs.Config.MaxBacklog)
@ -782,14 +784,15 @@ func (s *syncCopyMove) run() error {
// set up a march over fdst and fsrc
m := &march.March{
Ctx: s.ctx,
Fdst: s.fdst,
Fsrc: s.fsrc,
Dir: s.dir,
NoTraverse: s.noTraverse,
Callback: s,
DstIncludeAll: filter.Active.Opt.DeleteExcluded,
NoCheckDest: s.noCheckDest,
Ctx: s.ctx,
Fdst: s.fdst,
Fsrc: s.fsrc,
Dir: s.dir,
NoTraverse: s.noTraverse,
Callback: s,
DstIncludeAll: filter.Active.Opt.DeleteExcluded,
NoCheckDest: s.noCheckDest,
NoUnicodeNormalization: s.noUnicodeNormalization,
}
s.processError(m.Run())