From ccecfa9cb1cbc8da7e88974a8f29c86b67c202a7 Mon Sep 17 00:00:00 2001 From: Ivan Andreev Date: Wed, 25 Sep 2019 02:18:30 +0300 Subject: [PATCH] chunker: finish meta-format before release changes: - chunker: remove GetTier and SetTier - remove wdmrcompat metaformat - remove fastopen strategy - make hash_type option non-advanced - adverise hash support when possible - add metadata field "ver", run strict checks - describe internal behavior in comments - improve documentation note: wdmrcompat used to write file name in the metadata, so maximum metadata size was 1K; removing it allows to cap size by 200 bytes now. --- backend/chunker/chunker.go | 299 ++++++++++++++------------------ backend/chunker/chunker_test.go | 12 +- docs/content/chunker.md | 175 ++++++++++--------- fs/sync/sync_test.go | 86 +++++---- fstest/test_all/config.yaml | 43 ++--- 5 files changed, 303 insertions(+), 312 deletions(-) diff --git a/backend/chunker/chunker.go b/backend/chunker/chunker.go index 3561bac59..6d5e25bed 100644 --- a/backend/chunker/chunker.go +++ b/backend/chunker/chunker.go @@ -36,13 +36,11 @@ const ( // WARNING: this optimization is not transaction safe! optimizeFirstChunk = false - // Normally metadata is a small (less than 1KB) piece of JSON. + // Normally metadata is a small (100-200 bytes) piece of JSON. // Valid metadata size should not exceed this limit. - maxMetaDataSize = 1023 + maxMetaDataSize = 199 - // fastopen strategy opens all chunks immediately, but reads sequentially. - // linear strategy opens and reads chunks sequentially, without read-ahead. - downloadStrategy = "linear" + metaDataVersion = 1 ) // Formatting of temporary chunk names. Temporary suffix *follows* chunk @@ -52,6 +50,13 @@ var ( tempChunkRegexp = regexp.MustCompile(`^(.+)\.\.tmp_([0-9]{10,19})$`) ) +// Note: metadata logic is tightly coupled with chunker code in many +// places of the code, eg. in checks whether a file can have meta object +// or is eligible for chunking. +// If more metadata formats (or versions of a format) are added in future, +// it may be advisable to factor it into a "metadata strategy" interface +// similar to chunkingReader or linearReader below. + // Register with Fs func init() { fs.Register(&fs.RegInfo{ @@ -98,16 +103,10 @@ Metadata is a small JSON file named after the composite file.`, Value: "simplejson", Help: `Simple JSON supports hash sums and chunk validation. It has the following fields: size, nchunks, md5, sha1.`, - }, { - Value: "wdmrcompat", - Help: `This format brings compatibility with WebDavMailRuCloud. -It does not support hash sums or validation, most fields are ignored. -It has the following fields: Name, Size, PublicKey, CreationDate. -Requires hash type "none".`, }}, }, { Name: "hash_type", - Advanced: true, + Advanced: false, Default: "md5", Help: `Choose how chunker handles hash sums.`, Examples: []fs.OptionExample{{ @@ -122,8 +121,8 @@ for a single-chunk file but returns nothing otherwise.`, Help: `SHA1 for multi-chunk files. Requires "simplejson".`, }, { Value: "md5quick", - Help: `When a file is copied on to chunker, MD5 is taken from its source -falling back to SHA1 if the source doesn't support it. Requires "simplejson".`, + Help: `Copying a file to chunker will request MD5 from the source +falling back to SHA1 if unsupported. Requires "simplejson".`, }, { Value: "sha1quick", Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`, @@ -188,7 +187,7 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) { switch opt.MetaFormat { case "none": f.useMeta = false - case "simplejson", "wdmrcompat": + case "simplejson": f.useMeta = true default: return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat) @@ -243,8 +242,6 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) { WriteMimeType: true, BucketBased: true, CanHaveEmptyDirectories: true, - SetTier: true, - GetTier: true, ServerSideAcrossConfigs: true, }).Fill(f).Mask(baseFs).WrapsFs(f, baseFs) @@ -393,6 +390,19 @@ func (f *Fs) parseChunkName(name string) (mainName string, chunkNo int, tempNo i // // This should return ErrDirNotFound if the directory isn't // found. +// +// Commands normally cleanup all temporary chunks in case of a failure. +// However, if rclone dies unexpectedly, it can leave behind a bunch of +// hidden temporary chunks. List and its underlying chunkEntries() +// silently skip all temporary chunks in the directory. It's okay if +// they belong to an unfinished command running in parallel. +// +// However, there is no way to discover dead temporary chunks a.t.m. +// As a workaround users can use `purge` to forcibly remove the whole +// directory together with dead chunks. +// In future a flag named like `--chunker-list-hidden` may be added to +// rclone that will tell List to reveal hidden chunks. +// func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { entries, err = f.base.List(ctx, dir) if err != nil { @@ -428,7 +438,8 @@ func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) ( }) } -// Add some directory entries. This alters entries returning it as newEntries. +// chunkEntries is called by List(R). It merges chunk entries from +// wrapped remote into composite directory entries. func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardErrors bool) (chunkedEntries fs.DirEntries, err error) { // sort entries, so that meta objects (if any) appear before their chunks sortedEntries := make(fs.DirEntries, len(origEntries)) @@ -514,6 +525,11 @@ func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardEr } // NewObject finds the Object at remote. +// +// Please note that every NewObject invocation will scan the whole directory. +// Using here something like fs.DirCache might improve performance (and make +// logic more complex though). +// func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) { if mainRemote, _, _ := f.parseChunkName(remote); mainRemote != "" { return nil, fmt.Errorf("%q should be meta object, not a chunk", remote) @@ -622,23 +638,14 @@ func (o *Object) readMetaData(ctx context.Context) error { case "simplejson": metaInfo, err := unmarshalSimpleJSON(ctx, metaObject, metaData) if err != nil { - // TODO: maybe it's a small single chunk? - return err + // TODO: in a rare case we might mistake a small file for metadata + return errors.Wrap(err, "invalid metadata") } if o.size != metaInfo.Size() || len(o.chunks) != metaInfo.nChunks { - return errors.New("invalid simplejson metadata") + return errors.New("metadata doesn't match file size") } o.md5 = metaInfo.md5 o.sha1 = metaInfo.sha1 - case "wdmrcompat": - metaInfo, err := unmarshalWDMRCompat(ctx, metaObject, metaData) - if err != nil { - // TODO: maybe it's a small single chunk? - return err - } - if o.size != metaInfo.Size() { - return errors.New("invalid wdmrcompat metadata") - } } o.isFull = true @@ -784,9 +791,6 @@ func (f *Fs) put(ctx context.Context, in io.Reader, src fs.ObjectInfo, remote st case "simplejson": c.updateHashes() metaData, err = marshalSimpleJSON(ctx, sizeTotal, len(c.chunks), c.md5, c.sha1) - case "wdmrcompat": - fileInfo := f.wrapInfo(src, baseRemote, sizeTotal) - metaData, err = marshalWDMRCompat(ctx, fileInfo) } if err == nil { metaInfo := f.wrapInfo(src, baseRemote, int64(len(metaData))) @@ -951,6 +955,9 @@ func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, opt // Update in to the object with the modTime given of the given size func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error { + if err := o.readMetaData(ctx); err != nil { + return err + } basePut := o.f.base.Put if src.Size() < 0 { basePut = o.f.base.Features().PutStream @@ -989,8 +996,17 @@ func (f *Fs) Precision() time.Duration { } // Hashes returns the supported hash sets. +// Chunker advertises a hash type if and only if it can be calculated +// for files of any size, multi-chunked or small. func (f *Fs) Hashes() hash.Set { - return hash.Set(hash.None) + // composites && all of them && small files supported by wrapped remote + if f.useMD5 && !f.quickHash && f.base.Hashes().Contains(hash.MD5) { + return hash.NewHashSet(hash.MD5) + } + if f.useSHA1 && !f.quickHash && f.base.Hashes().Contains(hash.SHA1) { + return hash.NewHashSet(hash.SHA1) + } + return hash.NewHashSet() // can't provide strong guarantees } // Mkdir makes the directory (container, bucket) @@ -1012,7 +1028,12 @@ func (f *Fs) Rmdir(ctx context.Context, dir string) error { // Implement this if you have a way of deleting all the files // quicker than just running Remove() on the result of List() // -// Return an error if it doesn't exist +// Return an error if it doesn't exist. +// +// This command will chain to `purge` from wrapped remote. +// As a result it removes not only chunker files with their +// active chunks but also all hidden chunks in the directory. +// func (f *Fs) Purge(ctx context.Context) error { do := f.base.Features().Purge if do == nil { @@ -1021,7 +1042,25 @@ func (f *Fs) Purge(ctx context.Context) error { return do(ctx) } -// Remove an object +// Remove an object (chunks and metadata, if any) +// +// Remove deletes only active chunks of the object. +// It does not try to look for temporary chunks because they could belong +// to another command modifying this composite file in parallel. +// +// Commands normally cleanup all temporary chunks in case of a failure. +// However, if rclone dies unexpectedly, it can leave hidden temporary +// chunks, which cannot be discovered using the `list` command. +// Remove does not try to search for such chunks or delete them. +// Sometimes this can lead to strange results eg. when `list` shows that +// directory is empty but `rmdir` refuses to remove it because on the +// level of wrapped remote it's actually *not* empty. +// As a workaround users can use `purge` to forcibly remove it. +// +// In future, a flag `--chunker-delete-hidden` may be added which tells +// Remove to search directory for hidden chunks and remove them too +// (at the risk of breaking parallel commands). +// func (o *Object) Remove(ctx context.Context) (err error) { if o.main != nil { err = o.main.Remove(ctx) @@ -1095,13 +1134,6 @@ func (f *Fs) copyOrMove(ctx context.Context, o *Object, remote string, do copyMo metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData))) err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo) } - case "wdmrcompat": - newInfo := f.wrapInfo(metaObject, "", newObj.size) - metaData, err = marshalWDMRCompat(ctx, newInfo) - if err == nil { - metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData))) - err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo) - } case "none": if newObj.main != nil { err = newObj.main.Remove(ctx) @@ -1436,7 +1468,22 @@ func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error { // Hash returns the selected checksum of the file. // If no checksum is available it returns "". -// It prefers the wrapped hashsum for a non-chunked file, then tries saved one. +// +// Hash prefers wrapped hashsum for a non-chunked file, then tries to +// read it from metadata. This in theory handles an unusual case when +// a small file is modified on the lower level by wrapped remote +// but chunker is not yet aware of changes. +// +// Currently metadata (if not configured as 'none') is kept only for +// multi-chunk files, but for small files chunker obtains hashsums from +// wrapped remote. If a particular hashsum type is not supported, +// chunker won't fail with `unsupported` error but return empty hash. +// +// In future metadata logic can be extended: if a normal (non-quick) +// hash type is configured, chunker will check whether wrapped remote +// supports it (see Fs.Hashes as an example). If not, it will add metadata +// to small files as well, thus providing hashsums for all files. +// func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) { if !o.isChunked() { // First, chain to the single wrapped chunk, if possible. @@ -1500,78 +1547,10 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (rc io.Read limit = o.size - offset } - switch downloadStrategy { - case "linear": - return o.newLinearReader(ctx, offset, limit, openOptions) - case "fastopen": - return o.newFastopenReader(ctx, offset, limit, openOptions) - default: - return nil, errors.New("invalid download strategy") - } + return o.newLinearReader(ctx, offset, limit, openOptions) } -// fastopenReader opens all chunks immediately, but reads sequentlially -type fastopenReader struct { - readClosers []io.ReadCloser - multiReader io.Reader -} - -func (o *Object) newFastopenReader(ctx context.Context, offset, limit int64, options []fs.OpenOption) (io.ReadCloser, error) { - var ( - readers []io.Reader - readClosers []io.ReadCloser - ) - for _, chunk := range o.chunks { - if limit <= 0 { - break - } - count := chunk.Size() - if offset >= count { - offset -= count - continue - } - count -= offset - if limit < count { - count = limit - } - - end := offset + count - 1 - chunkOptions := append(options, &fs.RangeOption{Start: offset, End: end}) - rc, err := chunk.Open(ctx, chunkOptions...) - if err != nil { - r := fastopenReader{readClosers: readClosers} - _ = r.Close() // ignore error - return nil, err - } - readClosers = append(readClosers, rc) - readers = append(readers, rc) - - offset = 0 - limit -= count - } - - r := &fastopenReader{ - readClosers: readClosers, - multiReader: io.MultiReader(readers...), - } - return r, nil -} - -func (r *fastopenReader) Read(p []byte) (n int, err error) { - return r.multiReader.Read(p) -} - -func (r *fastopenReader) Close() (err error) { - for _, rc := range r.readClosers { - chunkErr := rc.Close() - if err == nil { - err = chunkErr - } - } - return -} - -// linearReader opens and reads chunks sequentially, without read-ahead +// linearReader opens and reads file chunks sequentially, without read-ahead type linearReader struct { ctx context.Context chunks []fs.Object @@ -1771,25 +1750,9 @@ func (o *Object) ID() string { return "" } -// SetTier performs changing storage tier of the Object if -// multiple storage classes supported -func (o *Object) SetTier(tier string) error { - if doer, ok := o.mainChunk().(fs.SetTierer); ok { - return doer.SetTier(tier) - } - return errors.New("chunker: wrapped remote does not support SetTier") -} - -// GetTier returns storage tier or class of the Object -func (o *Object) GetTier() string { - if doer, ok := o.mainChunk().(fs.GetTierer); ok { - return doer.GetTier() - } - return "" -} - // Meta format `simplejson` type metaSimpleJSON struct { + Version int `json:"ver"` Size int64 `json:"size"` NChunks int `json:"nchunks"` MD5 string `json:"md5"` @@ -1798,6 +1761,7 @@ type metaSimpleJSON struct { func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) (data []byte, err error) { metaData := &metaSimpleJSON{ + Version: metaDataVersion, Size: size, NChunks: nChunks, MD5: md5, @@ -1806,47 +1770,56 @@ func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 s return json.Marshal(&metaData) } +// Note: only metadata format version 1 is supported a.t.m. +// +// Current implementation creates metadata only for files larger than +// configured chunk size. This approach has drawback: availability of +// configured hashsum type for small files depends on the wrapped remote. +// Future versions of chunker may change approach as described in comment +// to the Hash method. They can transparently migrate older metadata. +// New format will have a higher version number and cannot be correctly +// hanled by current implementation. +// The version check below will then explicitly ask user to upgrade rclone. +// func unmarshalSimpleJSON(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) { var metaData *metaSimpleJSON err = json.Unmarshal(data, &metaData) if err != nil { - return + return nil, err } + + // Perform strict checks, avoid corruption of future metadata formats. + if metaData.Size < 0 { + return nil, errors.New("negative file size") + } + if metaData.NChunks <= 0 { + return nil, errors.New("wrong number of chunks") + } + if metaData.MD5 != "" { + _, err = hex.DecodeString(metaData.MD5) + if len(metaData.MD5) != 32 || err != nil { + return nil, errors.New("wrong md5 hash") + } + } + if metaData.SHA1 != "" { + _, err = hex.DecodeString(metaData.SHA1) + if len(metaData.SHA1) != 40 || err != nil { + return nil, errors.New("wrong sha1 hash") + } + } + if metaData.Version <= 0 { + return nil, errors.New("wrong version number") + } + if metaData.Version != metaDataVersion { + return nil, errors.Errorf("version %d is not supported, please upgrade rclone", metaData.Version) + } + var nilFs *Fs // nil object triggers appropriate type method info = nilFs.wrapInfo(metaObject, "", metaData.Size) info.md5 = metaData.MD5 info.sha1 = metaData.SHA1 info.nChunks = metaData.NChunks - return -} - -// Meta format `wdmrcompat` -type metaWDMRCompat struct { - Name string `json:"Name"` - Size int64 `json:"Size"` - PublicKey interface{} `json:"PublicKey"` // ignored, can be nil - CreationDate time.Time `json:"CreationDate"` // modification time, ignored -} - -func marshalWDMRCompat(ctx context.Context, srcInfo fs.ObjectInfo) (data []byte, err error) { - metaData := &metaWDMRCompat{ - Name: path.Base(srcInfo.Remote()), - Size: srcInfo.Size(), - PublicKey: nil, - CreationDate: srcInfo.ModTime(ctx).UTC(), - } - return json.Marshal(&metaData) -} - -func unmarshalWDMRCompat(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) { - var metaData *metaWDMRCompat - err = json.Unmarshal(data, &metaData) - if err != nil { - return - } - var nilFs *Fs // nil object triggers appropriate type method - info = nilFs.wrapInfo(metaObject, "", metaData.Size) - return + return info, nil } // Check the interfaces are satisfied @@ -1868,6 +1841,4 @@ var ( _ fs.Object = (*Object)(nil) _ fs.ObjectUnWrapper = (*Object)(nil) _ fs.IDer = (*Object)(nil) - _ fs.SetTierer = (*Object)(nil) - _ fs.GetTierer = (*Object)(nil) ) diff --git a/backend/chunker/chunker_test.go b/backend/chunker/chunker_test.go index efbe41a4d..e5f1bb181 100644 --- a/backend/chunker/chunker_test.go +++ b/backend/chunker/chunker_test.go @@ -28,10 +28,14 @@ var ( // dynamic chunker overlay wrapping a local temporary directory. func TestIntegration(t *testing.T) { opt := fstests.Opt{ - RemoteName: *fstest.RemoteName, - NilObject: (*chunker.Object)(nil), - SkipBadWindowsCharacters: !*UseBadChars, - UnimplementableObjectMethods: []string{"MimeType"}, + RemoteName: *fstest.RemoteName, + NilObject: (*chunker.Object)(nil), + SkipBadWindowsCharacters: !*UseBadChars, + UnimplementableObjectMethods: []string{ + "MimeType", + "GetTier", + "SetTier", + }, UnimplementableFsMethods: []string{ "PublicLink", "OpenWriterAt", diff --git a/docs/content/chunker.md b/docs/content/chunker.md index daf9fbf06..f35fc8cf9 100644 --- a/docs/content/chunker.md +++ b/docs/content/chunker.md @@ -4,11 +4,11 @@ description: "Split-chunking overlay remote" date: "2019-08-30" --- -Chunker +Chunker (BETA) ---------------------------------------- The `chunker` overlay transparently splits large files into smaller chunks -during the upload to wrapped remote and transparently assembles them back +during upload to wrapped remote and transparently assembles them back when the file is downloaded. This allows to effectively overcome size limits imposed by storage providers. @@ -41,10 +41,27 @@ Storage> chunker Remote to chunk/unchunk. Normally should contain a ':' and a path, eg "myremote:path/to/dir", "myremote:bucket" or maybe "myremote:" (not recommended). +Enter a string value. Press Enter for the default (""). remote> remote:path -Files larger than chunk_size will be split in chunks. By default 2 Gb. +Files larger than chunk size will be split in chunks. Enter a size with suffix k,M,G,T. Press Enter for the default ("2G"). -chunk_size> 1G +chunk_size> 100M +Choose how chunker handles hash sums. +Enter a string value. Press Enter for the default ("md5"). +Choose a number from below, or type in your own value + / Chunker can pass any hash supported by wrapped remote + 1 | for a single-chunk file but returns nothing otherwise. + \ "none" + 2 / MD5 for multi-chunk files. Requires "simplejson". + \ "md5" + 3 / SHA1 for multi-chunk files. Requires "simplejson". + \ "sha1" + / Copying a file to chunker will request MD5 from the source + 4 | falling back to SHA1 if unsupported. Requires "simplejson". + \ "md5quick" + 5 / Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson". + \ "sha1quick" +hash_type> md5 Edit advanced config? (y/n) y) Yes n) No @@ -53,8 +70,9 @@ Remote config -------------------- [overlay] type = chunker -remote = TestLocal: -chunk_size = 2G +remote = remote:bucket +chunk_size = 100M +hash_type = md5 -------------------- y) Yes this is OK e) Edit this remote @@ -73,8 +91,8 @@ will put files in a directory called `name` in the current directory. ### Chunking -When rclone starts a file upload, chunker checks the file size. -If it doesn't exceed the configured chunk size, chunker will just pass it +When rclone starts a file upload, chunker checks the file size. If it +doesn't exceed the configured chunk size, chunker will just pass the file to the wrapped remote. If a file is large, chunker will transparently cut data in pieces with temporary names and stream them one by one, on the fly. Each chunk will contain the specified number of data byts, except for the @@ -84,7 +102,7 @@ a temporary copy, record its size and repeat the above process. When upload completes, temporary chunk files are finally renamed. This scheme guarantees that operations look from outside as atomic. A similar method with hidden temporary chunks is used for other operations -(copy/move/rename etc). If operation fails, hidden chunks are normally +(copy/move/rename etc). If an operation fails, hidden chunks are normally destroyed, and the destination composite file stays intact. #### Chunk names @@ -94,58 +112,52 @@ By default chunk names are `BIG_FILE_NAME.rclone-chunk.001`, format is `*.rclone-chunk.###`. You can configure another name format using the `--chunker-name-format` option. The format uses asterisk `*` as a placeholder for the base file name and one or more consecutive -hash characters `#` as a placeholder for the chunk number. There must be -one and only one asterisk. The number of consecutive hashes defines the -minimum length of a string representing a chunk number. If a chunk number -has less digits than the number of hashes, it is left-padded by zeros. -If there are more digits in the number, they are left as is. +hash characters `#` as a placeholder for sequential chunk number. +There must be one and only one asterisk. The number of consecutive hash +characters defines the minimum length of a string representing a chunk number. +If decimal chunk number has less digits than the number of hashes, it is +left-padded by zeros. If the number stringis longer, it is left intact. By default numbering starts from 1 but there is another option that allows user to start from 0, eg. for compatibility with legacy software. -For example, if name format is `big_*-##.part`, and original file was -named `data.txt` and numbering starts from 0, then the first chunk will be -named `big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part` -and the 302nd chunk will be `big_data.txt-301.part`. +For example, if name format is `big_*-##.part` and original file name is +`data.txt` and numbering starts from 0, then the first chunk will be named +`big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part` +and the 302nd chunk will become `big_data.txt-301.part`. -Would-be chunk files are ignored if their name does not match given format. -The list command might encounter composite files with missinng or invalid -chunks. By default, if chunker detects a missing chunk it will silently -ignore the whole group. Use the `--chunker-fail-on-bad-chunks` flag -to make it fail with an error message. +When the `list` rclone command scans a directory on wrapped remote, the +potential chunk files are accounted for and merged into composite directory +entries only if their names match the configured format. All other files +are ignored, including temporary chunks. +The list command might encounter composite files with missing or invalid +chunks. If chunker detects a missing chunk it will by default silently +ignore the whole group. You can use the `--chunker-fail-on-bad-chunks` +command line flag to make `list` fail with an error message. ### Metadata By default when a file is large enough, chunker will create a metadata object besides data chunks. The object is named after the original file. -Chunker allows to choose between few metadata formats. Please note that -currently metadata is not created for files smaller than configured -chunk size. This may change in future as new formats are developed. +Chunker allows user to disable metadata completely (the `none` format). +Please note that currently metadata is not created for files smaller +than configured chunk size. This may change in future as new formats +are developed. #### Simple JSON metadata format This is the default format. It supports hash sums and chunk validation for composite files. Meta objects carry the following fields: -- `size` - total size of chunks -- `nchunks` - number of chunks -- `md5` - MD5 hashsum (if present) +- `ver` - version of format, currently `1` +- `size` - total size of composite file +- `nchunks` - number of chunks in the file +- `md5` - MD5 hashsum of composite file (if present) - `sha1` - SHA1 hashsum (if present) There is no field for composite file name as it's simply equal to the name of meta object on the wrapped remote. Please refer to respective sections -for detils on hashsums and modified time handling. - -#### WedDavMailRu compatible metadata format - -The `wdmrcompat` metadata format is only useful to support historical files -created by [WebDriveMailru](https://github.com/yar229/WebDavMailRuCloud). -It keeps the following fields (most are ignored, though): - -- `Name` - name of the composite file (always equal to the meta file name) -- `Size` - total size of chunks -- `PublicKey` - ignored, always "null" -- `CreationDate` - last modification (sic!) time, ignored. +for detils on hashsums and handling of modified time. #### No metadata @@ -161,8 +173,8 @@ errors (especially missing last chunk) than metadata-enabled formats. ### Hashsums Chunker supports hashsums only when a compatible metadata is present. -Thus, if you choose metadata format of `none` or `wdmrcompat`, chunker -will return `UNSUPPORTED` as hashsum. +Thus, if you choose metadata format of `none`, chunker will return +`UNSUPPORTED` as hashsum. Please note that metadata is stored only for composite files. If a file is small (smaller than configured chunk size), chunker will transparently @@ -175,16 +187,16 @@ Currently you can choose one or another but not both. MD5 is set by default as the most supported type. Since chunker keeps hashes for composite files and falls back to the wrapped remote hash for small ones, we advise you to choose the same -hash type as wrapped remote, so your file listings look coherent. +hash type as wrapped remote so that your file listings look coherent. -Normally, when a file is copied to chunker controlled remote, chunker -will ask its source for compatible file hash and revert to on-the-fly +Normally, when a file is copied to a chunker controlled remote, chunker +will ask the file source for compatible file hash and revert to on-the-fly calculation if none is found. This involves some CPU overhead but provides a guarantee that given hashsum is available. Also, chunker will reject a server-side copy or move operation if source and destination hashsum types are different, resulting in the extra network bandwidth, too. In some rare cases this may be undesired, so chunker provides two optional -choices: `sha1quick` and `md5quick`. If source does not have the primary +choices: `sha1quick` and `md5quick`. If the source does not support primary hash type and the quick mode is enabled, chunker will try to fall back to the secondary type. This will save CPU and bandwidth but can result in empty hashsums at destination. Beware of consequences: the `sync` command will @@ -215,13 +227,14 @@ chunk naming scheme is to: hash type, chunk naming etc. - Now run `rclone sync oldchunks: newchunks:` and all your data will be transparently converted at transfer. - This may take some time. + This may take some time, yet chunker will try server-side + copy if possible. - After checking data integrity you may remove configuration section of the old remote. If rclone gets killed during a long operation on a big composite file, hidden temporary chunks may stay in the directory. They will not be -shown by the list command but will eat up your account quota. +shown by the `list` command but will eat up your account quota. Please note that the `deletefile` rclone command deletes only active chunks of a file. As a workaround, you can use remote of the wrapped file system to see them. @@ -234,17 +247,18 @@ remove everything including garbage. ### Caveats and Limitations Chunker requires wrapped remote to support server side `move` (or `copy` + -delete) operations, otherwise it will explicitly refuse to start. +`delete`) operations, otherwise it will explicitly refuse to start. This is because it internally renames temporary chunk files to their final names when an operation completes successfully. -Note that moves done using the copy-and-delete method may incur double -charging with some cloud storage providers. +Note that a move implemented using the copy-and-delete method may incur +double charging with some cloud storage providers. -Chunker will not automatically rename existing chunks when you change the -chunk name format. Beware that in result of this some files which have been -treated as chunks before the change can pop up in directory listings as -normal files and vice versa. The same warning holds for the chunk size. +Chunker will not automatically rename existing chunks when you run +`rclone config` on a live remote and change the chunk name format. +Beware that in result of this some files which have been treated as chunks +before the change can pop up in directory listings as normal files +and vice versa. The same warning holds for the chunk size. If you desperately need to change critical chunking setings, you should run data migration as described in a dedicated section. @@ -278,6 +292,28 @@ Files larger than chunk size will be split in chunks. - Type: SizeSuffix - Default: 2G +#### --chunker-hash-type + +Choose how chunker handles hash sums. + +- Config: hash_type +- Env Var: RCLONE_CHUNKER_HASH_TYPE +- Type: string +- Default: "md5" +- Examples: + - "none" + - Chunker can pass any hash supported by wrapped remote + - for a single-chunk file but returns nothing otherwise. + - "md5" + - MD5 for multi-chunk files. Requires "simplejson". + - "sha1" + - SHA1 for multi-chunk files. Requires "simplejson". + - "md5quick" + - Copying a file to chunker will request MD5 from the source + - falling back to SHA1 if unsupported. Requires "simplejson". + - "sha1quick" + - Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson". + ### Advanced Options Here are the advanced options specific to chunker (Transparently chunk/split large files). @@ -321,33 +357,6 @@ Metadata is a small JSON file named after the composite file. - "simplejson" - Simple JSON supports hash sums and chunk validation. - It has the following fields: size, nchunks, md5, sha1. - - "wdmrcompat" - - This format brings compatibility with WebDavMailRuCloud. - - It does not support hash sums or validation, most fields are ignored. - - It has the following fields: Name, Size, PublicKey, CreationDate. - - Requires hash type "none". - -#### --chunker-hash-type - -Choose how chunker handles hash sums. - -- Config: hash_type -- Env Var: RCLONE_CHUNKER_HASH_TYPE -- Type: string -- Default: "md5" -- Examples: - - "none" - - Chunker can pass any hash supported by wrapped remote - - for a single-chunk file but returns nothing otherwise. - - "md5" - - MD5 for multi-chunk files. Requires "simplejson". - - "sha1" - - SHA1 for multi-chunk files. Requires "simplejson". - - "md5quick" - - When a file is copied on to chunker, MD5 is taken from its source - - falling back to SHA1 if the source doesn't support it. Requires "simplejson". - - "sha1quick" - - Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson". #### --chunker-fail-on-bad-chunks diff --git a/fs/sync/sync_test.go b/fs/sync/sync_test.go index d3a5730ad..da69a48ba 100644 --- a/fs/sync/sync_test.go +++ b/fs/sync/sync_test.go @@ -986,7 +986,6 @@ func TestSyncWithTrackRenames(t *testing.T) { fs.Config.TrackRenames = true defer func() { fs.Config.TrackRenames = false - }() haveHash := r.Fremote.Hashes().Overlap(r.Flocal.Hashes()).GetOne() != hash.None @@ -1010,45 +1009,64 @@ func TestSyncWithTrackRenames(t *testing.T) { fstest.CheckItems(t, r.Fremote, f1, f2) - if canTrackRenames { - if r.Fremote.Features().Move == nil || r.Fremote.Name() == "TestUnion" { // union remote can Move but returns CantMove error - // If no server side Move, we are falling back to Copy + Delete - assert.Equal(t, int64(1), accounting.GlobalStats().GetTransfers()) // 1 copy - assert.Equal(t, int64(4), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move + 1 delete - } else { - assert.Equal(t, int64(0), accounting.GlobalStats().GetTransfers()) // 0 copy - assert.Equal(t, int64(3), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move - } - } else { - if toyFileChecks(r) != -1 { - assert.Equal(t, toyFileChecks(r), accounting.GlobalStats().GetChecks()) - } - assert.Equal(t, toyFileTransfers(r), accounting.GlobalStats().GetTransfers()) - } -} - -func toyFileChecks(r *fstest.Run) int64 { + // As currently there is no Fs interface providing number of chunks + // in a file, this test depends on the well-known names of test remotes. remote := r.Fremote.Name() - // Numbers below are calculated for a 14 byte file. - if !strings.HasPrefix(remote, "TestChunker") { - return 2 - } - // Chunker makes more internal checks. + + // Union remote can Move but returns CantMove error. + moveAsCopyDelete := r.Fremote.Features().Move == nil || remote == "TestUnion" + + chunker := strings.HasPrefix(remote, "TestChunker") + wrappedMoveAsCopyDelete := chunker && strings.HasSuffix(remote, "S3") + + chunk3b := chunker && strings.Contains(remote, "Chunk3b") // chunker with 3 byte chunks + chunk50b := chunker && strings.Contains(remote, "Chunk50b") // chunker with 50 byte chunks + chunkDefault := chunker && !strings.Contains(remote, "ChunkerChunk") // default big chunk size + chunkBig := chunk50b || chunkDefault // file is smaller than chunk size + + // Verify number of checks for a toy 14 byte file. + // The order of cases matters! var checks int switch { - case strings.Contains(remote, "Chunk3b"): // chunk 3 bytes - checks = 6 - case strings.Contains(remote, "Chunk50b"): // chunk 50 bytes - checks = 3 - case strings.Contains(remote, "ChunkerChunk"): // unknown chunk size - return -1 + case canTrackRenames && chunk3b: + checks = 8 // chunker makes extra checks for each small chunk + case canTrackRenames && chunkBig: + checks = 4 // chunker makes 1 extra check for a single big chunk + case canTrackRenames && moveAsCopyDelete: + checks = 4 // 2 file checks + 1 move + 1 delete + case canTrackRenames: + checks = 3 // 2 file checks + 1 move + case !chunker: + checks = 2 // 2 file checks on a generic non-chunking remote + case chunk3b: + checks = 6 // chunker makes extra checks for each small chunk + case chunkBig && wrappedMoveAsCopyDelete: + checks = 4 // one more extra check because S3 emulates Move as Copy+Delete + case chunkBig: + checks = 3 // chunker makes 1 extra check for a single big chunk default: - checks = 3 // large chunks (eventually no chunking) + checks = -1 // skip verification for chunker with unknown chunk size } - if strings.HasSuffix(remote, "S3") { - checks++ // Extra check because S3 emulates Move as Copy+Delete. + if checks != -1 { // "-1" allows remotes to bypass this check + assert.Equal(t, int64(checks), accounting.GlobalStats().GetChecks()) + } + + // Verify number of copy operations for a toy 14 byte file. + // The order of cases matters! + var copies int64 + switch { + case canTrackRenames && moveAsCopyDelete: + copies = 1 // 1 copy + case canTrackRenames: + copies = 0 // 0 copy + case chunkBig && wrappedMoveAsCopyDelete: + copies = 2 // extra Copy because S3 emulates Move as Copy+Delete. + default: + copies = 1 + } + if copies != -1 { // "-1" allows remotes to bypass this check + assert.Equal(t, copies, accounting.GlobalStats().GetTransfers()) } - return int64(checks) } func toyFileTransfers(r *fstest.Run) int64 { diff --git a/fstest/test_all/config.yaml b/fstest/test_all/config.yaml index b0a4d9739..31201bf36 100644 --- a/fstest/test_all/config.yaml +++ b/fstest/test_all/config.yaml @@ -33,9 +33,6 @@ backends: - backend: "chunker" remote: "TestChunkerNometaLocal:" fastlist: true - - backend: "chunker" - remote: "TestChunkerCompatLocal:" - fastlist: true - backend: "chunker" remote: "TestChunkerChunk3bLocal:" fastlist: true @@ -44,10 +41,6 @@ backends: remote: "TestChunkerChunk3bNometaLocal:" fastlist: true maxfile: 6k - - backend: "chunker" - remote: "TestChunkerChunk3bCompatLocal:" - fastlist: true - maxfile: 6k - backend: "chunker" remote: "TestChunkerMailru:" fastlist: true @@ -66,30 +59,26 @@ backends: - backend: "chunker" remote: "TestChunkerS3:" fastlist: true - ignore: - - TestIntegration/FsMkdir/FsPutFiles/SetTier - backend: "chunker" remote: "TestChunkerChunk50bS3:" fastlist: true maxfile: 1k - ignore: - - TestIntegration/FsMkdir/FsPutFiles/SetTier - #- backend: "chunker" - # remote: "TestChunkerChunk50bMD5HashS3:" - # fastlist: true - # maxfile: 1k - #- backend: "chunker" - # remote: "TestChunkerChunk50bMD5QuickS3:" - # fastlist: true - # maxfile: 1k - #- backend: "chunker" - # remote: "TestChunkerChunk50bSHA1HashS3:" - # fastlist: true - # maxfile: 1k - #- backend: "chunker" - # remote: "TestChunkerChunk50bSHA1QuickS3:" - # fastlist: true - # maxfile: 1k + - backend: "chunker" + remote: "TestChunkerChunk50bMD5HashS3:" + fastlist: true + maxfile: 1k + - backend: "chunker" + remote: "TestChunkerChunk50bSHA1HashS3:" + fastlist: true + maxfile: 1k + - backend: "chunker" + remote: "TestChunkerChunk50bMD5QuickS3:" + fastlist: true + maxfile: 1k + - backend: "chunker" + remote: "TestChunkerChunk50bSHA1QuickS3:" + fastlist: true + maxfile: 1k ## end chunker - backend: "drive" remote: "TestDrive:"