chunker: finish meta-format before release

changes:
- chunker: remove GetTier and SetTier
- remove wdmrcompat metaformat
- remove fastopen strategy
- make hash_type option non-advanced
- adverise hash support when possible
- add metadata field "ver", run strict checks
- describe internal behavior in comments
- improve documentation

note:
wdmrcompat used to write file name in the metadata, so maximum metadata
size was 1K; removing it allows to cap size by 200 bytes now.
This commit is contained in:
Ivan Andreev 2019-09-25 02:18:30 +03:00 committed by Nick Craig-Wood
parent c41812fc88
commit ccecfa9cb1
5 changed files with 303 additions and 312 deletions

View File

@ -36,13 +36,11 @@ const (
// WARNING: this optimization is not transaction safe! // WARNING: this optimization is not transaction safe!
optimizeFirstChunk = false optimizeFirstChunk = false
// Normally metadata is a small (less than 1KB) piece of JSON. // Normally metadata is a small (100-200 bytes) piece of JSON.
// Valid metadata size should not exceed this limit. // Valid metadata size should not exceed this limit.
maxMetaDataSize = 1023 maxMetaDataSize = 199
// fastopen strategy opens all chunks immediately, but reads sequentially. metaDataVersion = 1
// linear strategy opens and reads chunks sequentially, without read-ahead.
downloadStrategy = "linear"
) )
// Formatting of temporary chunk names. Temporary suffix *follows* chunk // Formatting of temporary chunk names. Temporary suffix *follows* chunk
@ -52,6 +50,13 @@ var (
tempChunkRegexp = regexp.MustCompile(`^(.+)\.\.tmp_([0-9]{10,19})$`) tempChunkRegexp = regexp.MustCompile(`^(.+)\.\.tmp_([0-9]{10,19})$`)
) )
// Note: metadata logic is tightly coupled with chunker code in many
// places of the code, eg. in checks whether a file can have meta object
// or is eligible for chunking.
// If more metadata formats (or versions of a format) are added in future,
// it may be advisable to factor it into a "metadata strategy" interface
// similar to chunkingReader or linearReader below.
// Register with Fs // Register with Fs
func init() { func init() {
fs.Register(&fs.RegInfo{ fs.Register(&fs.RegInfo{
@ -98,16 +103,10 @@ Metadata is a small JSON file named after the composite file.`,
Value: "simplejson", Value: "simplejson",
Help: `Simple JSON supports hash sums and chunk validation. Help: `Simple JSON supports hash sums and chunk validation.
It has the following fields: size, nchunks, md5, sha1.`, It has the following fields: size, nchunks, md5, sha1.`,
}, {
Value: "wdmrcompat",
Help: `This format brings compatibility with WebDavMailRuCloud.
It does not support hash sums or validation, most fields are ignored.
It has the following fields: Name, Size, PublicKey, CreationDate.
Requires hash type "none".`,
}}, }},
}, { }, {
Name: "hash_type", Name: "hash_type",
Advanced: true, Advanced: false,
Default: "md5", Default: "md5",
Help: `Choose how chunker handles hash sums.`, Help: `Choose how chunker handles hash sums.`,
Examples: []fs.OptionExample{{ Examples: []fs.OptionExample{{
@ -122,8 +121,8 @@ for a single-chunk file but returns nothing otherwise.`,
Help: `SHA1 for multi-chunk files. Requires "simplejson".`, Help: `SHA1 for multi-chunk files. Requires "simplejson".`,
}, { }, {
Value: "md5quick", Value: "md5quick",
Help: `When a file is copied on to chunker, MD5 is taken from its source Help: `Copying a file to chunker will request MD5 from the source
falling back to SHA1 if the source doesn't support it. Requires "simplejson".`, falling back to SHA1 if unsupported. Requires "simplejson".`,
}, { }, {
Value: "sha1quick", Value: "sha1quick",
Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`, Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`,
@ -188,7 +187,7 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
switch opt.MetaFormat { switch opt.MetaFormat {
case "none": case "none":
f.useMeta = false f.useMeta = false
case "simplejson", "wdmrcompat": case "simplejson":
f.useMeta = true f.useMeta = true
default: default:
return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat) return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat)
@ -243,8 +242,6 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
WriteMimeType: true, WriteMimeType: true,
BucketBased: true, BucketBased: true,
CanHaveEmptyDirectories: true, CanHaveEmptyDirectories: true,
SetTier: true,
GetTier: true,
ServerSideAcrossConfigs: true, ServerSideAcrossConfigs: true,
}).Fill(f).Mask(baseFs).WrapsFs(f, baseFs) }).Fill(f).Mask(baseFs).WrapsFs(f, baseFs)
@ -393,6 +390,19 @@ func (f *Fs) parseChunkName(name string) (mainName string, chunkNo int, tempNo i
// //
// This should return ErrDirNotFound if the directory isn't // This should return ErrDirNotFound if the directory isn't
// found. // found.
//
// Commands normally cleanup all temporary chunks in case of a failure.
// However, if rclone dies unexpectedly, it can leave behind a bunch of
// hidden temporary chunks. List and its underlying chunkEntries()
// silently skip all temporary chunks in the directory. It's okay if
// they belong to an unfinished command running in parallel.
//
// However, there is no way to discover dead temporary chunks a.t.m.
// As a workaround users can use `purge` to forcibly remove the whole
// directory together with dead chunks.
// In future a flag named like `--chunker-list-hidden` may be added to
// rclone that will tell List to reveal hidden chunks.
//
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
entries, err = f.base.List(ctx, dir) entries, err = f.base.List(ctx, dir)
if err != nil { if err != nil {
@ -428,7 +438,8 @@ func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) (
}) })
} }
// Add some directory entries. This alters entries returning it as newEntries. // chunkEntries is called by List(R). It merges chunk entries from
// wrapped remote into composite directory entries.
func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardErrors bool) (chunkedEntries fs.DirEntries, err error) { func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardErrors bool) (chunkedEntries fs.DirEntries, err error) {
// sort entries, so that meta objects (if any) appear before their chunks // sort entries, so that meta objects (if any) appear before their chunks
sortedEntries := make(fs.DirEntries, len(origEntries)) sortedEntries := make(fs.DirEntries, len(origEntries))
@ -514,6 +525,11 @@ func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardEr
} }
// NewObject finds the Object at remote. // NewObject finds the Object at remote.
//
// Please note that every NewObject invocation will scan the whole directory.
// Using here something like fs.DirCache might improve performance (and make
// logic more complex though).
//
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) { func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
if mainRemote, _, _ := f.parseChunkName(remote); mainRemote != "" { if mainRemote, _, _ := f.parseChunkName(remote); mainRemote != "" {
return nil, fmt.Errorf("%q should be meta object, not a chunk", remote) return nil, fmt.Errorf("%q should be meta object, not a chunk", remote)
@ -622,23 +638,14 @@ func (o *Object) readMetaData(ctx context.Context) error {
case "simplejson": case "simplejson":
metaInfo, err := unmarshalSimpleJSON(ctx, metaObject, metaData) metaInfo, err := unmarshalSimpleJSON(ctx, metaObject, metaData)
if err != nil { if err != nil {
// TODO: maybe it's a small single chunk? // TODO: in a rare case we might mistake a small file for metadata
return err return errors.Wrap(err, "invalid metadata")
} }
if o.size != metaInfo.Size() || len(o.chunks) != metaInfo.nChunks { if o.size != metaInfo.Size() || len(o.chunks) != metaInfo.nChunks {
return errors.New("invalid simplejson metadata") return errors.New("metadata doesn't match file size")
} }
o.md5 = metaInfo.md5 o.md5 = metaInfo.md5
o.sha1 = metaInfo.sha1 o.sha1 = metaInfo.sha1
case "wdmrcompat":
metaInfo, err := unmarshalWDMRCompat(ctx, metaObject, metaData)
if err != nil {
// TODO: maybe it's a small single chunk?
return err
}
if o.size != metaInfo.Size() {
return errors.New("invalid wdmrcompat metadata")
}
} }
o.isFull = true o.isFull = true
@ -784,9 +791,6 @@ func (f *Fs) put(ctx context.Context, in io.Reader, src fs.ObjectInfo, remote st
case "simplejson": case "simplejson":
c.updateHashes() c.updateHashes()
metaData, err = marshalSimpleJSON(ctx, sizeTotal, len(c.chunks), c.md5, c.sha1) metaData, err = marshalSimpleJSON(ctx, sizeTotal, len(c.chunks), c.md5, c.sha1)
case "wdmrcompat":
fileInfo := f.wrapInfo(src, baseRemote, sizeTotal)
metaData, err = marshalWDMRCompat(ctx, fileInfo)
} }
if err == nil { if err == nil {
metaInfo := f.wrapInfo(src, baseRemote, int64(len(metaData))) metaInfo := f.wrapInfo(src, baseRemote, int64(len(metaData)))
@ -951,6 +955,9 @@ func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, opt
// Update in to the object with the modTime given of the given size // Update in to the object with the modTime given of the given size
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error { func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
if err := o.readMetaData(ctx); err != nil {
return err
}
basePut := o.f.base.Put basePut := o.f.base.Put
if src.Size() < 0 { if src.Size() < 0 {
basePut = o.f.base.Features().PutStream basePut = o.f.base.Features().PutStream
@ -989,8 +996,17 @@ func (f *Fs) Precision() time.Duration {
} }
// Hashes returns the supported hash sets. // Hashes returns the supported hash sets.
// Chunker advertises a hash type if and only if it can be calculated
// for files of any size, multi-chunked or small.
func (f *Fs) Hashes() hash.Set { func (f *Fs) Hashes() hash.Set {
return hash.Set(hash.None) // composites && all of them && small files supported by wrapped remote
if f.useMD5 && !f.quickHash && f.base.Hashes().Contains(hash.MD5) {
return hash.NewHashSet(hash.MD5)
}
if f.useSHA1 && !f.quickHash && f.base.Hashes().Contains(hash.SHA1) {
return hash.NewHashSet(hash.SHA1)
}
return hash.NewHashSet() // can't provide strong guarantees
} }
// Mkdir makes the directory (container, bucket) // Mkdir makes the directory (container, bucket)
@ -1012,7 +1028,12 @@ func (f *Fs) Rmdir(ctx context.Context, dir string) error {
// Implement this if you have a way of deleting all the files // Implement this if you have a way of deleting all the files
// quicker than just running Remove() on the result of List() // quicker than just running Remove() on the result of List()
// //
// Return an error if it doesn't exist // Return an error if it doesn't exist.
//
// This command will chain to `purge` from wrapped remote.
// As a result it removes not only chunker files with their
// active chunks but also all hidden chunks in the directory.
//
func (f *Fs) Purge(ctx context.Context) error { func (f *Fs) Purge(ctx context.Context) error {
do := f.base.Features().Purge do := f.base.Features().Purge
if do == nil { if do == nil {
@ -1021,7 +1042,25 @@ func (f *Fs) Purge(ctx context.Context) error {
return do(ctx) return do(ctx)
} }
// Remove an object // Remove an object (chunks and metadata, if any)
//
// Remove deletes only active chunks of the object.
// It does not try to look for temporary chunks because they could belong
// to another command modifying this composite file in parallel.
//
// Commands normally cleanup all temporary chunks in case of a failure.
// However, if rclone dies unexpectedly, it can leave hidden temporary
// chunks, which cannot be discovered using the `list` command.
// Remove does not try to search for such chunks or delete them.
// Sometimes this can lead to strange results eg. when `list` shows that
// directory is empty but `rmdir` refuses to remove it because on the
// level of wrapped remote it's actually *not* empty.
// As a workaround users can use `purge` to forcibly remove it.
//
// In future, a flag `--chunker-delete-hidden` may be added which tells
// Remove to search directory for hidden chunks and remove them too
// (at the risk of breaking parallel commands).
//
func (o *Object) Remove(ctx context.Context) (err error) { func (o *Object) Remove(ctx context.Context) (err error) {
if o.main != nil { if o.main != nil {
err = o.main.Remove(ctx) err = o.main.Remove(ctx)
@ -1095,13 +1134,6 @@ func (f *Fs) copyOrMove(ctx context.Context, o *Object, remote string, do copyMo
metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData))) metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData)))
err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo) err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo)
} }
case "wdmrcompat":
newInfo := f.wrapInfo(metaObject, "", newObj.size)
metaData, err = marshalWDMRCompat(ctx, newInfo)
if err == nil {
metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData)))
err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo)
}
case "none": case "none":
if newObj.main != nil { if newObj.main != nil {
err = newObj.main.Remove(ctx) err = newObj.main.Remove(ctx)
@ -1436,7 +1468,22 @@ func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error {
// Hash returns the selected checksum of the file. // Hash returns the selected checksum of the file.
// If no checksum is available it returns "". // If no checksum is available it returns "".
// It prefers the wrapped hashsum for a non-chunked file, then tries saved one. //
// Hash prefers wrapped hashsum for a non-chunked file, then tries to
// read it from metadata. This in theory handles an unusual case when
// a small file is modified on the lower level by wrapped remote
// but chunker is not yet aware of changes.
//
// Currently metadata (if not configured as 'none') is kept only for
// multi-chunk files, but for small files chunker obtains hashsums from
// wrapped remote. If a particular hashsum type is not supported,
// chunker won't fail with `unsupported` error but return empty hash.
//
// In future metadata logic can be extended: if a normal (non-quick)
// hash type is configured, chunker will check whether wrapped remote
// supports it (see Fs.Hashes as an example). If not, it will add metadata
// to small files as well, thus providing hashsums for all files.
//
func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) { func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) {
if !o.isChunked() { if !o.isChunked() {
// First, chain to the single wrapped chunk, if possible. // First, chain to the single wrapped chunk, if possible.
@ -1500,78 +1547,10 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (rc io.Read
limit = o.size - offset limit = o.size - offset
} }
switch downloadStrategy { return o.newLinearReader(ctx, offset, limit, openOptions)
case "linear":
return o.newLinearReader(ctx, offset, limit, openOptions)
case "fastopen":
return o.newFastopenReader(ctx, offset, limit, openOptions)
default:
return nil, errors.New("invalid download strategy")
}
} }
// fastopenReader opens all chunks immediately, but reads sequentlially // linearReader opens and reads file chunks sequentially, without read-ahead
type fastopenReader struct {
readClosers []io.ReadCloser
multiReader io.Reader
}
func (o *Object) newFastopenReader(ctx context.Context, offset, limit int64, options []fs.OpenOption) (io.ReadCloser, error) {
var (
readers []io.Reader
readClosers []io.ReadCloser
)
for _, chunk := range o.chunks {
if limit <= 0 {
break
}
count := chunk.Size()
if offset >= count {
offset -= count
continue
}
count -= offset
if limit < count {
count = limit
}
end := offset + count - 1
chunkOptions := append(options, &fs.RangeOption{Start: offset, End: end})
rc, err := chunk.Open(ctx, chunkOptions...)
if err != nil {
r := fastopenReader{readClosers: readClosers}
_ = r.Close() // ignore error
return nil, err
}
readClosers = append(readClosers, rc)
readers = append(readers, rc)
offset = 0
limit -= count
}
r := &fastopenReader{
readClosers: readClosers,
multiReader: io.MultiReader(readers...),
}
return r, nil
}
func (r *fastopenReader) Read(p []byte) (n int, err error) {
return r.multiReader.Read(p)
}
func (r *fastopenReader) Close() (err error) {
for _, rc := range r.readClosers {
chunkErr := rc.Close()
if err == nil {
err = chunkErr
}
}
return
}
// linearReader opens and reads chunks sequentially, without read-ahead
type linearReader struct { type linearReader struct {
ctx context.Context ctx context.Context
chunks []fs.Object chunks []fs.Object
@ -1771,25 +1750,9 @@ func (o *Object) ID() string {
return "" return ""
} }
// SetTier performs changing storage tier of the Object if
// multiple storage classes supported
func (o *Object) SetTier(tier string) error {
if doer, ok := o.mainChunk().(fs.SetTierer); ok {
return doer.SetTier(tier)
}
return errors.New("chunker: wrapped remote does not support SetTier")
}
// GetTier returns storage tier or class of the Object
func (o *Object) GetTier() string {
if doer, ok := o.mainChunk().(fs.GetTierer); ok {
return doer.GetTier()
}
return ""
}
// Meta format `simplejson` // Meta format `simplejson`
type metaSimpleJSON struct { type metaSimpleJSON struct {
Version int `json:"ver"`
Size int64 `json:"size"` Size int64 `json:"size"`
NChunks int `json:"nchunks"` NChunks int `json:"nchunks"`
MD5 string `json:"md5"` MD5 string `json:"md5"`
@ -1798,6 +1761,7 @@ type metaSimpleJSON struct {
func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) (data []byte, err error) { func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) (data []byte, err error) {
metaData := &metaSimpleJSON{ metaData := &metaSimpleJSON{
Version: metaDataVersion,
Size: size, Size: size,
NChunks: nChunks, NChunks: nChunks,
MD5: md5, MD5: md5,
@ -1806,47 +1770,56 @@ func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 s
return json.Marshal(&metaData) return json.Marshal(&metaData)
} }
// Note: only metadata format version 1 is supported a.t.m.
//
// Current implementation creates metadata only for files larger than
// configured chunk size. This approach has drawback: availability of
// configured hashsum type for small files depends on the wrapped remote.
// Future versions of chunker may change approach as described in comment
// to the Hash method. They can transparently migrate older metadata.
// New format will have a higher version number and cannot be correctly
// hanled by current implementation.
// The version check below will then explicitly ask user to upgrade rclone.
//
func unmarshalSimpleJSON(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) { func unmarshalSimpleJSON(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) {
var metaData *metaSimpleJSON var metaData *metaSimpleJSON
err = json.Unmarshal(data, &metaData) err = json.Unmarshal(data, &metaData)
if err != nil { if err != nil {
return return nil, err
} }
// Perform strict checks, avoid corruption of future metadata formats.
if metaData.Size < 0 {
return nil, errors.New("negative file size")
}
if metaData.NChunks <= 0 {
return nil, errors.New("wrong number of chunks")
}
if metaData.MD5 != "" {
_, err = hex.DecodeString(metaData.MD5)
if len(metaData.MD5) != 32 || err != nil {
return nil, errors.New("wrong md5 hash")
}
}
if metaData.SHA1 != "" {
_, err = hex.DecodeString(metaData.SHA1)
if len(metaData.SHA1) != 40 || err != nil {
return nil, errors.New("wrong sha1 hash")
}
}
if metaData.Version <= 0 {
return nil, errors.New("wrong version number")
}
if metaData.Version != metaDataVersion {
return nil, errors.Errorf("version %d is not supported, please upgrade rclone", metaData.Version)
}
var nilFs *Fs // nil object triggers appropriate type method var nilFs *Fs // nil object triggers appropriate type method
info = nilFs.wrapInfo(metaObject, "", metaData.Size) info = nilFs.wrapInfo(metaObject, "", metaData.Size)
info.md5 = metaData.MD5 info.md5 = metaData.MD5
info.sha1 = metaData.SHA1 info.sha1 = metaData.SHA1
info.nChunks = metaData.NChunks info.nChunks = metaData.NChunks
return return info, nil
}
// Meta format `wdmrcompat`
type metaWDMRCompat struct {
Name string `json:"Name"`
Size int64 `json:"Size"`
PublicKey interface{} `json:"PublicKey"` // ignored, can be nil
CreationDate time.Time `json:"CreationDate"` // modification time, ignored
}
func marshalWDMRCompat(ctx context.Context, srcInfo fs.ObjectInfo) (data []byte, err error) {
metaData := &metaWDMRCompat{
Name: path.Base(srcInfo.Remote()),
Size: srcInfo.Size(),
PublicKey: nil,
CreationDate: srcInfo.ModTime(ctx).UTC(),
}
return json.Marshal(&metaData)
}
func unmarshalWDMRCompat(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) {
var metaData *metaWDMRCompat
err = json.Unmarshal(data, &metaData)
if err != nil {
return
}
var nilFs *Fs // nil object triggers appropriate type method
info = nilFs.wrapInfo(metaObject, "", metaData.Size)
return
} }
// Check the interfaces are satisfied // Check the interfaces are satisfied
@ -1868,6 +1841,4 @@ var (
_ fs.Object = (*Object)(nil) _ fs.Object = (*Object)(nil)
_ fs.ObjectUnWrapper = (*Object)(nil) _ fs.ObjectUnWrapper = (*Object)(nil)
_ fs.IDer = (*Object)(nil) _ fs.IDer = (*Object)(nil)
_ fs.SetTierer = (*Object)(nil)
_ fs.GetTierer = (*Object)(nil)
) )

View File

@ -28,10 +28,14 @@ var (
// dynamic chunker overlay wrapping a local temporary directory. // dynamic chunker overlay wrapping a local temporary directory.
func TestIntegration(t *testing.T) { func TestIntegration(t *testing.T) {
opt := fstests.Opt{ opt := fstests.Opt{
RemoteName: *fstest.RemoteName, RemoteName: *fstest.RemoteName,
NilObject: (*chunker.Object)(nil), NilObject: (*chunker.Object)(nil),
SkipBadWindowsCharacters: !*UseBadChars, SkipBadWindowsCharacters: !*UseBadChars,
UnimplementableObjectMethods: []string{"MimeType"}, UnimplementableObjectMethods: []string{
"MimeType",
"GetTier",
"SetTier",
},
UnimplementableFsMethods: []string{ UnimplementableFsMethods: []string{
"PublicLink", "PublicLink",
"OpenWriterAt", "OpenWriterAt",

View File

@ -4,11 +4,11 @@ description: "Split-chunking overlay remote"
date: "2019-08-30" date: "2019-08-30"
--- ---
<i class="fa fa-cut"></i>Chunker <i class="fa fa-cut"></i>Chunker (BETA)
---------------------------------------- ----------------------------------------
The `chunker` overlay transparently splits large files into smaller chunks The `chunker` overlay transparently splits large files into smaller chunks
during the upload to wrapped remote and transparently assembles them back during upload to wrapped remote and transparently assembles them back
when the file is downloaded. This allows to effectively overcome size limits when the file is downloaded. This allows to effectively overcome size limits
imposed by storage providers. imposed by storage providers.
@ -41,10 +41,27 @@ Storage> chunker
Remote to chunk/unchunk. Remote to chunk/unchunk.
Normally should contain a ':' and a path, eg "myremote:path/to/dir", Normally should contain a ':' and a path, eg "myremote:path/to/dir",
"myremote:bucket" or maybe "myremote:" (not recommended). "myremote:bucket" or maybe "myremote:" (not recommended).
Enter a string value. Press Enter for the default ("").
remote> remote:path remote> remote:path
Files larger than chunk_size will be split in chunks. By default 2 Gb. Files larger than chunk size will be split in chunks.
Enter a size with suffix k,M,G,T. Press Enter for the default ("2G"). Enter a size with suffix k,M,G,T. Press Enter for the default ("2G").
chunk_size> 1G chunk_size> 100M
Choose how chunker handles hash sums.
Enter a string value. Press Enter for the default ("md5").
Choose a number from below, or type in your own value
/ Chunker can pass any hash supported by wrapped remote
1 | for a single-chunk file but returns nothing otherwise.
\ "none"
2 / MD5 for multi-chunk files. Requires "simplejson".
\ "md5"
3 / SHA1 for multi-chunk files. Requires "simplejson".
\ "sha1"
/ Copying a file to chunker will request MD5 from the source
4 | falling back to SHA1 if unsupported. Requires "simplejson".
\ "md5quick"
5 / Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
\ "sha1quick"
hash_type> md5
Edit advanced config? (y/n) Edit advanced config? (y/n)
y) Yes y) Yes
n) No n) No
@ -53,8 +70,9 @@ Remote config
-------------------- --------------------
[overlay] [overlay]
type = chunker type = chunker
remote = TestLocal: remote = remote:bucket
chunk_size = 2G chunk_size = 100M
hash_type = md5
-------------------- --------------------
y) Yes this is OK y) Yes this is OK
e) Edit this remote e) Edit this remote
@ -73,8 +91,8 @@ will put files in a directory called `name` in the current directory.
### Chunking ### Chunking
When rclone starts a file upload, chunker checks the file size. When rclone starts a file upload, chunker checks the file size. If it
If it doesn't exceed the configured chunk size, chunker will just pass it doesn't exceed the configured chunk size, chunker will just pass the file
to the wrapped remote. If a file is large, chunker will transparently cut to the wrapped remote. If a file is large, chunker will transparently cut
data in pieces with temporary names and stream them one by one, on the fly. data in pieces with temporary names and stream them one by one, on the fly.
Each chunk will contain the specified number of data byts, except for the Each chunk will contain the specified number of data byts, except for the
@ -84,7 +102,7 @@ a temporary copy, record its size and repeat the above process.
When upload completes, temporary chunk files are finally renamed. When upload completes, temporary chunk files are finally renamed.
This scheme guarantees that operations look from outside as atomic. This scheme guarantees that operations look from outside as atomic.
A similar method with hidden temporary chunks is used for other operations A similar method with hidden temporary chunks is used for other operations
(copy/move/rename etc). If operation fails, hidden chunks are normally (copy/move/rename etc). If an operation fails, hidden chunks are normally
destroyed, and the destination composite file stays intact. destroyed, and the destination composite file stays intact.
#### Chunk names #### Chunk names
@ -94,58 +112,52 @@ By default chunk names are `BIG_FILE_NAME.rclone-chunk.001`,
format is `*.rclone-chunk.###`. You can configure another name format format is `*.rclone-chunk.###`. You can configure another name format
using the `--chunker-name-format` option. The format uses asterisk using the `--chunker-name-format` option. The format uses asterisk
`*` as a placeholder for the base file name and one or more consecutive `*` as a placeholder for the base file name and one or more consecutive
hash characters `#` as a placeholder for the chunk number. There must be hash characters `#` as a placeholder for sequential chunk number.
one and only one asterisk. The number of consecutive hashes defines the There must be one and only one asterisk. The number of consecutive hash
minimum length of a string representing a chunk number. If a chunk number characters defines the minimum length of a string representing a chunk number.
has less digits than the number of hashes, it is left-padded by zeros. If decimal chunk number has less digits than the number of hashes, it is
If there are more digits in the number, they are left as is. left-padded by zeros. If the number stringis longer, it is left intact.
By default numbering starts from 1 but there is another option that allows By default numbering starts from 1 but there is another option that allows
user to start from 0, eg. for compatibility with legacy software. user to start from 0, eg. for compatibility with legacy software.
For example, if name format is `big_*-##.part`, and original file was For example, if name format is `big_*-##.part` and original file name is
named `data.txt` and numbering starts from 0, then the first chunk will be `data.txt` and numbering starts from 0, then the first chunk will be named
named `big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part` `big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part`
and the 302nd chunk will be `big_data.txt-301.part`. and the 302nd chunk will become `big_data.txt-301.part`.
Would-be chunk files are ignored if their name does not match given format. When the `list` rclone command scans a directory on wrapped remote, the
The list command might encounter composite files with missinng or invalid potential chunk files are accounted for and merged into composite directory
chunks. By default, if chunker detects a missing chunk it will silently entries only if their names match the configured format. All other files
ignore the whole group. Use the `--chunker-fail-on-bad-chunks` flag are ignored, including temporary chunks.
to make it fail with an error message. The list command might encounter composite files with missing or invalid
chunks. If chunker detects a missing chunk it will by default silently
ignore the whole group. You can use the `--chunker-fail-on-bad-chunks`
command line flag to make `list` fail with an error message.
### Metadata ### Metadata
By default when a file is large enough, chunker will create a metadata By default when a file is large enough, chunker will create a metadata
object besides data chunks. The object is named after the original file. object besides data chunks. The object is named after the original file.
Chunker allows to choose between few metadata formats. Please note that Chunker allows user to disable metadata completely (the `none` format).
currently metadata is not created for files smaller than configured Please note that currently metadata is not created for files smaller
chunk size. This may change in future as new formats are developed. than configured chunk size. This may change in future as new formats
are developed.
#### Simple JSON metadata format #### Simple JSON metadata format
This is the default format. It supports hash sums and chunk validation This is the default format. It supports hash sums and chunk validation
for composite files. Meta objects carry the following fields: for composite files. Meta objects carry the following fields:
- `size` - total size of chunks - `ver` - version of format, currently `1`
- `nchunks` - number of chunks - `size` - total size of composite file
- `md5` - MD5 hashsum (if present) - `nchunks` - number of chunks in the file
- `md5` - MD5 hashsum of composite file (if present)
- `sha1` - SHA1 hashsum (if present) - `sha1` - SHA1 hashsum (if present)
There is no field for composite file name as it's simply equal to the name There is no field for composite file name as it's simply equal to the name
of meta object on the wrapped remote. Please refer to respective sections of meta object on the wrapped remote. Please refer to respective sections
for detils on hashsums and modified time handling. for detils on hashsums and handling of modified time.
#### WedDavMailRu compatible metadata format
The `wdmrcompat` metadata format is only useful to support historical files
created by [WebDriveMailru](https://github.com/yar229/WebDavMailRuCloud).
It keeps the following fields (most are ignored, though):
- `Name` - name of the composite file (always equal to the meta file name)
- `Size` - total size of chunks
- `PublicKey` - ignored, always "null"
- `CreationDate` - last modification (sic!) time, ignored.
#### No metadata #### No metadata
@ -161,8 +173,8 @@ errors (especially missing last chunk) than metadata-enabled formats.
### Hashsums ### Hashsums
Chunker supports hashsums only when a compatible metadata is present. Chunker supports hashsums only when a compatible metadata is present.
Thus, if you choose metadata format of `none` or `wdmrcompat`, chunker Thus, if you choose metadata format of `none`, chunker will return
will return `UNSUPPORTED` as hashsum. `UNSUPPORTED` as hashsum.
Please note that metadata is stored only for composite files. If a file Please note that metadata is stored only for composite files. If a file
is small (smaller than configured chunk size), chunker will transparently is small (smaller than configured chunk size), chunker will transparently
@ -175,16 +187,16 @@ Currently you can choose one or another but not both.
MD5 is set by default as the most supported type. MD5 is set by default as the most supported type.
Since chunker keeps hashes for composite files and falls back to the Since chunker keeps hashes for composite files and falls back to the
wrapped remote hash for small ones, we advise you to choose the same wrapped remote hash for small ones, we advise you to choose the same
hash type as wrapped remote, so your file listings look coherent. hash type as wrapped remote so that your file listings look coherent.
Normally, when a file is copied to chunker controlled remote, chunker Normally, when a file is copied to a chunker controlled remote, chunker
will ask its source for compatible file hash and revert to on-the-fly will ask the file source for compatible file hash and revert to on-the-fly
calculation if none is found. This involves some CPU overhead but provides calculation if none is found. This involves some CPU overhead but provides
a guarantee that given hashsum is available. Also, chunker will reject a guarantee that given hashsum is available. Also, chunker will reject
a server-side copy or move operation if source and destination hashsum a server-side copy or move operation if source and destination hashsum
types are different, resulting in the extra network bandwidth, too. types are different, resulting in the extra network bandwidth, too.
In some rare cases this may be undesired, so chunker provides two optional In some rare cases this may be undesired, so chunker provides two optional
choices: `sha1quick` and `md5quick`. If source does not have the primary choices: `sha1quick` and `md5quick`. If the source does not support primary
hash type and the quick mode is enabled, chunker will try to fall back to hash type and the quick mode is enabled, chunker will try to fall back to
the secondary type. This will save CPU and bandwidth but can result in empty the secondary type. This will save CPU and bandwidth but can result in empty
hashsums at destination. Beware of consequences: the `sync` command will hashsums at destination. Beware of consequences: the `sync` command will
@ -215,13 +227,14 @@ chunk naming scheme is to:
hash type, chunk naming etc. hash type, chunk naming etc.
- Now run `rclone sync oldchunks: newchunks:` and all your data - Now run `rclone sync oldchunks: newchunks:` and all your data
will be transparently converted at transfer. will be transparently converted at transfer.
This may take some time. This may take some time, yet chunker will try server-side
copy if possible.
- After checking data integrity you may remove configuration section - After checking data integrity you may remove configuration section
of the old remote. of the old remote.
If rclone gets killed during a long operation on a big composite file, If rclone gets killed during a long operation on a big composite file,
hidden temporary chunks may stay in the directory. They will not be hidden temporary chunks may stay in the directory. They will not be
shown by the list command but will eat up your account quota. shown by the `list` command but will eat up your account quota.
Please note that the `deletefile` rclone command deletes only active Please note that the `deletefile` rclone command deletes only active
chunks of a file. As a workaround, you can use remote of the wrapped chunks of a file. As a workaround, you can use remote of the wrapped
file system to see them. file system to see them.
@ -234,17 +247,18 @@ remove everything including garbage.
### Caveats and Limitations ### Caveats and Limitations
Chunker requires wrapped remote to support server side `move` (or `copy` + Chunker requires wrapped remote to support server side `move` (or `copy` +
delete) operations, otherwise it will explicitly refuse to start. `delete`) operations, otherwise it will explicitly refuse to start.
This is because it internally renames temporary chunk files to their final This is because it internally renames temporary chunk files to their final
names when an operation completes successfully. names when an operation completes successfully.
Note that moves done using the copy-and-delete method may incur double Note that a move implemented using the copy-and-delete method may incur
charging with some cloud storage providers. double charging with some cloud storage providers.
Chunker will not automatically rename existing chunks when you change the Chunker will not automatically rename existing chunks when you run
chunk name format. Beware that in result of this some files which have been `rclone config` on a live remote and change the chunk name format.
treated as chunks before the change can pop up in directory listings as Beware that in result of this some files which have been treated as chunks
normal files and vice versa. The same warning holds for the chunk size. before the change can pop up in directory listings as normal files
and vice versa. The same warning holds for the chunk size.
If you desperately need to change critical chunking setings, you should If you desperately need to change critical chunking setings, you should
run data migration as described in a dedicated section. run data migration as described in a dedicated section.
@ -278,6 +292,28 @@ Files larger than chunk size will be split in chunks.
- Type: SizeSuffix - Type: SizeSuffix
- Default: 2G - Default: 2G
#### --chunker-hash-type
Choose how chunker handles hash sums.
- Config: hash_type
- Env Var: RCLONE_CHUNKER_HASH_TYPE
- Type: string
- Default: "md5"
- Examples:
- "none"
- Chunker can pass any hash supported by wrapped remote
- for a single-chunk file but returns nothing otherwise.
- "md5"
- MD5 for multi-chunk files. Requires "simplejson".
- "sha1"
- SHA1 for multi-chunk files. Requires "simplejson".
- "md5quick"
- Copying a file to chunker will request MD5 from the source
- falling back to SHA1 if unsupported. Requires "simplejson".
- "sha1quick"
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
### Advanced Options ### Advanced Options
Here are the advanced options specific to chunker (Transparently chunk/split large files). Here are the advanced options specific to chunker (Transparently chunk/split large files).
@ -321,33 +357,6 @@ Metadata is a small JSON file named after the composite file.
- "simplejson" - "simplejson"
- Simple JSON supports hash sums and chunk validation. - Simple JSON supports hash sums and chunk validation.
- It has the following fields: size, nchunks, md5, sha1. - It has the following fields: size, nchunks, md5, sha1.
- "wdmrcompat"
- This format brings compatibility with WebDavMailRuCloud.
- It does not support hash sums or validation, most fields are ignored.
- It has the following fields: Name, Size, PublicKey, CreationDate.
- Requires hash type "none".
#### --chunker-hash-type
Choose how chunker handles hash sums.
- Config: hash_type
- Env Var: RCLONE_CHUNKER_HASH_TYPE
- Type: string
- Default: "md5"
- Examples:
- "none"
- Chunker can pass any hash supported by wrapped remote
- for a single-chunk file but returns nothing otherwise.
- "md5"
- MD5 for multi-chunk files. Requires "simplejson".
- "sha1"
- SHA1 for multi-chunk files. Requires "simplejson".
- "md5quick"
- When a file is copied on to chunker, MD5 is taken from its source
- falling back to SHA1 if the source doesn't support it. Requires "simplejson".
- "sha1quick"
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
#### --chunker-fail-on-bad-chunks #### --chunker-fail-on-bad-chunks

View File

@ -986,7 +986,6 @@ func TestSyncWithTrackRenames(t *testing.T) {
fs.Config.TrackRenames = true fs.Config.TrackRenames = true
defer func() { defer func() {
fs.Config.TrackRenames = false fs.Config.TrackRenames = false
}() }()
haveHash := r.Fremote.Hashes().Overlap(r.Flocal.Hashes()).GetOne() != hash.None haveHash := r.Fremote.Hashes().Overlap(r.Flocal.Hashes()).GetOne() != hash.None
@ -1010,45 +1009,64 @@ func TestSyncWithTrackRenames(t *testing.T) {
fstest.CheckItems(t, r.Fremote, f1, f2) fstest.CheckItems(t, r.Fremote, f1, f2)
if canTrackRenames { // As currently there is no Fs interface providing number of chunks
if r.Fremote.Features().Move == nil || r.Fremote.Name() == "TestUnion" { // union remote can Move but returns CantMove error // in a file, this test depends on the well-known names of test remotes.
// If no server side Move, we are falling back to Copy + Delete
assert.Equal(t, int64(1), accounting.GlobalStats().GetTransfers()) // 1 copy
assert.Equal(t, int64(4), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move + 1 delete
} else {
assert.Equal(t, int64(0), accounting.GlobalStats().GetTransfers()) // 0 copy
assert.Equal(t, int64(3), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move
}
} else {
if toyFileChecks(r) != -1 {
assert.Equal(t, toyFileChecks(r), accounting.GlobalStats().GetChecks())
}
assert.Equal(t, toyFileTransfers(r), accounting.GlobalStats().GetTransfers())
}
}
func toyFileChecks(r *fstest.Run) int64 {
remote := r.Fremote.Name() remote := r.Fremote.Name()
// Numbers below are calculated for a 14 byte file.
if !strings.HasPrefix(remote, "TestChunker") { // Union remote can Move but returns CantMove error.
return 2 moveAsCopyDelete := r.Fremote.Features().Move == nil || remote == "TestUnion"
}
// Chunker makes more internal checks. chunker := strings.HasPrefix(remote, "TestChunker")
wrappedMoveAsCopyDelete := chunker && strings.HasSuffix(remote, "S3")
chunk3b := chunker && strings.Contains(remote, "Chunk3b") // chunker with 3 byte chunks
chunk50b := chunker && strings.Contains(remote, "Chunk50b") // chunker with 50 byte chunks
chunkDefault := chunker && !strings.Contains(remote, "ChunkerChunk") // default big chunk size
chunkBig := chunk50b || chunkDefault // file is smaller than chunk size
// Verify number of checks for a toy 14 byte file.
// The order of cases matters!
var checks int var checks int
switch { switch {
case strings.Contains(remote, "Chunk3b"): // chunk 3 bytes case canTrackRenames && chunk3b:
checks = 6 checks = 8 // chunker makes extra checks for each small chunk
case strings.Contains(remote, "Chunk50b"): // chunk 50 bytes case canTrackRenames && chunkBig:
checks = 3 checks = 4 // chunker makes 1 extra check for a single big chunk
case strings.Contains(remote, "ChunkerChunk"): // unknown chunk size case canTrackRenames && moveAsCopyDelete:
return -1 checks = 4 // 2 file checks + 1 move + 1 delete
case canTrackRenames:
checks = 3 // 2 file checks + 1 move
case !chunker:
checks = 2 // 2 file checks on a generic non-chunking remote
case chunk3b:
checks = 6 // chunker makes extra checks for each small chunk
case chunkBig && wrappedMoveAsCopyDelete:
checks = 4 // one more extra check because S3 emulates Move as Copy+Delete
case chunkBig:
checks = 3 // chunker makes 1 extra check for a single big chunk
default: default:
checks = 3 // large chunks (eventually no chunking) checks = -1 // skip verification for chunker with unknown chunk size
} }
if strings.HasSuffix(remote, "S3") { if checks != -1 { // "-1" allows remotes to bypass this check
checks++ // Extra check because S3 emulates Move as Copy+Delete. assert.Equal(t, int64(checks), accounting.GlobalStats().GetChecks())
}
// Verify number of copy operations for a toy 14 byte file.
// The order of cases matters!
var copies int64
switch {
case canTrackRenames && moveAsCopyDelete:
copies = 1 // 1 copy
case canTrackRenames:
copies = 0 // 0 copy
case chunkBig && wrappedMoveAsCopyDelete:
copies = 2 // extra Copy because S3 emulates Move as Copy+Delete.
default:
copies = 1
}
if copies != -1 { // "-1" allows remotes to bypass this check
assert.Equal(t, copies, accounting.GlobalStats().GetTransfers())
} }
return int64(checks)
} }
func toyFileTransfers(r *fstest.Run) int64 { func toyFileTransfers(r *fstest.Run) int64 {

View File

@ -33,9 +33,6 @@ backends:
- backend: "chunker" - backend: "chunker"
remote: "TestChunkerNometaLocal:" remote: "TestChunkerNometaLocal:"
fastlist: true fastlist: true
- backend: "chunker"
remote: "TestChunkerCompatLocal:"
fastlist: true
- backend: "chunker" - backend: "chunker"
remote: "TestChunkerChunk3bLocal:" remote: "TestChunkerChunk3bLocal:"
fastlist: true fastlist: true
@ -44,10 +41,6 @@ backends:
remote: "TestChunkerChunk3bNometaLocal:" remote: "TestChunkerChunk3bNometaLocal:"
fastlist: true fastlist: true
maxfile: 6k maxfile: 6k
- backend: "chunker"
remote: "TestChunkerChunk3bCompatLocal:"
fastlist: true
maxfile: 6k
- backend: "chunker" - backend: "chunker"
remote: "TestChunkerMailru:" remote: "TestChunkerMailru:"
fastlist: true fastlist: true
@ -66,30 +59,26 @@ backends:
- backend: "chunker" - backend: "chunker"
remote: "TestChunkerS3:" remote: "TestChunkerS3:"
fastlist: true fastlist: true
ignore:
- TestIntegration/FsMkdir/FsPutFiles/SetTier
- backend: "chunker" - backend: "chunker"
remote: "TestChunkerChunk50bS3:" remote: "TestChunkerChunk50bS3:"
fastlist: true fastlist: true
maxfile: 1k maxfile: 1k
ignore: - backend: "chunker"
- TestIntegration/FsMkdir/FsPutFiles/SetTier remote: "TestChunkerChunk50bMD5HashS3:"
#- backend: "chunker" fastlist: true
# remote: "TestChunkerChunk50bMD5HashS3:" maxfile: 1k
# fastlist: true - backend: "chunker"
# maxfile: 1k remote: "TestChunkerChunk50bSHA1HashS3:"
#- backend: "chunker" fastlist: true
# remote: "TestChunkerChunk50bMD5QuickS3:" maxfile: 1k
# fastlist: true - backend: "chunker"
# maxfile: 1k remote: "TestChunkerChunk50bMD5QuickS3:"
#- backend: "chunker" fastlist: true
# remote: "TestChunkerChunk50bSHA1HashS3:" maxfile: 1k
# fastlist: true - backend: "chunker"
# maxfile: 1k remote: "TestChunkerChunk50bSHA1QuickS3:"
#- backend: "chunker" fastlist: true
# remote: "TestChunkerChunk50bSHA1QuickS3:" maxfile: 1k
# fastlist: true
# maxfile: 1k
## end chunker ## end chunker
- backend: "drive" - backend: "drive"
remote: "TestDrive:" remote: "TestDrive:"