rclone/b2/upload.go

// Upload large files for b2
//
// Docs - https://www.backblaze.com/b2/docs/large_files.html

package b2

import (
	"bytes"
	"crypto/sha1"
	"encoding/hex"
	"fmt"
	"hash"
	"io"
	"strings"
	"sync"

	"github.com/ncw/rclone/b2/api"
	"github.com/ncw/rclone/fs"
	"github.com/ncw/rclone/rest"
	"github.com/pkg/errors"
)

type hashAppendingReader struct {
	h         hash.Hash
	in        io.Reader
	hexSum    string
	hexReader io.Reader
}

// Read returns bytes all bytes from the original reader, then the hex sum
// of what was read so far, then EOF.
func (har *hashAppendingReader) Read(b []byte) (int, error) {
	if har.hexReader == nil {
		n, err := har.in.Read(b)
		if err == io.EOF {
			har.in = nil // allow GC
			err = nil    // allow reading hexSum before EOF

			har.hexSum = hex.EncodeToString(har.h.Sum(nil))
			har.hexReader = strings.NewReader(har.hexSum)
		}
		return n, err
	}
	return har.hexReader.Read(b)
}

// AdditionalLength returns how many bytes the appended hex sum will take up.
func (har *hashAppendingReader) AdditionalLength() int {
	return hex.EncodedLen(har.h.Size())
}

// HexSum returns the hash sum as hex. It's only available after the original
// reader has EOF'd. It's an empty string before that.
func (har *hashAppendingReader) HexSum() string {
	return har.hexSum
}

// newHashAppendingReader takes a Reader and a Hash and will append the hex sum
// after the original reader reaches EOF. The increased size depends on the
// given hash, which may be queried through AdditionalLength()
func newHashAppendingReader(in io.Reader, h hash.Hash) *hashAppendingReader {
	withHash := io.TeeReader(in, h)
	return &hashAppendingReader{h: h, in: withHash}
}

// largeUpload is used to control the upload of large files which need chunking
type largeUpload struct {
	f        *Fs                             // parent Fs
	o        *Object                         // object being uploaded
	in       io.Reader                       // read the data from here
	id       string                          // ID of the file being uploaded
	size     int64                           // total size
	parts    int64                           // calculated number of parts, if known
	sha1s    []string                        // slice of SHA1s for each part
	uploadMu sync.Mutex                      // lock for upload variable
	uploads  []*api.GetUploadPartURLResponse // result of get upload URL calls
}

// newLargeUpload starts an upload of object o from in with metadata in src
func (f *Fs) newLargeUpload(o *Object, in io.Reader, src fs.ObjectInfo) (up *largeUpload, err error) {
	remote := o.remote
	size := src.Size()
	parts := int64(0)
	sha1SliceSize := int64(maxParts)
	if size == -1 {
		fs.Debugf(o, "Streaming upload with --b2-chunk-size %s allows uploads of up to %s and will fail only when that limit is reached.", fs.SizeSuffix(chunkSize), fs.SizeSuffix(maxParts*chunkSize))
	} else {
		parts = size / int64(chunkSize)
		if size%int64(chunkSize) != 0 {
			parts++
		}
		if parts > maxParts {
			return nil, errors.Errorf("%q too big (%d bytes) makes too many parts %d > %d - increase --b2-chunk-size", remote, size, parts, maxParts)
		}
		sha1SliceSize = parts
	}

	modTime := src.ModTime()
	opts := rest.Opts{
		Method: "POST",
		Path:   "/b2_start_large_file",
	}
	bucketID, err := f.getBucketID()
	if err != nil {
		return nil, err
	}
	var request = api.StartLargeFileRequest{
		BucketID:    bucketID,
		Name:        o.fs.root + remote,
		ContentType: fs.MimeType(src),
		Info: map[string]string{
			timeKey: timeString(modTime),
		},
	}
	// Set the SHA1 if known
	if calculatedSha1, err := src.Hash(fs.HashSHA1); err == nil && calculatedSha1 != "" {
		request.Info[sha1Key] = calculatedSha1
	}
	var response api.StartLargeFileResponse
	err = f.pacer.Call(func() (bool, error) {
		resp, err := f.srv.CallJSON(&opts, &request, &response)
		return f.shouldRetry(resp, err)
	})
	if err != nil {
		return nil, err
	}
	up = &largeUpload{
		f:     f,
		o:     o,
		in:    in,
		id:    response.ID,
		size:  size,
		parts: parts,
		sha1s: make([]string, sha1SliceSize),
	}
	return up, nil
}

// getUploadURL returns the upload info with the UploadURL and the AuthorizationToken
//
// This should be returned with returnUploadURL when finished
func (up *largeUpload) getUploadURL() (upload *api.GetUploadPartURLResponse, err error) {
	up.uploadMu.Lock()
	defer up.uploadMu.Unlock()
	if len(up.uploads) == 0 {
		opts := rest.Opts{
			Method: "POST",
			Path:   "/b2_get_upload_part_url",
		}
		var request = api.GetUploadPartURLRequest{
			ID: up.id,
		}
		err := up.f.pacer.Call(func() (bool, error) {
			resp, err := up.f.srv.CallJSON(&opts, &request, &upload)
			return up.f.shouldRetry(resp, err)
		})
		if err != nil {
			return nil, errors.Wrap(err, "failed to get upload URL")
		}
	} else {
		upload, up.uploads = up.uploads[0], up.uploads[1:]
	}
	return upload, nil
}

// returnUploadURL returns the UploadURL to the cache
func (up *largeUpload) returnUploadURL(upload *api.GetUploadPartURLResponse) {
	if upload == nil {
		return
	}
	up.uploadMu.Lock()
	up.uploads = append(up.uploads, upload)
	up.uploadMu.Unlock()
}

// clearUploadURL clears the current UploadURL and the AuthorizationToken
func (up *largeUpload) clearUploadURL() {
	up.uploadMu.Lock()
	up.uploads = nil
	up.uploadMu.Unlock()
}

// Transfer a chunk
func (up *largeUpload) transferChunk(part int64, body []byte) error {
	err := up.f.pacer.Call(func() (bool, error) {
		fs.Debugf(up.o, "Sending chunk %d length %d", part, len(body))

		// Get upload URL
		upload, err := up.getUploadURL()
		if err != nil {
			return false, err
		}

		in := newHashAppendingReader(bytes.NewReader(body), sha1.New())
		size := int64(len(body)) + int64(in.AdditionalLength())

		// Authorization
		//
		// An upload authorization token, from b2_get_upload_part_url.
		//
		// X-Bz-Part-Number
		//
		// A number from 1 to 10000. The parts uploaded for one file
		// must have contiguous numbers, starting with 1.
		//
		// Content-Length
		//
		// The number of bytes in the file being uploaded. Note that
		// this header is required; you cannot leave it out and just
		// use chunked encoding.  The minimum size of every part but
		// the last one is 100MB.
		//
		// X-Bz-Content-Sha1
		//
		// The SHA1 checksum of the this part of the file. B2 will
		// check this when the part is uploaded, to make sure that the
		// data arrived correctly.  The same SHA1 checksum must be
		// passed to b2_finish_large_file.
		opts := rest.Opts{
			Method:  "POST",
			RootURL: upload.UploadURL,
			Body:    fs.AccountPart(up.o, in),
			ExtraHeaders: map[string]string{
				"Authorization":    upload.AuthorizationToken,
				"X-Bz-Part-Number": fmt.Sprintf("%d", part),
				sha1Header:         "hex_digits_at_end",
			},
			ContentLength: &size,
		}

		var response api.UploadPartResponse

		resp, err := up.f.srv.CallJSON(&opts, nil, &response)
		retry, err := up.f.shouldRetry(resp, err)
		// On retryable error clear PartUploadURL
		if retry {
			fs.Debugf(up.o, "Clearing part upload URL because of error: %v", err)
			upload = nil
		}
		up.returnUploadURL(upload)
		up.sha1s[part-1] = in.HexSum()
		return retry, err
	})
	if err != nil {
		fs.Debugf(up.o, "Error sending chunk %d: %v", part, err)
	} else {
		fs.Debugf(up.o, "Done sending chunk %d", part)
	}
	return err
}

// finish closes off the large upload
func (up *largeUpload) finish() error {
	fs.Debugf(up.o, "Finishing large file upload with %d parts", up.parts)
	opts := rest.Opts{
		Method: "POST",
		Path:   "/b2_finish_large_file",
	}
	var request = api.FinishLargeFileRequest{
		ID:    up.id,
		SHA1s: up.sha1s,
	}
	var response api.FileInfo
	err := up.f.pacer.Call(func() (bool, error) {
		resp, err := up.f.srv.CallJSON(&opts, &request, &response)
		return up.f.shouldRetry(resp, err)
	})
	if err != nil {
		return err
	}
	return up.o.decodeMetaDataFileInfo(&response)
}

// cancel aborts the large upload
func (up *largeUpload) cancel() error {
	opts := rest.Opts{
		Method: "POST",
		Path:   "/b2_cancel_large_file",
	}
	var request = api.CancelLargeFileRequest{
		ID: up.id,
	}
	var response api.CancelLargeFileResponse
	err := up.f.pacer.Call(func() (bool, error) {
		resp, err := up.f.srv.CallJSON(&opts, &request, &response)
		return up.f.shouldRetry(resp, err)
	})
	return err
}

func (up *largeUpload) managedTransferChunk(wg *sync.WaitGroup, errs chan error, part int64, buf []byte) {
	wg.Add(1)
	go func(part int64, buf []byte) {
		defer wg.Done()
		defer up.f.putUploadBlock(buf)
		err := up.transferChunk(part, buf)
		if err != nil {
			select {
			case errs <- err:
			default:
			}
		}
	}(part, buf)
}

func (up *largeUpload) finishOrCancelOnError(err error, errs chan error) error {
	if err == nil {
		select {
		case err = <-errs:
		default:
		}
	}
	if err != nil {
		fs.Debugf(up.o, "Cancelling large file upload due to error: %v", err)
		cancelErr := up.cancel()
		if cancelErr != nil {
			fs.Errorf(up.o, "Failed to cancel large file upload: %v", cancelErr)
		}
		return err
	}
	return up.finish()
}

// Stream uploads the chunks from the input, starting with a required initial
// chunk. Assumes the file size is unknown and will upload until the input
// reaches EOF.
func (up *largeUpload) Stream(initialUploadBlock []byte) (err error) {
	fs.Debugf(up.o, "Starting streaming of large file (id %q)", up.id)
	errs := make(chan error, 1)
	hasMoreParts := true
	var wg sync.WaitGroup
	fs.AccountByPart(up.o) // Cancel whole file accounting before reading

	// Transfer initial chunk
	up.size = int64(len(initialUploadBlock))
	up.managedTransferChunk(&wg, errs, 1, initialUploadBlock)

outer:
	for part := int64(2); hasMoreParts; part++ {
		// Check any errors
		select {
		case err = <-errs:
			break outer
		default:
		}

		// Get a block of memory
		buf := up.f.getUploadBlock()

		// Read the chunk
		n, err := io.ReadFull(up.in, buf)
		if err == io.ErrUnexpectedEOF {
			fs.Debugf(up.o, "Read less than a full chunk, making this the last one.")
			buf = buf[:n]
			hasMoreParts = false
			err = nil
		} else if err == io.EOF {
			fs.Debugf(up.o, "Could not read any more bytes, previous chunk was the last.")
			up.f.putUploadBlock(buf)
			hasMoreParts = false
			err = nil
			break outer
		} else if err != nil {
			// other kinds of errors indicate failure
			up.f.putUploadBlock(buf)
			break outer
		}

		// Keep stats up to date
		up.parts = part
		up.size += int64(n)
		if part > maxParts {
			err = errors.Errorf("%q too big (%d bytes so far) makes too many parts %d > %d - increase --b2-chunk-size", up.o, up.size, up.parts, maxParts)
			break outer
		}

		// Transfer the chunk
		up.managedTransferChunk(&wg, errs, part, buf)
	}
	wg.Wait()
	up.sha1s = up.sha1s[:up.parts]

	return up.finishOrCancelOnError(err, errs)
}

// Upload uploads the chunks from the input
func (up *largeUpload) Upload() error {
	fs.Debugf(up.o, "Starting upload of large file in %d chunks (id %q)", up.parts, up.id)
	remaining := up.size
	errs := make(chan error, 1)
	var wg sync.WaitGroup
	var err error
	fs.AccountByPart(up.o) // Cancel whole file accounting before reading
outer:
	for part := int64(1); part <= up.parts; part++ {
		// Check any errors
		select {
		case err = <-errs:
			break outer
		default:
		}

		reqSize := remaining
		if reqSize >= int64(chunkSize) {
			reqSize = int64(chunkSize)
		}

		// Get a block of memory
		buf := up.f.getUploadBlock()[:reqSize]

		// Read the chunk
		_, err = io.ReadFull(up.in, buf)
		if err != nil {
			up.f.putUploadBlock(buf)
			break outer
		}

		// Transfer the chunk
		up.managedTransferChunk(&wg, errs, part, buf)
		remaining -= reqSize
	}
	wg.Wait()

	return up.finishOrCancelOnError(err, errs)
}