rclone/backend/http/http.go

657 lines
17 KiB
Go

// Package http provides a filesystem interface using golang.org/net/http
//
// It treats HTML pages served from the endpoint as directory
// listings, and includes any links found as files.
package http
import (
"context"
"io"
"mime"
"net/http"
"net/url"
"path"
"strconv"
"strings"
"sync"
"time"
"github.com/pkg/errors"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config/configmap"
"github.com/rclone/rclone/fs/config/configstruct"
"github.com/rclone/rclone/fs/fshttp"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/lib/rest"
"golang.org/x/net/html"
)
var (
errorReadOnly = errors.New("http remotes are read only")
timeUnset = time.Unix(0, 0)
)
func init() {
fsi := &fs.RegInfo{
Name: "http",
Description: "http Connection",
NewFs: NewFs,
Options: []fs.Option{{
Name: "url",
Help: "URL of http host to connect to",
Required: true,
Examples: []fs.OptionExample{{
Value: "https://example.com",
Help: "Connect to example.com",
}, {
Value: "https://user:pass@example.com",
Help: "Connect to example.com using a username and password",
}},
}, {
Name: "headers",
Help: `Set HTTP headers for all transactions
Use this to set additional HTTP headers for all transactions
The input format is comma separated list of key,value pairs. Standard
[CSV encoding](https://godoc.org/encoding/csv) may be used.
For example to set a Cookie use 'Cookie,name=value', or '"Cookie","name=value"'.
You can set multiple headers, e.g. '"Cookie","name=value","Authorization","xxx"'.
`,
Default: fs.CommaSepList{},
Advanced: true,
}, {
Name: "no_slash",
Help: `Set this if the site doesn't end directories with /
Use this if your target website does not use / on the end of
directories.
A / on the end of a path is how rclone normally tells the difference
between files and directories. If this flag is set, then rclone will
treat all files with Content-Type: text/html as directories and read
URLs from them rather than downloading them.
Note that this may cause rclone to confuse genuine HTML files with
directories.`,
Default: false,
Advanced: true,
}, {
Name: "no_head",
Help: `Don't use HEAD requests to find file sizes in dir listing
If your site is being very slow to load then you can try this option.
Normally rclone does a HEAD request for each potential file in a
directory listing to:
- find its size
- check it really exists
- check to see if it is a directory
If you set this option, rclone will not do the HEAD request. This will mean
- directory listings are much quicker
- rclone won't have the times or sizes of any files
- some files that don't exist may be in the listing
`,
Default: false,
Advanced: true,
}},
}
fs.Register(fsi)
}
// Options defines the configuration for this backend
type Options struct {
Endpoint string `config:"url"`
NoSlash bool `config:"no_slash"`
NoHead bool `config:"no_head"`
Headers fs.CommaSepList `config:"headers"`
}
// Fs stores the interface to the remote HTTP files
type Fs struct {
name string
root string
features *fs.Features // optional features
opt Options // options for this backend
ci *fs.ConfigInfo // global config
endpoint *url.URL
endpointURL string // endpoint as a string
httpClient *http.Client
}
// Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading)
type Object struct {
fs *Fs
remote string
size int64
modTime time.Time
contentType string
}
// statusError returns an error if the res contained an error
func statusError(res *http.Response, err error) error {
if err != nil {
return err
}
if res.StatusCode < 200 || res.StatusCode > 299 {
_ = res.Body.Close()
return errors.Errorf("HTTP Error %d: %s", res.StatusCode, res.Status)
}
return nil
}
// NewFs creates a new Fs object from the name and root. It connects to
// the host specified in the config file.
func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
// Parse config into Options struct
opt := new(Options)
err := configstruct.Set(m, opt)
if err != nil {
return nil, err
}
if len(opt.Headers)%2 != 0 {
return nil, errors.New("odd number of headers supplied")
}
if !strings.HasSuffix(opt.Endpoint, "/") {
opt.Endpoint += "/"
}
// Parse the endpoint and stick the root onto it
base, err := url.Parse(opt.Endpoint)
if err != nil {
return nil, err
}
u, err := rest.URLJoin(base, rest.URLPathEscape(root))
if err != nil {
return nil, err
}
client := fshttp.NewClient(ctx)
var isFile = false
if !strings.HasSuffix(u.String(), "/") {
// Make a client which doesn't follow redirects so the server
// doesn't redirect http://host/dir to http://host/dir/
noRedir := *client
noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
// check to see if points to a file
req, err := http.NewRequest("HEAD", u.String(), nil)
if err == nil {
req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
addHeaders(req, opt)
res, err := noRedir.Do(req)
err = statusError(res, err)
if err == nil {
isFile = true
}
}
}
newRoot := u.String()
if isFile {
// Point to the parent if this is a file
newRoot, _ = path.Split(u.String())
} else {
if !strings.HasSuffix(newRoot, "/") {
newRoot += "/"
}
}
u, err = url.Parse(newRoot)
if err != nil {
return nil, err
}
ci := fs.GetConfig(ctx)
f := &Fs{
name: name,
root: root,
opt: *opt,
ci: ci,
httpClient: client,
endpoint: u,
endpointURL: u.String(),
}
f.features = (&fs.Features{
CanHaveEmptyDirectories: true,
}).Fill(ctx, f)
if isFile {
return f, fs.ErrorIsFile
}
if !strings.HasSuffix(f.endpointURL, "/") {
return nil, errors.New("internal error: url doesn't end with /")
}
return f, nil
}
// Name returns the configured name of the file system
func (f *Fs) Name() string {
return f.name
}
// Root returns the root for the filesystem
func (f *Fs) Root() string {
return f.root
}
// String returns the URL for the filesystem
func (f *Fs) String() string {
return f.endpointURL
}
// Features returns the optional features of this Fs
func (f *Fs) Features() *fs.Features {
return f.features
}
// Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s
func (f *Fs) Precision() time.Duration {
return time.Second
}
// NewObject creates a new remote http file object
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
o := &Object{
fs: f,
remote: remote,
}
err := o.stat(ctx)
if err != nil {
return nil, err
}
return o, nil
}
// Join's the remote onto the base URL
func (f *Fs) url(remote string) string {
return f.endpointURL + rest.URLPathEscape(remote)
}
// parse s into an int64, on failure return def
func parseInt64(s string, def int64) int64 {
n, e := strconv.ParseInt(s, 10, 64)
if e != nil {
return def
}
return n
}
// Errors returned by parseName
var (
errURLJoinFailed = errors.New("URLJoin failed")
errFoundQuestionMark = errors.New("found ? in URL")
errHostMismatch = errors.New("host mismatch")
errSchemeMismatch = errors.New("scheme mismatch")
errNotUnderRoot = errors.New("not under root")
errNameIsEmpty = errors.New("name is empty")
errNameContainsSlash = errors.New("name contains /")
)
// parseName turns a name as found in the page into a remote path or returns an error
func parseName(base *url.URL, name string) (string, error) {
// make URL absolute
u, err := rest.URLJoin(base, name)
if err != nil {
return "", errURLJoinFailed
}
// check it doesn't have URL parameters
uStr := u.String()
if strings.Index(uStr, "?") >= 0 {
return "", errFoundQuestionMark
}
// check that this is going back to the same host and scheme
if base.Host != u.Host {
return "", errHostMismatch
}
if base.Scheme != u.Scheme {
return "", errSchemeMismatch
}
// check has path prefix
if !strings.HasPrefix(u.Path, base.Path) {
return "", errNotUnderRoot
}
// calculate the name relative to the base
name = u.Path[len(base.Path):]
// mustn't be empty
if name == "" {
return "", errNameIsEmpty
}
// mustn't contain a / - we are looking for a single level directory
slash := strings.Index(name, "/")
if slash >= 0 && slash != len(name)-1 {
return "", errNameContainsSlash
}
return name, nil
}
// Parse turns HTML for a directory into names
// base should be the base URL to resolve any relative names from
func parse(base *url.URL, in io.Reader) (names []string, err error) {
doc, err := html.Parse(in)
if err != nil {
return nil, err
}
var (
walk func(*html.Node)
seen = make(map[string]struct{})
)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
name, err := parseName(base, a.Val)
if err == nil {
if _, found := seen[name]; !found {
names = append(names, name)
seen[name] = struct{}{}
}
}
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
return names, nil
}
// Adds the configured headers to the request if any
func addHeaders(req *http.Request, opt *Options) {
for i := 0; i < len(opt.Headers); i += 2 {
key := opt.Headers[i]
value := opt.Headers[i+1]
req.Header.Add(key, value)
}
}
// Adds the configured headers to the request if any
func (f *Fs) addHeaders(req *http.Request) {
addHeaders(req, &f.opt)
}
// Read the directory passed in
func (f *Fs) readDir(ctx context.Context, dir string) (names []string, err error) {
URL := f.url(dir)
u, err := url.Parse(URL)
if err != nil {
return nil, errors.Wrap(err, "failed to readDir")
}
if !strings.HasSuffix(URL, "/") {
return nil, errors.Errorf("internal error: readDir URL %q didn't end in /", URL)
}
// Do the request
req, err := http.NewRequest("GET", URL, nil)
if err != nil {
return nil, errors.Wrap(err, "readDir failed")
}
req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
f.addHeaders(req)
res, err := f.httpClient.Do(req)
if err == nil {
defer fs.CheckClose(res.Body, &err)
if res.StatusCode == http.StatusNotFound {
return nil, fs.ErrorDirNotFound
}
}
err = statusError(res, err)
if err != nil {
return nil, errors.Wrap(err, "failed to readDir")
}
contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0]
switch contentType {
case "text/html":
names, err = parse(u, res.Body)
if err != nil {
return nil, errors.Wrap(err, "readDir")
}
default:
return nil, errors.Errorf("Can't parse content type %q", contentType)
}
return names, nil
}
// List the objects and directories in dir into entries. The
// entries can be returned in any order but should be for a
// complete directory.
//
// dir should be "" to list the root, and should not have
// trailing slashes.
//
// This should return ErrDirNotFound if the directory isn't
// found.
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
if !strings.HasSuffix(dir, "/") && dir != "" {
dir += "/"
}
names, err := f.readDir(ctx, dir)
if err != nil {
return nil, errors.Wrapf(err, "error listing %q", dir)
}
var (
entriesMu sync.Mutex // to protect entries
wg sync.WaitGroup
checkers = f.ci.Checkers
in = make(chan string, checkers)
)
add := func(entry fs.DirEntry) {
entriesMu.Lock()
entries = append(entries, entry)
entriesMu.Unlock()
}
for i := 0; i < checkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for remote := range in {
file := &Object{
fs: f,
remote: remote,
}
switch err := file.stat(ctx); err {
case nil:
add(file)
case fs.ErrorNotAFile:
// ...found a directory not a file
add(fs.NewDir(remote, timeUnset))
default:
fs.Debugf(remote, "skipping because of error: %v", err)
}
}
}()
}
for _, name := range names {
isDir := name[len(name)-1] == '/'
name = strings.TrimRight(name, "/")
remote := path.Join(dir, name)
if isDir {
add(fs.NewDir(remote, timeUnset))
} else {
in <- remote
}
}
close(in)
wg.Wait()
return entries, nil
}
// Put in to the remote path with the modTime given of the given size
//
// May create the object even if it returns an error - if so
// will return the object and the error, otherwise will return
// nil and the error
func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// PutStream uploads to the remote path with the modTime given of indeterminate size
func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// Fs is the filesystem this remote http file object is located within
func (o *Object) Fs() fs.Info {
return o.fs
}
// String returns the URL to the remote HTTP file
func (o *Object) String() string {
if o == nil {
return "<nil>"
}
return o.remote
}
// Remote the name of the remote HTTP file, relative to the fs root
func (o *Object) Remote() string {
return o.remote
}
// Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes
func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) {
return "", hash.ErrUnsupported
}
// Size returns the size in bytes of the remote http file
func (o *Object) Size() int64 {
return o.size
}
// ModTime returns the modification time of the remote http file
func (o *Object) ModTime(ctx context.Context) time.Time {
return o.modTime
}
// url returns the native url of the object
func (o *Object) url() string {
return o.fs.url(o.remote)
}
// stat updates the info field in the Object
func (o *Object) stat(ctx context.Context) error {
if o.fs.opt.NoHead {
o.size = -1
o.modTime = timeUnset
o.contentType = fs.MimeType(ctx, o)
return nil
}
url := o.url()
req, err := http.NewRequest("HEAD", url, nil)
if err != nil {
return errors.Wrap(err, "stat failed")
}
req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
o.fs.addHeaders(req)
res, err := o.fs.httpClient.Do(req)
if err == nil && res.StatusCode == http.StatusNotFound {
return fs.ErrorObjectNotFound
}
err = statusError(res, err)
if err != nil {
return errors.Wrap(err, "failed to stat")
}
t, err := http.ParseTime(res.Header.Get("Last-Modified"))
if err != nil {
t = timeUnset
}
o.size = parseInt64(res.Header.Get("Content-Length"), -1)
o.modTime = t
o.contentType = res.Header.Get("Content-Type")
// If NoSlash is set then check ContentType to see if it is a directory
if o.fs.opt.NoSlash {
mediaType, _, err := mime.ParseMediaType(o.contentType)
if err != nil {
return errors.Wrapf(err, "failed to parse Content-Type: %q", o.contentType)
}
if mediaType == "text/html" {
return fs.ErrorNotAFile
}
}
return nil
}
// SetModTime sets the modification and access time to the specified time
//
// it also updates the info field
func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
return errorReadOnly
}
// Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc.)
func (o *Object) Storable() bool {
return true
}
// Open a remote http file object for reading. Seek is supported
func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
url := o.url()
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, errors.Wrap(err, "Open failed")
}
req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
// Add optional headers
for k, v := range fs.OpenOptionHeaders(options) {
req.Header.Add(k, v)
}
o.fs.addHeaders(req)
// Do the request
res, err := o.fs.httpClient.Do(req)
err = statusError(res, err)
if err != nil {
return nil, errors.Wrap(err, "Open failed")
}
return res.Body, nil
}
// Hashes returns hash.HashNone to indicate remote hashing is unavailable
func (f *Fs) Hashes() hash.Set {
return hash.Set(hash.None)
}
// Mkdir makes the root directory of the Fs object
func (f *Fs) Mkdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// Remove a remote http file object
func (o *Object) Remove(ctx context.Context) error {
return errorReadOnly
}
// Rmdir removes the root directory of the Fs object
func (f *Fs) Rmdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// Update in to the object with the modTime given of the given size
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
return errorReadOnly
}
// MimeType of an Object if known, "" otherwise
func (o *Object) MimeType(ctx context.Context) string {
return o.contentType
}
// Check the interfaces are satisfied
var (
_ fs.Fs = &Fs{}
_ fs.PutStreamer = &Fs{}
_ fs.Object = &Object{}
_ fs.MimeTyper = &Object{}
)