http: improved recognition of url pointing to a single file - fixes #5929

This commit is contained in:
albertony 2022-01-23 23:39:05 +01:00
parent 1045344943
commit a667e03fc9
3 changed files with 218 additions and 37 deletions

View File

@ -73,8 +73,9 @@ directories.`,
Advanced: true,
}, {
Name: "no_head",
Help: `Don't use HEAD requests to find file sizes in dir listing.
Help: `Don't use HEAD requests.
HEAD requests are mainly used to find file sizes in dir listing.
If your site is being very slow to load then you can try this option.
Normally rclone does a HEAD request for each potential file in a
directory listing to:
@ -134,6 +135,82 @@ func statusError(res *http.Response, err error) error {
return nil
}
// getFsEndpoint decides if url is to be considered a file or directory,
// and returns a proper endpoint url to use for the fs.
func getFsEndpoint(ctx context.Context, client *http.Client, url string, opt *Options) (string, bool) {
// If url ends with '/' it is already a proper url always assumed to be a directory.
if url[len(url)-1] == '/' {
return url, false
}
// If url does not end with '/' we send a HEAD request to decide
// if it is directory or file, and if directory appends the missing
// '/', or if file returns the directory url to parent instead.
createFileResult := func() (string, bool) {
fs.Debugf(nil, "If path is a directory you must add a trailing '/'")
parent, _ := path.Split(url)
return parent, true
}
createDirResult := func() (string, bool) {
fs.Debugf(nil, "To avoid the initial HEAD request add a trailing '/' to the path")
return url + "/", false
}
// If HEAD requests are not allowed we just have to assume it is a file.
if opt.NoHead {
fs.Debugf(nil, "Assuming path is a file as --http-no-head is set")
return createFileResult()
}
// Use a client which doesn't follow redirects so the server
// doesn't redirect http://host/dir to http://host/dir/
noRedir := *client
noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil)
if err != nil {
fs.Debugf(nil, "Assuming path is a file as HEAD request could not be created: %v", err)
return createFileResult()
}
addHeaders(req, opt)
res, err := noRedir.Do(req)
if err != nil {
fs.Debugf(nil, "Assuming path is a file as HEAD request could not be sent: %v", err)
return createFileResult()
}
if res.StatusCode == http.StatusNotFound {
fs.Debugf(nil, "Assuming path is a directory as HEAD response is it does not exist as a file (%s)", res.Status)
return createDirResult()
}
if res.StatusCode == http.StatusMovedPermanently ||
res.StatusCode == http.StatusFound ||
res.StatusCode == http.StatusSeeOther ||
res.StatusCode == http.StatusTemporaryRedirect ||
res.StatusCode == http.StatusPermanentRedirect {
redir := res.Header.Get("Location")
if redir != "" {
if redir[len(redir)-1] == '/' {
fs.Debugf(nil, "Assuming path is a directory as HEAD response is redirect (%s) to a path that ends with '/': %s", res.Status, redir)
return createDirResult()
}
fs.Debugf(nil, "Assuming path is a file as HEAD response is redirect (%s) to a path that does not end with '/': %s", res.Status, redir)
return createFileResult()
}
fs.Debugf(nil, "Assuming path is a file as HEAD response is redirect (%s) but no location header", res.Status)
return createFileResult()
}
if res.StatusCode < 200 || res.StatusCode > 299 {
// Example is 403 (http.StatusForbidden) for servers not allowing HEAD requests.
fs.Debugf(nil, "Assuming path is a file as HEAD response is an error (%s)", res.Status)
return createFileResult()
}
fs.Debugf(nil, "Assuming path is a file as HEAD response is success (%s)", res.Status)
return createFileResult()
}
// NewFs creates a new Fs object from the name and root. It connects to
// the host specified in the config file.
func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
@ -164,37 +241,9 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e
client := fshttp.NewClient(ctx)
var isFile = false
if !strings.HasSuffix(u.String(), "/") {
// Make a client which doesn't follow redirects so the server
// doesn't redirect http://host/dir to http://host/dir/
noRedir := *client
noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
// check to see if points to a file
req, err := http.NewRequestWithContext(ctx, "HEAD", u.String(), nil)
if err == nil {
addHeaders(req, opt)
res, err := noRedir.Do(req)
err = statusError(res, err)
if err == nil {
isFile = true
}
}
}
newRoot := u.String()
if isFile {
// Point to the parent if this is a file
newRoot, _ = path.Split(u.String())
} else {
if !strings.HasSuffix(newRoot, "/") {
newRoot += "/"
}
}
u, err = url.Parse(newRoot)
endpoint, isFile := getFsEndpoint(ctx, client, u.String(), opt)
fs.Debugf(nil, "Root: %s", endpoint)
u, err = url.Parse(endpoint)
if err != nil {
return nil, err
}
@ -212,12 +261,16 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e
f.features = (&fs.Features{
CanHaveEmptyDirectories: true,
}).Fill(ctx, f)
if isFile {
// return an error with an fs which points to the parent
return f, fs.ErrorIsFile
}
if !strings.HasSuffix(f.endpointURL, "/") {
return nil, errors.New("internal error: url doesn't end with /")
}
return f, nil
}

View File

@ -8,8 +8,10 @@ import (
"net/http/httptest"
"net/url"
"os"
"path"
"path/filepath"
"sort"
"strconv"
"strings"
"testing"
"time"
@ -374,3 +376,106 @@ func TestParseCaddy(t *testing.T) {
"v1.36-22-g06ea13a-ssh-agentβ/",
})
}
func TestFsNoSlashRoots(t *testing.T) {
// Test Fs with roots that does not end with '/', the logic that
// decides if url is to be considered a file or directory, based
// on result from a HEAD request.
// Handler for faking HEAD responses with different status codes
headCount := 0
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method == "HEAD" {
headCount++
responseCode, err := strconv.Atoi(path.Base(r.URL.String()))
require.NoError(t, err)
if strings.HasPrefix(r.URL.String(), "/redirect/") {
var redir string
if strings.HasPrefix(r.URL.String(), "/redirect/file/") {
redir = "/redirected"
} else if strings.HasPrefix(r.URL.String(), "/redirect/dir/") {
redir = "/redirected/"
} else {
require.Fail(t, "Redirect test requests must start with '/redirect/file/' or '/redirect/dir/'")
}
http.Redirect(w, r, redir, responseCode)
} else {
http.Error(w, http.StatusText(responseCode), responseCode)
}
}
})
// Make the test server
ts := httptest.NewServer(handler)
defer ts.Close()
// Configure the remote
configfile.Install()
m := configmap.Simple{
"type": "http",
"url": ts.URL,
}
// Test
for i, test := range []struct {
root string
isFile bool
}{
// 2xx success
{"parent/200", true},
{"parent/204", true},
// 3xx redirection Redirect status 301, 302, 303, 307, 308
{"redirect/file/301", true}, // Request is redirected to "/redirected"
{"redirect/dir/301", false}, // Request is redirected to "/redirected/"
{"redirect/file/302", true}, // Request is redirected to "/redirected"
{"redirect/dir/302", false}, // Request is redirected to "/redirected/"
{"redirect/file/303", true}, // Request is redirected to "/redirected"
{"redirect/dir/303", false}, // Request is redirected to "/redirected/"
{"redirect/file/304", true}, // Not really a redirect, handled like 4xx errors (below)
{"redirect/file/305", true}, // Not really a redirect, handled like 4xx errors (below)
{"redirect/file/306", true}, // Not really a redirect, handled like 4xx errors (below)
{"redirect/file/307", true}, // Request is redirected to "/redirected"
{"redirect/dir/307", false}, // Request is redirected to "/redirected/"
{"redirect/file/308", true}, // Request is redirected to "/redirected"
{"redirect/dir/308", false}, // Request is redirected to "/redirected/"
// 4xx client errors
{"parent/403", true}, // Forbidden status (head request blocked)
{"parent/404", false}, // Not found status
} {
for _, noHead := range []bool{false, true} {
var isFile bool
if noHead {
m.Set("no_head", "true")
isFile = true
} else {
m.Set("no_head", "false")
isFile = test.isFile
}
headCount = 0
f, err := NewFs(context.Background(), remoteName, test.root, m)
if noHead {
assert.Equal(t, 0, headCount)
} else {
assert.Equal(t, 1, headCount)
}
if isFile {
assert.ErrorIs(t, err, fs.ErrorIsFile)
} else {
assert.NoError(t, err)
}
var endpoint string
if isFile {
parent, _ := path.Split(test.root)
endpoint = "/" + parent
} else {
endpoint = "/" + test.root + "/"
}
what := fmt.Sprintf("i=%d, root=%q, isFile=%v, noHead=%v", i, test.root, isFile, noHead)
assert.Equal(t, ts.URL+endpoint, f.String(), what)
}
}
}

View File

@ -12,7 +12,26 @@ webservers such as Apache/Nginx/Caddy and will likely work with file
listings from most web servers. (If it doesn't then please file an
issue, or send a pull request!)
Paths are specified as `remote:` or `remote:path/to/dir`.
Paths are specified as `remote:` or `remote:path`.
The `remote:` represents the configured [url](#http-url), and any path following
it will be resolved relative to this url, according to the URL standard. This
means with remote url `https://beta.rclone.org/branch` and path `fix`, the
resolved URL will be `https://beta.rclone.org/branch/fix`, while with path
`/fix` the resolved URL will be `https://beta.rclone.org/fix` as the absolute
path is resolved from the root of the domain.
If the path following the `remote:` ends with `/` it will be assumed to point
to a directory. If the path does not end with `/`, then a HEAD request is sent
and the response used to decide if it it is treated as a file or a directory
(run with `-vv` to see details). When [--http-no-head](#http-no-head) is
specified, a path without ending `/` is always assumed to be a file. If rclone
incorrectly assumes the path is a file, the solution is to specify the path with
ending `/`. When you know the path is a directory, ending it with `/` is always
better as it avoids the initial HEAD request.
To just download a single file it is easier to use
[copyurl](/commands/rclone_copyurl/).
## Configuration
@ -81,25 +100,29 @@ Sync the remote `directory` to `/home/local/directory`, deleting any excess file
rclone sync -i remote:directory /home/local/directory
### Read only ###
### Read only
This remote is read only - you can't upload files to an HTTP server.
### Modified time ###
### Modified time
Most HTTP servers store time accurate to 1 second.
### Checksum ###
### Checksum
No checksums are stored.
### Usage without a config file ###
### Usage without a config file
Since the http remote only has one config parameter it is easy to use
without a config file:
rclone lsd --http-url https://beta.rclone.org :http:
or:
rclone lsd :http,url='https://beta.rclone.org':
{{< rem autogenerated options start" - DO NOT EDIT - instead edit fs.RegInfo in backend/http/http.go then run make backenddocs" >}}
### Standard options