mirror of
https://github.com/rclone/rclone.git
synced 2025-03-12 23:29:54 +08:00
filter: Add --hash-filter
to deterministically select a subset of files
Fixes #8400
This commit is contained in:
parent
b7f26937f1
commit
32147b4bb5
@ -2808,6 +2808,7 @@ For the filtering options
|
||||
* `--max-size`
|
||||
* `--min-age`
|
||||
* `--max-age`
|
||||
* `--hash-filter`
|
||||
* `--dump filters`
|
||||
* `--metadata-include`
|
||||
* `--metadata-include-from`
|
||||
|
@ -718,6 +718,98 @@ old or more.
|
||||
|
||||
See [the time option docs](/docs/#time-option) for valid formats.
|
||||
|
||||
### `--hash-filter` - Deterministically select a subset of files {#hash-filter}
|
||||
|
||||
The `--hash-filter` flag enables selecting a deterministic subset of files, useful for:
|
||||
|
||||
1. Running large sync operations across multiple machines.
|
||||
2. Checking a subset of files for bitrot.
|
||||
3. Any other operations where a sample of files is required.
|
||||
|
||||
#### Syntax
|
||||
|
||||
The flag takes two parameters expressed as a fraction:
|
||||
|
||||
```
|
||||
--hash-filter K/N
|
||||
```
|
||||
|
||||
- `N`: The total number of partitions (must be a positive integer).
|
||||
- `K`: The specific partition to select (an integer from `0` to `N`).
|
||||
|
||||
For example:
|
||||
- `--hash-filter 1/3`: Selects the first third of the files.
|
||||
- `--hash-filter 2/3` and `--hash-filter 3/3`: Select the second and third partitions, respectively.
|
||||
|
||||
Each partition is non-overlapping, ensuring all files are covered without duplication.
|
||||
|
||||
#### Random Partition Selection
|
||||
|
||||
Use `@` as `K` to randomly select a partition:
|
||||
|
||||
```
|
||||
--hash-filter @/M
|
||||
```
|
||||
|
||||
For example, `--hash-filter @/3` will randomly select a number between 0 and 2. This will stay constant across retries.
|
||||
|
||||
#### How It Works
|
||||
|
||||
- Rclone takes each file's full path, normalizes it to lowercase, and applies Unicode normalization.
|
||||
- It then hashes the normalized path into a 64 bit number.
|
||||
- The hash result is reduced modulo `N` to assign the file to a partition.
|
||||
- If the calculated partition does not match `K` the file is excluded.
|
||||
- Other filters may apply if the file is not excluded.
|
||||
|
||||
**Important:** Rclone will traverse all directories to apply the filter.
|
||||
|
||||
#### Usage Notes
|
||||
|
||||
- Safe to use with `rclone sync`; source and destination selections will match.
|
||||
- **Do not** use with `--delete-excluded`, as this could delete unselected files.
|
||||
- Ignored if `--files-from` is used.
|
||||
|
||||
#### Examples
|
||||
|
||||
##### Dividing files into 4 partitions
|
||||
|
||||
Assuming the current directory contains `file1.jpg` through `file9.jpg`:
|
||||
|
||||
```
|
||||
$ rclone lsf --hash-filter 0/4 .
|
||||
file1.jpg
|
||||
file5.jpg
|
||||
|
||||
$ rclone lsf --hash-filter 1/4 .
|
||||
file3.jpg
|
||||
file6.jpg
|
||||
file9.jpg
|
||||
|
||||
$ rclone lsf --hash-filter 2/4 .
|
||||
file2.jpg
|
||||
file4.jpg
|
||||
|
||||
$ rclone lsf --hash-filter 3/4 .
|
||||
file7.jpg
|
||||
file8.jpg
|
||||
|
||||
$ rclone lsf --hash-filter 4/4 . # the same as --hash-filter 0/4
|
||||
file1.jpg
|
||||
file5.jpg
|
||||
```
|
||||
|
||||
##### Syncing the first quarter of files
|
||||
|
||||
```
|
||||
rclone sync --hash-filter 1/4 source:path destination:path
|
||||
```
|
||||
|
||||
##### Checking a random 1% of files for integrity
|
||||
|
||||
```
|
||||
rclone check --download --hash-filter @/100 source:path destination:path
|
||||
```
|
||||
|
||||
## Other flags
|
||||
|
||||
### `--delete-excluded` - Delete files on dest excluded from sync
|
||||
|
@ -3,14 +3,19 @@ package filter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/md5"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand/v2"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/rclone/rclone/fs"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
// This is the globally active filter
|
||||
@ -64,6 +69,11 @@ var OptionsInfo = fs.Options{{
|
||||
Default: false,
|
||||
Help: "Ignore case in filters (case insensitive)",
|
||||
Groups: "Filter",
|
||||
}, {
|
||||
Name: "hash_filter",
|
||||
Default: "",
|
||||
Help: "Partition filenames by hash k/n or randomly @/n",
|
||||
Groups: "Filter",
|
||||
}, {
|
||||
Name: "filter",
|
||||
Default: []string{},
|
||||
@ -140,6 +150,7 @@ type Options struct {
|
||||
MinSize fs.SizeSuffix `config:"min_size"`
|
||||
MaxSize fs.SizeSuffix `config:"max_size"`
|
||||
IgnoreCase bool `config:"ignore_case"`
|
||||
HashFilter string `config:"hash_filter"`
|
||||
}
|
||||
|
||||
func init() {
|
||||
@ -167,6 +178,8 @@ type Filter struct {
|
||||
metaRules rules
|
||||
files FilesMap // files if filesFrom
|
||||
dirs FilesMap // dirs from filesFrom
|
||||
hashFilterN uint64 // if non 0 do hash filtering
|
||||
hashFilterK uint64 // select partition K/N
|
||||
}
|
||||
|
||||
// NewFilter parses the command line options and creates a Filter
|
||||
@ -189,10 +202,17 @@ func NewFilter(opt *Options) (f *Filter, err error) {
|
||||
if f.Opt.MaxAge.IsSet() {
|
||||
f.ModTimeFrom = time.Now().Add(-time.Duration(f.Opt.MaxAge))
|
||||
if !f.ModTimeTo.IsZero() && f.ModTimeTo.Before(f.ModTimeFrom) {
|
||||
fs.Fatalf(nil, "filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
|
||||
return nil, fmt.Errorf("filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
|
||||
}
|
||||
fs.Debugf(nil, "--max-age %v to %v", f.Opt.MaxAge, f.ModTimeFrom)
|
||||
}
|
||||
if f.Opt.HashFilter != "" {
|
||||
f.hashFilterK, f.hashFilterN, err = parseHashFilter(f.Opt.HashFilter)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
fs.Debugf(nil, "Using --hash-filter %d/%d", f.hashFilterK, f.hashFilterN)
|
||||
}
|
||||
|
||||
err = parseRules(&f.Opt.RulesOpt, f.Add, f.Clear)
|
||||
if err != nil {
|
||||
@ -242,6 +262,32 @@ func NewFilter(opt *Options) (f *Filter, err error) {
|
||||
return f, nil
|
||||
}
|
||||
|
||||
// Parse the --hash-filter arguments into k/n
|
||||
func parseHashFilter(hashFilter string) (k, n uint64, err error) {
|
||||
slash := strings.IndexRune(hashFilter, '/')
|
||||
if slash < 0 {
|
||||
return 0, 0, fmt.Errorf("filter: --hash-filter: no / found")
|
||||
}
|
||||
kStr, nStr := hashFilter[:slash], hashFilter[slash+1:]
|
||||
n, err = strconv.ParseUint(nStr, 10, 64)
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse N=%q: %v", nStr, err)
|
||||
}
|
||||
if n == 0 {
|
||||
return 0, 0, fmt.Errorf("filter: --hash-filter: N must be greater than 0")
|
||||
}
|
||||
if kStr == "@" {
|
||||
k = rand.Uint64N(n)
|
||||
} else {
|
||||
k, err = strconv.ParseUint(kStr, 10, 64)
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse K=%q: %v", kStr, err)
|
||||
}
|
||||
k %= n
|
||||
}
|
||||
return k, n, nil
|
||||
}
|
||||
|
||||
func mustNewFilter(opt *Options) *Filter {
|
||||
f, err := NewFilter(opt)
|
||||
if err != nil {
|
||||
@ -366,7 +412,8 @@ func (f *Filter) InActive() bool {
|
||||
f.fileRules.len() == 0 &&
|
||||
f.dirRules.len() == 0 &&
|
||||
f.metaRules.len() == 0 &&
|
||||
len(f.Opt.ExcludeFile) == 0)
|
||||
len(f.Opt.ExcludeFile) == 0 &&
|
||||
f.hashFilterN == 0)
|
||||
}
|
||||
|
||||
// IncludeRemote returns whether this remote passes the filter rules.
|
||||
@ -376,6 +423,21 @@ func (f *Filter) IncludeRemote(remote string) bool {
|
||||
_, include := f.files[remote]
|
||||
return include
|
||||
}
|
||||
if f.hashFilterN != 0 {
|
||||
// Normalise the remote first in case we are using a
|
||||
// case insensitive remote or a remote which needs
|
||||
// unicode normalisation. This means all the remotes
|
||||
// which could be normalised together will be in the
|
||||
// same partition.
|
||||
normalized := norm.NFC.String(remote)
|
||||
normalized = strings.ToLower(normalized)
|
||||
hashBytes := md5.Sum([]byte(normalized))
|
||||
hash := binary.LittleEndian.Uint64(hashBytes[:])
|
||||
partition := hash % f.hashFilterN
|
||||
if partition != f.hashFilterK {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return f.fileRules.include(remote)
|
||||
}
|
||||
|
||||
|
@ -28,6 +28,38 @@ func TestNewFilterDefault(t *testing.T) {
|
||||
assert.True(t, f.InActive())
|
||||
}
|
||||
|
||||
func TestParseHashFilter(t *testing.T) {
|
||||
for _, test := range []struct {
|
||||
hashFilter string
|
||||
n uint64
|
||||
k uint64
|
||||
err string
|
||||
}{
|
||||
{hashFilter: "", err: "no / found"},
|
||||
{hashFilter: "17", err: "no / found"},
|
||||
{hashFilter: "-1/2", err: "can't parse K="},
|
||||
{hashFilter: "1/-2", err: "can't parse N="},
|
||||
{hashFilter: "0/0", err: "N must be greater than 0"},
|
||||
{hashFilter: "0/18446744073709551615", k: 0, n: 18446744073709551615},
|
||||
{hashFilter: "0/18446744073709551616", err: "can't parse N="},
|
||||
{hashFilter: "18446744073709551615/1", k: 0, n: 1},
|
||||
{hashFilter: "18446744073709551616/1", err: "can't parse K="},
|
||||
{hashFilter: "1/2", k: 1, n: 2},
|
||||
{hashFilter: "17/3", k: 2, n: 3},
|
||||
{hashFilter: "@/1", k: 0, n: 1},
|
||||
} {
|
||||
gotK, gotN, gotErr := parseHashFilter(test.hashFilter)
|
||||
if test.err != "" {
|
||||
assert.Error(t, gotErr)
|
||||
assert.ErrorContains(t, gotErr, test.err, test.hashFilter)
|
||||
} else {
|
||||
assert.Equal(t, test.k, gotK, test.hashFilter)
|
||||
assert.Equal(t, test.n, gotN, test.hashFilter)
|
||||
assert.NoError(t, gotErr, test.hashFilter)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// testFile creates a temp file with the contents
|
||||
func testFile(t *testing.T, contents string) string {
|
||||
out, err := os.CreateTemp("", "filter_test")
|
||||
@ -207,6 +239,7 @@ type includeTest struct {
|
||||
}
|
||||
|
||||
func testInclude(t *testing.T, f *Filter, tests []includeTest) {
|
||||
t.Helper()
|
||||
for _, test := range tests {
|
||||
got := f.Include(test.in, test.size, time.Unix(test.modTime, 0), nil)
|
||||
assert.Equal(t, test.want, got, fmt.Sprintf("in=%q, size=%v, modTime=%v", test.in, test.size, time.Unix(test.modTime, 0)))
|
||||
@ -537,6 +570,42 @@ func TestNewFilterMatchesRegexp(t *testing.T) {
|
||||
assert.False(t, f.InActive())
|
||||
}
|
||||
|
||||
func TestNewFilterHashFilter(t *testing.T) {
|
||||
const e1 = "filé1.jpg" // one of the unicode E characters
|
||||
const e2 = "filé1.jpg" // a different unicode E character
|
||||
assert.NotEqual(t, e1, e2)
|
||||
for i := 0; i <= 4; i++ {
|
||||
opt := Opt
|
||||
opt.HashFilter = fmt.Sprintf("%d/4", i)
|
||||
opt.ExcludeRule = []string{"*.bin"}
|
||||
f, err := NewFilter(&opt)
|
||||
require.NoError(t, err)
|
||||
t.Run(opt.HashFilter, func(t *testing.T) {
|
||||
testInclude(t, f, []includeTest{
|
||||
{"file1.jpg", 0, 0, i == 0 || i == 4},
|
||||
{"FILE1.jpg", 0, 0, i == 0 || i == 4},
|
||||
{"file2.jpg", 1, 0, i == 2},
|
||||
{"File2.jpg", 1, 0, i == 2},
|
||||
{"file3.jpg", 2, 0, i == 1},
|
||||
{"file4.jpg", 3, 0, i == 2},
|
||||
{"file5.jpg", 4, 0, i == 0 || i == 4},
|
||||
{"file6.jpg", 5, 0, i == 1},
|
||||
{"file7.jpg", 6, 0, i == 3},
|
||||
{"file8.jpg", 7, 0, i == 3},
|
||||
{"file9.jpg", 7, 0, i == 1},
|
||||
{e1, 0, 0, i == 3},
|
||||
{e2, 0, 0, i == 3},
|
||||
{"hello" + e1, 0, 0, i == 2},
|
||||
{"HELLO" + e2, 0, 0, i == 2},
|
||||
{"hello1" + e1, 0, 0, i == 1},
|
||||
{"Hello1" + e2, 0, 0, i == 1},
|
||||
{"exclude.bin", 8, 0, false},
|
||||
})
|
||||
})
|
||||
assert.False(t, f.InActive())
|
||||
}
|
||||
}
|
||||
|
||||
type includeTestMetadata struct {
|
||||
in string
|
||||
metadata fs.Metadata
|
||||
|
Loading…
x
Reference in New Issue
Block a user