filter: Add --hash-filter to deterministically select a subset of files

Fixes #8400
This commit is contained in:
Nick Craig-Wood 2025-02-25 17:48:15 +00:00
parent b7f26937f1
commit 32147b4bb5
4 changed files with 226 additions and 2 deletions

View File

@ -2808,6 +2808,7 @@ For the filtering options
* `--max-size`
* `--min-age`
* `--max-age`
* `--hash-filter`
* `--dump filters`
* `--metadata-include`
* `--metadata-include-from`

View File

@ -718,6 +718,98 @@ old or more.
See [the time option docs](/docs/#time-option) for valid formats.
### `--hash-filter` - Deterministically select a subset of files {#hash-filter}
The `--hash-filter` flag enables selecting a deterministic subset of files, useful for:
1. Running large sync operations across multiple machines.
2. Checking a subset of files for bitrot.
3. Any other operations where a sample of files is required.
#### Syntax
The flag takes two parameters expressed as a fraction:
```
--hash-filter K/N
```
- `N`: The total number of partitions (must be a positive integer).
- `K`: The specific partition to select (an integer from `0` to `N`).
For example:
- `--hash-filter 1/3`: Selects the first third of the files.
- `--hash-filter 2/3` and `--hash-filter 3/3`: Select the second and third partitions, respectively.
Each partition is non-overlapping, ensuring all files are covered without duplication.
#### Random Partition Selection
Use `@` as `K` to randomly select a partition:
```
--hash-filter @/M
```
For example, `--hash-filter @/3` will randomly select a number between 0 and 2. This will stay constant across retries.
#### How It Works
- Rclone takes each file's full path, normalizes it to lowercase, and applies Unicode normalization.
- It then hashes the normalized path into a 64 bit number.
- The hash result is reduced modulo `N` to assign the file to a partition.
- If the calculated partition does not match `K` the file is excluded.
- Other filters may apply if the file is not excluded.
**Important:** Rclone will traverse all directories to apply the filter.
#### Usage Notes
- Safe to use with `rclone sync`; source and destination selections will match.
- **Do not** use with `--delete-excluded`, as this could delete unselected files.
- Ignored if `--files-from` is used.
#### Examples
##### Dividing files into 4 partitions
Assuming the current directory contains `file1.jpg` through `file9.jpg`:
```
$ rclone lsf --hash-filter 0/4 .
file1.jpg
file5.jpg
$ rclone lsf --hash-filter 1/4 .
file3.jpg
file6.jpg
file9.jpg
$ rclone lsf --hash-filter 2/4 .
file2.jpg
file4.jpg
$ rclone lsf --hash-filter 3/4 .
file7.jpg
file8.jpg
$ rclone lsf --hash-filter 4/4 . # the same as --hash-filter 0/4
file1.jpg
file5.jpg
```
##### Syncing the first quarter of files
```
rclone sync --hash-filter 1/4 source:path destination:path
```
##### Checking a random 1% of files for integrity
```
rclone check --download --hash-filter @/100 source:path destination:path
```
## Other flags
### `--delete-excluded` - Delete files on dest excluded from sync

View File

@ -3,14 +3,19 @@ package filter
import (
"context"
"crypto/md5"
"encoding/binary"
"errors"
"fmt"
"math/rand/v2"
"path"
"strconv"
"strings"
"time"
"github.com/rclone/rclone/fs"
"golang.org/x/sync/errgroup"
"golang.org/x/text/unicode/norm"
)
// This is the globally active filter
@ -64,6 +69,11 @@ var OptionsInfo = fs.Options{{
Default: false,
Help: "Ignore case in filters (case insensitive)",
Groups: "Filter",
}, {
Name: "hash_filter",
Default: "",
Help: "Partition filenames by hash k/n or randomly @/n",
Groups: "Filter",
}, {
Name: "filter",
Default: []string{},
@ -140,6 +150,7 @@ type Options struct {
MinSize fs.SizeSuffix `config:"min_size"`
MaxSize fs.SizeSuffix `config:"max_size"`
IgnoreCase bool `config:"ignore_case"`
HashFilter string `config:"hash_filter"`
}
func init() {
@ -167,6 +178,8 @@ type Filter struct {
metaRules rules
files FilesMap // files if filesFrom
dirs FilesMap // dirs from filesFrom
hashFilterN uint64 // if non 0 do hash filtering
hashFilterK uint64 // select partition K/N
}
// NewFilter parses the command line options and creates a Filter
@ -189,10 +202,17 @@ func NewFilter(opt *Options) (f *Filter, err error) {
if f.Opt.MaxAge.IsSet() {
f.ModTimeFrom = time.Now().Add(-time.Duration(f.Opt.MaxAge))
if !f.ModTimeTo.IsZero() && f.ModTimeTo.Before(f.ModTimeFrom) {
fs.Fatalf(nil, "filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
return nil, fmt.Errorf("filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
}
fs.Debugf(nil, "--max-age %v to %v", f.Opt.MaxAge, f.ModTimeFrom)
}
if f.Opt.HashFilter != "" {
f.hashFilterK, f.hashFilterN, err = parseHashFilter(f.Opt.HashFilter)
if err != nil {
return nil, err
}
fs.Debugf(nil, "Using --hash-filter %d/%d", f.hashFilterK, f.hashFilterN)
}
err = parseRules(&f.Opt.RulesOpt, f.Add, f.Clear)
if err != nil {
@ -242,6 +262,32 @@ func NewFilter(opt *Options) (f *Filter, err error) {
return f, nil
}
// Parse the --hash-filter arguments into k/n
func parseHashFilter(hashFilter string) (k, n uint64, err error) {
slash := strings.IndexRune(hashFilter, '/')
if slash < 0 {
return 0, 0, fmt.Errorf("filter: --hash-filter: no / found")
}
kStr, nStr := hashFilter[:slash], hashFilter[slash+1:]
n, err = strconv.ParseUint(nStr, 10, 64)
if err != nil {
return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse N=%q: %v", nStr, err)
}
if n == 0 {
return 0, 0, fmt.Errorf("filter: --hash-filter: N must be greater than 0")
}
if kStr == "@" {
k = rand.Uint64N(n)
} else {
k, err = strconv.ParseUint(kStr, 10, 64)
if err != nil {
return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse K=%q: %v", kStr, err)
}
k %= n
}
return k, n, nil
}
func mustNewFilter(opt *Options) *Filter {
f, err := NewFilter(opt)
if err != nil {
@ -366,7 +412,8 @@ func (f *Filter) InActive() bool {
f.fileRules.len() == 0 &&
f.dirRules.len() == 0 &&
f.metaRules.len() == 0 &&
len(f.Opt.ExcludeFile) == 0)
len(f.Opt.ExcludeFile) == 0 &&
f.hashFilterN == 0)
}
// IncludeRemote returns whether this remote passes the filter rules.
@ -376,6 +423,21 @@ func (f *Filter) IncludeRemote(remote string) bool {
_, include := f.files[remote]
return include
}
if f.hashFilterN != 0 {
// Normalise the remote first in case we are using a
// case insensitive remote or a remote which needs
// unicode normalisation. This means all the remotes
// which could be normalised together will be in the
// same partition.
normalized := norm.NFC.String(remote)
normalized = strings.ToLower(normalized)
hashBytes := md5.Sum([]byte(normalized))
hash := binary.LittleEndian.Uint64(hashBytes[:])
partition := hash % f.hashFilterN
if partition != f.hashFilterK {
return false
}
}
return f.fileRules.include(remote)
}

View File

@ -28,6 +28,38 @@ func TestNewFilterDefault(t *testing.T) {
assert.True(t, f.InActive())
}
func TestParseHashFilter(t *testing.T) {
for _, test := range []struct {
hashFilter string
n uint64
k uint64
err string
}{
{hashFilter: "", err: "no / found"},
{hashFilter: "17", err: "no / found"},
{hashFilter: "-1/2", err: "can't parse K="},
{hashFilter: "1/-2", err: "can't parse N="},
{hashFilter: "0/0", err: "N must be greater than 0"},
{hashFilter: "0/18446744073709551615", k: 0, n: 18446744073709551615},
{hashFilter: "0/18446744073709551616", err: "can't parse N="},
{hashFilter: "18446744073709551615/1", k: 0, n: 1},
{hashFilter: "18446744073709551616/1", err: "can't parse K="},
{hashFilter: "1/2", k: 1, n: 2},
{hashFilter: "17/3", k: 2, n: 3},
{hashFilter: "@/1", k: 0, n: 1},
} {
gotK, gotN, gotErr := parseHashFilter(test.hashFilter)
if test.err != "" {
assert.Error(t, gotErr)
assert.ErrorContains(t, gotErr, test.err, test.hashFilter)
} else {
assert.Equal(t, test.k, gotK, test.hashFilter)
assert.Equal(t, test.n, gotN, test.hashFilter)
assert.NoError(t, gotErr, test.hashFilter)
}
}
}
// testFile creates a temp file with the contents
func testFile(t *testing.T, contents string) string {
out, err := os.CreateTemp("", "filter_test")
@ -207,6 +239,7 @@ type includeTest struct {
}
func testInclude(t *testing.T, f *Filter, tests []includeTest) {
t.Helper()
for _, test := range tests {
got := f.Include(test.in, test.size, time.Unix(test.modTime, 0), nil)
assert.Equal(t, test.want, got, fmt.Sprintf("in=%q, size=%v, modTime=%v", test.in, test.size, time.Unix(test.modTime, 0)))
@ -537,6 +570,42 @@ func TestNewFilterMatchesRegexp(t *testing.T) {
assert.False(t, f.InActive())
}
func TestNewFilterHashFilter(t *testing.T) {
const e1 = "filé1.jpg" // one of the unicode E characters
const e2 = "filé1.jpg" // a different unicode E character
assert.NotEqual(t, e1, e2)
for i := 0; i <= 4; i++ {
opt := Opt
opt.HashFilter = fmt.Sprintf("%d/4", i)
opt.ExcludeRule = []string{"*.bin"}
f, err := NewFilter(&opt)
require.NoError(t, err)
t.Run(opt.HashFilter, func(t *testing.T) {
testInclude(t, f, []includeTest{
{"file1.jpg", 0, 0, i == 0 || i == 4},
{"FILE1.jpg", 0, 0, i == 0 || i == 4},
{"file2.jpg", 1, 0, i == 2},
{"File2.jpg", 1, 0, i == 2},
{"file3.jpg", 2, 0, i == 1},
{"file4.jpg", 3, 0, i == 2},
{"file5.jpg", 4, 0, i == 0 || i == 4},
{"file6.jpg", 5, 0, i == 1},
{"file7.jpg", 6, 0, i == 3},
{"file8.jpg", 7, 0, i == 3},
{"file9.jpg", 7, 0, i == 1},
{e1, 0, 0, i == 3},
{e2, 0, 0, i == 3},
{"hello" + e1, 0, 0, i == 2},
{"HELLO" + e2, 0, 0, i == 2},
{"hello1" + e1, 0, 0, i == 1},
{"Hello1" + e2, 0, 0, i == 1},
{"exclude.bin", 8, 0, false},
})
})
assert.False(t, f.InActive())
}
}
type includeTestMetadata struct {
in string
metadata fs.Metadata