filter: Add --hash-filter to deterministically select a subset of files

Fixes #8400
2025-03-12 23:29:54 +08:00 · 2025-02-25 17:48:15 +00:00 · 2025-02-25 17:48:15 +00:00 · 32147b4bb5
commit 32147b4bb5
parent b7f26937f1
4 changed files with 226 additions and 2 deletions
--- a/docs/content/docs.md
+++ b/docs/content/docs.md
@ -2808,6 +2808,7 @@ For the filtering options
  * `--max-size`
  * `--min-age`
  * `--max-age`
+  * `--hash-filter`
  * `--dump filters`
  * `--metadata-include`
  * `--metadata-include-from`
--- a/docs/content/filtering.md
+++ b/docs/content/filtering.md
@ -718,6 +718,98 @@ old or more.

 See [the time option docs](/docs/#time-option) for valid formats.

+### `--hash-filter` - Deterministically select a subset of files {#hash-filter}
+
+The `--hash-filter` flag enables selecting a deterministic subset of files, useful for:
+
+1. Running large sync operations across multiple machines.
+2. Checking a subset of files for bitrot.
+3. Any other operations where a sample of files is required.
+
+#### Syntax
+
+The flag takes two parameters expressed as a fraction:
+
+```
+--hash-filter K/N
+```
+
+- `N`: The total number of partitions (must be a positive integer).
+- `K`: The specific partition to select (an integer from `0` to `N`).
+
+For example:
+- `--hash-filter 1/3`: Selects the first third of the files.
+- `--hash-filter 2/3` and `--hash-filter 3/3`: Select the second and third partitions, respectively.
+
+Each partition is non-overlapping, ensuring all files are covered without duplication.
+
+#### Random Partition Selection
+
+Use `@` as `K` to randomly select a partition:
+
+```
+--hash-filter @/M
+```
+
+For example, `--hash-filter @/3` will randomly select a number between 0 and 2. This will stay constant across retries.
+
+#### How It Works
+
+- Rclone takes each file's full path, normalizes it to lowercase, and applies Unicode normalization.
+- It then hashes the normalized path into a 64 bit number.
+- The hash result is reduced modulo `N` to assign the file to a partition.
+- If the calculated partition does not match `K` the file is excluded.
+- Other filters may apply if the file is not excluded.
+
+**Important:** Rclone will traverse all directories to apply the filter.
+
+#### Usage Notes
+
+- Safe to use with `rclone sync`; source and destination selections will match.
+- **Do not** use with `--delete-excluded`, as this could delete unselected files.
+- Ignored if `--files-from` is used.
+
+#### Examples
+
+##### Dividing files into 4 partitions
+
+Assuming the current directory contains `file1.jpg` through `file9.jpg`:
+
+```
+$ rclone lsf --hash-filter 0/4 .
+file1.jpg
+file5.jpg
+
+$ rclone lsf --hash-filter 1/4 .
+file3.jpg
+file6.jpg
+file9.jpg
+
+$ rclone lsf --hash-filter 2/4 .
+file2.jpg
+file4.jpg
+
+$ rclone lsf --hash-filter 3/4 .
+file7.jpg
+file8.jpg
+
+$ rclone lsf --hash-filter 4/4 . # the same as --hash-filter 0/4
+file1.jpg
+file5.jpg
+```
+
+##### Syncing the first quarter of files
+
+```
+rclone sync --hash-filter 1/4 source:path destination:path
+```
+
+##### Checking a random 1% of files for integrity
+
+```
+rclone check --download --hash-filter @/100 source:path destination:path
+```
+
 ## Other flags

 ### `--delete-excluded` - Delete files on dest excluded from sync
--- a/fs/filter/filter.go
+++ b/fs/filter/filter.go
@ -3,14 +3,19 @@ package filter

 import (
 	"context"
+	"crypto/md5"
+	"encoding/binary"
 	"errors"
 	"fmt"
+	"math/rand/v2"
 	"path"
+	"strconv"
 	"strings"
 	"time"

 	"github.com/rclone/rclone/fs"
 	"golang.org/x/sync/errgroup"
+	"golang.org/x/text/unicode/norm"
 )

 // This is the globally active filter
@ -64,6 +69,11 @@ var OptionsInfo = fs.Options{{
 	Default: false,
 	Help:    "Ignore case in filters (case insensitive)",
 	Groups:  "Filter",
+}, {
+	Name:    "hash_filter",
+	Default: "",
+	Help:    "Partition filenames by hash k/n or randomly @/n",
+	Groups:  "Filter",
 }, {
 	Name:     "filter",
 	Default:  []string{},
@ -140,6 +150,7 @@ type Options struct {
 	MinSize        fs.SizeSuffix `config:"min_size"`
 	MaxSize        fs.SizeSuffix `config:"max_size"`
 	IgnoreCase     bool          `config:"ignore_case"`
+	HashFilter     string        `config:"hash_filter"`
 }

 func init() {
@ -167,6 +178,8 @@ type Filter struct {
 	metaRules   rules
 	files       FilesMap // files if filesFrom
 	dirs        FilesMap // dirs from filesFrom
+	hashFilterN uint64   // if non 0 do hash filtering
+	hashFilterK uint64   // select partition K/N
 }

 // NewFilter parses the command line options and creates a Filter
@ -189,10 +202,17 @@ func NewFilter(opt *Options) (f *Filter, err error) {
 	if f.Opt.MaxAge.IsSet() {
 		f.ModTimeFrom = time.Now().Add(-time.Duration(f.Opt.MaxAge))
 		if !f.ModTimeTo.IsZero() && f.ModTimeTo.Before(f.ModTimeFrom) {
-			fs.Fatalf(nil, "filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
+			return nil, fmt.Errorf("filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
 		}
 		fs.Debugf(nil, "--max-age %v to %v", f.Opt.MaxAge, f.ModTimeFrom)
 	}
+	if f.Opt.HashFilter != "" {
+		f.hashFilterK, f.hashFilterN, err = parseHashFilter(f.Opt.HashFilter)
+		if err != nil {
+			return nil, err
+		}
+		fs.Debugf(nil, "Using --hash-filter %d/%d", f.hashFilterK, f.hashFilterN)
+	}

 	err = parseRules(&f.Opt.RulesOpt, f.Add, f.Clear)
 	if err != nil {
@ -242,6 +262,32 @@ func NewFilter(opt *Options) (f *Filter, err error) {
 	return f, nil
 }

+// Parse the --hash-filter arguments into k/n
+func parseHashFilter(hashFilter string) (k, n uint64, err error) {
+	slash := strings.IndexRune(hashFilter, '/')
+	if slash < 0 {
+		return 0, 0, fmt.Errorf("filter: --hash-filter: no / found")
+	}
+	kStr, nStr := hashFilter[:slash], hashFilter[slash+1:]
+	n, err = strconv.ParseUint(nStr, 10, 64)
+	if err != nil {
+		return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse N=%q: %v", nStr, err)
+	}
+	if n == 0 {
+		return 0, 0, fmt.Errorf("filter: --hash-filter: N must be greater than 0")
+	}
+	if kStr == "@" {
+		k = rand.Uint64N(n)
+	} else {
+		k, err = strconv.ParseUint(kStr, 10, 64)
+		if err != nil {
+			return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse K=%q: %v", kStr, err)
+		}
+		k %= n
+	}
+	return k, n, nil
+}
+
 func mustNewFilter(opt *Options) *Filter {
 	f, err := NewFilter(opt)
 	if err != nil {
@ -366,7 +412,8 @@ func (f *Filter) InActive() bool {
 		f.fileRules.len() == 0 &&
 		f.dirRules.len() == 0 &&
 		f.metaRules.len() == 0 &&
-		len(f.Opt.ExcludeFile) == 0)
+		len(f.Opt.ExcludeFile) == 0 &&
+		f.hashFilterN == 0)
 }

 // IncludeRemote returns whether this remote passes the filter rules.
@ -376,6 +423,21 @@ func (f *Filter) IncludeRemote(remote string) bool {
 		_, include := f.files[remote]
 		return include
 	}
+	if f.hashFilterN != 0 {
+		// Normalise the remote first in case we are using a
+		// case insensitive remote or a remote which needs
+		// unicode normalisation. This means all the remotes
+		// which could be normalised together will be in the
+		// same partition.
+		normalized := norm.NFC.String(remote)
+		normalized = strings.ToLower(normalized)
+		hashBytes := md5.Sum([]byte(normalized))
+		hash := binary.LittleEndian.Uint64(hashBytes[:])
+		partition := hash % f.hashFilterN
+		if partition != f.hashFilterK {
+			return false
+		}
+	}
 	return f.fileRules.include(remote)
 }

--- a/fs/filter/filter_test.go
+++ b/fs/filter/filter_test.go
@ -28,6 +28,38 @@ func TestNewFilterDefault(t *testing.T) {
 	assert.True(t, f.InActive())
 }

+func TestParseHashFilter(t *testing.T) {
+	for _, test := range []struct {
+		hashFilter string
+		n          uint64
+		k          uint64
+		err        string
+	}{
+		{hashFilter: "", err: "no / found"},
+		{hashFilter: "17", err: "no / found"},
+		{hashFilter: "-1/2", err: "can't parse K="},
+		{hashFilter: "1/-2", err: "can't parse N="},
+		{hashFilter: "0/0", err: "N must be greater than 0"},
+		{hashFilter: "0/18446744073709551615", k: 0, n: 18446744073709551615},
+		{hashFilter: "0/18446744073709551616", err: "can't parse N="},
+		{hashFilter: "18446744073709551615/1", k: 0, n: 1},
+		{hashFilter: "18446744073709551616/1", err: "can't parse K="},
+		{hashFilter: "1/2", k: 1, n: 2},
+		{hashFilter: "17/3", k: 2, n: 3},
+		{hashFilter: "@/1", k: 0, n: 1},
+	} {
+		gotK, gotN, gotErr := parseHashFilter(test.hashFilter)
+		if test.err != "" {
+			assert.Error(t, gotErr)
+			assert.ErrorContains(t, gotErr, test.err, test.hashFilter)
+		} else {
+			assert.Equal(t, test.k, gotK, test.hashFilter)
+			assert.Equal(t, test.n, gotN, test.hashFilter)
+			assert.NoError(t, gotErr, test.hashFilter)
+		}
+	}
+}
+
 // testFile creates a temp file with the contents
 func testFile(t *testing.T, contents string) string {
 	out, err := os.CreateTemp("", "filter_test")
@ -207,6 +239,7 @@ type includeTest struct {
 }

 func testInclude(t *testing.T, f *Filter, tests []includeTest) {
+	t.Helper()
 	for _, test := range tests {
 		got := f.Include(test.in, test.size, time.Unix(test.modTime, 0), nil)
 		assert.Equal(t, test.want, got, fmt.Sprintf("in=%q, size=%v, modTime=%v", test.in, test.size, time.Unix(test.modTime, 0)))
@ -537,6 +570,42 @@ func TestNewFilterMatchesRegexp(t *testing.T) {
 	assert.False(t, f.InActive())
 }

+func TestNewFilterHashFilter(t *testing.T) {
+	const e1 = "filé1.jpg" // one of the unicode E characters
+	const e2 = "filé1.jpg"  // a different unicode E character
+	assert.NotEqual(t, e1, e2)
+	for i := 0; i <= 4; i++ {
+		opt := Opt
+		opt.HashFilter = fmt.Sprintf("%d/4", i)
+		opt.ExcludeRule = []string{"*.bin"}
+		f, err := NewFilter(&opt)
+		require.NoError(t, err)
+		t.Run(opt.HashFilter, func(t *testing.T) {
+			testInclude(t, f, []includeTest{
+				{"file1.jpg", 0, 0, i == 0 || i == 4},
+				{"FILE1.jpg", 0, 0, i == 0 || i == 4},
+				{"file2.jpg", 1, 0, i == 2},
+				{"File2.jpg", 1, 0, i == 2},
+				{"file3.jpg", 2, 0, i == 1},
+				{"file4.jpg", 3, 0, i == 2},
+				{"file5.jpg", 4, 0, i == 0 || i == 4},
+				{"file6.jpg", 5, 0, i == 1},
+				{"file7.jpg", 6, 0, i == 3},
+				{"file8.jpg", 7, 0, i == 3},
+				{"file9.jpg", 7, 0, i == 1},
+				{e1, 0, 0, i == 3},
+				{e2, 0, 0, i == 3},
+				{"hello" + e1, 0, 0, i == 2},
+				{"HELLO" + e2, 0, 0, i == 2},
+				{"hello1" + e1, 0, 0, i == 1},
+				{"Hello1" + e2, 0, 0, i == 1},
+				{"exclude.bin", 8, 0, false},
+			})
+		})
+		assert.False(t, f.InActive())
+	}
+}
+
 type includeTestMetadata struct {
 	in       string
 	metadata fs.Metadata