From cb7534dcdfa8781f80a40bf6c1e15f39e105c172 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 13 Aug 2020 08:14:11 -0700 Subject: [PATCH] lib: Add file name compression Allows to compress short arbitrary strings and returns a string using base64 url encoding. Generator for tables included and a few samples has been added. Add more to init.go Tested with fuzzing for crash resistance and symmetry, see fuzz.go --- .gitignore | 1 + go.mod | 1 + go.sum | 6 +-- lib/encoder/filename/decode.go | 84 +++++++++++++++++++++++++++++ lib/encoder/filename/encode.go | 60 +++++++++++++++++++++ lib/encoder/filename/fuzz.go | 33 ++++++++++++ lib/encoder/filename/gentable.go | 93 ++++++++++++++++++++++++++++++++ lib/encoder/filename/init.go | 89 ++++++++++++++++++++++++++++++ 8 files changed, 363 insertions(+), 4 deletions(-) create mode 100644 lib/encoder/filename/decode.go create mode 100644 lib/encoder/filename/encode.go create mode 100644 lib/encoder/filename/fuzz.go create mode 100644 lib/encoder/filename/gentable.go create mode 100644 lib/encoder/filename/init.go diff --git a/.gitignore b/.gitignore index d6da3db99..9a877432c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ rclone.iml *.test *.log *.iml +fuzz-build.zip diff --git a/go.mod b/go.mod index d9854ccd3..92264763f 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,7 @@ require ( github.com/jlaffaye/ftp v0.0.0-20200720194710-13949d38913e github.com/jzelinskie/whirlpool v0.0.0-20170603002051-c19460b8caa6 github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0 // indirect + github.com/klauspost/compress v1.10.11 github.com/koofr/go-httpclient v0.0.0-20200420163713-93aa7c75b348 github.com/koofr/go-koofrclient v0.0.0-20190724113126-8e5366da203a github.com/mattn/go-colorable v0.1.7 diff --git a/go.sum b/go.sum index 2b4c0f7d9..1eb4f6da8 100644 --- a/go.sum +++ b/go.sum @@ -87,10 +87,6 @@ github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/billziss-gh/cgofuse v1.3.0 h1:mFj8XQg/vvxMFywNy1F7IqFYcMeBqceYTh1+iUhpsk8= -github.com/billziss-gh/cgofuse v1.3.0/go.mod h1:LJjoaUojlVjgo5GQoEJTcJNqZJeRU0nCR84CyxKt2YM= -github.com/billziss-gh/cgofuse v1.3.1-0.20200703171401-45df47debffe h1:AXqxouOOD7FQuoVfZubWmMyHzOrrSGZbdh9o6PCtfKM= -github.com/billziss-gh/cgofuse v1.3.1-0.20200703171401-45df47debffe/go.mod h1:LJjoaUojlVjgo5GQoEJTcJNqZJeRU0nCR84CyxKt2YM= github.com/billziss-gh/cgofuse v1.4.0 h1:kju2jDmdNuDDCrxPob2ggmZr5Mj/odCjU1Y8kx0Th9E= github.com/billziss-gh/cgofuse v1.4.0/go.mod h1:LJjoaUojlVjgo5GQoEJTcJNqZJeRU0nCR84CyxKt2YM= github.com/bradfitz/iter v0.0.0-20140124041915-454541ec3da2/go.mod h1:PyRFw1Lt2wKX4ZVSQ2mk+PeDa1rxyObEDlApuIsUKuo= @@ -265,6 +261,8 @@ github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvW github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kkdai/bstream v0.0.0-20161212061736-f391b8402d23/go.mod h1:J+Gs4SYgM6CZQHDETBtE9HaSEkGmuNXF86RwHhHUvq4= +github.com/klauspost/compress v1.10.11 h1:K9z59aO18Aywg2b/WSgBaUX99mHy2BES18Cr5lBKZHk= +github.com/klauspost/compress v1.10.11/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= diff --git a/lib/encoder/filename/decode.go b/lib/encoder/filename/decode.go new file mode 100644 index 000000000..75f87ccd5 --- /dev/null +++ b/lib/encoder/filename/decode.go @@ -0,0 +1,84 @@ +package filename + +import ( + "bytes" + "encoding/base64" + "encoding/binary" + "errors" + "sync" + + "github.com/klauspost/compress/huff0" +) + +// ErrCorrupted is returned if a provided encoded filename cannot be decoded. +var ErrCorrupted = errors.New("file name corrupt") + +// ErrUnsupported is returned if a provided encoding may come from a future version or the file name is corrupt. +var ErrUnsupported = errors.New("file name possibly generated by future version of rclone") + +// Custom decoder for tableCustom types. Stateful, so must have lock. +var customDec huff0.Scratch +var customDecMu sync.Mutex + +// Decode an encoded string. +func Decode(s string) (string, error) { + if len(s) < 1 { + return "", ErrCorrupted + } + table := decodeMap[s[0]] + if table == 0 { + return "", ErrCorrupted + } + table-- + s = s[1:] + + data := make([]byte, base64.URLEncoding.DecodedLen(len(s))) + n, err := base64.URLEncoding.Decode(data, ([]byte)(s)) + if err != nil || n < 0 { + return "", ErrCorrupted + } + data = data[:n] + + switch table { + case tableUncompressed: + return string(data), nil + case tableReserved: + return "", ErrUnsupported + case tableRLE: + if len(data) < 2 { + return "", ErrCorrupted + } + n, used := binary.Uvarint(data[:len(data)-1]) + if used <= 0 || n > maxLength { + return "", ErrCorrupted + } + return string(bytes.Repeat(data[len(data)-1:], int(n))), nil + case tableCustom: + customDecMu.Lock() + defer customDecMu.Unlock() + _, data, err := huff0.ReadTable(data, &customDec) + if err != nil { + return "", ErrCorrupted + } + customDec.MaxDecodedSize = maxLength + decoded, err := customDec.Decompress1X(data) + if err != nil { + return "", ErrCorrupted + } + return string(decoded), nil + default: + if table >= byte(len(decTables)) { + return "", ErrCorrupted + } + dec := decTables[table] + if dec == nil { + return "", ErrUnsupported + } + var dst [maxLength]byte + name, err := dec.Decompress1X(dst[:0], data) + if err != nil { + return "", ErrCorrupted + } + return string(name), nil + } +} diff --git a/lib/encoder/filename/encode.go b/lib/encoder/filename/encode.go new file mode 100644 index 000000000..555bf6b91 --- /dev/null +++ b/lib/encoder/filename/encode.go @@ -0,0 +1,60 @@ +package filename + +import ( + "encoding/base64" + "encoding/binary" + + "github.com/klauspost/compress/huff0" +) + +// Encode will encode the string and return a base64 (url) compatible version of it. +// Calling Decode with the returned string should always succeed. +// It is not a requirement that the input string is valid utf-8. +func Encode(s string) string { + initCoders() + bestSize := len(s) + bestTable := tableUncompressed + org := []byte(s) + bestOut := []byte(s) + + // Try all tables and choose the best + for i, enc := range encTables[:] { + if len(org) <= 1 || len(org) > maxLength { + // Use the uncompressed + break + } + if enc == nil { + continue + } + // Try to encode using table. + err := func() error { + encTableLocks[i].Lock() + defer encTableLocks[i].Unlock() + out, _, err := huff0.Compress1X(org, enc) + if err != nil { + return err + } + if len(out) < bestSize { + bestOut = bestOut[:len(out)] + bestTable = i + bestSize = len(out) + copy(bestOut, out) + } + return nil + }() + // If input is a single byte repeated store as RLE or save uncompressed. + if err == huff0.ErrUseRLE { + if len(org) > 2 { + // Encode as one byte repeated since it will be smaller than uncompressed. + n := binary.PutUvarint(bestOut, uint64(len(org))) + bestOut = bestOut[:n+1] + bestOut[n] = org[0] + bestSize = n + 1 + bestTable = tableRLE + } + break + } + } + + return string(encodeURL[bestTable]) + base64.URLEncoding.EncodeToString(bestOut) +} diff --git a/lib/encoder/filename/fuzz.go b/lib/encoder/filename/fuzz.go new file mode 100644 index 000000000..73468ee71 --- /dev/null +++ b/lib/encoder/filename/fuzz.go @@ -0,0 +1,33 @@ +//+build gofuzz + +package filename + +import ( + "bytes" + "fmt" +) + +// Run like: +// go-fuzz-build -o=fuzz-build.zip -func=Fuzz . && go-fuzz -minimize=5s -bin=fuzz-build.zip -workdir=testdata/corpus -procs=24 + +// Fuzz test the provided input. +func Fuzz(data []byte) int { + // First try to decode as is. + // We don't care about the result, it just shouldn't crash. + Decode(string(data)) + + // Now encode + enc := Encode(string(data)) + + // And decoded must match + decoded, err := Decode(enc) + if err != nil { + panic(fmt.Sprintf("error decoding %q, input %q: %v", enc, string(data), err)) + } + if !bytes.Equal(data, []byte(decoded)) { + panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q", enc, string(data), decoded)) + } + + // Everything is good. + return 1 +} diff --git a/lib/encoder/filename/gentable.go b/lib/encoder/filename/gentable.go new file mode 100644 index 000000000..4acb61707 --- /dev/null +++ b/lib/encoder/filename/gentable.go @@ -0,0 +1,93 @@ +//+build ignore + +package main + +import ( + "encoding/base64" + "fmt" + "math" + + "github.com/klauspost/compress" + "github.com/klauspost/compress/huff0" +) + +// Replace/add histogram data and execute go run gentable.go + +func main() { + // Allow non-represented characters. + const omitUnused = false + + histogram := [256]uint64{ + // ncw home directory + //0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19442, 760, 0, 349, 570, 1520, 199, 76, 685, 654, 0, 40377, 1605, 395132, 935270, 0, 1156377, 887730, 811737, 712241, 693240, 689139, 675964, 656417, 666577, 657413, 532, 24, 0, 145, 0, 3, 946, 44932, 37362, 46126, 36752, 76346, 19338, 47457, 14288, 38163, 4350, 7867, 36541, 65011, 30255, 26792, 22097, 1803, 39191, 61965, 76585, 11887, 12896, 5931, 1935, 1731, 1385, 1279, 9, 1278, 1, 420185, 0, 1146359, 746359, 968896, 868703, 1393640, 745019, 354147, 159462, 483979, 169092, 75937, 385858, 322166, 466635, 571268, 447132, 13792, 446484, 736844, 732675, 170232, 112983, 63184, 142357, 173945, 21521, 250, 0, 250, 4140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 39, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 15, 0, 0, 0, 10, 0, 5, 0, 0, 0, 0, 0, 0, 283, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + //Images: + //0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 765, 0, 0, 0, 0, 0, 8, 7, 3, 3, 0, 29, 53, 247265, 83587, 0, 265952, 233552, 229781, 71156, 78374, 65141, 46152, 43767, 55603, 39411, 0, 0, 0, 0, 0, 88, 84, 141, 70, 222, 191, 51, 52, 101, 60, 53, 23, 17, 49, 93, 53, 17, 92, 0, 158, 109, 41, 19, 43, 28, 10, 5, 1, 0, 0, 0, 0, 879, 0, 3415, 6770, 39823, 3566, 2491, 964, 42115, 825, 5178, 40755, 483, 1290, 3294, 1720, 6309, 42983, 10, 37739, 3454, 7028, 5077, 854, 227, 1259, 767, 218, 0, 0, 0, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Google Drive: + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459, 0, 0, 7, 0, 0, 0, 7, 1, 1, 0, 2, 1, 506, 706, 0, 3903, 3552, 3694, 3338, 3262, 3257, 3222, 3249, 3325, 3261, 5, 0, 0, 1, 0, 0, 0, 48, 31, 61, 53, 46, 17, 17, 34, 32, 9, 22, 17, 31, 27, 19, 52, 5, 46, 84, 38, 14, 5, 19, 2, 2, 0, 8, 0, 8, 0, 180, 0, 5847, 3282, 3729, 3695, 3842, 3356, 316, 139, 487, 117, 95, 476, 289, 428, 609, 467, 5, 446, 592, 955, 130, 112, 57, 390, 168, 14, 0, 2, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + } + + // Override with equally distributed characters + if false { + histogram = [256]uint64{} + var chars string + // base c64 + chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + // hex + //chars = "0123456789abcdef" + for _, v := range []byte(chars) { + histogram[v] = 1 + } + } + + // Sum up distributions + var total uint64 + for _, v := range histogram[:] { + total += v + } + + // Scale the distribution to approx this size. + const scale = 100 << 10 + var tmp []byte + for i, v := range histogram[:] { + if v == 0 && omitUnused { + continue + } + nf := float64(v) / float64(total) * scale + if nf < 1 { + nf = 1 + } + t2 := make([]byte, int(math.Ceil(nf))) + for j := range t2 { + t2[j] = byte(i) + } + tmp = append(tmp, t2...) + } + + var s huff0.Scratch + s.Reuse = huff0.ReusePolicyNone + _, _, err := huff0.Compress1X(tmp, &s) + if err != nil { + panic(err) + } + fmt.Println("table:", base64.URLEncoding.EncodeToString(s.OutTable)) + + // Encode without ones: + s.Reuse = huff0.ReusePolicyPrefer + tmp = tmp[:0] + for i, v := range histogram[:] { + nf := float64(v) / float64(total) * scale + t2 := make([]byte, int(math.Ceil(nf))) + for j := range t2 { + t2[j] = byte(i) + } + tmp = append(tmp, t2...) + } + _, _, err = huff0.Compress1X(tmp, &s) + fmt.Println("sample", len(tmp), "byte, compressed size:", len(s.OutData)) + fmt.Println("Shannon limit:", compress.ShannonEntropyBits(tmp)/8, "bytes") + if err != nil { + panic(err) + } + + fmt.Printf("avg size: 1 -> %.02f", float64(len(s.OutData))/float64(len(tmp))) +} diff --git a/lib/encoder/filename/init.go b/lib/encoder/filename/init.go new file mode 100644 index 000000000..1a7ae41ac --- /dev/null +++ b/lib/encoder/filename/init.go @@ -0,0 +1,89 @@ +package filename + +import ( + "encoding/base64" + "sync" + + "github.com/klauspost/compress/huff0" +) + +// encodeURL is base64 url encoding values. +const encodeURL = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + +// decodeMap will return x = decodeMap[encodeURL[byte(x)]] - 1 if x >= 0 and x < 64, otherwise -1 is returned. +var decodeMap [256]byte + +// maxLength is the maximum length that will be attempted to be compressed. +const maxLength = 256 + +var ( + initOnce sync.Once // Used to control init of tables. + + encTables [64]*huff0.Scratch // Encoders. + encTableLocks [64]sync.Mutex // Temporary locks for encoders since they are stateful. + decTables [64]*huff0.Decoder // Stateless decoders. +) + +const ( + tableUncompressed = 0 + tableRLE = 61 + tableCustom = 62 + tableReserved = 63 +) + +// predefined tables as base64 URL encoded string. +var tablesData = [64]string{ + // Uncompressed + tableUncompressed: "", + // ncw home directory + 1: "MRDIEtAAMAzDMAzDSjX_ybu0w97bb-L3b2mR-rUl5LXW3lZII43kIDMzM1NXu3okgQs=", + // ncw images + 2: "IhDIAEAA______-Pou_4Sf5z-uS-39MVWjullFLKM7EBECs=", + // ncw Google Drive: + 3: "JxDQAIIBMDMzMwOzbv7nJJCyd_m_9D2llCarnQX33nvvlFKEhUxAAQ==", + // Hex + 4: "ExDoSTD___-tfXfhJ0hKSkryTxU=", + // Base64 + 5: "JRDIcQf_______8PgIiIiIgINkggARHlkQwSSCCBxHFYINHdfXI=", + + // Special tables: + // Compressed data has its own table. + tableCustom: "", + // Reserved for extension. + tableReserved: "", +} + +func initCoders() { + initOnce.Do(func() { + // Init base 64 decoder. + for i, v := range encodeURL { + decodeMap[v] = byte(i) + 1 + } + + // Initialize encoders and decoders. + for i, dataString := range tablesData { + if len(dataString) == 0 { + continue + } + data, err := base64.URLEncoding.DecodeString(dataString) + if err != nil { + panic(err) + } + s, _, err := huff0.ReadTable(data, nil) + if err != nil { + panic(err) + } + + // We want to save at least len(in) >> 5 + s.WantLogLess = 5 + s.Reuse = huff0.ReusePolicyMust + encTables[i] = s + decTables[i] = s.Decoder() + } + // Add custom table type. + var s huff0.Scratch + s.Reuse = huff0.ReusePolicyNone + encTables[tableCustom] = &s + decTables[tableCustom] = nil + }) +}