Using XXHash64 for all wcstring unordered_map/set hashing

Since we are including XXHash32/64 anyway for the wchar_t* hashing, we might as well use it. Use arch-specific hash size and xxhash for all wcstring hashing Instead of using XXHash64 for all platforms, use the 32-bit version when running on 32-bit platforms where XXHash64 is significantly slower than XXHash32 (and the additional precision will not be used). Additionally, manually specify wcstring_hash as hashing method for non-const wcstring unordered_set/map instances (the const varieties don't have an in-library hash and so already use our xxhash-based specialization when calling std::hash<const wcstring>).
2025-01-20 19:12:59 +08:00 · 2017-08-19 13:47:16 -05:00 · 2017-08-19 13:47:16 -05:00 · d54fbddb11
commit d54fbddb11
parent d9f901f36d
14 changed files with 35 additions and 34 deletions
--- a/src/autoload.h
+++ b/src/autoload.h
@ -55,7 +55,7 @@ class autoload_t : public lru_cache_t<autoload_t, autoload_function_t> {
    wcstring_list_t last_path_tokenized;
    /// A table containing all the files that are currently being loaded.
    /// This is here to help prevent recursion.
-    std::unordered_set<wcstring> is_loading_set;
+    std::unordered_set<wcstring, wcstring_hash> is_loading_set;
    // Function invoked when a command is removed
    typedef void (*command_removed_function_t)(const wcstring &);
    const command_removed_function_t command_removed;
--- a/src/builtin_argparse.cpp
+++ b/src/builtin_argparse.cpp
@ -66,7 +66,7 @@ class argparse_cmd_opts_t {
    wcstring_list_t raw_exclusive_flags;
    wcstring_list_t argv;
    std::unordered_map<wchar_t, option_spec_t *> options;
-    std::unordered_map<wcstring, wchar_t> long_to_short_flag;
+    std::unordered_map<wcstring, wchar_t, wcstring_hash> long_to_short_flag;
    std::vector<std::vector<wchar_t>> exclusive_flag_sets;

    ~argparse_cmd_opts_t() {
--- a/src/common.h
+++ b/src/common.h
@ -836,15 +836,22 @@ enum {
 // Custom hash function used by unordered_map/unordered_set when key is const
 #ifndef CONST_WCSTRING_HASH
 #define CONST_WCSTRING_HASH 1
+#include "xxhash32.h"
+#include "xxhash64.h"
+inline size_t xxhash(const void *t, size_t size) {
+#if __SIZEOF_POINTER__ == __SIZEOF_INT__
+    return XXHash32::hash(t, size, 0);
+#else
+    return XXHash64::hash(t, size, 0);
+}
+struct wcstring_hash {
+    size_t operator()(const wcstring &w) const { return xxhash(w.c_str(), w.size()); }
+};
 namespace std {
-  template <>
-  struct hash<const wcstring>
-  {
-    std::size_t operator()(const wcstring& w) const
-    {
-        std::hash<wcstring> hasher;
-        return hasher((wcstring) w);
-    }
-  };
+template <>
+struct hash<const wcstring> {
+    std::size_t operator()(const wcstring &w) const { return xxhash(w.c_str(), w.size()); }
+};
 }
 #endif
+#endif
--- a/src/complete.cpp
+++ b/src/complete.cpp
@ -162,7 +162,7 @@ namespace std {
    template<>
    struct hash<completion_entry_t> {
        size_t operator()(const completion_entry_t &c) const {
-            std::hash<wcstring> hasher;
+            wcstring_hash hasher;
            return hasher((wcstring) c.cmd);
        }
    };
@ -297,7 +297,7 @@ class completer_t {

    /// Table of completions conditions that have already been tested and the corresponding test
    /// results.
-    typedef std::unordered_map<wcstring, bool> condition_cache_t;
+    typedef std::unordered_map<wcstring, bool, wcstring_hash> condition_cache_t;
    condition_cache_t condition_cache;

    enum complete_type_t { COMPLETE_DEFAULT, COMPLETE_AUTOSUGGEST };
@ -600,7 +600,7 @@ void completer_t::complete_cmd_desc(const wcstring &str) {
    wcstring lookup_cmd(L"__fish_describe_command ");
    lookup_cmd.append(escape_string(cmd_start, 1));

-    std::unordered_map<wcstring, wcstring> lookup;
+    std::unordered_map<wcstring, wcstring, wcstring_hash> lookup;

    // First locate a list of possible descriptions using a single call to apropos or a direct
    // search if we know the location of the whatis database. This can take some time on slower
@ -1557,7 +1557,7 @@ wcstring complete_print() {

 /// Completion "wrapper" support. The map goes from wrapping-command to wrapped-command-list.
 static std::mutex wrapper_lock;
-typedef std::unordered_map<wcstring, wcstring_list_t> wrapper_map_t;
+typedef std::unordered_map<wcstring, wcstring_list_t, wcstring_hash> wrapper_map_t;
 static wrapper_map_t &wrap_map() {
    ASSERT_IS_LOCKED(wrapper_lock);
    // A pointer is a little more efficient than an object as a static because we can elide the
@ -1614,7 +1614,7 @@ wcstring_list_t complete_get_wrap_chain(const wcstring &command) {
    const wrapper_map_t &wraps = wrap_map();

    wcstring_list_t result;
-    std::unordered_set<wcstring> visited;            // set of visited commands
+    std::unordered_set<wcstring, wcstring_hash> visited;  // set of visited commands
    wcstring_list_t to_visit(1, command);  // stack of remaining-to-visit commands

    wcstring target;
--- a/src/env.cpp
+++ b/src/env.cpp
@ -55,7 +55,6 @@
 #include "sanity.h"
 #include "screen.h"
 #include "wutil.h"  // IWYU pragma: keep
-#include "xxhash64.h"

 #define DEFAULT_TERM1 "ansi"
 #define DEFAULT_TERM2 "dumb"
@ -330,9 +329,7 @@ struct const_string_set_comparer {
 namespace std {
    template<>
    struct hash<const wchar_t *> {
-        size_t operator()(const wchar_t *p) const {
-            return XXHash64::hash(p, wcslen(p), 0);
-        }
+        size_t operator()(const wchar_t *p) const { return xxhash(p, wcslen(p)); }
    };
    template <>
    struct equal_to<const wchar_t *> {
--- a/src/env_universal_common.h
+++ b/src/env_universal_common.h
@ -34,7 +34,7 @@ class env_universal_t {

    // Keys that have been modified, and need to be written. A value here that is not present in
    // vars indicates a deleted value.
-    std::unordered_set<wcstring> modified;
+    std::unordered_set<wcstring, wcstring_hash> modified;

    // Path that we save to. If empty, use the default.
    const wcstring explicit_vars_path;
--- a/src/highlight.cpp
+++ b/src/highlight.cpp
@ -67,7 +67,7 @@ static const wchar_t *const highlight_var[] = {L"fish_color_normal",
 /// Returns:
 ///     false: the filesystem is not case insensitive
 ///     true: the file system is case insensitive
-typedef std::unordered_map<wcstring, bool> case_sensitivity_cache_t;
+typedef std::unordered_map<wcstring, bool, wcstring_hash> case_sensitivity_cache_t;
 bool fs_is_case_insensitive(const wcstring &path, int fd,
                            case_sensitivity_cache_t &case_sensitivity_cache) {
    bool result = false;
@ -146,7 +146,7 @@ bool is_potential_path(const wcstring &potential_path_fragment, const wcstring_l

    // Don't test the same path multiple times, which can happen if the path is absolute and the
    // CDPATH contains multiple entries.
-    std::unordered_set<wcstring> checked_paths;
+    std::unordered_set<wcstring, wcstring_hash> checked_paths;

    // Keep a cache of which paths / filesystems are case sensitive.
    case_sensitivity_cache_t case_sensitivity_cache;
--- a/src/history.h
+++ b/src/history.h
@ -139,7 +139,7 @@ class history_t {
    uint32_t disable_automatic_save_counter;

    // Deleted item contents.
-    std::unordered_set<wcstring> deleted_items;
+    std::unordered_set<wcstring, wcstring_hash> deleted_items;

    // The mmaped region for the history file.
    const char *mmap_start;
--- a/src/lru.h
+++ b/src/lru.h
@ -45,7 +45,7 @@ class lru_cache_t {
        explicit lru_node_t(const CONTENTS &v) : value(std::move(v)) {}
    };

-    typedef typename std::unordered_map<wcstring, lru_node_t>::iterator node_iter_t;
+    typedef typename std::unordered_map<wcstring, lru_node_t, wcstring_hash>::iterator node_iter_t;

    // Max node count. This may be (transiently) exceeded by add_node_without_eviction, which is
    // used from background threads.
@ -54,7 +54,7 @@ class lru_cache_t {
    // All of our nodes
    // Note that our linked list contains pointers to these nodes in the map
    // We are dependent on the iterator-noninvalidation guarantees of std::map
-    std::unordered_map<wcstring, lru_node_t> node_map;
+    std::unordered_map<wcstring, lru_node_t, wcstring_hash> node_map;

    // Head of the linked list
    // The list is circular!
--- a/src/pager.cpp
+++ b/src/pager.cpp
@ -267,7 +267,7 @@ static void mangle_1_completion_description(wcstring *str) {
 static void join_completions(comp_info_list_t *comps) {
    // A map from description to index in the completion list of the element with that description.
    // The indexes are stored +1.
-    std::unordered_map<wcstring, size_t> desc_table;
+    std::unordered_map<wcstring, size_t, wcstring_hash> desc_table;

    // Note that we mutate the completion list as we go, so the size changes.
    for (size_t i = 0; i < comps->size(); i++) {
--- a/src/screen.h
+++ b/src/screen.h
@ -203,7 +203,7 @@ size_t escape_code_length(const wchar_t *code);
 class cached_esc_sequences_t {
   private:
    // Cached escape sequences we've already detected in the prompt and similar strings.
-    std::unordered_set<wcstring> cache;
+    std::unordered_set<wcstring, wcstring_hash> cache;
    // The escape sequence lengths we've cached. My original implementation used min and max
    // length variables. The cache was then iterated over using a loop like this:
    // `for (size_t l = min; l <= max; l++)`.
--- a/src/wildcard.cpp
+++ b/src/wildcard.cpp
@ -439,7 +439,7 @@ class wildcard_expander_t {
    // The working directory to resolve paths against
    const wcstring working_directory;
    // The set of items we have resolved, used to efficiently avoid duplication.
-    std::unordered_set<wcstring> completion_set;
+    std::unordered_set<wcstring, wcstring_hash> completion_set;
    // The set of file IDs we have visited, used to avoid symlink loops.
    std::unordered_set<file_id_t> visited_files;
    // Flags controlling expansion.
--- a/src/wutil.cpp
+++ b/src/wutil.cpp
@ -38,7 +38,7 @@ const file_id_t kInvalidFileID = {(dev_t)-1LL, (ino_t)-1LL, (uint64_t)-1LL, -1,
 #endif

 /// Map used as cache by wgettext.
-static owning_lock<std::unordered_map<wcstring, wcstring>> wgettext_map;
+static owning_lock<std::unordered_map<wcstring, wcstring, wcstring_hash>> wgettext_map;

 bool wreaddir_resolving(DIR *dir, const wcstring &dir_path, wcstring &out_name, bool *out_is_dir) {
    struct dirent d;
--- a/src/wutil.h
+++ b/src/wutil.h
@ -145,13 +145,10 @@ struct file_id_t {

 #ifndef HASH_FILE_ID
 #define HASH_FILE_ID 1
-#include "xxhash64.h"
 namespace std {
    template<>
    struct hash<file_id_t> {
-        size_t operator()(const file_id_t &f) const {
-            return XXHash64::hash(&f, sizeof(f), 0);
-        }
+        size_t operator()(const file_id_t &f) const { return xxhash(&f, sizeof(f)); }
    };
 }
 #endif