From 9144141ded16ce2f93de9373eac4550195f5acc2 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Fri, 27 Nov 2020 15:43:07 -0800 Subject: [PATCH] Migrate string_fuzzy_match from common.h to wcstringutil.h This is a more appropriate location for this functionality. Also take this opportunity to clean up subsequence_in_string. --- src/common.cpp | 95 -------------------------------------------- src/common.h | 92 ------------------------------------------ src/complete.h | 1 + src/wcstringutil.cpp | 91 ++++++++++++++++++++++++++++++++++++++++++ src/wcstringutil.h | 93 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 185 insertions(+), 187 deletions(-) diff --git a/src/common.cpp b/src/common.cpp index 8e0e8f03c..d7b981d2b 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1634,101 +1634,6 @@ bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t e return success; } -/// Returns true if seq, represented as a subsequence, is contained within string. -static bool subsequence_in_string(const wcstring &seq, const wcstring &str) { - // Impossible if seq is larger than string. - if (seq.size() > str.size()) { - return false; - } - - // Empty strings are considered to be subsequences of everything. - if (seq.empty()) { - return true; - } - - size_t str_idx, seq_idx; - for (seq_idx = str_idx = 0; seq_idx < seq.size() && str_idx < str.size(); seq_idx++) { - wchar_t c = seq.at(seq_idx); - size_t char_loc = str.find(c, str_idx); - if (char_loc == wcstring::npos) { - break; // didn't find this character - } else { - str_idx = char_loc + 1; // we found it, continue the search just after it - } - } - - // We succeeded if we exhausted our sequence. - assert(seq_idx <= seq.size()); - return seq_idx == seq.size(); -} - -string_fuzzy_match_t::string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first, - size_t distance_second) - : type(t), match_distance_first(distance_first), match_distance_second(distance_second) {} - -string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string, - const wcstring &match_against, - fuzzy_match_type_t limit_type) { - // Distances are generally the amount of text not matched. - string_fuzzy_match_t result(fuzzy_match_none, 0, 0); - size_t location; - if (limit_type >= fuzzy_match_exact && string == match_against) { - result.type = fuzzy_match_exact; - } else if (limit_type >= fuzzy_match_prefix && string_prefixes_string(string, match_against)) { - result.type = fuzzy_match_prefix; - assert(match_against.size() >= string.size()); - result.match_distance_first = match_against.size() - string.size(); - } else if (limit_type >= fuzzy_match_case_insensitive && - wcscasecmp(string.c_str(), match_against.c_str()) == 0) { - result.type = fuzzy_match_case_insensitive; - } else if (limit_type >= fuzzy_match_prefix_case_insensitive && - string_prefixes_string_case_insensitive(string, match_against)) { - result.type = fuzzy_match_prefix_case_insensitive; - assert(match_against.size() >= string.size()); - result.match_distance_first = match_against.size() - string.size(); - } else if (limit_type >= fuzzy_match_substring && - (location = match_against.find(string)) != wcstring::npos) { - // String is contained within match against. - result.type = fuzzy_match_substring; - assert(match_against.size() >= string.size()); - result.match_distance_first = match_against.size() - string.size(); - result.match_distance_second = location; // prefer earlier matches - } else if (limit_type >= fuzzy_match_substring_case_insensitive && - (location = ifind(match_against, string, true)) != wcstring::npos) { - // A case-insensitive version of the string is in the match against. - result.type = fuzzy_match_substring_case_insensitive; - assert(match_against.size() >= string.size()); - result.match_distance_first = match_against.size() - string.size(); - result.match_distance_second = location; // prefer earlier matches - } else if (limit_type >= fuzzy_match_subsequence_insertions_only && - subsequence_in_string(string, match_against)) { - result.type = fuzzy_match_subsequence_insertions_only; - assert(match_against.size() >= string.size()); - result.match_distance_first = match_against.size() - string.size(); - // It would be nice to prefer matches with greater matching runs here. - } - return result; -} - -template -static inline int compare_ints(T a, T b) { - if (a < b) return -1; - if (a == b) return 0; - return 1; -} - -/// Compare types; if the types match, compare distances. -int string_fuzzy_match_t::compare(const string_fuzzy_match_t &rhs) const { - if (this->type != rhs.type) { - return compare_ints(this->type, rhs.type); - } else if (this->match_distance_first != rhs.match_distance_first) { - return compare_ints(this->match_distance_first, rhs.match_distance_first); - } else if (this->match_distance_second != rhs.match_distance_second) { - return compare_ints(this->match_distance_second, rhs.match_distance_second); - } - return 0; // equal -} - [[gnu::noinline]] void bugreport() { FLOG(error, _(L"This is a bug. Break on 'bugreport' to debug.")); FLOG(error, _(L"If you can reproduce it, please report: "), PACKAGE_BUGREPORT, L'.'); diff --git a/src/common.h b/src/common.h index ea4be2e14..2fb08d53c 100644 --- a/src/common.h +++ b/src/common.h @@ -288,98 +288,6 @@ wcstring str2wcstring(const std::string &in, size_t len); /// area. std::string wcs2string(const wcstring &input); -enum fuzzy_match_type_t { - // We match the string exactly: FOOBAR matches FOOBAR. - fuzzy_match_exact = 0, - - // We match a prefix of the string: FO matches FOOBAR. - fuzzy_match_prefix, - - // We match the string exactly, but in a case insensitive way: foobar matches FOOBAR. - fuzzy_match_case_insensitive, - - // We match a prefix of the string, in a case insensitive way: foo matches FOOBAR. - fuzzy_match_prefix_case_insensitive, - - // We match a substring of the string: OOBA matches FOOBAR. - fuzzy_match_substring, - - // We match a substring of the string: ooBA matches FOOBAR. - fuzzy_match_substring_case_insensitive, - - // A subsequence match with insertions only: FBR matches FOOBAR. - fuzzy_match_subsequence_insertions_only, - - // We don't match the string. - fuzzy_match_none -}; - -/// Indicates where a match type requires replacing the entire token. -static inline bool match_type_requires_full_replacement(fuzzy_match_type_t t) { - switch (t) { - case fuzzy_match_exact: - case fuzzy_match_prefix: { - return false; - } - case fuzzy_match_case_insensitive: - case fuzzy_match_prefix_case_insensitive: - case fuzzy_match_substring: - case fuzzy_match_substring_case_insensitive: - case fuzzy_match_subsequence_insertions_only: - case fuzzy_match_none: { - return true; - } - default: { - DIE("Unreachable"); - return false; - } - } -} - -/// Indicates where a match shares a prefix with the string it matches. -static inline bool match_type_shares_prefix(fuzzy_match_type_t t) { - switch (t) { - case fuzzy_match_exact: - case fuzzy_match_prefix: - case fuzzy_match_case_insensitive: - case fuzzy_match_prefix_case_insensitive: { - return true; - } - case fuzzy_match_substring: - case fuzzy_match_substring_case_insensitive: - case fuzzy_match_subsequence_insertions_only: - case fuzzy_match_none: { - return false; - } - default: { - DIE("Unreachabe"); - return false; - } - } -} - -/// Test if string is a fuzzy match to another. -struct string_fuzzy_match_t { - enum fuzzy_match_type_t type; - - // Strength of the match. The value depends on the type. Lower is stronger. - size_t match_distance_first; - size_t match_distance_second; - - // Constructor. - explicit string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first = 0, - size_t distance_second = 0); - - // Return -1, 0, 1 if this match is (respectively) better than, equal to, or worse than rhs. - int compare(const string_fuzzy_match_t &rhs) const; -}; - -/// Compute a fuzzy match for a string. If maximum_match is not fuzzy_match_none, limit the type to -/// matches at or below that type. -string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string, - const wcstring &match_against, - fuzzy_match_type_t limit_type = fuzzy_match_none); - // Check if we are running in the test mode, where we should suppress error output #define TESTS_PROGRAM_NAME L"(ignore)" bool should_suppress_stderr_for_tests(); diff --git a/src/complete.h b/src/complete.h index c060177d0..164702a9d 100644 --- a/src/complete.h +++ b/src/complete.h @@ -12,6 +12,7 @@ #include "common.h" #include "enum_set.h" +#include "wcstringutil.h" struct completion_mode_t { /// If set, skip file completions. diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp index a433f6fba..1dde6d359 100644 --- a/src/wcstringutil.cpp +++ b/src/wcstringutil.cpp @@ -130,6 +130,97 @@ bool string_suffixes_string_case_insensitive(const wcstring &proposed_suffix, proposed_suffix.c_str(), suffix_size) == 0; } +/// Returns true if needle, represented as a subsequence, is contained within haystack. +/// Note subsequence is not substring: "foo" is a subsequence of "follow" for example. +static bool subsequence_in_string(const wcstring &needle, const wcstring &haystack) { + // Impossible if haystack is larger than string. + if (haystack.size() > haystack.size()) { + return false; + } + + // Empty strings are considered to be subsequences of everything. + if (needle.empty()) { + return true; + } + + auto ni = needle.begin(); + for (auto hi = haystack.begin(); ni != needle.end() && hi != haystack.end(); ++hi) { + if (*ni == *hi) { + ++ni; + } + } + // We succeeded if we exhausted our sequence. + assert(ni <= needle.end()); + return ni == needle.end(); +} + +string_fuzzy_match_t::string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first, + size_t distance_second) + : type(t), match_distance_first(distance_first), match_distance_second(distance_second) {} + +string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string, + const wcstring &match_against, + fuzzy_match_type_t limit_type) { + // Distances are generally the amount of text not matched. + string_fuzzy_match_t result(fuzzy_match_none, 0, 0); + size_t location; + if (limit_type >= fuzzy_match_exact && string == match_against) { + result.type = fuzzy_match_exact; + } else if (limit_type >= fuzzy_match_prefix && string_prefixes_string(string, match_against)) { + result.type = fuzzy_match_prefix; + assert(match_against.size() >= string.size()); + result.match_distance_first = match_against.size() - string.size(); + } else if (limit_type >= fuzzy_match_case_insensitive && + wcscasecmp(string.c_str(), match_against.c_str()) == 0) { + result.type = fuzzy_match_case_insensitive; + } else if (limit_type >= fuzzy_match_prefix_case_insensitive && + string_prefixes_string_case_insensitive(string, match_against)) { + result.type = fuzzy_match_prefix_case_insensitive; + assert(match_against.size() >= string.size()); + result.match_distance_first = match_against.size() - string.size(); + } else if (limit_type >= fuzzy_match_substring && + (location = match_against.find(string)) != wcstring::npos) { + // String is contained within match against. + result.type = fuzzy_match_substring; + assert(match_against.size() >= string.size()); + result.match_distance_first = match_against.size() - string.size(); + result.match_distance_second = location; // prefer earlier matches + } else if (limit_type >= fuzzy_match_substring_case_insensitive && + (location = ifind(match_against, string, true)) != wcstring::npos) { + // A case-insensitive version of the string is in the match against. + result.type = fuzzy_match_substring_case_insensitive; + assert(match_against.size() >= string.size()); + result.match_distance_first = match_against.size() - string.size(); + result.match_distance_second = location; // prefer earlier matches + } else if (limit_type >= fuzzy_match_subsequence_insertions_only && + subsequence_in_string(string, match_against)) { + result.type = fuzzy_match_subsequence_insertions_only; + assert(match_against.size() >= string.size()); + result.match_distance_first = match_against.size() - string.size(); + // It would be nice to prefer matches with greater matching runs here. + } + return result; +} + +template +static inline int compare_ints(T a, T b) { + if (a < b) return -1; + if (a == b) return 0; + return 1; +} + +/// Compare types; if the types match, compare distances. +int string_fuzzy_match_t::compare(const string_fuzzy_match_t &rhs) const { + if (this->type != rhs.type) { + return compare_ints(this->type, rhs.type); + } else if (this->match_distance_first != rhs.match_distance_first) { + return compare_ints(this->match_distance_first, rhs.match_distance_first); + } else if (this->match_distance_second != rhs.match_distance_second) { + return compare_ints(this->match_distance_second, rhs.match_distance_second); + } + return 0; // equal +} + template size_t ifind_impl(const T &haystack, const T &needle) { using char_t = typename T::value_type; diff --git a/src/wcstringutil.h b/src/wcstringutil.h index 63dd91292..e572f242c 100644 --- a/src/wcstringutil.h +++ b/src/wcstringutil.h @@ -35,6 +35,99 @@ bool string_prefixes_string_case_insensitive(const wcstring &proposed_prefix, size_t ifind(const wcstring &haystack, const wcstring &needle, bool fuzzy = false); size_t ifind(const std::string &haystack, const std::string &needle, bool fuzzy = false); +// Ways that a string may fuzzily match another. +enum fuzzy_match_type_t { + // We match the string exactly: FOOBAR matches FOOBAR. + fuzzy_match_exact = 0, + + // We match a prefix of the string: FO matches FOOBAR. + fuzzy_match_prefix, + + // We match the string exactly, but in a case insensitive way: foobar matches FOOBAR. + fuzzy_match_case_insensitive, + + // We match a prefix of the string, in a case insensitive way: foo matches FOOBAR. + fuzzy_match_prefix_case_insensitive, + + // We match a substring of the string: OOBA matches FOOBAR. + fuzzy_match_substring, + + // We match a substring of the string: ooBA matches FOOBAR. + fuzzy_match_substring_case_insensitive, + + // A subsequence match with insertions only: FBR matches FOOBAR. + fuzzy_match_subsequence_insertions_only, + + // We don't match the string. + fuzzy_match_none +}; + +/// Indicates where a match type requires replacing the entire token. +static inline bool match_type_requires_full_replacement(fuzzy_match_type_t t) { + switch (t) { + case fuzzy_match_exact: + case fuzzy_match_prefix: { + return false; + } + case fuzzy_match_case_insensitive: + case fuzzy_match_prefix_case_insensitive: + case fuzzy_match_substring: + case fuzzy_match_substring_case_insensitive: + case fuzzy_match_subsequence_insertions_only: + case fuzzy_match_none: { + return true; + } + default: { + DIE("Unreachable"); + return false; + } + } +} + +/// Indicates where a match shares a prefix with the string it matches. +static inline bool match_type_shares_prefix(fuzzy_match_type_t t) { + switch (t) { + case fuzzy_match_exact: + case fuzzy_match_prefix: + case fuzzy_match_case_insensitive: + case fuzzy_match_prefix_case_insensitive: { + return true; + } + case fuzzy_match_substring: + case fuzzy_match_substring_case_insensitive: + case fuzzy_match_subsequence_insertions_only: + case fuzzy_match_none: { + return false; + } + default: { + DIE("Unreachable"); + return false; + } + } +} + +/// Test if string is a fuzzy match to another. +struct string_fuzzy_match_t { + enum fuzzy_match_type_t type; + + // Strength of the match. The value depends on the type. Lower is stronger. + size_t match_distance_first; + size_t match_distance_second; + + // Constructor. + explicit string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first = 0, + size_t distance_second = 0); + + // Return -1, 0, 1 if this match is (respectively) better than, equal to, or worse than rhs. + int compare(const string_fuzzy_match_t &rhs) const; +}; + +/// Compute a fuzzy match for a string. If maximum_match is not fuzzy_match_none, limit the type to +/// matches at or below that type. +string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string, + const wcstring &match_against, + fuzzy_match_type_t limit_type = fuzzy_match_none); + /// Split a string by a separator character. wcstring_list_t split_string(const wcstring &val, wchar_t sep);