Migrate string_fuzzy_match from common.h to wcstringutil.h

This is a more appropriate location for this functionality.
Also take this opportunity to clean up subsequence_in_string.
This commit is contained in:
ridiculousfish 2020-11-27 15:43:07 -08:00
parent 639cd66ba1
commit 9144141ded
5 changed files with 185 additions and 187 deletions

View File

@ -1634,101 +1634,6 @@ bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t e
return success;
}
/// Returns true if seq, represented as a subsequence, is contained within string.
static bool subsequence_in_string(const wcstring &seq, const wcstring &str) {
// Impossible if seq is larger than string.
if (seq.size() > str.size()) {
return false;
}
// Empty strings are considered to be subsequences of everything.
if (seq.empty()) {
return true;
}
size_t str_idx, seq_idx;
for (seq_idx = str_idx = 0; seq_idx < seq.size() && str_idx < str.size(); seq_idx++) {
wchar_t c = seq.at(seq_idx);
size_t char_loc = str.find(c, str_idx);
if (char_loc == wcstring::npos) {
break; // didn't find this character
} else {
str_idx = char_loc + 1; // we found it, continue the search just after it
}
}
// We succeeded if we exhausted our sequence.
assert(seq_idx <= seq.size());
return seq_idx == seq.size();
}
string_fuzzy_match_t::string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first,
size_t distance_second)
: type(t), match_distance_first(distance_first), match_distance_second(distance_second) {}
string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string,
const wcstring &match_against,
fuzzy_match_type_t limit_type) {
// Distances are generally the amount of text not matched.
string_fuzzy_match_t result(fuzzy_match_none, 0, 0);
size_t location;
if (limit_type >= fuzzy_match_exact && string == match_against) {
result.type = fuzzy_match_exact;
} else if (limit_type >= fuzzy_match_prefix && string_prefixes_string(string, match_against)) {
result.type = fuzzy_match_prefix;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
} else if (limit_type >= fuzzy_match_case_insensitive &&
wcscasecmp(string.c_str(), match_against.c_str()) == 0) {
result.type = fuzzy_match_case_insensitive;
} else if (limit_type >= fuzzy_match_prefix_case_insensitive &&
string_prefixes_string_case_insensitive(string, match_against)) {
result.type = fuzzy_match_prefix_case_insensitive;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
} else if (limit_type >= fuzzy_match_substring &&
(location = match_against.find(string)) != wcstring::npos) {
// String is contained within match against.
result.type = fuzzy_match_substring;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
result.match_distance_second = location; // prefer earlier matches
} else if (limit_type >= fuzzy_match_substring_case_insensitive &&
(location = ifind(match_against, string, true)) != wcstring::npos) {
// A case-insensitive version of the string is in the match against.
result.type = fuzzy_match_substring_case_insensitive;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
result.match_distance_second = location; // prefer earlier matches
} else if (limit_type >= fuzzy_match_subsequence_insertions_only &&
subsequence_in_string(string, match_against)) {
result.type = fuzzy_match_subsequence_insertions_only;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
// It would be nice to prefer matches with greater matching runs here.
}
return result;
}
template <typename T>
static inline int compare_ints(T a, T b) {
if (a < b) return -1;
if (a == b) return 0;
return 1;
}
/// Compare types; if the types match, compare distances.
int string_fuzzy_match_t::compare(const string_fuzzy_match_t &rhs) const {
if (this->type != rhs.type) {
return compare_ints(this->type, rhs.type);
} else if (this->match_distance_first != rhs.match_distance_first) {
return compare_ints(this->match_distance_first, rhs.match_distance_first);
} else if (this->match_distance_second != rhs.match_distance_second) {
return compare_ints(this->match_distance_second, rhs.match_distance_second);
}
return 0; // equal
}
[[gnu::noinline]] void bugreport() {
FLOG(error, _(L"This is a bug. Break on 'bugreport' to debug."));
FLOG(error, _(L"If you can reproduce it, please report: "), PACKAGE_BUGREPORT, L'.');

View File

@ -288,98 +288,6 @@ wcstring str2wcstring(const std::string &in, size_t len);
/// area.
std::string wcs2string(const wcstring &input);
enum fuzzy_match_type_t {
// We match the string exactly: FOOBAR matches FOOBAR.
fuzzy_match_exact = 0,
// We match a prefix of the string: FO matches FOOBAR.
fuzzy_match_prefix,
// We match the string exactly, but in a case insensitive way: foobar matches FOOBAR.
fuzzy_match_case_insensitive,
// We match a prefix of the string, in a case insensitive way: foo matches FOOBAR.
fuzzy_match_prefix_case_insensitive,
// We match a substring of the string: OOBA matches FOOBAR.
fuzzy_match_substring,
// We match a substring of the string: ooBA matches FOOBAR.
fuzzy_match_substring_case_insensitive,
// A subsequence match with insertions only: FBR matches FOOBAR.
fuzzy_match_subsequence_insertions_only,
// We don't match the string.
fuzzy_match_none
};
/// Indicates where a match type requires replacing the entire token.
static inline bool match_type_requires_full_replacement(fuzzy_match_type_t t) {
switch (t) {
case fuzzy_match_exact:
case fuzzy_match_prefix: {
return false;
}
case fuzzy_match_case_insensitive:
case fuzzy_match_prefix_case_insensitive:
case fuzzy_match_substring:
case fuzzy_match_substring_case_insensitive:
case fuzzy_match_subsequence_insertions_only:
case fuzzy_match_none: {
return true;
}
default: {
DIE("Unreachable");
return false;
}
}
}
/// Indicates where a match shares a prefix with the string it matches.
static inline bool match_type_shares_prefix(fuzzy_match_type_t t) {
switch (t) {
case fuzzy_match_exact:
case fuzzy_match_prefix:
case fuzzy_match_case_insensitive:
case fuzzy_match_prefix_case_insensitive: {
return true;
}
case fuzzy_match_substring:
case fuzzy_match_substring_case_insensitive:
case fuzzy_match_subsequence_insertions_only:
case fuzzy_match_none: {
return false;
}
default: {
DIE("Unreachabe");
return false;
}
}
}
/// Test if string is a fuzzy match to another.
struct string_fuzzy_match_t {
enum fuzzy_match_type_t type;
// Strength of the match. The value depends on the type. Lower is stronger.
size_t match_distance_first;
size_t match_distance_second;
// Constructor.
explicit string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first = 0,
size_t distance_second = 0);
// Return -1, 0, 1 if this match is (respectively) better than, equal to, or worse than rhs.
int compare(const string_fuzzy_match_t &rhs) const;
};
/// Compute a fuzzy match for a string. If maximum_match is not fuzzy_match_none, limit the type to
/// matches at or below that type.
string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string,
const wcstring &match_against,
fuzzy_match_type_t limit_type = fuzzy_match_none);
// Check if we are running in the test mode, where we should suppress error output
#define TESTS_PROGRAM_NAME L"(ignore)"
bool should_suppress_stderr_for_tests();

View File

@ -12,6 +12,7 @@
#include "common.h"
#include "enum_set.h"
#include "wcstringutil.h"
struct completion_mode_t {
/// If set, skip file completions.

View File

@ -130,6 +130,97 @@ bool string_suffixes_string_case_insensitive(const wcstring &proposed_suffix,
proposed_suffix.c_str(), suffix_size) == 0;
}
/// Returns true if needle, represented as a subsequence, is contained within haystack.
/// Note subsequence is not substring: "foo" is a subsequence of "follow" for example.
static bool subsequence_in_string(const wcstring &needle, const wcstring &haystack) {
// Impossible if haystack is larger than string.
if (haystack.size() > haystack.size()) {
return false;
}
// Empty strings are considered to be subsequences of everything.
if (needle.empty()) {
return true;
}
auto ni = needle.begin();
for (auto hi = haystack.begin(); ni != needle.end() && hi != haystack.end(); ++hi) {
if (*ni == *hi) {
++ni;
}
}
// We succeeded if we exhausted our sequence.
assert(ni <= needle.end());
return ni == needle.end();
}
string_fuzzy_match_t::string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first,
size_t distance_second)
: type(t), match_distance_first(distance_first), match_distance_second(distance_second) {}
string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string,
const wcstring &match_against,
fuzzy_match_type_t limit_type) {
// Distances are generally the amount of text not matched.
string_fuzzy_match_t result(fuzzy_match_none, 0, 0);
size_t location;
if (limit_type >= fuzzy_match_exact && string == match_against) {
result.type = fuzzy_match_exact;
} else if (limit_type >= fuzzy_match_prefix && string_prefixes_string(string, match_against)) {
result.type = fuzzy_match_prefix;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
} else if (limit_type >= fuzzy_match_case_insensitive &&
wcscasecmp(string.c_str(), match_against.c_str()) == 0) {
result.type = fuzzy_match_case_insensitive;
} else if (limit_type >= fuzzy_match_prefix_case_insensitive &&
string_prefixes_string_case_insensitive(string, match_against)) {
result.type = fuzzy_match_prefix_case_insensitive;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
} else if (limit_type >= fuzzy_match_substring &&
(location = match_against.find(string)) != wcstring::npos) {
// String is contained within match against.
result.type = fuzzy_match_substring;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
result.match_distance_second = location; // prefer earlier matches
} else if (limit_type >= fuzzy_match_substring_case_insensitive &&
(location = ifind(match_against, string, true)) != wcstring::npos) {
// A case-insensitive version of the string is in the match against.
result.type = fuzzy_match_substring_case_insensitive;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
result.match_distance_second = location; // prefer earlier matches
} else if (limit_type >= fuzzy_match_subsequence_insertions_only &&
subsequence_in_string(string, match_against)) {
result.type = fuzzy_match_subsequence_insertions_only;
assert(match_against.size() >= string.size());
result.match_distance_first = match_against.size() - string.size();
// It would be nice to prefer matches with greater matching runs here.
}
return result;
}
template <typename T>
static inline int compare_ints(T a, T b) {
if (a < b) return -1;
if (a == b) return 0;
return 1;
}
/// Compare types; if the types match, compare distances.
int string_fuzzy_match_t::compare(const string_fuzzy_match_t &rhs) const {
if (this->type != rhs.type) {
return compare_ints(this->type, rhs.type);
} else if (this->match_distance_first != rhs.match_distance_first) {
return compare_ints(this->match_distance_first, rhs.match_distance_first);
} else if (this->match_distance_second != rhs.match_distance_second) {
return compare_ints(this->match_distance_second, rhs.match_distance_second);
}
return 0; // equal
}
template <bool Fuzzy, typename T>
size_t ifind_impl(const T &haystack, const T &needle) {
using char_t = typename T::value_type;

View File

@ -35,6 +35,99 @@ bool string_prefixes_string_case_insensitive(const wcstring &proposed_prefix,
size_t ifind(const wcstring &haystack, const wcstring &needle, bool fuzzy = false);
size_t ifind(const std::string &haystack, const std::string &needle, bool fuzzy = false);
// Ways that a string may fuzzily match another.
enum fuzzy_match_type_t {
// We match the string exactly: FOOBAR matches FOOBAR.
fuzzy_match_exact = 0,
// We match a prefix of the string: FO matches FOOBAR.
fuzzy_match_prefix,
// We match the string exactly, but in a case insensitive way: foobar matches FOOBAR.
fuzzy_match_case_insensitive,
// We match a prefix of the string, in a case insensitive way: foo matches FOOBAR.
fuzzy_match_prefix_case_insensitive,
// We match a substring of the string: OOBA matches FOOBAR.
fuzzy_match_substring,
// We match a substring of the string: ooBA matches FOOBAR.
fuzzy_match_substring_case_insensitive,
// A subsequence match with insertions only: FBR matches FOOBAR.
fuzzy_match_subsequence_insertions_only,
// We don't match the string.
fuzzy_match_none
};
/// Indicates where a match type requires replacing the entire token.
static inline bool match_type_requires_full_replacement(fuzzy_match_type_t t) {
switch (t) {
case fuzzy_match_exact:
case fuzzy_match_prefix: {
return false;
}
case fuzzy_match_case_insensitive:
case fuzzy_match_prefix_case_insensitive:
case fuzzy_match_substring:
case fuzzy_match_substring_case_insensitive:
case fuzzy_match_subsequence_insertions_only:
case fuzzy_match_none: {
return true;
}
default: {
DIE("Unreachable");
return false;
}
}
}
/// Indicates where a match shares a prefix with the string it matches.
static inline bool match_type_shares_prefix(fuzzy_match_type_t t) {
switch (t) {
case fuzzy_match_exact:
case fuzzy_match_prefix:
case fuzzy_match_case_insensitive:
case fuzzy_match_prefix_case_insensitive: {
return true;
}
case fuzzy_match_substring:
case fuzzy_match_substring_case_insensitive:
case fuzzy_match_subsequence_insertions_only:
case fuzzy_match_none: {
return false;
}
default: {
DIE("Unreachable");
return false;
}
}
}
/// Test if string is a fuzzy match to another.
struct string_fuzzy_match_t {
enum fuzzy_match_type_t type;
// Strength of the match. The value depends on the type. Lower is stronger.
size_t match_distance_first;
size_t match_distance_second;
// Constructor.
explicit string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first = 0,
size_t distance_second = 0);
// Return -1, 0, 1 if this match is (respectively) better than, equal to, or worse than rhs.
int compare(const string_fuzzy_match_t &rhs) const;
};
/// Compute a fuzzy match for a string. If maximum_match is not fuzzy_match_none, limit the type to
/// matches at or below that type.
string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string,
const wcstring &match_against,
fuzzy_match_type_t limit_type = fuzzy_match_none);
/// Split a string by a separator character.
wcstring_list_t split_string(const wcstring &val, wchar_t sep);