2016-05-04 06:18:24 +08:00
|
|
|
// Helper functions for working with wcstring.
|
2014-09-22 10:18:56 +08:00
|
|
|
#ifndef FISH_WCSTRINGUTIL_H
|
|
|
|
#define FISH_WCSTRINGUTIL_H
|
|
|
|
|
2017-07-27 21:05:35 +08:00
|
|
|
#include <algorithm>
|
2020-07-30 08:16:51 +08:00
|
|
|
#include <cstring>
|
2015-07-25 23:14:25 +08:00
|
|
|
#include <string>
|
2014-09-22 10:18:56 +08:00
|
|
|
#include <utility>
|
2016-04-21 14:00:54 +08:00
|
|
|
|
2014-09-22 10:18:56 +08:00
|
|
|
#include "common.h"
|
2020-07-30 08:16:51 +08:00
|
|
|
#include "expand.h"
|
2014-09-22 10:18:56 +08:00
|
|
|
|
2020-01-16 05:16:43 +08:00
|
|
|
/// Test if a string prefixes another. Returns true if a is a prefix of b.
|
|
|
|
bool string_prefixes_string(const wcstring &proposed_prefix, const wcstring &value);
|
|
|
|
bool string_prefixes_string(const wchar_t *proposed_prefix, const wcstring &value);
|
|
|
|
bool string_prefixes_string(const wchar_t *proposed_prefix, const wchar_t *value);
|
|
|
|
bool string_prefixes_string(const char *proposed_prefix, const std::string &value);
|
|
|
|
bool string_prefixes_string(const char *proposed_prefix, const char *value);
|
|
|
|
|
|
|
|
/// Test if a string is a suffix of another.
|
|
|
|
bool string_suffixes_string(const wcstring &proposed_suffix, const wcstring &value);
|
|
|
|
bool string_suffixes_string(const wchar_t *proposed_suffix, const wcstring &value);
|
|
|
|
bool string_suffixes_string_case_insensitive(const wcstring &proposed_suffix,
|
|
|
|
const wcstring &value);
|
|
|
|
|
|
|
|
/// Test if a string prefixes another without regard to case. Returns true if a is a prefix of b.
|
|
|
|
bool string_prefixes_string_case_insensitive(const wcstring &proposed_prefix,
|
|
|
|
const wcstring &value);
|
|
|
|
|
|
|
|
/// Case-insensitive string search, modeled after std::string::find().
|
|
|
|
/// \param fuzzy indicates this is being used for fuzzy matching and case insensitivity is
|
|
|
|
/// expanded to include symbolic characters (#3584).
|
|
|
|
/// \return the offset of the first case-insensitive matching instance of `needle` within
|
|
|
|
/// `haystack`, or `string::npos()` if no results were found.
|
|
|
|
size_t ifind(const wcstring &haystack, const wcstring &needle, bool fuzzy = false);
|
|
|
|
size_t ifind(const std::string &haystack, const std::string &needle, bool fuzzy = false);
|
|
|
|
|
2020-11-28 07:43:07 +08:00
|
|
|
// Ways that a string may fuzzily match another.
|
|
|
|
enum fuzzy_match_type_t {
|
|
|
|
// We match the string exactly: FOOBAR matches FOOBAR.
|
|
|
|
fuzzy_match_exact = 0,
|
|
|
|
|
|
|
|
// We match a prefix of the string: FO matches FOOBAR.
|
|
|
|
fuzzy_match_prefix,
|
|
|
|
|
|
|
|
// We match the string exactly, but in a case insensitive way: foobar matches FOOBAR.
|
|
|
|
fuzzy_match_case_insensitive,
|
|
|
|
|
|
|
|
// We match a prefix of the string, in a case insensitive way: foo matches FOOBAR.
|
|
|
|
fuzzy_match_prefix_case_insensitive,
|
|
|
|
|
|
|
|
// We match a substring of the string: OOBA matches FOOBAR.
|
|
|
|
fuzzy_match_substring,
|
|
|
|
|
|
|
|
// We match a substring of the string: ooBA matches FOOBAR.
|
|
|
|
fuzzy_match_substring_case_insensitive,
|
|
|
|
|
|
|
|
// A subsequence match with insertions only: FBR matches FOOBAR.
|
|
|
|
fuzzy_match_subsequence_insertions_only,
|
|
|
|
|
|
|
|
// We don't match the string.
|
|
|
|
fuzzy_match_none
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Indicates where a match type requires replacing the entire token.
|
|
|
|
static inline bool match_type_requires_full_replacement(fuzzy_match_type_t t) {
|
|
|
|
switch (t) {
|
|
|
|
case fuzzy_match_exact:
|
|
|
|
case fuzzy_match_prefix: {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
case fuzzy_match_case_insensitive:
|
|
|
|
case fuzzy_match_prefix_case_insensitive:
|
|
|
|
case fuzzy_match_substring:
|
|
|
|
case fuzzy_match_substring_case_insensitive:
|
|
|
|
case fuzzy_match_subsequence_insertions_only:
|
|
|
|
case fuzzy_match_none: {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
DIE("Unreachable");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Indicates where a match shares a prefix with the string it matches.
|
|
|
|
static inline bool match_type_shares_prefix(fuzzy_match_type_t t) {
|
|
|
|
switch (t) {
|
|
|
|
case fuzzy_match_exact:
|
|
|
|
case fuzzy_match_prefix:
|
|
|
|
case fuzzy_match_case_insensitive:
|
|
|
|
case fuzzy_match_prefix_case_insensitive: {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
case fuzzy_match_substring:
|
|
|
|
case fuzzy_match_substring_case_insensitive:
|
|
|
|
case fuzzy_match_subsequence_insertions_only:
|
|
|
|
case fuzzy_match_none: {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
DIE("Unreachable");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Test if string is a fuzzy match to another.
|
|
|
|
struct string_fuzzy_match_t {
|
|
|
|
enum fuzzy_match_type_t type;
|
|
|
|
|
|
|
|
// Strength of the match. The value depends on the type. Lower is stronger.
|
|
|
|
size_t match_distance_first;
|
|
|
|
size_t match_distance_second;
|
|
|
|
|
|
|
|
// Constructor.
|
|
|
|
explicit string_fuzzy_match_t(enum fuzzy_match_type_t t, size_t distance_first = 0,
|
|
|
|
size_t distance_second = 0);
|
|
|
|
|
|
|
|
// Return -1, 0, 1 if this match is (respectively) better than, equal to, or worse than rhs.
|
|
|
|
int compare(const string_fuzzy_match_t &rhs) const;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Compute a fuzzy match for a string. If maximum_match is not fuzzy_match_none, limit the type to
|
|
|
|
/// matches at or below that type.
|
|
|
|
string_fuzzy_match_t string_fuzzy_match_string(const wcstring &string,
|
|
|
|
const wcstring &match_against,
|
|
|
|
fuzzy_match_type_t limit_type = fuzzy_match_none);
|
|
|
|
|
2020-01-16 05:16:43 +08:00
|
|
|
/// Split a string by a separator character.
|
|
|
|
wcstring_list_t split_string(const wcstring &val, wchar_t sep);
|
|
|
|
|
|
|
|
/// Join a list of strings by a separator character.
|
|
|
|
wcstring join_strings(const wcstring_list_t &vals, wchar_t sep);
|
|
|
|
|
|
|
|
inline wcstring to_string(long x) {
|
|
|
|
wchar_t buff[64];
|
|
|
|
format_long_safe(buff, x);
|
|
|
|
return wcstring(buff);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline wcstring to_string(unsigned long long x) {
|
|
|
|
wchar_t buff[64];
|
|
|
|
format_ullong_safe(buff, x);
|
|
|
|
return wcstring(buff);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline wcstring to_string(int x) { return to_string(static_cast<long>(x)); }
|
|
|
|
|
|
|
|
inline wcstring to_string(size_t x) { return to_string(static_cast<unsigned long long>(x)); }
|
|
|
|
|
|
|
|
inline bool bool_from_string(const std::string &x) {
|
|
|
|
if (x.empty()) return false;
|
|
|
|
switch (x.front()) {
|
|
|
|
case 'Y':
|
|
|
|
case 'T':
|
|
|
|
case 'y':
|
|
|
|
case 't':
|
|
|
|
case '1':
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inline bool bool_from_string(const wcstring &x) {
|
|
|
|
return !x.empty() && std::wcschr(L"YTyt1", x.at(0));
|
|
|
|
}
|
|
|
|
|
2016-06-06 12:30:24 +08:00
|
|
|
/// @typedef wcstring_range represents a range in a wcstring.
|
|
|
|
/// The first element is the location, the second is the count.
|
2014-09-22 10:18:56 +08:00
|
|
|
typedef std::pair<wcstring::size_type, wcstring::size_type> wcstring_range;
|
|
|
|
|
2016-05-04 06:18:24 +08:00
|
|
|
/// wcstring equivalent of wcstok(). Supports NUL. For convenience and wcstok() compatibility, the
|
|
|
|
/// first character of each token separator is replaced with NUL.
|
2016-06-06 12:30:24 +08:00
|
|
|
/// @return Returns a pair of (pos, count).
|
|
|
|
/// This will be (npos, npos) when it's done. In the form of (pos, npos)
|
|
|
|
/// when the token is already known to be the final token.
|
|
|
|
/// @note The final token may not necessarily return (pos, npos).
|
2019-05-05 18:09:25 +08:00
|
|
|
wcstring_range wcstring_tok(wcstring &str, const wcstring &needle,
|
2016-05-04 06:18:24 +08:00
|
|
|
wcstring_range last = wcstring_range(0, 0));
|
2014-09-22 10:18:56 +08:00
|
|
|
|
2017-07-27 21:05:35 +08:00
|
|
|
/// Given iterators into a string (forward or reverse), splits the haystack iterators
|
|
|
|
/// about the needle sequence, up to max times. Inserts splits into the output array.
|
|
|
|
/// If the iterators are forward, this does the normal thing.
|
|
|
|
/// If the iterators are backward, this returns reversed strings, in reversed order!
|
|
|
|
/// If the needle is empty, split on individual elements (characters).
|
2018-04-17 19:57:33 +08:00
|
|
|
/// Max output entries will be max + 1 (after max splits)
|
2017-07-27 21:05:35 +08:00
|
|
|
template <typename ITER>
|
|
|
|
void split_about(ITER haystack_start, ITER haystack_end, ITER needle_start, ITER needle_end,
|
2019-05-05 18:09:25 +08:00
|
|
|
wcstring_list_t *output, long max = LONG_MAX, bool no_empty = false) {
|
2017-07-27 21:05:35 +08:00
|
|
|
long remaining = max;
|
|
|
|
ITER haystack_cursor = haystack_start;
|
|
|
|
while (remaining > 0 && haystack_cursor != haystack_end) {
|
|
|
|
ITER split_point;
|
|
|
|
if (needle_start == needle_end) { // empty needle, we split on individual elements
|
|
|
|
split_point = haystack_cursor + 1;
|
|
|
|
} else {
|
|
|
|
split_point = std::search(haystack_cursor, haystack_end, needle_start, needle_end);
|
|
|
|
}
|
|
|
|
if (split_point == haystack_end) { // not found
|
|
|
|
break;
|
|
|
|
}
|
2018-04-17 10:49:26 +08:00
|
|
|
if (!no_empty || haystack_cursor != split_point) {
|
|
|
|
output->emplace_back(haystack_cursor, split_point);
|
2018-03-29 21:12:08 +08:00
|
|
|
}
|
2017-07-27 21:05:35 +08:00
|
|
|
remaining--;
|
|
|
|
// Need to skip over the needle for the next search note that the needle may be empty.
|
|
|
|
haystack_cursor = split_point + std::distance(needle_start, needle_end);
|
|
|
|
}
|
|
|
|
// Trailing component, possibly empty.
|
2018-04-17 10:49:26 +08:00
|
|
|
if (!no_empty || haystack_cursor != haystack_end) {
|
|
|
|
output->emplace_back(haystack_cursor, haystack_end);
|
|
|
|
}
|
2017-07-27 21:05:35 +08:00
|
|
|
}
|
2018-03-10 04:52:12 +08:00
|
|
|
|
|
|
|
enum class ellipsis_type {
|
|
|
|
None,
|
2019-05-05 18:09:25 +08:00
|
|
|
// Prefer niceness over minimalness
|
2018-03-10 04:52:12 +08:00
|
|
|
Prettiest,
|
2019-05-05 18:09:25 +08:00
|
|
|
// Make every character count ($ instead of ...)
|
2018-03-10 04:52:12 +08:00
|
|
|
Shortest,
|
|
|
|
};
|
|
|
|
|
2019-05-05 18:09:25 +08:00
|
|
|
wcstring truncate(const wcstring &input, int max_len,
|
|
|
|
ellipsis_type etype = ellipsis_type::Prettiest);
|
2019-08-26 04:37:06 +08:00
|
|
|
wcstring trim(wcstring input);
|
|
|
|
wcstring trim(wcstring input, const wchar_t *any_of);
|
2018-03-10 04:52:12 +08:00
|
|
|
|
2019-09-23 06:33:08 +08:00
|
|
|
/// Converts a string to lowercase.
|
|
|
|
wcstring wcstolower(wcstring input);
|
|
|
|
|
2020-07-30 08:16:51 +08:00
|
|
|
// Out-of-line helper for wcs2string_callback.
|
|
|
|
void wcs2string_bad_char(wchar_t);
|
|
|
|
|
|
|
|
/// Implementation of wcs2string that accepts a callback.
|
|
|
|
/// This invokes \p func with (const char*, size_t) pairs.
|
|
|
|
/// If \p func returns false, it stops; otherwise it continues.
|
|
|
|
/// \return false if the callback returned false, otherwise true.
|
|
|
|
template <typename Func>
|
|
|
|
bool wcs2string_callback(const wchar_t *input, size_t len, const Func &func) {
|
|
|
|
mbstate_t state = {};
|
|
|
|
char converted[MB_LEN_MAX];
|
|
|
|
|
|
|
|
for (size_t i = 0; i < len; i++) {
|
|
|
|
wchar_t wc = input[i];
|
|
|
|
// TODO: this doesn't seem sound.
|
|
|
|
if (wc == INTERNAL_SEPARATOR) {
|
|
|
|
// do nothing
|
|
|
|
} else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) {
|
|
|
|
converted[0] = wc - ENCODE_DIRECT_BASE;
|
|
|
|
if (!func(converted, 1)) return false;
|
|
|
|
} else if (MB_CUR_MAX == 1) { // single-byte locale (C/POSIX/ISO-8859)
|
|
|
|
// If `wc` contains a wide character we emit a question-mark.
|
|
|
|
if (wc & ~0xFF) {
|
|
|
|
wc = '?';
|
|
|
|
}
|
|
|
|
converted[0] = wc;
|
|
|
|
if (!func(converted, 1)) return false;
|
|
|
|
} else {
|
|
|
|
std::memset(converted, 0, sizeof converted);
|
|
|
|
size_t len = std::wcrtomb(converted, wc, &state);
|
|
|
|
if (len == static_cast<size_t>(-1)) {
|
|
|
|
wcs2string_bad_char(wc);
|
|
|
|
std::memset(&state, 0, sizeof(state));
|
|
|
|
} else {
|
|
|
|
if (!func(converted, len)) return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-01-16 05:16:43 +08:00
|
|
|
/// Support for iterating over a newline-separated string.
|
|
|
|
template <typename Collection>
|
|
|
|
class line_iterator_t {
|
|
|
|
// Storage for each line.
|
|
|
|
Collection storage;
|
|
|
|
|
|
|
|
// The collection we're iterating. Note we hold this by reference.
|
|
|
|
const Collection &coll;
|
|
|
|
|
|
|
|
// The current location in the iteration.
|
|
|
|
typename Collection::const_iterator current;
|
|
|
|
|
|
|
|
public:
|
|
|
|
/// Construct from a collection (presumably std::string or std::wcstring).
|
|
|
|
line_iterator_t(const Collection &coll) : coll(coll), current(coll.cbegin()) {}
|
|
|
|
|
|
|
|
/// Access the storage in which the last line was stored.
|
|
|
|
const Collection &line() const { return storage; }
|
|
|
|
|
|
|
|
/// Advances to the next line. \return true on success, false if we have exhausted the string.
|
|
|
|
bool next() {
|
|
|
|
if (current == coll.end()) return false;
|
|
|
|
auto newline_or_end = std::find(current, coll.cend(), '\n');
|
|
|
|
storage.assign(current, newline_or_end);
|
|
|
|
current = newline_or_end;
|
|
|
|
|
|
|
|
// Skip the newline.
|
|
|
|
if (current != coll.cend()) ++current;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2014-09-22 10:18:56 +08:00
|
|
|
#endif
|