diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 565e2cb39..95eb8b7e2 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -64,6 +64,7 @@ #include "signal.h" #include "tokenizer.h" #include "utf8.h" +#include "util.h" #include "wcstringutil.h" #include "wildcard.h" #include "wutil.h" // IWYU pragma: keep @@ -907,31 +908,97 @@ static void test_indents() { } } -static void test_utils() { - say(L"Testing utils"); +static void test_parse_util_cmdsubst_extent() { const wchar_t *a = L"echo (echo (echo hi"; - const wchar_t *begin = NULL, *end = NULL; + parse_util_cmdsubst_extent(a, 0, &begin, &end); - if (begin != a || end != begin + wcslen(begin)) + if (begin != a || end != begin + wcslen(begin)) { err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__); + } parse_util_cmdsubst_extent(a, 1, &begin, &end); - if (begin != a || end != begin + wcslen(begin)) + if (begin != a || end != begin + wcslen(begin)) { err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__); + } parse_util_cmdsubst_extent(a, 2, &begin, &end); - if (begin != a || end != begin + wcslen(begin)) + if (begin != a || end != begin + wcslen(begin)) { err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__); + } parse_util_cmdsubst_extent(a, 3, &begin, &end); - if (begin != a || end != begin + wcslen(begin)) + if (begin != a || end != begin + wcslen(begin)) { err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__); + } parse_util_cmdsubst_extent(a, 8, &begin, &end); - if (begin != a + wcslen(L"echo (")) + if (begin != a + wcslen(L"echo (")) { err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__); + } parse_util_cmdsubst_extent(a, 17, &begin, &end); - if (begin != a + wcslen(L"echo (echo (")) + if (begin != a + wcslen(L"echo (echo (")) { err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__); + } +} + +static struct wcsfilecmp_test { + const wchar_t *str1; + const wchar_t *str2; + int expected_rc; +} wcsfilecmp_tests[] = {{L"", L"", 0}, + {L"", L"def", -1}, + {L"abc", L"", 1}, + {L"abc", L"def", -1}, + {L"abc", L"DEF", -1}, + {L"DEF", L"abc", 1}, + {L"abc", L"abc", 0}, + {L"ABC", L"ABC", 0}, + {L"AbC", L"abc", -1}, + {L"AbC", L"ABC", 1}, + {L"def", L"abc", 1}, + {L"1ghi", L"1gHi", 1}, + {L"1ghi", L"2ghi", -1}, + {L"1ghi", L"01ghi", 1}, + {L"1ghi", L"02ghi", -1}, + {L"01ghi", L"1ghi", -1}, + {L"1ghi", L"002ghi", -1}, + {L"002ghi", L"1ghi", 1}, + {L"abc01def", L"abc1def", -1}, + {L"abc1def", L"abc01def", 1}, + {L"abc12", L"abc5", 1}, + {L"51abc", L"050abc", 1}, + {L"abc5", L"abc12", -1}, + {L"5abc", L"12ABC", -1}, + {L"abc0789", L"abc789", -1}, + {L"abc0xA789", L"abc0xA0789", 1}, + {L"abc002", L"abc2", -1}, + {L"abc002g", L"abc002", 1}, + {L"abc002g", L"abc02g", -1}, + {L"abc002.txt", L"abc02.txt", -1}, + {L"abc005", L"abc012", -1}, + {L"abc02", L"abc002", 1}, + {L"abc002.txt", L"abc02.txt", -1}, + {L"GHI1abc2.txt", L"ghi1abc2.txt", -1}, + {L"a0", L"a00", -1}, + {L"a00b", L"a0b", -1}, + {L"a0b", L"a00b", 1}, + {NULL, NULL, 0}}; + +/// Verify the behavior of the `wcsfilecmp()` function. +static void test_wcsfilecmp() { + for (auto test = wcsfilecmp_tests; test->str1; test++) { + int rc = wcsfilecmp(test->str1, test->str2); + if (rc != test->expected_rc) { + err(L"New failed on line %lu: [\"%ls\" <=> \"%ls\"]: " + L"expected return code %d but got %d", + __LINE__, test->str1, test->str2, test->expected_rc, rc); + } + } +} + +static void test_utility_functions() { + say(L"Testing utility functions"); + test_wcsfilecmp(); + test_parse_util_cmdsubst_extent(); } // UTF8 tests taken from Alexey Vatchenko's utf8 library. See http://www.bsdua.org/libbsdua.html. @@ -4252,7 +4319,6 @@ int main(int argc, char **argv) { if (should_test_function("parser")) test_parser(); if (should_test_function("cancellation")) test_cancellation(); if (should_test_function("indents")) test_indents(); - if (should_test_function("utils")) test_utils(); if (should_test_function("utf8")) test_utf8(); if (should_test_function("escape_sequences")) test_escape_sequences(); if (should_test_function("lru")) test_lru(); @@ -4283,6 +4349,7 @@ int main(int argc, char **argv) { if (should_test_function("string")) test_string(); if (should_test_function("env_vars")) test_env_vars(); if (should_test_function("illegal_command_exit_code")) test_illegal_command_exit_code(); + if (should_test_function("utility_functions")) test_utility_functions(); // history_tests_t::test_history_speed(); say(L"Encountered %d errors in low-level tests", err_count); diff --git a/src/util.cpp b/src/util.cpp index 600ef9e36..98ff50988 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -1,10 +1,8 @@ // Generic utilities library. -// -// Contains data structures such as automatically growing array lists, priority queues, etc. #include "config.h" // IWYU pragma: keep #include -#include +#include #include #include #include @@ -14,58 +12,90 @@ #include "util.h" #include "wutil.h" // IWYU pragma: keep -int wcsfilecmp(const wchar_t *a, const wchar_t *b) { - CHECK(a, 0); - CHECK(b, 0); +// Compare the strings to see if they begin with an integer that can be compared and return the +// result of that comparison. +static int wcsfilecmp_leading_digits(const wchar_t **a, const wchar_t **b) { + const wchar_t *a_end, *b_end; - if (*a == 0) { - if (*b == 0) return 0; - return -1; - } - if (*b == 0) { - return 1; - } + long a_num = fish_wcstol(*a, &a_end, 10); + if (errno > 0) return 0; // invalid number -- fallback to simple string compare + long b_num = fish_wcstol(*b, &b_end, 10); + if (errno > 0) return 0; // invalid number -- fallback to simple string compare - long secondary_diff = 0; - if (iswdigit(*a) && iswdigit(*b)) { - const wchar_t *aend, *bend; - long al; - long bl; - long diff; - - al = fish_wcstol(a, &aend); - int a1_errno = errno; - bl = fish_wcstol(b, &bend); - if (a1_errno || errno) { - // Huge numbers - fall back to regular string comparison. - return wcscmp(a, b); - } - - diff = al - bl; - if (diff) return diff > 0 ? 2 : -2; - - secondary_diff = (aend - a) - (bend - b); - - a = aend - 1; //!OCLINT(parameter reassignment) - b = bend - 1; //!OCLINT(parameter reassignment) - } else { - int diff = towlower(*a) - towlower(*b); - if (diff != 0) return diff > 0 ? 2 : -2; - - secondary_diff = *a - *b; - } - - int res = wcsfilecmp(a + 1, b + 1); - - // If no primary difference in rest of string use secondary difference on this element if - // found. - if (abs(res) < 2 && secondary_diff) { - return secondary_diff > 0 ? 1 : -1; - } - - return res; + if (a_num < b_num) return -1; + if (a_num > b_num) return 1; + *a = a_end; + *b = b_end; + return 0; } +/// Compare two strings, representing file names, using "natural" ordering. This means that letter +/// case is ignored. It also means that integers in each string are compared based on the decimal +/// value rather than the string representation. It only handles base 10 integers and they can +/// appear anywhere in each string, including multiple integers. This means that a file name like +/// "0xAF0123" is treated as the literal "0xAF" followed by the integer 123. +/// +/// The intent is to ensure that file names like "file23" and "file5" are sorted so that the latter +/// appears before the former. +/// +/// This does not handle esoterica like Unicode combining characters. Nor does it use collating +/// sequences. Which means that an ASCII "A" will be less than an equivalent character with a higher +/// Unicode code point. In part because doing so is really hard without the help of something like +/// the ICU library. But also because file names might be in a different encoding than is used by +/// the current fish process which results in weird situations. This is basically a best effort +/// implementation that will do the right thing 99.99% of the time. +/// +/// Returns: -1 if a < b, 0 if a == b, 1 if a > b. +int wcsfilecmp(const wchar_t *a, const wchar_t *b) { + CHECK(a, NULL); + CHECK(b, NULL); + const wchar_t *orig_a = a; + const wchar_t *orig_b = b; + int retval = 0; // assume the strings will be equal + + while (*a && *b) { + if (iswdigit(*a) && iswdigit(*b)) { + retval = wcsfilecmp_leading_digits(&a, &b); + // If we know the strings aren't logically equal or we've reached the end of one or both + // strings we can stop iterating over the chars in each string. + if (retval || *a == 0 || *b == 0) break; + } + + wint_t al = towlower(*a); + wint_t bl = towlower(*b); + if (al < bl) { + retval = -1; + break; + } else if (al > bl) { + retval = 1; + break; + } else { + a++; + b++; + } + } + + if (retval != 0) return retval; // we already know the strings aren't logically equal + + if (*a == 0) { + if (*b == 0) { + // The strings are logically equal. They may or may not be the same length depending on + // whether numbers were present but that doesn't matter. Disambiguate strings that + // differ by letter case or length. We don't bother optimizing the case where the file + // names are literally identical because that won't occur given how this function is + // used. And even if it were to occur (due to being reused in some other context) it + // would be so rare that it isn't worth optimizing for. + retval = wcscmp(orig_a, orig_b); + return retval < 0 ? -1 : retval == 0 ? 0 : 1; + } + return -1; // string a is a prefix of b and b is longer + } + + assert(*b == 0); + return 1; // string b is a prefix of a and a is longer +} + +/// Return microseconds since the epoch. long long get_time() { struct timeval time_struct; gettimeofday(&time_struct, 0);