From 0b976a18433fc734b60ab92593979ba64ada4c1f Mon Sep 17 00:00:00 2001
From: Kurtis Rader <krader@skepticism.us>
Date: Fri, 12 May 2017 20:15:24 -0700
Subject: [PATCH] fix `wcsfilecmp()`

This started out as a refactoring to eliminate the lint warnings. Adding
unit tests revealed the current implementation does not behave as
implied. So this is a complete rewrite of the implementation. With the
addition of unit tests so that it doesn't break in the future and anyone
who thinks this new version behaves wrong can update the unit tests to
help ensure we're testing for the correct behavior.

Fixes #4027
---
 src/fish_tests.cpp |  87 ++++++++++++++++++++++++++----
 src/util.cpp       | 132 +++++++++++++++++++++++++++------------------
 2 files changed, 158 insertions(+), 61 deletions(-)

diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index 565e2cb39..95eb8b7e2 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -64,6 +64,7 @@
 #include "signal.h"
 #include "tokenizer.h"
 #include "utf8.h"
+#include "util.h"
 #include "wcstringutil.h"
 #include "wildcard.h"
 #include "wutil.h"  // IWYU pragma: keep
@@ -907,31 +908,97 @@ static void test_indents() {
     }
 }
 
-static void test_utils() {
-    say(L"Testing utils");
+static void test_parse_util_cmdsubst_extent() {
     const wchar_t *a = L"echo (echo (echo hi";
-
     const wchar_t *begin = NULL, *end = NULL;
+
     parse_util_cmdsubst_extent(a, 0, &begin, &end);
-    if (begin != a || end != begin + wcslen(begin))
+    if (begin != a || end != begin + wcslen(begin)) {
         err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
+    }
     parse_util_cmdsubst_extent(a, 1, &begin, &end);
-    if (begin != a || end != begin + wcslen(begin))
+    if (begin != a || end != begin + wcslen(begin)) {
         err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
+    }
     parse_util_cmdsubst_extent(a, 2, &begin, &end);
-    if (begin != a || end != begin + wcslen(begin))
+    if (begin != a || end != begin + wcslen(begin)) {
         err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
+    }
     parse_util_cmdsubst_extent(a, 3, &begin, &end);
-    if (begin != a || end != begin + wcslen(begin))
+    if (begin != a || end != begin + wcslen(begin)) {
         err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
+    }
 
     parse_util_cmdsubst_extent(a, 8, &begin, &end);
-    if (begin != a + wcslen(L"echo ("))
+    if (begin != a + wcslen(L"echo (")) {
         err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
+    }
 
     parse_util_cmdsubst_extent(a, 17, &begin, &end);
-    if (begin != a + wcslen(L"echo (echo ("))
+    if (begin != a + wcslen(L"echo (echo (")) {
         err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
+    }
+}
+
+static struct wcsfilecmp_test {
+    const wchar_t *str1;
+    const wchar_t *str2;
+    int expected_rc;
+} wcsfilecmp_tests[] = {{L"", L"", 0},
+                        {L"", L"def", -1},
+                        {L"abc", L"", 1},
+                        {L"abc", L"def", -1},
+                        {L"abc", L"DEF", -1},
+                        {L"DEF", L"abc", 1},
+                        {L"abc", L"abc", 0},
+                        {L"ABC", L"ABC", 0},
+                        {L"AbC", L"abc", -1},
+                        {L"AbC", L"ABC", 1},
+                        {L"def", L"abc", 1},
+                        {L"1ghi", L"1gHi", 1},
+                        {L"1ghi", L"2ghi", -1},
+                        {L"1ghi", L"01ghi", 1},
+                        {L"1ghi", L"02ghi", -1},
+                        {L"01ghi", L"1ghi", -1},
+                        {L"1ghi", L"002ghi", -1},
+                        {L"002ghi", L"1ghi", 1},
+                        {L"abc01def", L"abc1def", -1},
+                        {L"abc1def", L"abc01def", 1},
+                        {L"abc12", L"abc5", 1},
+                        {L"51abc", L"050abc", 1},
+                        {L"abc5", L"abc12", -1},
+                        {L"5abc", L"12ABC", -1},
+                        {L"abc0789", L"abc789", -1},
+                        {L"abc0xA789", L"abc0xA0789", 1},
+                        {L"abc002", L"abc2", -1},
+                        {L"abc002g", L"abc002", 1},
+                        {L"abc002g", L"abc02g", -1},
+                        {L"abc002.txt", L"abc02.txt", -1},
+                        {L"abc005", L"abc012", -1},
+                        {L"abc02", L"abc002", 1},
+                        {L"abc002.txt", L"abc02.txt", -1},
+                        {L"GHI1abc2.txt", L"ghi1abc2.txt", -1},
+                        {L"a0", L"a00", -1},
+                        {L"a00b", L"a0b", -1},
+                        {L"a0b", L"a00b", 1},
+                        {NULL, NULL, 0}};
+
+/// Verify the behavior of the `wcsfilecmp()` function.
+static void test_wcsfilecmp() {
+    for (auto test = wcsfilecmp_tests; test->str1; test++) {
+        int rc = wcsfilecmp(test->str1, test->str2);
+        if (rc != test->expected_rc) {
+            err(L"New failed on line %lu: [\"%ls\" <=> \"%ls\"]: "
+                L"expected return code %d but got %d",
+                __LINE__, test->str1, test->str2, test->expected_rc, rc);
+        }
+    }
+}
+
+static void test_utility_functions() {
+    say(L"Testing utility functions");
+    test_wcsfilecmp();
+    test_parse_util_cmdsubst_extent();
 }
 
 // UTF8 tests taken from Alexey Vatchenko's utf8 library. See http://www.bsdua.org/libbsdua.html.
@@ -4252,7 +4319,6 @@ int main(int argc, char **argv) {
     if (should_test_function("parser")) test_parser();
     if (should_test_function("cancellation")) test_cancellation();
     if (should_test_function("indents")) test_indents();
-    if (should_test_function("utils")) test_utils();
     if (should_test_function("utf8")) test_utf8();
     if (should_test_function("escape_sequences")) test_escape_sequences();
     if (should_test_function("lru")) test_lru();
@@ -4283,6 +4349,7 @@ int main(int argc, char **argv) {
     if (should_test_function("string")) test_string();
     if (should_test_function("env_vars")) test_env_vars();
     if (should_test_function("illegal_command_exit_code")) test_illegal_command_exit_code();
+    if (should_test_function("utility_functions")) test_utility_functions();
     // history_tests_t::test_history_speed();
 
     say(L"Encountered %d errors in low-level tests", err_count);
diff --git a/src/util.cpp b/src/util.cpp
index 600ef9e36..98ff50988 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -1,10 +1,8 @@
 // Generic utilities library.
-//
-// Contains data structures such as automatically growing array lists, priority queues, etc.
 #include "config.h"  // IWYU pragma: keep
 
 #include <errno.h>
-#include <stdlib.h>
+#include <stddef.h>
 #include <sys/time.h>
 #include <wchar.h>
 #include <wctype.h>
@@ -14,58 +12,90 @@
 #include "util.h"
 #include "wutil.h"  // IWYU pragma: keep
 
-int wcsfilecmp(const wchar_t *a, const wchar_t *b) {
-    CHECK(a, 0);
-    CHECK(b, 0);
+// Compare the strings to see if they begin with an integer that can be compared and return the
+// result of that comparison.
+static int wcsfilecmp_leading_digits(const wchar_t **a, const wchar_t **b) {
+    const wchar_t *a_end, *b_end;
 
-    if (*a == 0) {
-        if (*b == 0) return 0;
-        return -1;
-    }
-    if (*b == 0) {
-        return 1;
-    }
+    long a_num = fish_wcstol(*a, &a_end, 10);
+    if (errno > 0) return 0;  // invalid number -- fallback to simple string compare
+    long b_num = fish_wcstol(*b, &b_end, 10);
+    if (errno > 0) return 0;  // invalid number -- fallback to simple string compare
 
-    long secondary_diff = 0;
-    if (iswdigit(*a) && iswdigit(*b)) {
-        const wchar_t *aend, *bend;
-        long al;
-        long bl;
-        long diff;
-
-        al = fish_wcstol(a, &aend);
-        int a1_errno = errno;
-        bl = fish_wcstol(b, &bend);
-        if (a1_errno || errno) {
-            // Huge numbers - fall back to regular string comparison.
-            return wcscmp(a, b);
-        }
-
-        diff = al - bl;
-        if (diff) return diff > 0 ? 2 : -2;
-
-        secondary_diff = (aend - a) - (bend - b);
-
-        a = aend - 1;  //!OCLINT(parameter reassignment)
-        b = bend - 1;  //!OCLINT(parameter reassignment)
-    } else {
-        int diff = towlower(*a) - towlower(*b);
-        if (diff != 0) return diff > 0 ? 2 : -2;
-
-        secondary_diff = *a - *b;
-    }
-
-    int res = wcsfilecmp(a + 1, b + 1);
-
-    // If no primary difference in rest of string use secondary difference on this element if
-    // found.
-    if (abs(res) < 2 && secondary_diff) {
-        return secondary_diff > 0 ? 1 : -1;
-    }
-
-    return res;
+    if (a_num < b_num) return -1;
+    if (a_num > b_num) return 1;
+    *a = a_end;
+    *b = b_end;
+    return 0;
 }
 
+/// Compare two strings, representing file names, using "natural" ordering. This means that letter
+/// case is ignored. It also means that integers in each string are compared based on the decimal
+/// value rather than the string representation. It only handles base 10 integers and they can
+/// appear anywhere in each string, including multiple integers. This means that a file name like
+/// "0xAF0123" is treated as the literal "0xAF" followed by the integer 123.
+///
+/// The intent is to ensure that file names like "file23" and "file5" are sorted so that the latter
+/// appears before the former.
+///
+/// This does not handle esoterica like Unicode combining characters. Nor does it use collating
+/// sequences. Which means that an ASCII "A" will be less than an equivalent character with a higher
+/// Unicode code point. In part because doing so is really hard without the help of something like
+/// the ICU library. But also because file names might be in a different encoding than is used by
+/// the current fish process which results in weird situations. This is basically a best effort
+/// implementation that will do the right thing 99.99% of the time.
+///
+/// Returns: -1 if a < b, 0 if a == b, 1 if a > b.
+int wcsfilecmp(const wchar_t *a, const wchar_t *b) {
+    CHECK(a, NULL);
+    CHECK(b, NULL);
+    const wchar_t *orig_a = a;
+    const wchar_t *orig_b = b;
+    int retval = 0;  // assume the strings will be equal
+
+    while (*a && *b) {
+        if (iswdigit(*a) && iswdigit(*b)) {
+            retval = wcsfilecmp_leading_digits(&a, &b);
+            // If we know the strings aren't logically equal or we've reached the end of one or both
+            // strings we can stop iterating over the chars in each string.
+            if (retval || *a == 0 || *b == 0) break;
+        }
+
+        wint_t al = towlower(*a);
+        wint_t bl = towlower(*b);
+        if (al < bl) {
+            retval = -1;
+            break;
+        } else if (al > bl) {
+            retval = 1;
+            break;
+        } else {
+            a++;
+            b++;
+        }
+    }
+
+    if (retval != 0) return retval;  // we already know the strings aren't logically equal
+
+    if (*a == 0) {
+        if (*b == 0) {
+            // The strings are logically equal. They may or may not be the same length depending on
+            // whether numbers were present but that doesn't matter. Disambiguate strings that
+            // differ by letter case or length. We don't bother optimizing the case where the file
+            // names are literally identical because that won't occur given how this function is
+            // used. And even if it were to occur (due to being reused in some other context) it
+            // would be so rare that it isn't worth optimizing for.
+            retval = wcscmp(orig_a, orig_b);
+            return retval < 0 ? -1 : retval == 0 ? 0 : 1;
+        }
+        return -1;  // string a is a prefix of b and b is longer
+    }
+
+    assert(*b == 0);
+    return 1;  // string b is a prefix of a and a is longer
+}
+
+/// Return microseconds since the epoch.
 long long get_time() {
     struct timeval time_struct;
     gettimeofday(&time_struct, 0);