fish-shell/src/util.cpp
Fabian Homborg 6af3896217 wcsfilecmp: Stop actually computing the numbers
This just compares two longs as strings on the go.

We can simply

1. ignore leading zeroes - they have no influence on the value
2. compare the digits char-by-char
3. keep the comparison for the first differing digit
4. if one number is longer than the other, that is larger
5. if the numbers have the same length, the one larger in the first
differing digit is larger

This makes this comparison quite a bit faster, which makes globs in
directories with numbered files up to 20% faster.

Note that, for historical reasons, this still ignores whitespace right
after the numbers!
2021-10-07 17:57:52 +02:00

201 lines
7.1 KiB
C++

// Generic utilities library.
#include "config.h" // IWYU pragma: keep
#include "util.h"
#include <errno.h>
#include <stddef.h>
#include <sys/time.h>
#include <wctype.h>
#include <cwchar>
#include "common.h"
#include "fallback.h" // IWYU pragma: keep
#include "wutil.h" // IWYU pragma: keep
// Compare the strings to see if they begin with an integer that can be compared and return the
// result of that comparison.
static int wcsfilecmp_leading_digits(const wchar_t **a, const wchar_t **b) {
const wchar_t *a1 = *a;
const wchar_t *b1 = *b;
// Ignore leading 0s.
while (*a1 == L'0') a1++;
while (*b1 == L'0') b1++;
int ret = 0;
while (true) {
if (iswdigit(*a1) && iswdigit(*b1)) {
// We keep the cmp value for the
// first differing digit.
//
// If the numbers have the same length, that's the value.
if (ret == 0) {
// Comparing the string value is the same as numerical
// for wchar_t digits!
if (*a1 > *b1) ret = 1;
if (*b1 > *a1) ret = -1;
}
} else {
// We don't have negative numbers and we only allow ints,
// and we have already skipped leading zeroes,
// so the longer number is larger automatically.
if (iswdigit(*a1)) ret = 1;
if (iswdigit(*b1)) ret = -1;
break;
}
a1++;
b1++;
}
// For historical reasons, we skip trailing whitespace
// like fish_wcstol does!
// This is used in sorting globs, and that's supposed to be stable.
while (iswspace(*a1)) a1++;
while (iswspace(*b1)) b1++;
*a = a1;
*b = b1;
return ret;
}
/// Compare two strings, representing file names, using "natural" ordering. This means that letter
/// case is ignored. It also means that integers in each string are compared based on the decimal
/// value rather than the string representation. It only handles base 10 integers and they can
/// appear anywhere in each string, including multiple integers. This means that a file name like
/// "0xAF0123" is treated as the literal "0xAF" followed by the integer 123.
///
/// The intent is to ensure that file names like "file23" and "file5" are sorted so that the latter
/// appears before the former.
///
/// This does not handle esoterica like Unicode combining characters. Nor does it use collating
/// sequences. Which means that an ASCII "A" will be less than an equivalent character with a higher
/// Unicode code point. In part because doing so is really hard without the help of something like
/// the ICU library. But also because file names might be in a different encoding than is used by
/// the current fish process which results in weird situations. This is basically a best effort
/// implementation that will do the right thing 99.99% of the time.
///
/// Returns: -1 if a < b, 0 if a == b, 1 if a > b.
int wcsfilecmp(const wchar_t *a, const wchar_t *b) {
assert(a && b && "Null parameter");
const wchar_t *orig_a = a;
const wchar_t *orig_b = b;
int retval = 0; // assume the strings will be equal
while (*a && *b) {
if (iswdigit(*a) && iswdigit(*b)) {
retval = wcsfilecmp_leading_digits(&a, &b);
// If we know the strings aren't logically equal or we've reached the end of one or both
// strings we can stop iterating over the chars in each string.
if (retval || *a == 0 || *b == 0) break;
}
// Fast path: Skip towupper.
if (*a == *b) {
a++;
b++;
continue;
}
wint_t al = towupper(*a);
wint_t bl = towupper(*b);
// Sort dashes after Z - see #5634
if (al == L'-') al = L'[';
if (bl == L'-') bl = L'[';
if (al < bl) {
retval = -1;
break;
} else if (al > bl) {
retval = 1;
break;
} else {
a++;
b++;
}
}
if (retval != 0) return retval; // we already know the strings aren't logically equal
if (*a == 0) {
if (*b == 0) {
// The strings are logically equal. They may or may not be the same length depending on
// whether numbers were present but that doesn't matter. Disambiguate strings that
// differ by letter case or length. We don't bother optimizing the case where the file
// names are literally identical because that won't occur given how this function is
// used. And even if it were to occur (due to being reused in some other context) it
// would be so rare that it isn't worth optimizing for.
retval = std::wcscmp(orig_a, orig_b);
return retval < 0 ? -1 : retval == 0 ? 0 : 1;
}
return -1; // string a is a prefix of b and b is longer
}
assert(*b == 0);
return 1; // string b is a prefix of a and a is longer
}
/// wcsfilecmp, but frozen in time for glob usage.
int wcsfilecmp_glob(const wchar_t *a, const wchar_t *b) {
assert(a && b && "Null parameter");
const wchar_t *orig_a = a;
const wchar_t *orig_b = b;
int retval = 0; // assume the strings will be equal
while (*a && *b) {
if (iswdigit(*a) && iswdigit(*b)) {
retval = wcsfilecmp_leading_digits(&a, &b);
// If we know the strings aren't logically equal or we've reached the end of one or both
// strings we can stop iterating over the chars in each string.
if (retval || *a == 0 || *b == 0) break;
}
// Fast path: Skip towlower.
if (*a == *b) {
a++;
b++;
continue;
}
wint_t al = towlower(*a);
wint_t bl = towlower(*b);
if (al < bl) {
retval = -1;
break;
} else if (al > bl) {
retval = 1;
break;
} else {
a++;
b++;
}
}
if (retval != 0) return retval; // we already know the strings aren't logically equal
if (*a == 0) {
if (*b == 0) {
// The strings are logically equal. They may or may not be the same length depending on
// whether numbers were present but that doesn't matter. Disambiguate strings that
// differ by letter case or length. We don't bother optimizing the case where the file
// names are literally identical because that won't occur given how this function is
// used. And even if it were to occur (due to being reused in some other context) it
// would be so rare that it isn't worth optimizing for.
retval = wcscmp(orig_a, orig_b);
return retval < 0 ? -1 : retval == 0 ? 0 : 1;
}
return -1; // string a is a prefix of b and b is longer
}
assert(*b == 0);
return 1; // string b is a prefix of a and a is longer
}
/// Return microseconds since the epoch.
long long get_time() {
struct timeval time_struct;
gettimeofday(&time_struct, nullptr);
return 1000000LL * time_struct.tv_sec + time_struct.tv_usec;
}