Optimize away a str2wcs_internal check

str2wcs_internal is one of worst hot paths in the codebase, and this
particular check can be optimized away for non-macOS hosts at compile
time.
This commit is contained in:
Mahmoud Al-Qudsi 2020-09-07 18:03:22 -05:00
parent bf31333622
commit 1365379518
4 changed files with 33 additions and 1 deletions

View File

@ -224,3 +224,11 @@ LIBATOMIC_NOT_NEEDED)
IF (NOT LIBATOMIC_NOT_NEEDED)
set(ATOMIC_LIBRARY "atomic")
endif()
# Check if mbrtowc implementation attempts to encode invalid UTF-8 sequences
# Known culprits: at least some versions of macOS (confirmed Snow Leopard and Yosemite)
try_run(mbrtowc_invalid_utf8_exit, mbrtowc_invalid_utf8_compiles, ${CMAKE_CURRENT_BINARY_DIR},
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/checks/mbrtowc_invalid_utf8.cpp")
IF ("${mbrtowc_invalid_utf8_compiles}" AND ("${mbrtowc_invalid_utf8_exit}" EQUAL 1))
SET(HAVE_BROKEN_MBRTOWC_UTF8 1)
ENDIF()

View File

@ -0,0 +1,18 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cwchar>
// Check whether the runtime mbrtowc implementation attempts to encode
// invalid UTF-8 values.
int main() {
// TODO: I'm not sure how to enforce a UTF-8 locale without overriding the language
char sample[] = "hello world";
sample[0] |= 0xF8;
wchar_t wsample[100] {};
std::mbstate_t state = std::mbstate_t();
int res = std::mbrtowc(wsample, sample, strlen(sample), &state);
return res < 0 ? 0 : 1;
}

View File

@ -167,6 +167,9 @@
# define _DARWIN_USE_64_BIT_INODE 1
#endif
/* Define to 1 if mbrtowc attempts to convert invalid UTF-8 sequences */
#cmakedefine HAVE_BROKEN_MBRTOWC_UTF8 1
#if __GNUC__ >= 3
#ifndef __warn_unused
#define __warn_unused __attribute__ ((warn_unused_result))

View File

@ -266,10 +266,13 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) {
size_t ret = 0;
wchar_t wc = 0;
if ((in[in_pos] & 0xF8) == 0xF8) {
if (false) {
#if defined(HAVE_BROKEN_MBRTOWC_UTF8)
} else if ((in[in_pos] & 0xF8) == 0xF8) {
// Protect against broken std::mbrtowc() implementations which attempt to encode UTF-8
// sequences longer than four bytes (e.g., OS X Snow Leopard).
use_encode_direct = true;
#endif
} else if (sizeof(wchar_t) == 2 && //!OCLINT(constant if expression)
(in[in_pos] & 0xF8) == 0xF0) {
// Assume we are in a UTF-16 environment (e.g., Cygwin) using a UTF-8 encoding.