From 92dd6de73ccdb37facc43f768653e11941448a6b Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Tue, 27 Sep 2016 21:07:10 -0700 Subject: [PATCH] deal with broken unicode implementations Both GNU and BSD have bugs regarding the classification of non-characters and private use area characters. Provide wrappers around iswalnum(), iswalpha(), and isgraph() to provide a consistent experience. We don't bother to autoconf the use of these wrappers for several reasons. Including the fact that a binary built for one distro release should behave correctly on another release (e.g., FreeBSD 10 does the right thing while FreeBSD 11 and 12 do not with respect to iswalnum() of code points in the range 0xFDD0..0xFDFF). Also move a few functions from common.* to wutil.* because they are wide char specific and really belong in the latter module. Fixes #3050 --- src/common.cpp | 20 ------------- src/common.h | 21 -------------- src/wutil.cpp | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/wutil.h | 25 ++++++++++++++++ 4 files changed, 104 insertions(+), 41 deletions(-) diff --git a/src/common.cpp b/src/common.cpp index 7154f0ecb..539cd2133 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -428,26 +428,6 @@ void append_format(wcstring &str, const wchar_t *format, ...) { va_end(va); } -const wchar_t *wcsvarname(const wchar_t *str) { - while (*str) { - if ((!iswalnum(*str)) && (*str != L'_')) { - return str; - } - str++; - } - return NULL; -} - -const wchar_t *wcsvarname(const wcstring &str) { return wcsvarname(str.c_str()); } - -const wchar_t *wcsfuncname(const wcstring &str) { return wcschr(str.c_str(), L'/'); } - -bool wcsvarchr(wchar_t chr) { return iswalnum(chr) || chr == L'_'; } - -int fish_wcswidth(const wchar_t *str) { return fish_wcswidth(str, wcslen(str)); } - -int fish_wcswidth(const wcstring &str) { return fish_wcswidth(str.c_str(), str.size()); } - wchar_t *quote_end(const wchar_t *pos) { wchar_t c = *pos; diff --git a/src/common.h b/src/common.h index 4449f898f..50100af8b 100644 --- a/src/common.h +++ b/src/common.h @@ -617,27 +617,6 @@ wcstring vformat_string(const wchar_t *format, va_list va_orig); void append_format(wcstring &str, const wchar_t *format, ...); void append_formatv(wcstring &str, const wchar_t *format, va_list ap); -/// Test if the given string is a valid variable name. -/// -/// \return null if this is a valid name, and a pointer to the first invalid character otherwise. -const wchar_t *wcsvarname(const wchar_t *str); -const wchar_t *wcsvarname(const wcstring &str); - -/// Test if the given string is a valid function name. -/// -/// \return null if this is a valid name, and a pointer to the first invalid character otherwise. -const wchar_t *wcsfuncname(const wcstring &str); - -/// Test if the given string is valid in a variable name. -/// -/// \return true if this is a valid name, false otherwise. -bool wcsvarchr(wchar_t chr); - -/// Convenience variants on fish_wcwswidth(). -/// -/// See fallback.h for the normal definitions. -int fish_wcswidth(const wchar_t *str); -int fish_wcswidth(const wcstring &str); /// This functions returns the end of the quoted substring beginning at \c in. The type of quoting /// character is detemrined by examining \c in. Returns 0 on error. diff --git a/src/wutil.cpp b/src/wutil.cpp index c7b826fd4..5a08b6aca 100644 --- a/src/wutil.cpp +++ b/src/wutil.cpp @@ -1,4 +1,5 @@ // Wide character equivalents of various standard unix functions. +#define FISH_NO_ISW_WRAPPERS #include "config.h" #include @@ -470,6 +471,84 @@ int wrename(const wcstring &old, const wcstring &newv) { return rename(old_narrow.c_str(), new_narrow.c_str()); } +/// Return one if the code point is in the range we reserve for internal use. +int fish_is_reserved_codepoint(wint_t wc) { + if (RESERVED_CHAR_BASE <= wc && wc < RESERVED_CHAR_END) return 1; + if (EXPAND_RESERVED_BASE <= wc && wc < EXPAND_RESERVED_END) return 1; + if (WILDCARD_RESERVED_BASE <= wc && wc < WILDCARD_RESERVED_END) return 1; + return 0; +} + +/// Return one if the code point is in a Unicode private use area. +int fish_is_pua(wint_t wc) { + if (PUA1_START <= wc && wc < PUA1_END) return 1; + if (PUA2_START <= wc && wc < PUA2_END) return 1; + if (PUA3_START <= wc && wc < PUA3_END) return 1; + return 0; +} + +/// We need this because there are too many implementations that don't return the proper answer for +/// some code points. See issue #3050. +int fish_iswalnum(wint_t wc) { + if (fish_is_reserved_codepoint(wc)) return 0; + if (fish_is_pua(wc)) return 0; + return iswalnum(wc); +} + +/// We need this because there are too many implementations that don't return the proper answer for +/// some code points. See issue #3050. +int fish_iswalpha(wint_t wc) { + if (fish_is_reserved_codepoint(wc)) return 0; + if (fish_is_pua(wc)) return 0; + return iswalpha(wc); +} + +/// We need this because there are too many implementations that don't return the proper answer for +/// some code points. See issue #3050. +int fish_iswgraph(wint_t wc) { + if (fish_is_reserved_codepoint(wc)) return 0; + if (fish_is_pua(wc)) return 1; + return iswgraph(wc); +} + +/// Test if the given string is a valid variable name. +/// +/// \return null if this is a valid name, and a pointer to the first invalid character otherwise. +const wchar_t *wcsvarname(const wchar_t *str) { + while (*str) { + if ((!fish_iswalnum(*str)) && (*str != L'_')) { + return str; + } + str++; + } + return NULL; +} + +/// Test if the given string is a valid variable name. +/// +/// \return null if this is a valid name, and a pointer to the first invalid character otherwise. +const wchar_t *wcsvarname(const wcstring &str) { return wcsvarname(str.c_str()); } + +/// Test if the given string is a valid function name. +/// +/// \return null if this is a valid name, and a pointer to the first invalid character otherwise. +const wchar_t *wcsfuncname(const wcstring &str) { return wcschr(str.c_str(), L'/'); } + +/// Test if the given string is valid in a variable name. +/// +/// \return true if this is a valid name, false otherwise. +bool wcsvarchr(wchar_t chr) { return fish_iswalnum(chr) || chr == L'_'; } + +/// Convenience variants on fish_wcwswidth(). +/// +/// See fallback.h for the normal definitions. +int fish_wcswidth(const wchar_t *str) { return fish_wcswidth(str, wcslen(str)); } + +/// Convenience variants on fish_wcwswidth(). +/// +/// See fallback.h for the normal definitions. +int fish_wcswidth(const wcstring &str) { return fish_wcswidth(str.c_str(), str.size()); } + file_id_t file_id_t::file_id_from_stat(const struct stat *buf) { assert(buf != NULL); diff --git a/src/wutil.h b/src/wutil.h index d0f1a7944..304c3b23c 100644 --- a/src/wutil.h +++ b/src/wutil.h @@ -59,6 +59,31 @@ int wmkdir(const wcstring &dir, int mode); int wrename(const wcstring &oldName, const wcstring &newName); +#define PUA1_START 0xE000 +#define PUA1_END 0xF900 +#define PUA2_START 0xF0000 +#define PUA2_END 0xFFFFE +#define PUA3_START 0x100000 +#define PUA3_END 0x10FFFE + +// We need this because there are too many implementations that don't return the proper answer for +// some code points. See issue #3050. +#ifndef FISH_NO_ISW_WRAPPERS +#define iswalnum fish_iswalnum +#define iswalpha fish_iswalpha +#define iswgraph fish_iswgraph +#endif +int fish_iswalnum(wint_t wc); +int fish_iswalpha(wint_t wc); +int fish_iswgraph(wint_t wc); + +const wchar_t *wcsvarname(const wchar_t *str); +const wchar_t *wcsvarname(const wcstring &str); +const wchar_t *wcsfuncname(const wcstring &str); +bool wcsvarchr(wchar_t chr); +int fish_wcswidth(const wchar_t *str); +int fish_wcswidth(const wcstring &str); + /// Class for representing a file's inode. We use this to detect and avoid symlink loops, among /// other things. While an inode / dev pair is sufficient to distinguish co-existing files, Linux /// seems to aggressively re-use inodes, so it cannot determine if a file has been deleted (ABA