fix handling of non-ASCII chars in C locale

The relevant standards allow the mbtowc/mbrtowc functions to reject
non-ASCII characters (i.e., chars with the high bit set) when the locale
is C or POSIX.  The BSD libraries (e.g., on OS X) don't do this but
the GNU libraries (e.g., on Linux) do. Like most programs we need the
C/POSIX locales to allow arbitrary bytes. So explicitly check if we're
in a single-byte locale (which would also include ISO-8859 variants)
and simply pass-thru the chars without encoding or decoding.

Fixes #2802.
This commit is contained in:
Kurtis Rader 2016-03-10 18:17:39 -08:00
parent fb0921249f
commit c2f1df1d4a
14 changed files with 215 additions and 165 deletions

View File

@ -1907,18 +1907,18 @@ static int builtin_echo(parser_t &parser, io_streams_t &streams, wchar_t **argv)
return STATUS_BUILTIN_OK;
}
/** The pwd builtin. We don't respect -P to resolve symbolic links because we try to always resolve them. */
// The pwd builtin. We don't respect -P to resolve symbolic links because we
// try to always resolve them.
static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv)
{
wchar_t dir_path[4096];
wchar_t *res = wgetcwd(dir_path, 4096);
if (res == NULL)
wcstring res = wgetcwd();
if (res.empty())
{
return STATUS_BUILTIN_ERROR;
}
else
{
streams.out.append(dir_path);
streams.out.append(res);
streams.out.push_back(L'\n');
return STATUS_BUILTIN_OK;
}
@ -2699,9 +2699,8 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
while (1)
{
int finished=0;
wchar_t res=0;
int finished = 0;
wchar_t res = 0;
mbstate_t state = {};
while (!finished)
@ -2713,24 +2712,26 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
break;
}
size_t sz = mbrtowc(&res, &b, 1, &state);
switch (sz)
if (MB_CUR_MAX == 1) // single-byte locale
{
case (size_t)(-1):
memset(&state, '\0', sizeof(state));
break;
res = (unsigned char)b;
finished = 1;
}
else {
size_t sz = mbrtowc(&res, &b, 1, &state);
switch (sz)
{
case (size_t)-1:
memset(&state, 0, sizeof(state));
break;
case (size_t)(-2):
break;
case 0:
finished = 1;
break;
default:
finished=1;
break;
case (size_t)-2:
break;
default:
finished = 1;
break;
}
}
}

View File

@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f)
{
errno=0;
c = getwc(f);
c = fgetwc(f);
if (errno == EILSEQ || errno == EINTR)
{
continue;
@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
wcstring result;
result.reserve(in_len);
mbstate_t state = {};
size_t in_pos = 0;
if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
{
while (in_pos < in_len)
{
result.push_back((unsigned char)in[in_pos]);
in_pos++;
}
return result;
}
mbstate_t state = {};
while (in_pos < in_len)
{
wchar_t wc = 0;
@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
{
use_encode_direct = true;
}
else if (ret == (size_t)(-2))
else if (ret == (size_t)-2)
{
/* Incomplete sequence */
use_encode_direct = true;
}
else if (ret == (size_t)(-1))
else if (ret == (size_t)-1)
{
/* Invalid data */
use_encode_direct = true;
@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input)
std::string result;
result.reserve(input.size());
mbstate_t state;
memset(&state, 0, sizeof(state));
mbstate_t state = {};
char converted[MB_LEN_MAX + 1];
for (size_t i=0; i < input.size(); i++)
@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input)
wchar_t wc = input[i];
if (wc == INTERNAL_SEPARATOR)
{
// Do nothing.
}
else if ((wc >= ENCODE_DIRECT_BASE) &&
(wc < ENCODE_DIRECT_BASE+256))
else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256)
{
result.push_back(wc - ENCODE_DIRECT_BASE);
}
else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
// If `wc` contains a wide character we emit a question-mark.
if (wc & ~0xFF)
{
wc = '?';
}
converted[0] = wc;
result.append(converted, 1);
}
else
{
memset(converted, 0, sizeof converted);
@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input)
*/
static char *wcs2str_internal(const wchar_t *in, char *out)
{
size_t res=0;
size_t in_pos=0;
size_t out_pos = 0;
mbstate_t state;
CHECK(in, 0);
CHECK(out, 0);
memset(&state, 0, sizeof(state));
size_t in_pos = 0;
size_t out_pos = 0;
mbstate_t state = {};
while (in[in_pos])
{
if (in[in_pos] == INTERNAL_SEPARATOR)
{
// Do nothing.
}
else if ((in[in_pos] >= ENCODE_DIRECT_BASE) &&
(in[in_pos] < ENCODE_DIRECT_BASE+256))
else if (in[in_pos] >= ENCODE_DIRECT_BASE &&
in[in_pos] < ENCODE_DIRECT_BASE + 256)
{
out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE;
}
else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
// If `wc` contains a wide character we emit a question-mark.
if (in[in_pos] & ~0xFF)
{
out[out_pos++] = '?';
}
else
{
out[out_pos++] = (unsigned char)in[in_pos];
}
}
else
{
res = wcrtomb(&out[out_pos], in[in_pos], &state);
if (res == (size_t)(-1))
size_t len = wcrtomb(&out[out_pos], in[in_pos], &state);
if (len == (size_t)-1)
{
debug(1, L"Wide character %d has no narrow representation", in[in_pos]);
memset(&state, 0, sizeof(state));
}
else
{
out_pos += res;
out_pos += len;
}
}
in_pos++;

View File

@ -377,14 +377,13 @@ static void setup_path()
int env_set_pwd()
{
wchar_t dir_path[4096];
wchar_t *res = wgetcwd(dir_path, 4096);
if (!res)
wcstring res = wgetcwd();
if (res.empty())
{
debug(0, _(L"Could not determine current working directory. Is your locale set correctly?"));
return 0;
}
env_set(L"PWD", dir_path, ENV_EXPORT | ENV_GLOBAL);
env_set(L"PWD", res.c_str(), ENV_EXPORT | ENV_GLOBAL);
return 1;
}

View File

@ -606,7 +606,7 @@ static FILE *fw_data;
static void fw_writer(wchar_t c)
{
putwc(c, fw_data);
fputwc(c, fw_data);
}
/*
@ -648,33 +648,30 @@ int wprintf(const wchar_t *filter, ...)
#endif
#ifndef HAVE_FGETWC
wint_t fgetwc(FILE *stream)
{
wchar_t res=0;
mbstate_t state;
memset(&state, '\0', sizeof(state));
wchar_t res;
mbstate_t state = {};
while (1)
{
int b = fgetc(stream);
char bb;
int sz;
if (b == EOF)
{
return WEOF;
}
bb=b;
sz = mbrtowc(&res, &bb, 1, &state);
if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
{
return b;
}
char bb = b;
size_t sz = mbrtowc(&res, &bb, 1, &state);
switch (sz)
{
case -1:
memset(&state, '\0', sizeof(state));
return WEOF;
case -2:
break;
case 0:
@ -683,35 +680,40 @@ wint_t fgetwc(FILE *stream)
return res;
}
}
}
wint_t getwc(FILE *stream)
{
return fgetwc(stream);
}
#endif
#ifndef HAVE_FPUTWC
wint_t fputwc(wchar_t wc, FILE *stream)
{
int res;
char s[MB_CUR_MAX+1];
memset(s, 0, MB_CUR_MAX+1);
wctomb(s, wc);
res = fputs(s, stream);
return res==EOF?WEOF:wc;
}
int res = 0;
mbstate_t state = {};
char s[MB_CUR_MAX + 1] = {};
wint_t putwc(wchar_t wc, FILE *stream)
{
return fputwc(wc, stream);
}
if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
// If `wc` contains a wide character we emit a question-mark.
if (wc & ~0xFF)
{
wc = '?';
}
s[0] = (char)wc;
res = fputs(s, stream);
}
else
{
size_t len = wcrtomb(s, wc, &state);
if (len == (size_t)-1)
{
debug(1, L"Wide character %d has no narrow representation", wc);
}
else {
res = fputs(s, stream);
}
}
return res == EOF ? WEOF : wc;
}
#endif
#ifndef HAVE_WCSTOK

View File

@ -158,29 +158,13 @@ int vswprintf(wchar_t *out, size_t n, const wchar_t *filter, va_list va);
#endif
#ifndef HAVE_FGETWC
/**
Fallback implementation of fgetwc
*/
// Fallback implementation of fgetwc.
wint_t fgetwc(FILE *stream);
/**
Fallback implementation of getwc
*/
wint_t getwc(FILE *stream);
#endif
#ifndef HAVE_FPUTWC
/**
Fallback implementation of fputwc
*/
// Fallback implementation of fputwc.
wint_t fputwc(wchar_t wc, FILE *stream);
/**
Fallback implementation of putwc
*/
wint_t putwc(wchar_t wc, FILE *stream);
#endif
#ifndef HAVE_WCSTOK

View File

@ -926,10 +926,9 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
{
const char *end = begin + length;
const char *pos=begin;
bool was_backslash = 0;
const char *pos = begin;
wcstring out;
bool was_backslash = false;
bool first_char = true;
bool timestamp_mode = false;
time_t timestamp = 0;
@ -937,12 +936,18 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
while (1)
{
wchar_t c;
mbstate_t state;
size_t res;
mbstate_t state = {};
memset(&state, 0, sizeof(state));
res = mbrtowc(&c, pos, end-pos, &state);
if (MB_CUR_MAX == 1) // single-byte locale
{
c = (unsigned char)*pos;
res = 1;
}
else
{
res = mbrtowc(&c, pos, end - pos, &state);
}
if (res == (size_t)-1)
{

View File

@ -263,16 +263,17 @@ wchar_t input_common_readch(int timed)
while (1)
{
wint_t b = readb();
char bb;
size_t sz;
if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
{
return (unsigned char)b;
}
if ((b >= R_NULL) && (b < R_NULL + 1000))
return b;
bb=b;
sz = mbrtowc(&res, &bb, 1, &state);
char bb = b;
size_t sz = mbrtowc(&res, &bb, 1, &state);
switch (sz)
{

View File

@ -386,32 +386,35 @@ int writeb(tputs_arg_t b)
int writech(wint_t ch)
{
mbstate_t state;
size_t i;
char buff[MB_LEN_MAX+1];
size_t bytes;
size_t len;
if ((ch >= ENCODE_DIRECT_BASE) &&
(ch < ENCODE_DIRECT_BASE+256))
if (ch >= ENCODE_DIRECT_BASE && ch < ENCODE_DIRECT_BASE + 256)
{
buff[0] = ch - ENCODE_DIRECT_BASE;
bytes=1;
len = 1;
}
else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
// If `wc` contains a wide character we emit a question-mark.
if (ch & ~0xFF)
{
ch = '?';
}
buff[0] = ch;
len = 1;
}
else
{
memset(&state, 0, sizeof(state));
bytes= wcrtomb(buff, ch, &state);
switch (bytes)
mbstate_t state = {};
len = wcrtomb(buff, ch, &state);
if (len == (size_t)-1)
{
case (size_t)(-1):
{
return 1;
}
return 1;
}
}
for (i=0; i<bytes; i++)
for (size_t i = 0; i < len; i++)
{
out(buff[i]);
}
@ -420,29 +423,26 @@ int writech(wint_t ch)
void writestr(const wchar_t *str)
{
char *pos;
CHECK(str,);
// while( *str )
// writech( *str++ );
/*
Check amount of needed space
*/
size_t len = wcstombs(0, str, 0);
if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
while( *str )
{
writech( *str++ );
}
return;
}
size_t len = wcstombs(0, str, 0); // figure amount of space needed
if (len == (size_t)-1)
{
debug(1, L"Tried to print invalid wide character string");
return;
}
// Convert the string.
len++;
/*
Convert
*/
char *buffer, static_buffer[256];
if (len <= sizeof static_buffer)
buffer = static_buffer;
@ -456,7 +456,7 @@ void writestr(const wchar_t *str)
/*
Write
*/
for (pos = buffer; *pos; pos++)
for (char *pos = buffer; *pos; pos++)
{
out(*pos);
}

View File

@ -145,30 +145,23 @@ bool wreaddir_for_dirs(DIR *dir, wcstring *out_name)
}
wchar_t *wgetcwd(wchar_t *buff, size_t sz)
const wcstring wgetcwd()
{
char *buffc = (char *)malloc(sz*MAX_UTF8_BYTES);
char *res;
wchar_t *ret = 0;
wcstring retval;
if (!buffc)
{
errno = ENOMEM;
return 0;
}
res = getcwd(buffc, sz*MAX_UTF8_BYTES);
char *res = getcwd(NULL, 0);
if (res)
{
if ((size_t)-1 != mbstowcs(buff, buffc, sizeof(wchar_t) * sz))
{
ret = buff;
}
retval = str2wcstring(res);
free(res);
}
else
{
debug(0, _(L"getcwd() failed with errno %d/%s"), errno, strerror(errno));
retval = wcstring();
}
free(buffc);
return ret;
return retval;
}
int wchdir(const wcstring &dir)

View File

@ -70,10 +70,8 @@ void safe_perror(const char *message);
*/
const char *safe_strerror(int err);
/**
Wide character version of getcwd().
*/
wchar_t *wgetcwd(wchar_t *buff, size_t sz);
// Wide character version of getcwd().
const wcstring wgetcwd();
/**
Wide character version of chdir()

0
tests/c-locale.err Normal file
View File

35
tests/c-locale.in Normal file
View File

@ -0,0 +1,35 @@
# Verify that fish can pass through non-ASCII characters in the C/POSIX
# locale. This is to prevent regression of
# https://github.com/fish-shell/fish-shell/issues/2802.
#
# These tests are needed because the relevant standards allow the functions
# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
# Other libc implementations (e.g., BSD) treat them as valid. We want fish to
# always treat those bytes as valid.
# The fish in the middle of the pipeline should be receiving a UTF-8 encoded
# version of the unicode from the echo. It should pass those bytes thru
# literally since it is in the C locale. We verify this by first passing the
# echo output directly to the `xxd` program then via a fish instance. The
# output should be "58c3bb58" for the first statement and "58c3bc58" for the
# second.
echo -n X\u00fbX | \
xxd --plain
echo X\u00fcX | env LC_ALL=C ../test/root/bin/fish -c 'read foo; echo -n $foo' | \
xxd --plain
# This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
# few single-byte unicode chars (that are not ASCII) are generally in the
# ISO-8859-1 char set which is encompased by the C locale. The output should
# be "59fc59".
env LC_ALL=C ../test/root/bin/fish -c 'echo -n Y\u00fcY' | \
xxd --plain
# The user can specify a wide unicode character (one requiring more than a
# single byte). In the C/POSIX locales we substitute a question-mark for the
# unencodable wide char. The output should be "543f54".
env LC_ALL=C ../test/root/bin/fish -c 'echo -n T\u01fdT' | \
xxd --plain

4
tests/c-locale.out Normal file
View File

@ -0,0 +1,4 @@
58c3bb58
58c3bc58
59fc59
543f54

1
tests/c-locale.status Normal file
View File

@ -0,0 +1 @@
0