From 595d5937328bd148cc382090a5df42c4a15ca92d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?= <36937807+henrikhorluck@users.noreply.github.com> Date: Sun, 2 Jul 2023 01:38:53 +0200 Subject: [PATCH] Fully migrate to Rust escape string tests and code Co-Authored-By: Mahmoud Al-Qudsi --- fish-rust/src/common.rs | 42 +++++++- src/common.cpp | 223 ++-------------------------------------- src/fish_tests.cpp | 75 -------------- 3 files changed, 49 insertions(+), 291 deletions(-) diff --git a/fish-rust/src/common.rs b/fish-rust/src/common.rs index 9d64fa428..bda45217b 100644 --- a/fish-rust/src/common.rs +++ b/fish-rust/src/common.rs @@ -2375,16 +2375,31 @@ mod common_ffi { type escape_string_style_t = crate::ffi::escape_string_style_t; } extern "Rust" { - fn rust_unescape_string( + #[cxx_name = "rust_unescape_string"] + fn unescape_string_ffi( input: *const wchar_t, len: usize, escape_special: u32, style: escape_string_style_t, ) -> UniquePtr; + + #[cxx_name = "rust_escape_string_script"] + fn escape_string_script_ffi( + input: *const wchar_t, + len: usize, + flags: u32, + ) -> UniquePtr; + + #[cxx_name = "rust_escape_string_url"] + fn escape_string_url_ffi(input: *const wchar_t, len: usize) -> UniquePtr; + + #[cxx_name = "rust_escape_string_var"] + fn escape_string_var_ffi(input: *const wchar_t, len: usize) -> UniquePtr; + } } -fn rust_unescape_string( +fn unescape_string_ffi( input: *const ffi::wchar_t, len: usize, escape_special: u32, @@ -2405,3 +2420,26 @@ fn rust_unescape_string( None => UniquePtr::null(), } } + +fn escape_string_script_ffi( + input: *const ffi::wchar_t, + len: usize, + flags: u32, +) -> UniquePtr { + let input = unsafe { slice::from_raw_parts(input, len) }; + escape_string_script( + wstr::from_slice(input).unwrap(), + EscapeFlags::from_bits(flags).unwrap(), + ) + .to_ffi() +} + +fn escape_string_var_ffi(input: *const ffi::wchar_t, len: usize) -> UniquePtr { + let input = unsafe { slice::from_raw_parts(input, len) }; + escape_string_var(wstr::from_slice(input).unwrap()).to_ffi() +} + +fn escape_string_url_ffi(input: *const ffi::wchar_t, len: usize) -> UniquePtr { + let input = unsafe { slice::from_raw_parts(input, len) }; + escape_string_url(wstr::from_slice(input).unwrap()).to_ffi() +} diff --git a/src/common.cpp b/src/common.cpp index d93e2cf0a..282bb2438 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -117,9 +117,6 @@ long convert_digit(wchar_t d, int base) { return res; } -/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions. -static bool is_hex_digit(int c) { return std::strchr("0123456789ABCDEF", c) != nullptr; } - bool is_windows_subsystem_for_linux() { #if defined(WSL) return true; @@ -723,51 +720,17 @@ wcstring reformat_for_screen(const wcstring &msg, const termsize_t &termsize) { /// Escape a string in a fashion suitable for using as a URL. Store the result in out_str. static void escape_string_url(const wcstring &in, wcstring &out) { - const std::string narrow = wcs2string(in); - for (auto &c1 : narrow) { - // This silliness is so we get the correct result whether chars are signed or unsigned. - unsigned int c2 = static_cast(c1) & 0xFF; - if (!(c2 & 0x80) && - (isalnum(c2) || c2 == '/' || c2 == '.' || c2 == '~' || c2 == '-' || c2 == '_')) { - // The above characters don't need to be encoded. - out.push_back(static_cast(c2)); - } else { - // All other chars need to have their UTF-8 representation encoded in hex. - wchar_t buf[4]; - swprintf(buf, sizeof buf / sizeof buf[0], L"%%%02X", c2); - out.append(buf); - } + auto result = rust_escape_string_url(in.c_str(), in.size()); + if (result) { + out = *result; } } /// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str. static void escape_string_var(const wcstring &in, wcstring &out) { - bool prev_was_hex_encoded = false; - const std::string narrow = wcs2string(in); - for (auto c1 : narrow) { - // This silliness is so we get the correct result whether chars are signed or unsigned. - unsigned int c2 = static_cast(c1) & 0xFF; - if (!(c2 & 0x80) && isalnum(c2) && (!prev_was_hex_encoded || !is_hex_digit(c2))) { - // ASCII alphanumerics don't need to be encoded. - if (prev_was_hex_encoded) { - out.push_back(L'_'); - prev_was_hex_encoded = false; - } - out.push_back(static_cast(c2)); - } else if (c2 == '_') { - // Underscores are encoded by doubling them. - out.append(L"__"); - prev_was_hex_encoded = false; - } else { - // All other chars need to have their UTF-8 representation encoded in hex. - wchar_t buf[4]; - swprintf(buf, sizeof buf / sizeof buf[0], L"_%02X", c2); - out.append(buf); - prev_was_hex_encoded = true; - } - } - if (prev_was_hex_encoded) { - out.push_back(L'_'); + auto result = rust_escape_string_var(in.c_str(), in.size()); + if (result) { + out = *result; } } @@ -790,177 +753,9 @@ wcstring escape_string_for_double_quotes(wcstring in) { /// Escape a string in a fashion suitable for using in fish script. Store the result in out_str. static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out, escape_flags_t flags) { - const wchar_t *in = orig_in; - const bool escape_printables = !(flags & ESCAPE_NO_PRINTABLES); - const bool no_quoted = static_cast(flags & ESCAPE_NO_QUOTED); - const bool no_tilde = static_cast(flags & ESCAPE_NO_TILDE); - const bool no_qmark = feature_test(feature_flag_t::qmark_noglob); - const bool symbolic = static_cast(flags & ESCAPE_SYMBOLIC) && (MB_CUR_MAX > 1); - assert((!symbolic || !escape_printables) && "symbolic implies escape-no-printables"); - - bool need_escape = false; - bool need_complex_escape = false; - - if (!no_quoted && in_len == 0) { - out.assign(L"''"); - return; - } - - for (size_t i = 0; i < in_len; i++) { - if ((*in >= ENCODE_DIRECT_BASE) && (*in < ENCODE_DIRECT_BASE + 256)) { - int val = *in - ENCODE_DIRECT_BASE; - int tmp; - - out += L'\\'; - out += L'X'; - - tmp = val / 16; - out += tmp > 9 ? L'a' + (tmp - 10) : L'0' + tmp; - - tmp = val % 16; - out += tmp > 9 ? L'a' + (tmp - 10) : L'0' + tmp; - need_escape = need_complex_escape = true; - - } else { - wchar_t c = *in; - switch (c) { - case L'\t': { - if (symbolic) - out += L'␉'; - else - out += L"\\t"; - need_escape = need_complex_escape = true; - break; - } - case L'\n': { - if (symbolic) - out += L'␤'; - else - out += L"\\n"; - need_escape = need_complex_escape = true; - break; - } - case L'\b': { - if (symbolic) - out += L'␈'; - else - out += L"\\b"; - need_escape = need_complex_escape = true; - break; - } - case L'\r': { - if (symbolic) - out += L'␍'; - else - out += L"\\r"; - need_escape = need_complex_escape = true; - break; - } - case L'\x1B': { - if (symbolic) - out += L'␛'; - else - out += L"\\e"; - need_escape = need_complex_escape = true; - break; - } - case L'\x7F': { - if (symbolic) - out += L'␡'; - else - out += L"\\x7f"; - need_escape = need_complex_escape = true; - break; - } - case L'\\': - case L'\'': { - need_escape = need_complex_escape = true; - if (escape_printables || (c == L'\\' && !symbolic)) out += L'\\'; - out += *in; - break; - } - case ANY_CHAR: { - // See #1614 - out += L'?'; - break; - } - case ANY_STRING: { - out += L'*'; - break; - } - case ANY_STRING_RECURSIVE: { - out += L"**"; - break; - } - - case L'&': - case L'$': - case L' ': - case L'#': - case L'<': - case L'>': - case L'(': - case L')': - case L'[': - case L']': - case L'{': - case L'}': - case L'?': - case L'*': - case L'|': - case L';': - case L'"': - case L'%': - case L'~': { - bool char_is_normal = (c == L'~' && no_tilde) || (c == L'?' && no_qmark); - if (!char_is_normal) { - need_escape = true; - if (escape_printables) out += L'\\'; - } - out += *in; - break; - } - - default: { - if (*in >= 0 && *in < 32) { - need_escape = need_complex_escape = true; - - if (symbolic) { - out += L'\u2400' + *in; - break; - } - - if (*in < 27 && *in != 0) { - out += L'\\'; - out += L'c'; - out += L'a' + *in - 1; - break; - } - - int tmp = (*in) % 16; - out += L'\\'; - out += L'x'; - out += ((*in > 15) ? L'1' : L'0'); - out += tmp > 9 ? L'a' + (tmp - 10) : L'0' + tmp; - } else { - out += *in; - } - break; - } - } - } - - in++; - } - - // Use quoted escaping if possible, since most people find it easier to read. - if (!no_quoted && need_escape && !need_complex_escape && escape_printables) { - wchar_t single_quote = L'\''; - out.clear(); - out.reserve(2 + in_len); - out.push_back(single_quote); - out.append(orig_in, in_len); - out.push_back(single_quote); + auto result = rust_escape_string_script(orig_in, in_len, flags); + if (result) { + out = *result; } } diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 230306571..571720a14 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -367,79 +367,6 @@ static void test_enum_array() { do_test(es.at(test_enum::gamma) == "def"); } -/// Test sane escapes. -static void test_unescape_sane() { - const struct test_t { - const wchar_t *input; - const wchar_t *expected; - } tests[] = { - {L"abcd", L"abcd"}, {L"'abcd'", L"abcd"}, - {L"'abcd\\n'", L"abcd\\n"}, {L"\"abcd\\n\"", L"abcd\\n"}, - {L"\"abcd\\n\"", L"abcd\\n"}, {L"\\143", L"c"}, - {L"'\\143'", L"\\143"}, {L"\\n", L"\n"} // \n normally becomes newline - }; - for (const auto &test : tests) { - auto output = unescape_string(test.input, UNESCAPE_DEFAULT); - if (!output) { - err(L"Failed to unescape '%ls'\n", test.input); - } else if (*output != test.expected) { - err(L"In unescaping '%ls', expected '%ls' but got '%ls'\n", test.input, test.expected, - output->c_str()); - } - } - - // Test for overflow. - if (unescape_string(L"echo \\UFFFFFF", UNESCAPE_DEFAULT)) { - err(L"Should not have been able to unescape \\UFFFFFF\n"); - } - if (unescape_string(L"echo \\U110000", UNESCAPE_DEFAULT)) { - err(L"Should not have been able to unescape \\U110000\n"); - } -#if WCHAR_MAX != 0xffff - // TODO: Make this work on MS Windows. - if (!unescape_string(L"echo \\U10FFFF", UNESCAPE_DEFAULT)) { - err(L"Should have been able to unescape \\U10FFFF\n"); - } -#endif -} - -/// Test the escaping/unescaping code by escaping/unescaping random strings and verifying that the -/// original string comes back. -static void test_escape_crazy() { - say(L"Testing escaping and unescaping"); - wcstring random_string; - wcstring escaped_string; - for (size_t i = 0; i < ESCAPE_TEST_COUNT; i++) { - random_string.clear(); - while (random() % ESCAPE_TEST_LENGTH) { - random_string.push_back((random() % ESCAPE_TEST_CHAR) + 1); - } - - escaped_string = escape_string(random_string); - auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT); - - if (!unescaped_string) { - err(L"Failed to unescape string <%ls>", escaped_string.c_str()); - break; - } else if (*unescaped_string != random_string) { - err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'", - random_string.c_str(), unescaped_string->c_str()); - break; - } - } - - // Verify that ESCAPE_NO_PRINTABLES also escapes backslashes so we don't regress on issue #3892. - random_string = L"line 1\\n\nline 2"; - escaped_string = escape_string(random_string, ESCAPE_NO_PRINTABLES | ESCAPE_NO_QUOTED); - auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT); - if (!unescaped_string) { - err(L"Failed to unescape string <%ls>", escaped_string.c_str()); - } else if (*unescaped_string != random_string) { - err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'", - random_string.c_str(), unescaped_string->c_str()); - } -} - static void test_format() { say(L"Testing formatting functions"); struct { @@ -6216,8 +6143,6 @@ static const test_t s_tests[]{ {TEST_GROUP("new_parser_ad_hoc"), test_new_parser_ad_hoc}, {TEST_GROUP("new_parser_errors"), test_new_parser_errors}, {TEST_GROUP("error_messages"), test_error_messages}, - {TEST_GROUP("escape"), test_unescape_sane}, - {TEST_GROUP("escape"), test_escape_crazy}, {TEST_GROUP("format"), test_format}, {TEST_GROUP("convert"), test_convert}, {TEST_GROUP("convert"), test_convert_private_use},