From 595d5937328bd148cc382090a5df42c4a15ca92d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?=
 <36937807+henrikhorluck@users.noreply.github.com>
Date: Sun, 2 Jul 2023 01:38:53 +0200
Subject: [PATCH] Fully migrate to Rust escape string tests and code

Co-Authored-By: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
---
 fish-rust/src/common.rs |  42 +++++++-
 src/common.cpp          | 223 ++--------------------------------------
 src/fish_tests.cpp      |  75 --------------
 3 files changed, 49 insertions(+), 291 deletions(-)
diff --git a/fish-rust/src/common.rs b/fish-rust/src/common.rs
index 9d64fa428..bda45217b 100644
--- a/fish-rust/src/common.rs
+++ b/fish-rust/src/common.rs
@@ -2375,16 +2375,31 @@ mod common_ffi {
         type escape_string_style_t = crate::ffi::escape_string_style_t;
     }
     extern "Rust" {
-        fn rust_unescape_string(
+        #[cxx_name = "rust_unescape_string"]
+        fn unescape_string_ffi(
             input: *const wchar_t,
             len: usize,
             escape_special: u32,
             style: escape_string_style_t,
         ) -> UniquePtr<CxxWString>;
+
+        #[cxx_name = "rust_escape_string_script"]
+        fn escape_string_script_ffi(
+            input: *const wchar_t,
+            len: usize,
+            flags: u32,
+        ) -> UniquePtr<CxxWString>;
+
+        #[cxx_name = "rust_escape_string_url"]
+        fn escape_string_url_ffi(input: *const wchar_t, len: usize) -> UniquePtr<CxxWString>;
+
+        #[cxx_name = "rust_escape_string_var"]
+        fn escape_string_var_ffi(input: *const wchar_t, len: usize) -> UniquePtr<CxxWString>;
+
     }
 }
 
-fn rust_unescape_string(
+fn unescape_string_ffi(
     input: *const ffi::wchar_t,
     len: usize,
     escape_special: u32,
@@ -2405,3 +2420,26 @@ fn rust_unescape_string(
         None => UniquePtr::null(),
     }
 }
+
+fn escape_string_script_ffi(
+    input: *const ffi::wchar_t,
+    len: usize,
+    flags: u32,
+) -> UniquePtr<CxxWString> {
+    let input = unsafe { slice::from_raw_parts(input, len) };
+    escape_string_script(
+        wstr::from_slice(input).unwrap(),
+        EscapeFlags::from_bits(flags).unwrap(),
+    )
+    .to_ffi()
+}
+
+fn escape_string_var_ffi(input: *const ffi::wchar_t, len: usize) -> UniquePtr<CxxWString> {
+    let input = unsafe { slice::from_raw_parts(input, len) };
+    escape_string_var(wstr::from_slice(input).unwrap()).to_ffi()
+}
+
+fn escape_string_url_ffi(input: *const ffi::wchar_t, len: usize) -> UniquePtr<CxxWString> {
+    let input = unsafe { slice::from_raw_parts(input, len) };
+    escape_string_url(wstr::from_slice(input).unwrap()).to_ffi()
+}
diff --git a/src/common.cpp b/src/common.cpp
index d93e2cf0a..282bb2438 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -117,9 +117,6 @@ long convert_digit(wchar_t d, int base) {
     return res;
 }
 
-/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions.
-static bool is_hex_digit(int c) { return std::strchr("0123456789ABCDEF", c) != nullptr; }
-
 bool is_windows_subsystem_for_linux() {
 #if defined(WSL)
     return true;
@@ -723,51 +720,17 @@ wcstring reformat_for_screen(const wcstring &msg, const termsize_t &termsize) {
 
 /// Escape a string in a fashion suitable for using as a URL. Store the result in out_str.
 static void escape_string_url(const wcstring &in, wcstring &out) {
-    const std::string narrow = wcs2string(in);
-    for (auto &c1 : narrow) {
-        // This silliness is so we get the correct result whether chars are signed or unsigned.
-        unsigned int c2 = static_cast<unsigned int>(c1) & 0xFF;
-        if (!(c2 & 0x80) &&
-            (isalnum(c2) || c2 == '/' || c2 == '.' || c2 == '~' || c2 == '-' || c2 == '_')) {
-            // The above characters don't need to be encoded.
-            out.push_back(static_cast<wchar_t>(c2));
-        } else {
-            // All other chars need to have their UTF-8 representation encoded in hex.
-            wchar_t buf[4];
-            swprintf(buf, sizeof buf / sizeof buf[0], L"%%%02X", c2);
-            out.append(buf);
-        }
+    auto result = rust_escape_string_url(in.c_str(), in.size());
+    if (result) {
+        out = *result;
     }
 }
 
 /// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str.
 static void escape_string_var(const wcstring &in, wcstring &out) {
-    bool prev_was_hex_encoded = false;
-    const std::string narrow = wcs2string(in);
-    for (auto c1 : narrow) {
-        // This silliness is so we get the correct result whether chars are signed or unsigned.
-        unsigned int c2 = static_cast<unsigned int>(c1) & 0xFF;
-        if (!(c2 & 0x80) && isalnum(c2) && (!prev_was_hex_encoded || !is_hex_digit(c2))) {
-            // ASCII alphanumerics don't need to be encoded.
-            if (prev_was_hex_encoded) {
-                out.push_back(L'_');
-                prev_was_hex_encoded = false;
-            }
-            out.push_back(static_cast<wchar_t>(c2));
-        } else if (c2 == '_') {
-            // Underscores are encoded by doubling them.
-            out.append(L"__");
-            prev_was_hex_encoded = false;
-        } else {
-            // All other chars need to have their UTF-8 representation encoded in hex.
-            wchar_t buf[4];
-            swprintf(buf, sizeof buf / sizeof buf[0], L"_%02X", c2);
-            out.append(buf);
-            prev_was_hex_encoded = true;
-        }
-    }
-    if (prev_was_hex_encoded) {
-        out.push_back(L'_');
+    auto result = rust_escape_string_var(in.c_str(), in.size());
+    if (result) {
+        out = *result;
     }
 }
 
@@ -790,177 +753,9 @@ wcstring escape_string_for_double_quotes(wcstring in) {
 /// Escape a string in a fashion suitable for using in fish script. Store the result in out_str.
 static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out,
                                  escape_flags_t flags) {
-    const wchar_t *in = orig_in;
-    const bool escape_printables = !(flags & ESCAPE_NO_PRINTABLES);
-    const bool no_quoted = static_cast<bool>(flags & ESCAPE_NO_QUOTED);
-    const bool no_tilde = static_cast<bool>(flags & ESCAPE_NO_TILDE);
-    const bool no_qmark = feature_test(feature_flag_t::qmark_noglob);
-    const bool symbolic = static_cast<bool>(flags & ESCAPE_SYMBOLIC) && (MB_CUR_MAX > 1);
-    assert((!symbolic || !escape_printables) && "symbolic implies escape-no-printables");
-
-    bool need_escape = false;
-    bool need_complex_escape = false;
-
-    if (!no_quoted && in_len == 0) {
-        out.assign(L"''");
-        return;
-    }
-
-    for (size_t i = 0; i < in_len; i++) {
-        if ((*in >= ENCODE_DIRECT_BASE) && (*in < ENCODE_DIRECT_BASE + 256)) {
-            int val = *in - ENCODE_DIRECT_BASE;
-            int tmp;
-
-            out += L'\\';
-            out += L'X';
-
-            tmp = val / 16;
-            out += tmp > 9 ? L'a' + (tmp - 10) : L'0' + tmp;
-
-            tmp = val % 16;
-            out += tmp > 9 ? L'a' + (tmp - 10) : L'0' + tmp;
-            need_escape = need_complex_escape = true;
-
-        } else {
-            wchar_t c = *in;
-            switch (c) {
-                case L'\t': {
-                    if (symbolic)
-                        out += L'␉';
-                    else
-                        out += L"\\t";
-                    need_escape = need_complex_escape = true;
-                    break;
-                }
-                case L'\n': {
-                    if (symbolic)
-                        out += L'␤';
-                    else
-                        out += L"\\n";
-                    need_escape = need_complex_escape = true;
-                    break;
-                }
-                case L'\b': {
-                    if (symbolic)
-                        out += L'␈';
-                    else
-                        out += L"\\b";
-                    need_escape = need_complex_escape = true;
-                    break;
-                }
-                case L'\r': {
-                    if (symbolic)
-                        out += L'␍';
-                    else
-                        out += L"\\r";
-                    need_escape = need_complex_escape = true;
-                    break;
-                }
-                case L'\x1B': {
-                    if (symbolic)
-                        out += L'␛';
-                    else
-                        out += L"\\e";
-                    need_escape = need_complex_escape = true;
-                    break;
-                }
-                case L'\x7F': {
-                    if (symbolic)
-                        out += L'␡';
-                    else
-                        out += L"\\x7f";
-                    need_escape = need_complex_escape = true;
-                    break;
-                }
-                case L'\\':
-                case L'\'': {
-                    need_escape = need_complex_escape = true;
-                    if (escape_printables || (c == L'\\' && !symbolic)) out += L'\\';
-                    out += *in;
-                    break;
-                }
-                case ANY_CHAR: {
-                    // See #1614
-                    out += L'?';
-                    break;
-                }
-                case ANY_STRING: {
-                    out += L'*';
-                    break;
-                }
-                case ANY_STRING_RECURSIVE: {
-                    out += L"**";
-                    break;
-                }
-
-                case L'&':
-                case L'$':
-                case L' ':
-                case L'#':
-                case L'<':
-                case L'>':
-                case L'(':
-                case L')':
-                case L'[':
-                case L']':
-                case L'{':
-                case L'}':
-                case L'?':
-                case L'*':
-                case L'|':
-                case L';':
-                case L'"':
-                case L'%':
-                case L'~': {
-                    bool char_is_normal = (c == L'~' && no_tilde) || (c == L'?' && no_qmark);
-                    if (!char_is_normal) {
-                        need_escape = true;
-                        if (escape_printables) out += L'\\';
-                    }
-                    out += *in;
-                    break;
-                }
-
-                default: {
-                    if (*in >= 0 && *in < 32) {
-                        need_escape = need_complex_escape = true;
-
-                        if (symbolic) {
-                            out += L'\u2400' + *in;
-                            break;
-                        }
-
-                        if (*in < 27 && *in != 0) {
-                            out += L'\\';
-                            out += L'c';
-                            out += L'a' + *in - 1;
-                            break;
-                        }
-
-                        int tmp = (*in) % 16;
-                        out += L'\\';
-                        out += L'x';
-                        out += ((*in > 15) ? L'1' : L'0');
-                        out += tmp > 9 ? L'a' + (tmp - 10) : L'0' + tmp;
-                    } else {
-                        out += *in;
-                    }
-                    break;
-                }
-            }
-        }
-
-        in++;
-    }
-
-    // Use quoted escaping if possible, since most people find it easier to read.
-    if (!no_quoted && need_escape && !need_complex_escape && escape_printables) {
-        wchar_t single_quote = L'\'';
-        out.clear();
-        out.reserve(2 + in_len);
-        out.push_back(single_quote);
-        out.append(orig_in, in_len);
-        out.push_back(single_quote);
+    auto result = rust_escape_string_script(orig_in, in_len, flags);
+    if (result) {
+        out = *result;
     }
 }
 
diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index 230306571..571720a14 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -367,79 +367,6 @@ static void test_enum_array() {
     do_test(es.at(test_enum::gamma) == "def");
 }
 
-/// Test sane escapes.
-static void test_unescape_sane() {
-    const struct test_t {
-        const wchar_t *input;
-        const wchar_t *expected;
-    } tests[] = {
-        {L"abcd", L"abcd"},           {L"'abcd'", L"abcd"},
-        {L"'abcd\\n'", L"abcd\\n"},   {L"\"abcd\\n\"", L"abcd\\n"},
-        {L"\"abcd\\n\"", L"abcd\\n"}, {L"\\143", L"c"},
-        {L"'\\143'", L"\\143"},       {L"\\n", L"\n"}  // \n normally becomes newline
-    };
-    for (const auto &test : tests) {
-        auto output = unescape_string(test.input, UNESCAPE_DEFAULT);
-        if (!output) {
-            err(L"Failed to unescape '%ls'\n", test.input);
-        } else if (*output != test.expected) {
-            err(L"In unescaping '%ls', expected '%ls' but got '%ls'\n", test.input, test.expected,
-                output->c_str());
-        }
-    }
-
-    // Test for overflow.
-    if (unescape_string(L"echo \\UFFFFFF", UNESCAPE_DEFAULT)) {
-        err(L"Should not have been able to unescape \\UFFFFFF\n");
-    }
-    if (unescape_string(L"echo \\U110000", UNESCAPE_DEFAULT)) {
-        err(L"Should not have been able to unescape \\U110000\n");
-    }
-#if WCHAR_MAX != 0xffff
-    // TODO: Make this work on MS Windows.
-    if (!unescape_string(L"echo \\U10FFFF", UNESCAPE_DEFAULT)) {
-        err(L"Should have been able to unescape \\U10FFFF\n");
-    }
-#endif
-}
-
-/// Test the escaping/unescaping code by escaping/unescaping random strings and verifying that the
-/// original string comes back.
-static void test_escape_crazy() {
-    say(L"Testing escaping and unescaping");
-    wcstring random_string;
-    wcstring escaped_string;
-    for (size_t i = 0; i < ESCAPE_TEST_COUNT; i++) {
-        random_string.clear();
-        while (random() % ESCAPE_TEST_LENGTH) {
-            random_string.push_back((random() % ESCAPE_TEST_CHAR) + 1);
-        }
-
-        escaped_string = escape_string(random_string);
-        auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT);
-
-        if (!unescaped_string) {
-            err(L"Failed to unescape string <%ls>", escaped_string.c_str());
-            break;
-        } else if (*unescaped_string != random_string) {
-            err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'",
-                random_string.c_str(), unescaped_string->c_str());
-            break;
-        }
-    }
-
-    // Verify that ESCAPE_NO_PRINTABLES also escapes backslashes so we don't regress on issue #3892.
-    random_string = L"line 1\\n\nline 2";
-    escaped_string = escape_string(random_string, ESCAPE_NO_PRINTABLES | ESCAPE_NO_QUOTED);
-    auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT);
-    if (!unescaped_string) {
-        err(L"Failed to unescape string <%ls>", escaped_string.c_str());
-    } else if (*unescaped_string != random_string) {
-        err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'",
-            random_string.c_str(), unescaped_string->c_str());
-    }
-}
-
 static void test_format() {
     say(L"Testing formatting functions");
     struct {
@@ -6216,8 +6143,6 @@ static const test_t s_tests[]{
     {TEST_GROUP("new_parser_ad_hoc"), test_new_parser_ad_hoc},
     {TEST_GROUP("new_parser_errors"), test_new_parser_errors},
     {TEST_GROUP("error_messages"), test_error_messages},
-    {TEST_GROUP("escape"), test_unescape_sane},
-    {TEST_GROUP("escape"), test_escape_crazy},
     {TEST_GROUP("format"), test_format},
     {TEST_GROUP("convert"), test_convert},
     {TEST_GROUP("convert"), test_convert_private_use},