diff --git a/src/builtin_string.cpp b/src/builtin_string.cpp index 0915e4911..ca9a1b52b 100644 --- a/src/builtin_string.cpp +++ b/src/builtin_string.cpp @@ -196,6 +196,8 @@ static int handle_flag_1(wchar_t **argv, parser_t &parser, io_streams_t &streams opts->escape_style = STRING_STYLE_URL; } else if (wcscmp(w.woptarg, L"var") == 0) { opts->escape_style = STRING_STYLE_VAR; + } else if (wcscmp(w.woptarg, L"pcre2") == 0) { + opts->escape_style = STRING_STYLE_PCRE2; } else { string_error(streams, _(L"%ls: Invalid escape style '%ls'\n"), cmd, w.woptarg); return STATUS_INVALID_ARGS; diff --git a/src/common.cpp b/src/common.cpp index 7160ef978..59a1e77e9 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1096,6 +1096,42 @@ static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring } } +/// Escapes a string for use in a regex string. Not safe for use with `eval` as only +/// characters reserved by PCRE2 are escaped, i.e. it relies on fish's automatic escaping +/// of subshell output in subsequent concatenation or for use as an argument. +/// \param in is the raw string to be searched for literally when substituted in a PCRE2 expression. +static wcstring escape_string_pcre2(const wcstring &in) { + wcstring out; + out.reserve(in.size() * 1.3); // a wild guess + + for (auto c : in) { + switch (c) { + case L'.': + case L'^': + case L'$': + case L'*': + case L'+': + case L'(': + case L')': + case L'?': + case L'[': + case L'{': + case L'}': + case L'\\': + case L'|': + // these two only *need* to be escaped within a character class, and technically it makes + // no sense to ever use process substitution output to compose a character class, but... + case L'-': + case L']': + out.push_back('\\'); + default: + out.push_back(c); + } + } + + return out; +} + wcstring escape_string(const wchar_t *in, escape_flags_t flags, escape_string_style_t style) { wcstring result; @@ -1112,6 +1148,10 @@ wcstring escape_string(const wchar_t *in, escape_flags_t flags, escape_string_st escape_string_var(in, result); break; } + case STRING_STYLE_PCRE2: { + result = escape_string_pcre2(in); + break; + } } return result; @@ -1133,6 +1173,10 @@ wcstring escape_string(const wcstring &in, escape_flags_t flags, escape_string_s escape_string_var(in, result); break; } + case STRING_STYLE_PCRE2: { + result = escape_string_pcre2(in); + break; + } } return result; @@ -1617,6 +1661,11 @@ bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t es success = unescape_string_var(input, output); break; } + case STRING_STYLE_PCRE2: { + // unescaping PCRE2 is not needed/supported, the PCRE2 engine is responsible for that + success = false; + break; + } } if (!success) output->clear(); return success; @@ -1638,6 +1687,11 @@ bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t e success = unescape_string_var(input.c_str(), output); break; } + case STRING_STYLE_PCRE2: { + // unescaping PCRE2 is not needed/supported, the PCRE2 engine is responsible for that + success = false; + break; + } } if (!success) output->clear(); return success; diff --git a/src/common.h b/src/common.h index 4e3b61ca2..da354be9e 100644 --- a/src/common.h +++ b/src/common.h @@ -118,7 +118,12 @@ static_assert(false, "Neither NAME_MAX nor MAXNAMELEN is defined!"); #endif #endif -enum escape_string_style_t { STRING_STYLE_SCRIPT, STRING_STYLE_URL, STRING_STYLE_VAR }; +enum escape_string_style_t { + STRING_STYLE_SCRIPT, + STRING_STYLE_URL, + STRING_STYLE_VAR, + STRING_STYLE_PCRE2, +}; // Flags for unescape_string functions. enum { diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 77e646b7f..4ee81280d 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -4349,6 +4349,33 @@ static void test_wcstring_tok() { } } +static void test_pcre2_escape() { + say(L"Testing escaping strings as pcre2 literals"); + // plain text should not be needlessly escaped + auto input = L"hello world!"; + auto escaped = escape_string(input, 0, STRING_STYLE_PCRE2); + if (escaped != input) { + err(L"Input string %ls unnecessarily PCRE2 escaped as %ls", input, escaped.c_str()); + } + + // all the following are intended to be ultimately matched literally - even if they don't look + // like that's the intent - so we escape them. + const wchar_t * tests[][2] = { + L".ext", L"\\.ext", + L"{word}", L"\\{word\\}", + L"hola-mundo", L"hola\\-mundo", + L"$17.42 is your total?", L"\\$17\\.42 is your total\\?", + L"not really escaped\\?", L"not really escaped\\\\\\?", + }; + + for (auto &test : tests) { + auto escaped = escape_string(test[0], 0, STRING_STYLE_PCRE2); + if (escaped != test[1]) { + err(L"pcre2_escape error: pcre2_escape(%ls) -> %ls, expected %ls", test[0], escaped.c_str(), test[1]); + } + } +} + int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv); static void run_one_string_test(const wchar_t *const *argv, int expected_rc, const wchar_t *expected_out) { @@ -4961,6 +4988,7 @@ int main(int argc, char **argv) { if (should_test_function("utf8")) test_utf8(); if (should_test_function("feature_flags")) test_feature_flags(); if (should_test_function("escape_sequences")) test_escape_sequences(); + if (should_test_function("pcre2_escape")) test_pcre2_escape(); if (should_test_function("lru")) test_lru(); if (should_test_function("expand")) test_expand(); if (should_test_function("fuzzy_match")) test_fuzzy_match(); diff --git a/tests/string.err b/tests/string.err index b016d450d..0b2106c73 100644 --- a/tests/string.err +++ b/tests/string.err @@ -92,6 +92,9 @@ #################### # string escape with multibyte chars +#################### +# string escape for literal pcre2 searching + #################### # set x (string unescape (echo \x07 | string escape)) @@ -182,7 +185,7 @@ string match: ^ #################### # string invalidarg string: Subcommand 'invalidarg' is not valid -Standard input (line 205): +Standard input (line 211): string invalidarg; and echo "unexpected exit 0" ^ @@ -267,7 +270,7 @@ string repeat: Expected argument #################### # string repeat -l fakearg 2>&1 string repeat: Unknown option '-l' -Standard input (line 281): +Standard input (line 287): string repeat -l fakearg ^ diff --git a/tests/string.in b/tests/string.in index fc6be1bbf..2b4b955fb 100644 --- a/tests/string.in +++ b/tests/string.in @@ -101,6 +101,12 @@ string escape --style=var 中 string escape --style=var aöb | string unescape --style=var string escape --style=var 中 | string unescape --style=var +# test regex escaping +logmsg 'string escape for literal pcre2 searching' +string escape --style=pcre2 ".ext" +string escape --style=pcre2 "bonjour, amigo" +string escape --style=pcre2 "^this is a literal string" + # The following tests verify that we can correctly unescape the same strings # we tested escaping above. diff --git a/tests/string.out b/tests/string.out index a10e6010d..a8a8ecf2f 100644 --- a/tests/string.out +++ b/tests/string.out @@ -140,6 +140,12 @@ _E4_B8_AD_ aöb 中 +#################### +# string escape for literal pcre2 searching +\.ext +bonjour, amigo +\^this is a literal string + #################### # set x (string unescape (echo \x07 | string escape)) success