implement string escape --style=xxx

We need a way to encode arbitrary strings into valid fish variable
names. It would also be nice if we could convert strings to valid URLs
without using the slow and hard to understand `__fish_urlencode` function.
In particular, eliminating the need to manipulate the locale.

Fixes #4150
This commit is contained in:
Kurtis Rader 2017-06-20 21:55:16 -07:00
parent 30368d5526
commit 60bca14b37
8 changed files with 263 additions and 34 deletions

View File

@ -4,6 +4,7 @@
- The `COLUMNS` and `LINES` env vars are now correctly set the first time `fish_prompt` is run (#4141).
- New `status is-breakpoint` command that is true when a prompt is displayed in response to a `breakpoint` command (#1310).
- Invalid array indexes are now silently ignored (#826, #4127).
- `string escape` has a new `--style=xxx` flag where `xxx` can be `script`, `var`, or `url` (#4150)
## Other significant changes

View File

@ -2,7 +2,7 @@
\subsection string-synopsis Synopsis
\fish{synopsis}
string escape [(-n | --no-quoted)] [STRING...]
string escape [(-n | --no-quoted)] [--style=xxx] [STRING...]
string join [(-q | --quiet)] SEP [STRING...]
string length [(-q | --quiet)] [STRING...]
string lower [(-q | --quiet)] [STRING...]
@ -36,7 +36,11 @@ The following subcommands are available.
\subsection string-escape "escape" subcommand
`string escape` escapes each STRING such that it can be passed back to `eval` to produce the original argument again. By default, all special characters are escaped, and quotes are used to simplify the output when possible. If `-n` or `--no-quoted` is given, the simplifying quoted format is not used. Exit status: 0 if at least one string was escaped, or 1 otherwise.
`string escape` escapes each STRING in one of three ways. The first is `--style=script`. This is the default. It alters the string such that it can be passed back to `eval` to produce the original argument again. By default, all special characters are escaped, and quotes are used to simplify the output when possible. If `-n` or `--no-quoted` is given, the simplifying quoted format is not used. Exit status: 0 if at least one string was escaped, or 1 otherwise.
The second is `--style=var` which ensures the string can be used as a variable name by hex encoding any non-alphanumeric characters. The string is first converted to UTF-8 before being encoded.
The third is `--style=url` which ensures the string can be used as a URL by hex encoding any character which is not legal in a URL. The string is first converted to UTF-8 before being encoded.
\subsection string-join "join" subcommand
@ -159,6 +163,11 @@ In general, special characters are special by default, so `a+` matches one or mo
<bs>cg</bs>
\endfish
\fish{cli-dark}
>_ string escape --style=var 'a1 b2'\u6161
<bs>a1_20b2__c_E6_85_A1</bs>
\endfish
\subsection string-example-match-glob Match Glob Examples
\fish{cli-dark}

View File

@ -116,6 +116,7 @@ typedef struct { //!OCLINT(too many fields)
bool regex_valid = false;
bool right_valid = false;
bool start_valid = false;
bool style_valid = false;
bool all = false;
bool entire = false;
@ -138,8 +139,34 @@ typedef struct { //!OCLINT(too many fields)
const wchar_t *chars_to_trim = L" \f\n\r\t";
const wchar_t *arg1 = NULL;
const wchar_t *arg2 = NULL;
escape_string_style_t escape_style = STRING_STYLE_SCRIPT;
} options_t;
/// This handles the `--style=xxx` flag.
static int handle_flag_1(wchar_t **argv, parser_t &parser, io_streams_t &streams, wgetopter_t &w,
options_t *opts) {
const wchar_t *cmd = argv[0];
if (opts->style_valid) {
if (wcscmp(w.woptarg, L"script") == 0) {
opts->escape_style = STRING_STYLE_SCRIPT;
} else if (wcscmp(w.woptarg, L"url") == 0) {
opts->escape_style = STRING_STYLE_URL;
} else if (wcscmp(w.woptarg, L"var") == 0) {
opts->escape_style = STRING_STYLE_VAR;
}
else {
string_error(streams, _(L"%ls: Invalid escape style '%ls'\n"), cmd, w.woptarg);
return STATUS_INVALID_ARGS;
}
return STATUS_CMD_OK;
}
string_unknown_option(parser, streams, cmd, argv[w.woptind - 1]);
return STATUS_INVALID_ARGS;
}
static int handle_flag_N(wchar_t **argv, parser_t &parser, io_streams_t &streams, wgetopter_t &w,
options_t *opts) {
if (opts->no_newline_valid) {
@ -349,13 +376,14 @@ static const struct woption long_options[] = {
{L"max", required_argument, NULL, 'm'}, {L"no-newline", no_argument, NULL, 'N'},
{L"no-quoted", no_argument, NULL, 'n'}, {L"quiet", no_argument, NULL, 'q'},
{L"regex", no_argument, NULL, 'r'}, {L"right", no_argument, NULL, 'r'},
{L"start", required_argument, NULL, 's'}, {NULL, 0, NULL, 0}};
{L"start", required_argument, NULL, 's'}, {L"style", required_argument, NULL, 1},
{NULL, 0, NULL, 0}};
static std::map<char, decltype(*handle_flag_N)> flag_to_function = {
{'N', handle_flag_N}, {'a', handle_flag_a}, {'c', handle_flag_c}, {'e', handle_flag_e},
{'f', handle_flag_f}, {'i', handle_flag_i}, {'l', handle_flag_l}, {'m', handle_flag_m},
{'n', handle_flag_n}, {'q', handle_flag_q}, {'r', handle_flag_r}, {'s', handle_flag_s},
{'v', handle_flag_v}};
{'v', handle_flag_v}, {1, handle_flag_1}};
/// Parse the arguments for flags recognized by a specific string subcommand.
static int parse_opts(options_t *opts, int *optind, int n_req_args, int argc, wchar_t **argv,
@ -408,21 +436,15 @@ static int parse_opts(options_t *opts, int *optind, int n_req_args, int argc, wc
return STATUS_CMD_OK;
}
static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
options_t opts;
opts.no_quoted_valid = true;
int optind;
int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams);
if (retval != STATUS_CMD_OK) return retval;
/// Escape a string so that it can be used in a fish script without further word splitting.
static int string_escape_script(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) {
wcstring storage;
int nesc = 0;
escape_flags_t flags = ESCAPE_ALL;
if (opts.no_quoted) flags |= ESCAPE_NO_QUOTED;
int nesc = 0;
wcstring storage;
const wchar_t *arg;
while ((arg = string_get_arg(&optind, argv, &storage, streams)) != 0) {
streams.out.append(escape_string(arg, flags));
while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
streams.out.append(escape_string(arg, flags, STRING_STYLE_SCRIPT));
streams.out.append(L'\n');
nesc++;
}
@ -430,6 +452,61 @@ static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wcha
return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
/// Escape a string so that it can be used as a URL.
static int string_escape_url(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) {
UNUSED(opts);
wcstring storage;
int nesc = 0;
escape_flags_t flags = 0;
while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
streams.out.append(escape_string(arg, flags, STRING_STYLE_URL));
streams.out.append(L'\n');
nesc++;
}
return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
/// Escape a string so that it can be used as a fish var name.
static int string_escape_var(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) {
UNUSED(opts);
wcstring storage;
int nesc = 0;
escape_flags_t flags = 0;
while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
streams.out.append(escape_string(arg, flags, STRING_STYLE_VAR));
streams.out.append(L'\n');
nesc++;
}
return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
options_t opts;
opts.no_quoted_valid = true;
opts.style_valid = true;
int optind;
int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams);
if (retval != STATUS_CMD_OK) return retval;
switch (opts.escape_style) {
case STRING_STYLE_SCRIPT: {
return string_escape_script(opts, optind, argv, streams);
}
case STRING_STYLE_URL: {
return string_escape_url(opts, optind, argv, streams);
}
case STRING_STYLE_VAR: {
return string_escape_var(opts, optind, argv, streams);
}
}
DIE("should never reach this statement");
}
static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
options_t opts;
opts.quiet_valid = true;

View File

@ -1,6 +1,7 @@
// Various functions, mostly string utilities, that are used by most parts of fish.
#include "config.h"
#include <ctype.h>
#include <cxxabi.h>
#include <dlfcn.h>
#include <errno.h>
@ -745,11 +746,62 @@ wcstring reformat_for_screen(const wcstring &msg) {
return buff;
}
/// Escape a string, storing the result in out_str.
static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstring *out_str,
escape_flags_t flags) {
assert(orig_in != NULL);
/// Escape a string in a fashion suitable for using as a URL. Store the result in out_str.
static void escape_string_url(const wchar_t *orig_in, wcstring &out) {
const std::string &in = wcs2string(orig_in);
for (auto c1 : in) {
// This silliness is so we get the correct result whether chars are signed or unsigned.
unsigned int c2 = (unsigned int)c1 & 0xFF;
if (!(c2 & 0x80) &&
(isalnum(c2) || c2 == '/' || c2 == '.' || c2 == '~' || c2 == '-' || c2 == '_')) {
// The above characters don't need to be encoded.
out.push_back((wchar_t)c2);
} else {
// All other chars need to have their UTF-8 representation encoded in hex.
wchar_t buf[4];
swprintf(buf, sizeof buf / sizeof buf[0], L"%%%02X", c2);
out.append(buf);
}
}
}
static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; }
/// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str.
static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
bool prev_was_hex_encoded = false;
bool maybe_encode_next_char = false;
const std::string &in = wcs2string(orig_in);
for (auto c1 : in) {
// This silliness is so we get the correct result whether chars are signed or unsigned.
unsigned int c2 = (unsigned int)c1 & 0xFF;
if (!(c2 & 0x80) && isalnum(c2) && (!prev_was_hex_encoded || !is_hex_digit(c2))) {
// ASCII alphanumerics don't need to be encoded.
if (prev_was_hex_encoded) {
out.push_back(L'_');
prev_was_hex_encoded = false;
}
out.push_back((wchar_t)c2);
} else if (c2 == '_') {
// Underscores are encoded by doubling them.
out.append(L"__");
prev_was_hex_encoded = false;
} else {
// All other chars need to have their UTF-8 representation encoded in hex.
wchar_t buf[4];
swprintf(buf, sizeof buf / sizeof buf[0], L"_%02X", c2);
out.append(buf);
prev_was_hex_encoded = true;
}
}
if (prev_was_hex_encoded) {
out.push_back(L'_');
}
}
/// Escape a string in a fashion suitable for using in fish script. Store the result in out_str.
static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out,
escape_flags_t flags) {
const wchar_t *in = orig_in;
bool escape_all = static_cast<bool>(flags & ESCAPE_ALL);
bool no_quoted = static_cast<bool>(flags & ESCAPE_NO_QUOTED);
@ -758,9 +810,6 @@ static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstri
int need_escape = 0;
int need_complex_escape = 0;
// Avoid dereferencing all over the place.
wcstring &out = *out_str;
if (!no_quoted && in_len == 0) {
out.assign(L"''");
return;
@ -903,15 +952,45 @@ static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstri
}
}
wcstring escape_string(const wchar_t *in, escape_flags_t flags) {
wcstring escape_string(const wchar_t *in, escape_flags_t flags, escape_string_style_t style) {
wcstring result;
escape_string_internal(in, wcslen(in), &result, flags);
switch (style) {
case STRING_STYLE_SCRIPT: {
escape_string_script(in, wcslen(in), result, flags);
break;
}
case STRING_STYLE_URL: {
escape_string_url(in, result);
break;
}
case STRING_STYLE_VAR: {
escape_string_var(in, result);
break;
}
}
return result;
}
wcstring escape_string(const wcstring &in, escape_flags_t flags) {
wcstring escape_string(const wcstring &in, escape_flags_t flags, escape_string_style_t style) {
wcstring result;
escape_string_internal(in.c_str(), in.size(), &result, flags);
switch (style) {
case STRING_STYLE_SCRIPT: {
escape_string_script(in.c_str(), in.size(), result, flags);
break;
}
case STRING_STYLE_URL: {
DIE("STRING_STYLE_URL not implemented");
break;
}
case STRING_STYLE_VAR: {
escape_string_var(in.c_str(), result);
break;
}
}
return result;
}

View File

@ -89,6 +89,12 @@ typedef std::vector<wcstring> wcstring_list_t;
#define INPUT_COMMON_BASE (wchar_t)0xF700
#define INPUT_COMMON_END (INPUT_COMMON_BASE + 64)
enum escape_string_style_t {
STRING_STYLE_SCRIPT,
STRING_STYLE_URL,
STRING_STYLE_VAR
};
// Flags for unescape_string functions.
enum {
UNESCAPE_DEFAULT = 0, // default behavior
@ -97,15 +103,14 @@ enum {
};
typedef unsigned int unescape_flags_t;
// Flags for the escape_string() and escape_string() functions.
// Flags for the escape_string() and escape_string() functions. These are only applicable when the
// escape style is "script" (i.e., STRING_STYLE_SCRIPT).
enum {
/// Escape all characters, including magic characters like the semicolon.
ESCAPE_ALL = 1 << 0,
/// Do not try to use 'simplified' quoted escapes, and do not use empty quotes as the empty
/// string.
ESCAPE_NO_QUOTED = 1 << 1,
/// Do not escape tildes.
ESCAPE_NO_TILDE = 1 << 2
};
@ -692,8 +697,10 @@ ssize_t read_loop(int fd, void *buff, size_t count);
/// \param in The string to be escaped
/// \param flags Flags to control the escaping
/// \return The escaped string
wcstring escape_string(const wchar_t *in, escape_flags_t flags);
wcstring escape_string(const wcstring &in, escape_flags_t flags);
wcstring escape_string(const wchar_t *in, escape_flags_t flags,
escape_string_style_t style=STRING_STYLE_SCRIPT);
wcstring escape_string(const wcstring &in, escape_flags_t flags,
escape_string_style_t style=STRING_STYLE_SCRIPT);
/// Expand backslashed escapes and substitute them with their unescaped counterparts. Also
/// optionally change the wildcards, the tilde character and a few more into constants which are

View File

@ -5,7 +5,7 @@ string match: ^
# string invalidarg
string: Subcommand 'invalidarg' is not valid
Standard input (line 183):
Standard input (line 215):
string invalidarg; and echo "unexpected exit 0" >&2
^
@ -29,6 +29,6 @@ string repeat: Expected argument
# string repeat -l fakearg 2>&1
string repeat: Unknown option '-l'
Standard input (line 284):
Standard input (line 316):
string repeat -l fakearg
^

View File

@ -94,6 +94,38 @@ echo
echo '# echo \x07 | string escape'
echo \x07 | string escape
echo
echo '# string escape --style=script \'a b#c"\\\'d\''
string escape --style=script 'a b#c"\'d'
echo
echo '# string escape --style=url \'a b#c"\\\'d\''
string escape --style=url 'a b#c"\'d'
echo
echo '# string escape --style=url \\na\\nb%c~d\\n'
string escape --style=url \na\nb%c~d\n
echo
echo '# string escape --style=var \'a b#c"\\\'d\''
string escape --style=var 'a b#c"\'d'
echo
echo '# string escape --style=script a\nghi_'
string escape --style=var a\nghi_
echo
echo '# string escape --style=var \'abc\''
string escape --style=var 'abc'
echo
echo '# string escape --style=var \'_a_b_c_\''
string escape --style=var '_a_b_c_'
echo
echo '# string escape --style=var -- -'
string escape --style=var -- -
echo
echo '# string match "?" a'
string match "?" a

View File

@ -74,6 +74,30 @@ zan
# echo \x07 | string escape
\cg
# string escape --style=script 'a b#c"\'d'
a\ b\#c\"\'d
# string escape --style=url 'a b#c"\'d'
a%20b%23c%22%27d
# string escape --style=url \na\nb%c~d\n
%0Aa%0Ab%25c~d%0A
# string escape --style=var 'a b#c"\'d'
a_20_62_23_63_22_27_64_
# string escape --style=script a\nghi_
a_0A_ghi__
# string escape --style=var 'abc'
abc
# string escape --style=var '_a_b_c_'
__a__b__c__
# string escape --style=var -- -
_2D_
# string match "?" a
a