Add read --tokenize

This splits a string into variables according to the shell's tokenization rules, considering quoting, escaping etc. This runs an automatic `unescape` on the string so it's presented like it would be passed to the command. E.g. printf '%s\n' a\ b returns the tokens printf %s\n a b It might be useful to add another mode "--tokenize-raw" that doesn't do that, but this seems to be the more useful of the two. Fixes #3823.
2025-01-19 04:52:44 +08:00 · 2019-11-29 20:05:31 +01:00 · 2019-11-29 20:05:31 +01:00 · 86133b0a2b
commit 86133b0a2b
parent 2fd1e4ab75
3 changed files with 84 additions and 3 deletions
--- a/sphinx_doc_src/cmds/read.rst
+++ b/sphinx_doc_src/cmds/read.rst
@ -38,6 +38,8 @@ The following options are available:

 - ``-S`` or ``--shell`` enables syntax highlighting, tab completions and command termination suitable for entering shellscript code in the interactive mode. NOTE: Prior to fish 3.0, the short opt for ``--shell`` was ``-s``, but it has been changed for compatibility with bash's ``-s`` short opt for ``--silent``.

+- ``-t`` -or ``--tokenize`` causes read to split the input into variables by the shell's tokenization rules. This means it will honor quotes and escaping. This option is of course incompatible with other options to control splitting like ``--delimiter`` and does not honor $IFS (like fish's tokenizer). It saves the tokens in the manner they'd be passed to commands on the commandline, so e.g. ``a\ b`` is stored as ``a b``. Note that currently it leaves command substitutions intact along with the parentheses.
+
 - ``-u`` or ``--unexport`` prevents the variables from being exported to child processes (default behaviour).

 - ``-U`` or ``--universal`` causes the specified shell variable to be made universal.
@ -53,7 +55,7 @@ The following options are available:

 Without the ``--line`` option, ``read`` reads a single line of input from standard input, breaks it into tokens, and then assigns one token to each variable specified in ``VARIABLES``. If there are more tokens than variables, the complete remainder is assigned to the last variable.

-If the ``--delimiter`` argument is not given, the variable ``IFS`` is used as a list of characters to split on. Relying on the use of ``IFS`` is deprecated and this behaviour will be removed in future versions. The default value of ``IFS`` contains space, tab and newline characters. As a special case, if ``IFS`` is set to the empty string, each character of the input is considered a separate token.
+If no option to determine how to split like ``--delimiter``, ``--line`` or ``--tokenize`` is given, the variable ``IFS`` is used as a list of characters to split on. Relying on the use of ``IFS`` is deprecated and this behaviour will be removed in future versions. The default value of ``IFS`` contains space, tab and newline characters. As a special case, if ``IFS`` is set to the empty string, each character of the input is considered a separate token.

 With the ``--line`` option, ``read`` reads a line of input from standard input into each provided variable, stopping when each variable has been filled. The line is not tokenized.

@ -101,3 +103,12 @@ The following code stores the value 'hello' in the shell variable ``$foo``.
    echo $a # a
    echo $b # b
    echo $c # c
+
+    # --tokenize honors quotes and escaping like the shell's argument passing:
+    echo 'a\ b' | read -t first second
+    echo $first # outputs "a b", $second is empty
+
+    echo 'a"foo bar"b (command echo wurst)*" "{a,b}' | read -lt -l a b c
+    echo $a # outputs 'afoo bar' (without the quotes)
+    echo $b # outputs '(command echo wurst)* {a,b}' (without the quotes)
+    echo $c # nothing
--- a/src/builtin_read.cpp
+++ b/src/builtin_read.cpp
@ -2,7 +2,6 @@
 #include "config.h"  // IWYU pragma: keep

 #include "builtin_read.h"
-
 #include <unistd.h>

 #include <algorithm>
@ -46,6 +45,7 @@ struct read_cmd_opts_t {
    // empty string and a given empty delimiter.
    bool have_delimiter = false;
    wcstring delimiter;
+    bool tokenize = false;
    bool shell = false;
    bool array = false;
    bool silent = false;
@ -55,7 +55,7 @@ struct read_cmd_opts_t {
    bool one_line = false;
 };

-static const wchar_t *const short_options = L":ac:d:ghiLlm:n:p:sSuxzP:UR:LB";
+static const wchar_t *const short_options = L":ac:d:ghiLlm:n:p:sStuxzP:UR:LB";
 static const struct woption long_options[] = {{L"array", no_argument, nullptr, 'a'},
                                              {L"command", required_argument, nullptr, 'c'},
                                              {L"delimiter", required_argument, nullptr, 'd'},
@ -72,6 +72,7 @@ static const struct woption long_options[] = {{L"array", no_argument, nullptr, '
                                              {L"right-prompt", required_argument, nullptr, 'R'},
                                              {L"shell", no_argument, nullptr, 'S'},
                                              {L"silent", no_argument, nullptr, 's'},
+                                              {L"tokenize", no_argument, nullptr, 't'},
                                              {L"unexport", no_argument, nullptr, 'u'},
                                              {L"universal", no_argument, nullptr, 'U'},
                                              {nullptr, 0, nullptr, 0}};
@ -154,6 +155,10 @@ static int parse_cmd_opts(read_cmd_opts_t &opts, int *optind,  //!OCLINT(high nc
                opts.shell = true;
                break;
            }
+            case L't': {
+                opts.tokenize = true;
+                break;
+            }
            case L'U': {
                opts.place |= ENV_UNIVERSAL;
                break;
@ -397,6 +402,20 @@ static int validate_read_args(const wchar_t *cmd, read_cmd_opts_t &opts, int arg
        return STATUS_INVALID_ARGS;
    }

+    if (opts.tokenize && opts.have_delimiter) {
+        streams.err.append_format(
+            BUILTIN_ERR_COMBO2, cmd,
+            L"--delimiter and --tokenize can not be used together");
+        return STATUS_INVALID_ARGS;
+    }
+
+    if (opts.tokenize && opts.one_line) {
+        streams.err.append_format(
+            BUILTIN_ERR_COMBO2, cmd,
+            L"--line and --tokenize can not be used together");
+        return STATUS_INVALID_ARGS;
+    }
+
    // Verify all variable names.
    for (int i = 0; i < argc; i++) {
        if (!valid_var_name(argv[i])) {
@ -486,6 +505,43 @@ int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv) {
            return exit_res;
        }

+        if (opts.tokenize) {
+            tokenizer_t tok{buff.c_str(), TOK_ACCEPT_UNFINISHED};
+            wcstring out;
+            if (opts.array) {
+                // Array mode: assign each token as a separate element of the sole var.
+                wcstring_list_t tokens;
+                while (auto t = tok.next()) {
+                    auto text = tok.text_of(*t);
+                    if (unescape_string(text, &out, UNESCAPE_DEFAULT)) {
+                        tokens.push_back(out);
+                    } else {
+                        tokens.push_back(text);
+                    }
+                }
+
+                vars.set(*var_ptr++, opts.place, tokens);
+            } else {
+                maybe_t<tok_t> t;
+                while ((vars_left() - 1 > 0) && (t = tok.next())) {
+                    auto text = tok.text_of(*t);
+                    if (unescape_string(text, &out, UNESCAPE_DEFAULT)) {
+                        vars.set_one(*var_ptr++, opts.place, out);
+                    } else {
+                        vars.set_one(*var_ptr++, opts.place, text);
+                    }
+                }
+
+                // If we still have tokens, set the last variable to them.
+                if (t = tok.next()) {
+                    wcstring rest = wcstring(buff, t->offset);
+                    vars.set_one(*var_ptr++, opts.place, rest);
+                }
+            }
+            // The rest of the loop is other split-modes, we don't care about those.
+            continue;
+        }
+
        if (!opts.have_delimiter) {
            auto ifs = parser.vars().get(L"IFS");
            if (!ifs.missing_or_empty()) opts.delimiter = ifs->as_string();
--- a/tests/checks/read.fish
+++ b/tests/checks/read.fish
@ -0,0 +1,14 @@
+# RUN: %fish %s
+echo 'a | b' | read -lt a b c
+
+set -l
+# CHECK: a a
+# CHECK: b '|'
+# CHECK: c b
+
+echo 'a"foo bar"b' | read -lt a b c
+
+set -l
+# CHECK: a 'afoo barb'
+# CHECK: b
+# CHECK: c