Implement read --delimiter

This takes a string that is then split upon like `string split`. Unlike $IFS, the string is used as one piece, not a set of characters. There is still a fallback to IFS if no delimiter is given, that behaves exactly as before. Fixes #4156.
2024-11-26 10:43:47 +08:00 · 2017-07-27 15:06:01 +02:00 · 2017-07-27 15:06:01 +02:00 · b1866b18dc
commit b1866b18dc
parent 78889cc034
5 changed files with 137 additions and 18 deletions
--- a/doc_src/read.txt
+++ b/doc_src/read.txt
@ -13,6 +13,8 @@ The following options are available:

 - `-c CMD` or `--command=CMD` sets the initial string in the interactive mode command buffer to `CMD`.

+- `-d DELIMITER` or `--delimiter=DELIMITER` splits on DELIMITER.
+
 - `-g` or `--global` makes the variables global.

 - `-i` or `--silent` makes the characters typed obfuscated. This is useful for reading things like passwords or other sensitive information. Note that in bash the short flag is `-s`. We can't use that due to the existing use as an alias for `--shell`.
@ -39,7 +41,7 @@ The following options are available:

 - `-z` or `--null` reads up to NUL instead of newline. Disables interactive mode.

-`read` reads a single line of input from stdin, breaks it into tokens based on the `IFS` shell variable, and then assigns one token to each variable specified in `VARIABLES`. If there are more tokens than variables, the complete remainder is assigned to the last variable. As a special case, if `IFS` is set to the empty string, each character of the input is considered a separate token.
+`read` reads a single line of input from stdin, breaks it into tokens based on the delimiter set via `-d`/`--delimiter` as a complete string (like `string split` or, if that has not been given the (deprecated) `IFS` shell variable as a set of characters, and then assigns one token to each variable specified in `VARIABLES`. If there are more tokens than variables, the complete remainder is assigned to the last variable. As a special case, if `IFS` is set to the empty string, each character of the input is considered a separate token.

 If `-a` or `--array` is provided, only one variable name is allowed and the tokens are stored as an array in this variable.

@ -64,4 +66,11 @@ echo hello|read foo
 printf '%s\n' line1 line2 line3 line4 | while read -l foo
                  echo "This is another line: $foo"
              end
+
+# Delimiters given via "-d" are taken as one string
+echo a==b==c | read -d == -l a b c
+echo $a # a
+echo $b # b
+echo $c # c
+
 \endfish
--- a/doc_src/string.txt
+++ b/doc_src/string.txt
@ -91,6 +91,8 @@ Exit status: 0 if at least one replacement was performed, or 1 otherwise.

 `string split` splits each STRING on the separator SEP, which can be an empty string. If `-m` or `--max` is specified, at most MAX splits are done on each STRING. If `-r` or `--right` is given, splitting is performed right-to-left. This is useful in combination with `-m` or `--max`. Exit status: 0 if at least one split was performed, or 1 otherwise.

+See also `read --delimiter`.
+
 \subsection string-sub "sub" subcommand

 `string sub` prints a substring of each string argument. The start of the substring can be specified with `-s` or `--start` followed by a 1-based index value. Positive index values are relative to the start of the string and negative index values are relative to the end of the string. The default start value is 1. The length of the substring can be specified with `-l` or `--length`. If the length is not specified, the substring continues to the end of each STRING. Exit status: 0 if at least one substring operation was performed, 1 otherwise.
--- a/src/builtin_read.cpp
+++ b/src/builtin_read.cpp
@ -2,6 +2,7 @@
 #include "config.h"  // IWYU pragma: keep

 #include <errno.h>
+#include <limits.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <string.h>
@ -37,6 +38,10 @@ struct read_cmd_opts_t {
    const wchar_t *prompt_str = NULL;
    const wchar_t *right_prompt = L"";
    const wchar_t *commandline = L"";
+    // If a delimiter was given. Used to distinguish between the default
+    // empty string and a given empty delimiter.
+    bool have_delimiter = false;
+    wcstring delimiter;
    bool shell = false;
    bool array = false;
    bool silent = false;
@ -44,7 +49,7 @@ struct read_cmd_opts_t {
    int nchars = 0;
 };

-static const wchar_t *short_options = L":ac:ghilm:n:p:suxzP:UR:";
+static const wchar_t *short_options = L":ac:ghilm:n:p:d:suxzP:UR:";
 static const struct woption long_options[] = {{L"export", no_argument, NULL, 'x'},
                                              {L"global", no_argument, NULL, 'g'},
                                              {L"local", no_argument, NULL, 'l'},
@ -57,6 +62,7 @@ static const struct woption long_options[] = {{L"export", no_argument, NULL, 'x'
                                              {L"mode-name", required_argument, NULL, 'm'},
                                              {L"silent", no_argument, NULL, 'i'},
                                              {L"nchars", required_argument, NULL, 'n'},
+                                              {L"delimiter", required_argument, NULL, 'd'},
                                              {L"shell", no_argument, NULL, 's'},
                                              {L"array", no_argument, NULL, 'a'},
                                              {L"null", no_argument, NULL, 'z'},
@ -129,6 +135,11 @@ static int parse_cmd_opts(read_cmd_opts_t &opts, int *optind,  //!OCLINT(high nc
                }
                break;
            }
+            case 'd': {
+                opts.have_delimiter = true;
+                opts.delimiter = w.woptarg;
+                break;
+            }
            case 's': {
                opts.shell = true;
                break;
@ -415,8 +426,13 @@ int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv) {
        return exit_res;
    }

-    env_var_t ifs = env_get_string(L"IFS");
-    if (ifs.missing_or_empty()) {
+    if (!opts.have_delimiter) {
+        env_var_t ifs = env_get_string(L"IFS");
+        if (!ifs.missing_or_empty()) {
+            opts.delimiter = ifs;
+        }
+    }
+    if (opts.delimiter.empty()) {
        // Every character is a separate token.
        size_t bufflen = buff.size();
        if (opts.array) {
@ -446,23 +462,42 @@ int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv) {
            if (i < argc) env_set(argv[i], &buff[j], opts.place);
        }
    } else if (opts.array) {
-        wcstring tokens;
-        tokens.reserve(buff.size());
-        bool empty = true;
+        if (!opts.have_delimiter) {
+            // We're using IFS, so well split on every character in that,
+            // not on the entire thing at once.
+            wcstring tokens;
+            tokens.reserve(buff.size());

-        for (wcstring_range loc = wcstring_tok(buff, ifs); loc.first != wcstring::npos;
-             loc = wcstring_tok(buff, ifs, loc)) {
-            if (!empty) tokens.push_back(ARRAY_SEP);
-            tokens.append(buff, loc.first, loc.second);
-            empty = false;
+            for (wcstring_range loc = wcstring_tok(buff, opts.delimiter);
+                 loc.first != wcstring::npos; loc = wcstring_tok(buff, opts.delimiter, loc)) {
+                if (!tokens.empty()) tokens.push_back(ARRAY_SEP);
+                tokens.append(buff, loc.first, loc.second);
+            }
+            env_set(argv[0], tokens.empty() ? NULL : tokens.c_str(), opts.place);
+        } else {
+            wcstring_list_t splits;
+            split_about(buff.begin(), buff.end(), opts.delimiter.begin(), opts.delimiter.end(),
+                        &splits, LONG_MAX);
+            auto val = list_to_array_val(splits);
+            env_set(argv[0], *val == ENV_NULL ? NULL : val->c_str(), opts.place);
        }
-        env_set(argv[0], empty ? NULL : tokens.c_str(), opts.place);
    } else {  // not array
-        wcstring_range loc = wcstring_range(0, 0);
-        for (int i = 0; i < argc; i++) {
-            loc = wcstring_tok(buff, (i + 1 < argc) ? ifs : wcstring(), loc);
-            env_set(argv[i], loc.first == wcstring::npos ? L"" : &buff.c_str()[loc.first],
-                    opts.place);
+        if (!opts.have_delimiter) {
+            wcstring_range loc = wcstring_range(0, 0);
+            for (int i = 0; i < argc; i++) {
+                loc = wcstring_tok(buff, (i + 1 < argc) ? opts.delimiter : wcstring(), loc);
+                env_set(argv[i], loc.first == wcstring::npos ? L"" : &buff.c_str()[loc.first],
+                        opts.place);
+            }
+        } else {
+            wcstring_list_t splits;
+            // We're making at most argc - 1 splits so the last variable
+            // is set to the remaining string.
+            split_about(buff.begin(), buff.end(), opts.delimiter.begin(), opts.delimiter.end(),
+                        &splits, argc - 1);
+            for (size_t i = 0; i < (size_t)argc && i < splits.size(); i++) {
+                env_set(argv[i], splits[i].c_str(), opts.place);
+            }
        }
    }

--- a/tests/read.in
+++ b/tests/read.in
@ -208,3 +208,46 @@ end

 echo '# Confirm reading non-interactively works (#4206 regression)'
 echo abc\ndef | ../test/root/bin/fish -i -c 'read a; read b; show a; show b'
+### Test --delimiter (and $IFS, for now)
+echo a=b | read -l foo bar
+echo $foo
+echo $bar
+echo Delimiter =
+echo a=b | read -l -d = foo bar
+echo $foo
+echo $bar
+echo Delimiter empty
+echo a=b | read -l -d '' foo bar baz
+echo $foo
+echo $bar
+echo $baz
+echo IFS empty string
+set -l IFS ''
+echo a=b | read -l foo bar baz
+echo $foo
+echo $bar
+echo $baz
+echo IFS unset
+set -e IFS
+echo a=b | read -l foo bar baz
+echo $foo
+echo $bar
+echo $baz
+echo Delimiter =
+echo a=b | read -l -d = foo bar baz
+echo $foo
+echo $bar
+echo $baz
+echo
+
+echo 'Multi-char delimiters with -d'
+echo a...b...c | read -l -d "..." a b c
+echo $a
+echo $b
+echo $c
+echo 'Multi-char delimiters with IFS'
+begin
+    set IFS "..."
+    echo a...b...c | read -l a b c
+    echo $a; echo $b; echo $c
+end
--- a/tests/read.out
+++ b/tests/read.out
@ -65,3 +65,33 @@ $a count=1
 $a[1]=|abc|
 $b count=1
 $b[1]=|def|
+a=b
+
+Delimiter =
+a
+b
+Delimiter empty
+a
+=
+b
+IFS empty string
+a
+=
+b
+IFS unset
+a=b
+
+
+Delimiter =
+a
+b
+
+
+Multi-char delimiters with -d
+a
+b
+c
+Multi-char delimiters with IFS
+a
+b
+..c