Port echo builtin to Rust

2025-02-21 02:16:32 +08:00 · 2023-02-05 22:08:32 +01:00 · 2023-02-05 22:08:32 +01:00 · a16e2ecb1b
commit a16e2ecb1b
parent 4b85c2f6db
9 changed files with 268 additions and 257 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -103,7 +103,7 @@ set(FISH_BUILTIN_SRCS
    src/builtins/bg.cpp src/builtins/bind.cpp src/builtins/block.cpp
    src/builtins/builtin.cpp src/builtins/cd.cpp src/builtins/command.cpp
    src/builtins/commandline.cpp src/builtins/complete.cpp src/builtins/contains.cpp
-    src/builtins/disown.cpp src/builtins/echo.cpp src/builtins/emit.cpp
+    src/builtins/disown.cpp src/builtins/emit.cpp
    src/builtins/eval.cpp src/builtins/exit.cpp src/builtins/fg.cpp
    src/builtins/function.cpp src/builtins/functions.cpp src/builtins/history.cpp
    src/builtins/jobs.cpp src/builtins/math.cpp src/builtins/printf.cpp src/builtins/path.cpp
--- a/fish-rust/src/builtins/echo.rs
+++ b/fish-rust/src/builtins/echo.rs
@ -0,0 +1,232 @@
+//! Implementation of the echo builtin.
+
+use libc::c_int;
+
+use super::shared::{builtin_missing_argument, io_streams_t, STATUS_CMD_OK, STATUS_INVALID_ARGS};
+use crate::ffi::parser_t;
+use crate::wchar::{wchar_literal_byte, wstr, WString, L};
+use crate::wgetopt::{wgetopter_t, woption};
+
+#[derive(Debug, Clone, Copy)]
+struct Options {
+    print_newline: bool,
+    print_spaces: bool,
+    interpret_special_chars: bool,
+}
+
+impl Default for Options {
+    fn default() -> Self {
+        Self {
+            print_newline: true,
+            print_spaces: true,
+            interpret_special_chars: false,
+        }
+    }
+}
+
+fn parse_options(
+    args: &mut [&wstr],
+    parser: &mut parser_t,
+    streams: &mut io_streams_t,
+) -> Result<(Options, usize), Option<c_int>> {
+    let cmd = args[0];
+
+    const SHORT_OPTS: &wstr = L!("+:Eens");
+    const LONG_OPTS: &[woption] = &[];
+
+    let mut opts = Options::default();
+
+    let mut oldopts = opts;
+    let mut oldoptind = 0;
+
+    let mut w = wgetopter_t::new(SHORT_OPTS, LONG_OPTS, args);
+    while let Some(c) = w.wgetopt_long() {
+        match c {
+            'n' => opts.print_newline = false,
+            'e' => opts.interpret_special_chars = true,
+            's' => opts.print_spaces = false,
+            'E' => opts.interpret_special_chars = false,
+            ':' => {
+                builtin_missing_argument(parser, streams, cmd, args[w.woptind - 1], true);
+                return Err(STATUS_INVALID_ARGS);
+            }
+            '?' => {
+                return Ok((oldopts, w.woptind - 1));
+            }
+            _ => {
+                panic!("unexpected retval from wgetopter::wgetopt_long()");
+            }
+        }
+
+        // Super cheesy: We keep an old copy of the option state around,
+        // so we can revert it in case we get an argument like
+        // "-n foo".
+        // We need to keep it one out-of-date so we can ignore the *last* option.
+        // (this might be an issue in wgetopt, but that's a whole other can of worms
+        //  and really only occurs with our weird "put it back" option parsing)
+        if w.woptind == oldoptind + 2 {
+            oldopts = opts;
+            oldoptind = w.woptind;
+        }
+    }
+
+    Ok((opts, w.woptind))
+}
+
+/// Parse a numeric escape sequence in `s`, returning the number of characters consumed and the
+/// resulting value. Supported escape sequences:
+///
+/// - `0nnn`: octal value, zero to three digits
+/// - `nnn`: octal value, one to three digits
+/// - `xhh`: hex value, one to two digits
+fn parse_numeric_sequence<I>(chars: I) -> Option<(usize, u8)>
+where
+    I: IntoIterator<Item = char>,
+{
+    let mut chars = chars.into_iter().peekable();
+
+    // the first character of the numeric part of the sequence
+    let mut start = 0;
+
+    let mut base: u8 = 0;
+    let mut max_digits = 0;
+
+    let first = *chars.peek()?;
+    if first.is_digit(8) {
+        // Octal escape
+        base = 8;
+
+        // If the first digit is a 0, we allow four digits (including that zero); otherwise, we
+        // allow 3.
+        max_digits = if first == '0' { 4 } else { 3 };
+    } else if first == 'x' {
+        // Hex escape
+        base = 16;
+        max_digits = 2;
+
+        // Skip the x
+        start = 1;
+    };
+
+    if base == 0 {
+        return None;
+    }
+
+    let mut val = 0;
+    let mut consumed = start;
+    for digit in chars
+        .skip(start)
+        .take(max_digits)
+        .map_while(|c| c.to_digit(base.into()))
+    {
+        // base is either 8 or 16, so digit can never be >255
+        let digit = u8::try_from(digit).unwrap();
+
+        val = val * base + digit;
+
+        consumed += 1;
+    }
+
+    // We succeeded if we consumed at least one digit.
+    if consumed > 0 {
+        Some((consumed, val))
+    } else {
+        None
+    }
+}
+
+/// The echo builtin.
+///
+/// Bash only respects `-n` if it's the first argument. We'll do the same. We also support a new,
+/// fish specific, option `-s` to mean "no spaces".
+pub fn echo(
+    parser: &mut parser_t,
+    streams: &mut io_streams_t,
+    args: &mut [&wstr],
+) -> Option<c_int> {
+    let (opts, optind) = match parse_options(args, parser, streams) {
+        Ok((opts, optind)) => (opts, optind),
+        Err(err @ Some(_)) if err != STATUS_CMD_OK => return err,
+        Err(err) => panic!("Illogical exit code from parse_options(): {err:?}"),
+    };
+
+    // The special character \c can be used to indicate no more output.
+    let mut output_stopped = false;
+
+    // We buffer output so we can write in one go,
+    // this matters when writing to an fd.
+    let mut out = WString::new();
+    let args_to_echo = &args[optind..];
+    'outer: for (idx, arg) in args_to_echo.iter().enumerate() {
+        if opts.print_spaces && idx > 0 {
+            out.push(' ');
+        }
+
+        let mut chars = arg.chars().peekable();
+        while let Some(c) = chars.next() {
+            if !opts.interpret_special_chars || c != '\\' {
+                // Not an escape.
+                out.push(c);
+                continue;
+            }
+
+            let Some(next_char) = chars.peek() else {
+                // Incomplete escape sequence is echoed verbatim
+                out.push('\\');
+                break;
+            };
+
+            // Most escapes consume one character in addition to the backslash; the numeric
+            // sequences may consume more, while an unrecognized escape sequence consumes none.
+            let mut consumed = 1;
+
+            let escaped = match next_char {
+                'a' => '\x07',
+                'b' => '\x08',
+                'e' => '\x1B',
+                'f' => '\x0C',
+                'n' => '\n',
+                'r' => '\r',
+                't' => '\t',
+                'v' => '\x0B',
+                '\\' => '\\',
+                'c' => {
+                    output_stopped = true;
+                    break 'outer;
+                }
+                _ => {
+                    // Octal and hex escape sequences.
+                    if let Some((digits_consumed, narrow_val)) =
+                        parse_numeric_sequence(chars.clone())
+                    {
+                        consumed = digits_consumed;
+                        // The narrow_val is a literal byte that we want to output (#1894).
+                        wchar_literal_byte(narrow_val)
+                    } else {
+                        consumed = 0;
+                        '\\'
+                    }
+                }
+            };
+
+            // Skip over characters that were part of this escape sequence (after the backslash
+            // that was consumed by the `while` loop).
+            // TODO: `Iterator::advance_by()`: https://github.com/rust-lang/rust/issues/77404
+            for _ in 0..consumed {
+                let _ = chars.next();
+            }
+
+            out.push(escaped);
+        }
+    }
+
+    if opts.print_newline && !output_stopped {
+        out.push('\n');
+    }
+
+    if !out.is_empty() {
+        streams.out.append(out);
+    }
+
+    STATUS_CMD_OK
+}
--- a/fish-rust/src/builtins/mod.rs
+++ b/fish-rust/src/builtins/mod.rs
@ -1,2 +1,4 @@
 pub mod shared;
+
+pub mod echo;
 pub mod wait;
--- a/fish-rust/src/builtins/shared.rs
+++ b/fish-rust/src/builtins/shared.rs
@ -108,6 +108,7 @@ pub fn run_builtin(
    builtin: RustBuiltin,
 ) -> Option<c_int> {
    match builtin {
+        RustBuiltin::Echo => super::echo::echo(parser, streams, args),
        RustBuiltin::Wait => wait::wait(parser, streams, args),
    }
 }
--- a/fish-rust/src/wchar.rs
+++ b/fish-rust/src/wchar.rs
@ -33,3 +33,30 @@ pub use widestring_suffix::widestrs;

 /// Pull in our extensions.
 pub use crate::wchar_ext::{CharPrefixSuffix, WExt};
+
+// These are in the Unicode private-use range. We really shouldn't use this
+// range but have little choice in the matter given how our lexer/parser works.
+// We can't use non-characters for these two ranges because there are only 66 of
+// them and we need at least 256 + 64.
+//
+// If sizeof(wchar_t)==4 we could avoid using private-use chars; however, that
+// would result in fish having different behavior on machines with 16 versus 32
+// bit wchar_t. It's better that fish behave the same on both types of systems.
+//
+// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know
+// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF)
+// on Mac OS X. See http://www.unicode.org/faq/private_use.html.
+const ENCODE_DIRECT_BASE: u32 = 0xF600;
+const ENCODE_DIRECT_END: u32 = ENCODE_DIRECT_BASE + 256;
+
+/// Encode a literal byte in a UTF-32 character. This is required for e.g. the echo builtin, whose
+/// escape sequences can be used to construct raw byte sequences which are then interpreted as e.g.
+/// UTF-8 by the terminal. If we were to interpret each of those bytes as a codepoint and encode it
+/// as a UTF-32 character, printing them would result in several characters instead of one UTF-8
+/// character.
+///
+/// See https://github.com/fish-shell/fish-shell/issues/1894.
+pub fn wchar_literal_byte(byte: u8) -> char {
+    char::from_u32(ENCODE_DIRECT_BASE + u32::from(byte))
+        .expect("private-use codepoint should be valid char")
+}
--- a/src/builtin.cpp
+++ b/src/builtin.cpp
@ -41,7 +41,6 @@
 #include "builtins/complete.h"
 #include "builtins/contains.h"
 #include "builtins/disown.h"
-#include "builtins/echo.h"
 #include "builtins/emit.h"
 #include "builtins/eval.h"
 #include "builtins/exit.h"
@ -384,7 +383,7 @@ static constexpr builtin_data_t builtin_datas[] = {
    {L"continue", &builtin_break_continue, N_(L"Skip over remaining innermost loop")},
    {L"count", &builtin_count, N_(L"Count the number of arguments")},
    {L"disown", &builtin_disown, N_(L"Remove job from job list")},
-    {L"echo", &builtin_echo, N_(L"Print arguments")},
+    {L"echo", &implemented_in_rust, N_(L"Print arguments")},
    {L"else", &builtin_generic, N_(L"Evaluate block if condition is false")},
    {L"emit", &builtin_emit, N_(L"Emit an event")},
    {L"end", &builtin_generic, N_(L"End a block of commands")},
@ -529,6 +528,9 @@ const wchar_t *builtin_get_desc(const wcstring &name) {
 }

 static maybe_t<RustBuiltin> try_get_rust_builtin(const wcstring &cmd) {
+    if (cmd == L"echo") {
+        return RustBuiltin::Echo;
+    }
    if (cmd == L"wait") {
        return RustBuiltin::Wait;
    }
--- a/src/builtin.h
+++ b/src/builtin.h
@ -109,6 +109,7 @@ int parse_help_only_cmd_opts(help_only_cmd_opts_t &opts, int *optind, int argc,

 /// An enum of the builtins implemented in Rust.
 enum RustBuiltin : int32_t {
+    Echo,
    Wait,
 };
 #endif
--- a/src/builtins/echo.cpp
+++ b/src/builtins/echo.cpp
@ -1,243 +0,0 @@
-// Implementation of the echo builtin.
-#include "config.h"  // IWYU pragma: keep
-
-#include "echo.h"
-
-#include <cstddef>
-
-#include "../builtin.h"
-#include "../common.h"
-#include "../fallback.h"  // IWYU pragma: keep
-#include "../io.h"
-#include "../maybe.h"
-#include "../wgetopt.h"
-#include "../wutil.h"  // IWYU pragma: keep
-
-struct echo_cmd_opts_t {
-    bool print_newline = true;
-    bool print_spaces = true;
-    bool interpret_special_chars = false;
-};
-static const wchar_t *const short_options = L"+:Eens";
-static const struct woption *const long_options = nullptr;
-
-static int parse_cmd_opts(echo_cmd_opts_t &opts, int *optind, int argc, const wchar_t **argv,
-                          parser_t &parser, io_streams_t &streams) {
-    UNUSED(parser);
-    UNUSED(streams);
-    const wchar_t *cmd = argv[0];
-    int opt;
-    wgetopter_t w;
-    echo_cmd_opts_t oldopts = opts;
-    int oldoptind = 0;
-    while ((opt = w.wgetopt_long(argc, argv, short_options, long_options, nullptr)) != -1) {
-        switch (opt) {
-            case 'n': {
-                opts.print_newline = false;
-                break;
-            }
-            case 'e': {
-                opts.interpret_special_chars = true;
-                break;
-            }
-            case 's': {
-                opts.print_spaces = false;
-                break;
-            }
-            case 'E': {
-                opts.interpret_special_chars = false;
-                break;
-            }
-            case ':': {
-                builtin_missing_argument(parser, streams, cmd, argv[w.woptind - 1]);
-                return STATUS_INVALID_ARGS;
-            }
-            case '?': {
-                opts = oldopts;
-                *optind = w.woptind - 1;
-                return STATUS_CMD_OK;
-            }
-            default: {
-                DIE("unexpected retval from wgetopt_long");
-            }
-        }
-
-        // Super cheesy: We keep an old copy of the option state around,
-        // so we can revert it in case we get an argument like
-        // "-n foo".
-        // We need to keep it one out-of-date so we can ignore the *last* option.
-        // (this might be an issue in wgetopt, but that's a whole other can of worms
-        //  and really only occurs with our weird "put it back" option parsing)
-        if (w.woptind == oldoptind + 2) {
-            oldopts = opts;
-            oldoptind = w.woptind;
-        }
-    }
-
-    *optind = w.woptind;
-    return STATUS_CMD_OK;
-}
-
-/// Parse a numeric escape sequence in str, returning whether we succeeded. Also return the number
-/// of characters consumed and the resulting value. Supported escape sequences:
-///
-/// \0nnn: octal value, zero to three digits
-/// \nnn: octal value, one to three digits
-/// \xhh: hex value, one to two digits
-static bool builtin_echo_parse_numeric_sequence(const wchar_t *str, size_t *consumed,
-                                                unsigned char *out_val) {
-    bool success = false;
-    unsigned int start = 0;  // the first character of the numeric part of the sequence
-
-    unsigned int base = 0, max_digits = 0;
-    if (convert_digit(str[0], 8) != -1) {
-        // Octal escape
-        base = 8;
-
-        // If the first digit is a 0, we allow four digits (including that zero); otherwise, we
-        // allow 3.
-        max_digits = (str[0] == L'0' ? 4 : 3);
-    } else if (str[0] == L'x') {
-        // Hex escape
-        base = 16;
-        max_digits = 2;
-
-        // Skip the x
-        start = 1;
-    }
-
-    if (base == 0) {
-        return success;
-    }
-
-    unsigned int idx;
-    unsigned char val = 0;  // resulting character
-    for (idx = start; idx < start + max_digits; idx++) {
-        int digit = convert_digit(str[idx], base);
-        if (digit == -1) break;
-        val = val * base + digit;
-    }
-
-    // We succeeded if we consumed at least one digit.
-    if (idx > start) {
-        *consumed = idx;
-        *out_val = val;
-        success = true;
-    }
-    return success;
-}
-
-/// The echo builtin.
-///
-/// Bash only respects -n if it's the first argument. We'll do the same. We also support a new,
-/// fish specific, option -s to mean "no spaces".
-maybe_t<int> builtin_echo(parser_t &parser, io_streams_t &streams, const wchar_t **argv) {
-    const wchar_t *cmd = argv[0];
-    UNUSED(cmd);
-    int argc = builtin_count_args(argv);
-    echo_cmd_opts_t opts;
-    int optind;
-    int retval = parse_cmd_opts(opts, &optind, argc, argv, parser, streams);
-    if (retval != STATUS_CMD_OK) return retval;
-
-    // The special character \c can be used to indicate no more output.
-    bool continue_output = true;
-
-    const wchar_t *const *args_to_echo = argv + optind;
-    // We buffer output so we can write in one go,
-    // this matters when writing to an fd.
-    wcstring out;
-    for (size_t idx = 0; continue_output && args_to_echo[idx] != nullptr; idx++) {
-        if (opts.print_spaces && idx > 0) {
-            out.push_back(' ');
-        }
-
-        const wchar_t *str = args_to_echo[idx];
-        for (size_t j = 0; continue_output && str[j]; j++) {
-            if (!opts.interpret_special_chars || str[j] != L'\\') {
-                // Not an escape.
-                out.push_back(str[j]);
-            } else {
-                // Most escapes consume one character in addition to the backslash; the numeric
-                // sequences may consume more, while an unrecognized escape sequence consumes none.
-                wchar_t wc;
-                size_t consumed = 1;
-                switch (str[j + 1]) {
-                    case L'a': {
-                        wc = L'\a';
-                        break;
-                    }
-                    case L'b': {
-                        wc = L'\b';
-                        break;
-                    }
-                    case L'e': {
-                        wc = L'\x1B';
-                        break;
-                    }
-                    case L'f': {
-                        wc = L'\f';
-                        break;
-                    }
-                    case L'n': {
-                        wc = L'\n';
-                        break;
-                    }
-                    case L'r': {
-                        wc = L'\r';
-                        break;
-                    }
-                    case L't': {
-                        wc = L'\t';
-                        break;
-                    }
-                    case L'v': {
-                        wc = L'\v';
-                        break;
-                    }
-                    case L'\\': {
-                        wc = L'\\';
-                        break;
-                    }
-                    case L'c': {
-                        wc = 0;
-                        continue_output = false;
-                        break;
-                    }
-                    default: {
-                        // Octal and hex escape sequences.
-                        unsigned char narrow_val = 0;
-                        if (builtin_echo_parse_numeric_sequence(str + j + 1, &consumed,
-                                                                &narrow_val)) {
-                            // Here consumed must have been set to something. The narrow_val is a
-                            // literal byte that we want to output (#1894).
-                            wc = ENCODE_DIRECT_BASE + narrow_val % 256;
-                        } else {
-                            // Not a recognized escape. We consume only the backslash.
-                            wc = L'\\';
-                            consumed = 0;
-                        }
-                        break;
-                    }
-                }
-
-                // Skip over characters that were part of this escape sequence (but not the
-                // backslash, which will be handled by the loop increment.
-                j += consumed;
-
-                if (continue_output) {
-                    out.push_back(wc);
-                }
-            }
-        }
-    }
-    if (opts.print_newline && continue_output) {
-        out.push_back('\n');
-    }
-
-    if (!out.empty()) {
-        streams.out.append(out);
-    }
-
-    return STATUS_CMD_OK;
-}
--- a/src/builtins/echo.h
+++ b/src/builtins/echo.h
@ -1,11 +0,0 @@
-// Prototypes for executing builtin_echo function.
-#ifndef FISH_BUILTIN_ECHO_H
-#define FISH_BUILTIN_ECHO_H
-
-#include "../maybe.h"
-
-class parser_t;
-struct io_streams_t;
-
-maybe_t<int> builtin_echo(parser_t &parser, io_streams_t &streams, const wchar_t **argv);
-#endif