From a16e2ecb1b25cc23c91a0e5ae041148d2d5505fb Mon Sep 17 00:00:00 2001 From: Xiretza Date: Sun, 5 Feb 2023 22:08:32 +0100 Subject: [PATCH] Port echo builtin to Rust --- CMakeLists.txt | 2 +- fish-rust/src/builtins/echo.rs | 232 +++++++++++++++++++++++++++++ fish-rust/src/builtins/mod.rs | 2 + fish-rust/src/builtins/shared.rs | 1 + fish-rust/src/wchar.rs | 27 ++++ src/builtin.cpp | 6 +- src/builtin.h | 1 + src/builtins/echo.cpp | 243 ------------------------------- src/builtins/echo.h | 11 -- 9 files changed, 268 insertions(+), 257 deletions(-) create mode 100644 fish-rust/src/builtins/echo.rs delete mode 100644 src/builtins/echo.cpp delete mode 100644 src/builtins/echo.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 22d4c6af0..b99e9bd20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ set(FISH_BUILTIN_SRCS src/builtins/bg.cpp src/builtins/bind.cpp src/builtins/block.cpp src/builtins/builtin.cpp src/builtins/cd.cpp src/builtins/command.cpp src/builtins/commandline.cpp src/builtins/complete.cpp src/builtins/contains.cpp - src/builtins/disown.cpp src/builtins/echo.cpp src/builtins/emit.cpp + src/builtins/disown.cpp src/builtins/emit.cpp src/builtins/eval.cpp src/builtins/exit.cpp src/builtins/fg.cpp src/builtins/function.cpp src/builtins/functions.cpp src/builtins/history.cpp src/builtins/jobs.cpp src/builtins/math.cpp src/builtins/printf.cpp src/builtins/path.cpp diff --git a/fish-rust/src/builtins/echo.rs b/fish-rust/src/builtins/echo.rs new file mode 100644 index 000000000..9b251cd87 --- /dev/null +++ b/fish-rust/src/builtins/echo.rs @@ -0,0 +1,232 @@ +//! Implementation of the echo builtin. + +use libc::c_int; + +use super::shared::{builtin_missing_argument, io_streams_t, STATUS_CMD_OK, STATUS_INVALID_ARGS}; +use crate::ffi::parser_t; +use crate::wchar::{wchar_literal_byte, wstr, WString, L}; +use crate::wgetopt::{wgetopter_t, woption}; + +#[derive(Debug, Clone, Copy)] +struct Options { + print_newline: bool, + print_spaces: bool, + interpret_special_chars: bool, +} + +impl Default for Options { + fn default() -> Self { + Self { + print_newline: true, + print_spaces: true, + interpret_special_chars: false, + } + } +} + +fn parse_options( + args: &mut [&wstr], + parser: &mut parser_t, + streams: &mut io_streams_t, +) -> Result<(Options, usize), Option> { + let cmd = args[0]; + + const SHORT_OPTS: &wstr = L!("+:Eens"); + const LONG_OPTS: &[woption] = &[]; + + let mut opts = Options::default(); + + let mut oldopts = opts; + let mut oldoptind = 0; + + let mut w = wgetopter_t::new(SHORT_OPTS, LONG_OPTS, args); + while let Some(c) = w.wgetopt_long() { + match c { + 'n' => opts.print_newline = false, + 'e' => opts.interpret_special_chars = true, + 's' => opts.print_spaces = false, + 'E' => opts.interpret_special_chars = false, + ':' => { + builtin_missing_argument(parser, streams, cmd, args[w.woptind - 1], true); + return Err(STATUS_INVALID_ARGS); + } + '?' => { + return Ok((oldopts, w.woptind - 1)); + } + _ => { + panic!("unexpected retval from wgetopter::wgetopt_long()"); + } + } + + // Super cheesy: We keep an old copy of the option state around, + // so we can revert it in case we get an argument like + // "-n foo". + // We need to keep it one out-of-date so we can ignore the *last* option. + // (this might be an issue in wgetopt, but that's a whole other can of worms + // and really only occurs with our weird "put it back" option parsing) + if w.woptind == oldoptind + 2 { + oldopts = opts; + oldoptind = w.woptind; + } + } + + Ok((opts, w.woptind)) +} + +/// Parse a numeric escape sequence in `s`, returning the number of characters consumed and the +/// resulting value. Supported escape sequences: +/// +/// - `0nnn`: octal value, zero to three digits +/// - `nnn`: octal value, one to three digits +/// - `xhh`: hex value, one to two digits +fn parse_numeric_sequence(chars: I) -> Option<(usize, u8)> +where + I: IntoIterator, +{ + let mut chars = chars.into_iter().peekable(); + + // the first character of the numeric part of the sequence + let mut start = 0; + + let mut base: u8 = 0; + let mut max_digits = 0; + + let first = *chars.peek()?; + if first.is_digit(8) { + // Octal escape + base = 8; + + // If the first digit is a 0, we allow four digits (including that zero); otherwise, we + // allow 3. + max_digits = if first == '0' { 4 } else { 3 }; + } else if first == 'x' { + // Hex escape + base = 16; + max_digits = 2; + + // Skip the x + start = 1; + }; + + if base == 0 { + return None; + } + + let mut val = 0; + let mut consumed = start; + for digit in chars + .skip(start) + .take(max_digits) + .map_while(|c| c.to_digit(base.into())) + { + // base is either 8 or 16, so digit can never be >255 + let digit = u8::try_from(digit).unwrap(); + + val = val * base + digit; + + consumed += 1; + } + + // We succeeded if we consumed at least one digit. + if consumed > 0 { + Some((consumed, val)) + } else { + None + } +} + +/// The echo builtin. +/// +/// Bash only respects `-n` if it's the first argument. We'll do the same. We also support a new, +/// fish specific, option `-s` to mean "no spaces". +pub fn echo( + parser: &mut parser_t, + streams: &mut io_streams_t, + args: &mut [&wstr], +) -> Option { + let (opts, optind) = match parse_options(args, parser, streams) { + Ok((opts, optind)) => (opts, optind), + Err(err @ Some(_)) if err != STATUS_CMD_OK => return err, + Err(err) => panic!("Illogical exit code from parse_options(): {err:?}"), + }; + + // The special character \c can be used to indicate no more output. + let mut output_stopped = false; + + // We buffer output so we can write in one go, + // this matters when writing to an fd. + let mut out = WString::new(); + let args_to_echo = &args[optind..]; + 'outer: for (idx, arg) in args_to_echo.iter().enumerate() { + if opts.print_spaces && idx > 0 { + out.push(' '); + } + + let mut chars = arg.chars().peekable(); + while let Some(c) = chars.next() { + if !opts.interpret_special_chars || c != '\\' { + // Not an escape. + out.push(c); + continue; + } + + let Some(next_char) = chars.peek() else { + // Incomplete escape sequence is echoed verbatim + out.push('\\'); + break; + }; + + // Most escapes consume one character in addition to the backslash; the numeric + // sequences may consume more, while an unrecognized escape sequence consumes none. + let mut consumed = 1; + + let escaped = match next_char { + 'a' => '\x07', + 'b' => '\x08', + 'e' => '\x1B', + 'f' => '\x0C', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'v' => '\x0B', + '\\' => '\\', + 'c' => { + output_stopped = true; + break 'outer; + } + _ => { + // Octal and hex escape sequences. + if let Some((digits_consumed, narrow_val)) = + parse_numeric_sequence(chars.clone()) + { + consumed = digits_consumed; + // The narrow_val is a literal byte that we want to output (#1894). + wchar_literal_byte(narrow_val) + } else { + consumed = 0; + '\\' + } + } + }; + + // Skip over characters that were part of this escape sequence (after the backslash + // that was consumed by the `while` loop). + // TODO: `Iterator::advance_by()`: https://github.com/rust-lang/rust/issues/77404 + for _ in 0..consumed { + let _ = chars.next(); + } + + out.push(escaped); + } + } + + if opts.print_newline && !output_stopped { + out.push('\n'); + } + + if !out.is_empty() { + streams.out.append(out); + } + + STATUS_CMD_OK +} diff --git a/fish-rust/src/builtins/mod.rs b/fish-rust/src/builtins/mod.rs index 9ae08c6e6..6fab413aa 100644 --- a/fish-rust/src/builtins/mod.rs +++ b/fish-rust/src/builtins/mod.rs @@ -1,2 +1,4 @@ pub mod shared; + +pub mod echo; pub mod wait; diff --git a/fish-rust/src/builtins/shared.rs b/fish-rust/src/builtins/shared.rs index a6e05454d..e770e2c56 100644 --- a/fish-rust/src/builtins/shared.rs +++ b/fish-rust/src/builtins/shared.rs @@ -108,6 +108,7 @@ pub fn run_builtin( builtin: RustBuiltin, ) -> Option { match builtin { + RustBuiltin::Echo => super::echo::echo(parser, streams, args), RustBuiltin::Wait => wait::wait(parser, streams, args), } } diff --git a/fish-rust/src/wchar.rs b/fish-rust/src/wchar.rs index 59680df78..fd91fb6de 100644 --- a/fish-rust/src/wchar.rs +++ b/fish-rust/src/wchar.rs @@ -33,3 +33,30 @@ pub use widestring_suffix::widestrs; /// Pull in our extensions. pub use crate::wchar_ext::{CharPrefixSuffix, WExt}; + +// These are in the Unicode private-use range. We really shouldn't use this +// range but have little choice in the matter given how our lexer/parser works. +// We can't use non-characters for these two ranges because there are only 66 of +// them and we need at least 256 + 64. +// +// If sizeof(wchar_t)==4 we could avoid using private-use chars; however, that +// would result in fish having different behavior on machines with 16 versus 32 +// bit wchar_t. It's better that fish behave the same on both types of systems. +// +// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know +// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF) +// on Mac OS X. See http://www.unicode.org/faq/private_use.html. +const ENCODE_DIRECT_BASE: u32 = 0xF600; +const ENCODE_DIRECT_END: u32 = ENCODE_DIRECT_BASE + 256; + +/// Encode a literal byte in a UTF-32 character. This is required for e.g. the echo builtin, whose +/// escape sequences can be used to construct raw byte sequences which are then interpreted as e.g. +/// UTF-8 by the terminal. If we were to interpret each of those bytes as a codepoint and encode it +/// as a UTF-32 character, printing them would result in several characters instead of one UTF-8 +/// character. +/// +/// See https://github.com/fish-shell/fish-shell/issues/1894. +pub fn wchar_literal_byte(byte: u8) -> char { + char::from_u32(ENCODE_DIRECT_BASE + u32::from(byte)) + .expect("private-use codepoint should be valid char") +} diff --git a/src/builtin.cpp b/src/builtin.cpp index 4f8fa7b6c..b4405af23 100644 --- a/src/builtin.cpp +++ b/src/builtin.cpp @@ -41,7 +41,6 @@ #include "builtins/complete.h" #include "builtins/contains.h" #include "builtins/disown.h" -#include "builtins/echo.h" #include "builtins/emit.h" #include "builtins/eval.h" #include "builtins/exit.h" @@ -384,7 +383,7 @@ static constexpr builtin_data_t builtin_datas[] = { {L"continue", &builtin_break_continue, N_(L"Skip over remaining innermost loop")}, {L"count", &builtin_count, N_(L"Count the number of arguments")}, {L"disown", &builtin_disown, N_(L"Remove job from job list")}, - {L"echo", &builtin_echo, N_(L"Print arguments")}, + {L"echo", &implemented_in_rust, N_(L"Print arguments")}, {L"else", &builtin_generic, N_(L"Evaluate block if condition is false")}, {L"emit", &builtin_emit, N_(L"Emit an event")}, {L"end", &builtin_generic, N_(L"End a block of commands")}, @@ -529,6 +528,9 @@ const wchar_t *builtin_get_desc(const wcstring &name) { } static maybe_t try_get_rust_builtin(const wcstring &cmd) { + if (cmd == L"echo") { + return RustBuiltin::Echo; + } if (cmd == L"wait") { return RustBuiltin::Wait; } diff --git a/src/builtin.h b/src/builtin.h index a24ea3665..54582475e 100644 --- a/src/builtin.h +++ b/src/builtin.h @@ -109,6 +109,7 @@ int parse_help_only_cmd_opts(help_only_cmd_opts_t &opts, int *optind, int argc, /// An enum of the builtins implemented in Rust. enum RustBuiltin : int32_t { + Echo, Wait, }; #endif diff --git a/src/builtins/echo.cpp b/src/builtins/echo.cpp deleted file mode 100644 index 0f15e36b8..000000000 --- a/src/builtins/echo.cpp +++ /dev/null @@ -1,243 +0,0 @@ -// Implementation of the echo builtin. -#include "config.h" // IWYU pragma: keep - -#include "echo.h" - -#include - -#include "../builtin.h" -#include "../common.h" -#include "../fallback.h" // IWYU pragma: keep -#include "../io.h" -#include "../maybe.h" -#include "../wgetopt.h" -#include "../wutil.h" // IWYU pragma: keep - -struct echo_cmd_opts_t { - bool print_newline = true; - bool print_spaces = true; - bool interpret_special_chars = false; -}; -static const wchar_t *const short_options = L"+:Eens"; -static const struct woption *const long_options = nullptr; - -static int parse_cmd_opts(echo_cmd_opts_t &opts, int *optind, int argc, const wchar_t **argv, - parser_t &parser, io_streams_t &streams) { - UNUSED(parser); - UNUSED(streams); - const wchar_t *cmd = argv[0]; - int opt; - wgetopter_t w; - echo_cmd_opts_t oldopts = opts; - int oldoptind = 0; - while ((opt = w.wgetopt_long(argc, argv, short_options, long_options, nullptr)) != -1) { - switch (opt) { - case 'n': { - opts.print_newline = false; - break; - } - case 'e': { - opts.interpret_special_chars = true; - break; - } - case 's': { - opts.print_spaces = false; - break; - } - case 'E': { - opts.interpret_special_chars = false; - break; - } - case ':': { - builtin_missing_argument(parser, streams, cmd, argv[w.woptind - 1]); - return STATUS_INVALID_ARGS; - } - case '?': { - opts = oldopts; - *optind = w.woptind - 1; - return STATUS_CMD_OK; - } - default: { - DIE("unexpected retval from wgetopt_long"); - } - } - - // Super cheesy: We keep an old copy of the option state around, - // so we can revert it in case we get an argument like - // "-n foo". - // We need to keep it one out-of-date so we can ignore the *last* option. - // (this might be an issue in wgetopt, but that's a whole other can of worms - // and really only occurs with our weird "put it back" option parsing) - if (w.woptind == oldoptind + 2) { - oldopts = opts; - oldoptind = w.woptind; - } - } - - *optind = w.woptind; - return STATUS_CMD_OK; -} - -/// Parse a numeric escape sequence in str, returning whether we succeeded. Also return the number -/// of characters consumed and the resulting value. Supported escape sequences: -/// -/// \0nnn: octal value, zero to three digits -/// \nnn: octal value, one to three digits -/// \xhh: hex value, one to two digits -static bool builtin_echo_parse_numeric_sequence(const wchar_t *str, size_t *consumed, - unsigned char *out_val) { - bool success = false; - unsigned int start = 0; // the first character of the numeric part of the sequence - - unsigned int base = 0, max_digits = 0; - if (convert_digit(str[0], 8) != -1) { - // Octal escape - base = 8; - - // If the first digit is a 0, we allow four digits (including that zero); otherwise, we - // allow 3. - max_digits = (str[0] == L'0' ? 4 : 3); - } else if (str[0] == L'x') { - // Hex escape - base = 16; - max_digits = 2; - - // Skip the x - start = 1; - } - - if (base == 0) { - return success; - } - - unsigned int idx; - unsigned char val = 0; // resulting character - for (idx = start; idx < start + max_digits; idx++) { - int digit = convert_digit(str[idx], base); - if (digit == -1) break; - val = val * base + digit; - } - - // We succeeded if we consumed at least one digit. - if (idx > start) { - *consumed = idx; - *out_val = val; - success = true; - } - return success; -} - -/// The echo builtin. -/// -/// Bash only respects -n if it's the first argument. We'll do the same. We also support a new, -/// fish specific, option -s to mean "no spaces". -maybe_t builtin_echo(parser_t &parser, io_streams_t &streams, const wchar_t **argv) { - const wchar_t *cmd = argv[0]; - UNUSED(cmd); - int argc = builtin_count_args(argv); - echo_cmd_opts_t opts; - int optind; - int retval = parse_cmd_opts(opts, &optind, argc, argv, parser, streams); - if (retval != STATUS_CMD_OK) return retval; - - // The special character \c can be used to indicate no more output. - bool continue_output = true; - - const wchar_t *const *args_to_echo = argv + optind; - // We buffer output so we can write in one go, - // this matters when writing to an fd. - wcstring out; - for (size_t idx = 0; continue_output && args_to_echo[idx] != nullptr; idx++) { - if (opts.print_spaces && idx > 0) { - out.push_back(' '); - } - - const wchar_t *str = args_to_echo[idx]; - for (size_t j = 0; continue_output && str[j]; j++) { - if (!opts.interpret_special_chars || str[j] != L'\\') { - // Not an escape. - out.push_back(str[j]); - } else { - // Most escapes consume one character in addition to the backslash; the numeric - // sequences may consume more, while an unrecognized escape sequence consumes none. - wchar_t wc; - size_t consumed = 1; - switch (str[j + 1]) { - case L'a': { - wc = L'\a'; - break; - } - case L'b': { - wc = L'\b'; - break; - } - case L'e': { - wc = L'\x1B'; - break; - } - case L'f': { - wc = L'\f'; - break; - } - case L'n': { - wc = L'\n'; - break; - } - case L'r': { - wc = L'\r'; - break; - } - case L't': { - wc = L'\t'; - break; - } - case L'v': { - wc = L'\v'; - break; - } - case L'\\': { - wc = L'\\'; - break; - } - case L'c': { - wc = 0; - continue_output = false; - break; - } - default: { - // Octal and hex escape sequences. - unsigned char narrow_val = 0; - if (builtin_echo_parse_numeric_sequence(str + j + 1, &consumed, - &narrow_val)) { - // Here consumed must have been set to something. The narrow_val is a - // literal byte that we want to output (#1894). - wc = ENCODE_DIRECT_BASE + narrow_val % 256; - } else { - // Not a recognized escape. We consume only the backslash. - wc = L'\\'; - consumed = 0; - } - break; - } - } - - // Skip over characters that were part of this escape sequence (but not the - // backslash, which will be handled by the loop increment. - j += consumed; - - if (continue_output) { - out.push_back(wc); - } - } - } - } - if (opts.print_newline && continue_output) { - out.push_back('\n'); - } - - if (!out.empty()) { - streams.out.append(out); - } - - return STATUS_CMD_OK; -} diff --git a/src/builtins/echo.h b/src/builtins/echo.h deleted file mode 100644 index ed4ae2d13..000000000 --- a/src/builtins/echo.h +++ /dev/null @@ -1,11 +0,0 @@ -// Prototypes for executing builtin_echo function. -#ifndef FISH_BUILTIN_ECHO_H -#define FISH_BUILTIN_ECHO_H - -#include "../maybe.h" - -class parser_t; -struct io_streams_t; - -maybe_t builtin_echo(parser_t &parser, io_streams_t &streams, const wchar_t **argv); -#endif