Port common.{h,cpp} to Rust

Most of it is duplicated, hence untested.

Functions like mbrtowc are not exposed by the libc crate, so declare them
ourselves.
Since we don't know the definition of C macros, add two big hacks to make
this work:
1. Replace MB_LEN_MAX and mbstate_t with values (resp types) that should
   be large enough for any implementation.
2. Detect the definition of MB_CUR_MAX in the build script. This requires
   more changes for each new libc. We could also use this approach for 1.

Additionally, this commit brings a small behavior change to
read_unquoted_escape(): we cannot decode surrogate code points like \UDE01
into a Rust char, so use � (\UFFFD, replacement character) instead.
Previously, we added such code points to a wcstring; looks like they were
ignored when printed.
This commit is contained in:
Johannes Altmanninger 2023-03-26 17:23:05 +02:00
parent 998cb7f1cd
commit 05bad5eda1
33 changed files with 1837 additions and 556 deletions

1
fish-rust/Cargo.lock generated
View File

@ -368,6 +368,7 @@ dependencies = [
"autocxx",
"autocxx-build",
"bitflags",
"cc",
"cxx",
"cxx-build",
"cxx-gen",

View File

@ -26,6 +26,7 @@ widestring = "1.0.2"
[build-dependencies]
autocxx-build = "0.23.1"
cc = { git = "https://github.com/mqudsi/cc-rs", branch = "fish" }
cxx-build = { git = "https://github.com/fish-shell/cxx", branch = "fish" }
cxx-gen = { git = "https://github.com/fish-shell/cxx", branch = "fish" }
miette = { version = "5", features = ["fancy"] }

View File

@ -1,6 +1,8 @@
use miette::miette;
fn main() -> miette::Result<()> {
cc::Build::new().file("src/compat.c").compile("libcompat.a");
let rust_dir = std::env::var("CARGO_MANIFEST_DIR").expect("Env var CARGO_MANIFEST_DIR missing");
let target_dir =
std::env::var("FISH_RUST_TARGET_DIR").unwrap_or(format!("{}/{}", rust_dir, "target/"));
@ -25,6 +27,7 @@ fn main() -> miette::Result<()> {
let source_files = vec![
"src/abbrs.rs",
"src/event.rs",
"src/common.rs",
"src/fd_monitor.rs",
"src/fd_readable_set.rs",
"src/fds.rs",

File diff suppressed because it is too large Load Diff

3
fish-rust/src/compat.c Normal file
View File

@ -0,0 +1,3 @@
#include <stdlib.h>
size_t C_MB_CUR_MAX() { return MB_CUR_MAX; }

8
fish-rust/src/compat.rs Normal file
View File

@ -0,0 +1,8 @@
#[allow(non_snake_case)]
pub fn MB_CUR_MAX() -> usize {
unsafe { C_MB_CUR_MAX() }
}
extern "C" {
fn C_MB_CUR_MAX() -> usize;
}

View File

@ -38,6 +38,11 @@ pub mod flags {
c_int(i32::from(val.bits()))
}
}
impl From<EnvMode> for u16 {
fn from(val: EnvMode) -> Self {
val.bits()
}
}
}
/// Return values for `env_stack_t::set()`.

View File

@ -1,39 +1,34 @@
use crate::wchar::{EXPAND_RESERVED_BASE, EXPAND_RESERVED_END};
use crate::common::{char_offset, EXPAND_RESERVED_BASE, EXPAND_RESERVED_END};
use crate::wchar::wstr;
use widestring_suffix::widestrs;
/// Private use area characters used in expansions
#[repr(u32)]
pub enum ExpandChars {
/// Character representing a home directory.
HomeDirectory = EXPAND_RESERVED_BASE as u32,
/// Character representing process expansion for %self.
ProcessExpandSelf,
/// Character representing variable expansion.
VariableExpand,
/// Character representing variable expansion into a single element.
VariableExpandSingle,
/// Character representing the start of a bracket expansion.
BraceBegin,
/// Character representing the end of a bracket expansion.
BraceEnd,
/// Character representing separation between two bracket elements.
BraceSep,
/// Character that takes the place of any whitespace within non-quoted text in braces
BraceSpace,
/// Separate subtokens in a token with this character.
InternalSeparator,
/// Character representing an empty variable expansion. Only used transitively while expanding
/// variables.
VariableExpandEmpty,
}
/// Character representing a home directory.
pub const HOME_DIRECTORY: char = char_offset(EXPAND_RESERVED_BASE, 0);
/// Character representing process expansion for %self.
pub const PROCESS_EXPAND_SELF: char = char_offset(EXPAND_RESERVED_BASE, 1);
/// Character representing variable expansion.
pub const VARIABLE_EXPAND: char = char_offset(EXPAND_RESERVED_BASE, 2);
/// Character representing variable expansion into a single element.
pub const VARIABLE_EXPAND_SINGLE: char = char_offset(EXPAND_RESERVED_BASE, 3);
/// Character representing the start of a bracket expansion.
pub const BRACE_BEGIN: char = char_offset(EXPAND_RESERVED_BASE, 4);
/// Character representing the end of a bracket expansion.
pub const BRACE_END: char = char_offset(EXPAND_RESERVED_BASE, 5);
/// Character representing separation between two bracket elements.
pub const BRACE_SEP: char = char_offset(EXPAND_RESERVED_BASE, 6);
/// Character that takes the place of any whitespace within non-quoted text in braces
pub const BRACE_SPACE: char = char_offset(EXPAND_RESERVED_BASE, 7);
/// Separate subtokens in a token with this character.
pub const INTERNAL_SEPARATOR: char = char_offset(EXPAND_RESERVED_BASE, 8);
/// Character representing an empty variable expansion. Only used transitively while expanding
/// variables.
pub const VARIABLE_EXPAND_EMPTY: char = char_offset(EXPAND_RESERVED_BASE, 9);
const _: () = assert!(
EXPAND_RESERVED_END as u32 > ExpandChars::VariableExpandEmpty as u32,
EXPAND_RESERVED_END as u32 > VARIABLE_EXPAND_EMPTY as u32,
"Characters used in expansions must stay within private use area"
);
impl From<ExpandChars> for char {
fn from(val: ExpandChars) -> Self {
// We know this is safe because we limit the the range of this enum
unsafe { char::from_u32_unchecked(val as _) }
}
}
/// The string represented by PROCESS_EXPAND_SELF
#[widestrs]
pub const PROCESS_EXPAND_SELF_STR: &wstr = "%self"L;

View File

@ -53,8 +53,6 @@ include_cpp! {
generate!("env_var_t")
generate!("make_pipes_ffi")
generate!("valid_var_name_char")
generate!("get_flog_file_fd")
generate!("log_extra_to_flog_file")
@ -100,9 +98,6 @@ include_cpp! {
generate!("re::regex_t")
generate!("re::regex_result_ffi")
generate!("re::try_compile_ffi")
generate!("wcs2string")
generate!("wcs2zstring")
generate!("str2wcstring")
generate!("signal_handle")
generate!("signal_check_cancel")

View File

@ -188,7 +188,15 @@ macro_rules! FLOG {
}
};
}
pub(crate) use FLOG;
// TODO implement.
macro_rules! FLOGF {
($category:ident, $($elem:expr),+) => {
crate::flog::FLOG!($category, $($elem),*);
}
}
pub(crate) use {FLOG, FLOGF};
/// For each category, if its name matches the wildcard, set its enabled to the given sense.
fn apply_one_wildcard(wc_esc: &wstr, sense: bool) {

View File

@ -12,6 +12,7 @@ mod common;
mod abbrs;
mod builtins;
mod color;
mod compat;
mod env;
mod event;
mod expand;
@ -51,6 +52,7 @@ mod wchar_ext;
mod wchar_ffi;
mod wcstringutil;
mod wgetopt;
mod wildcard;
mod wutil;
// Don't use `#[cfg(test)]` here to make sure ffi tests are built and tested

View File

@ -1,5 +1,5 @@
use crate::{
expand::ExpandChars::HomeDirectory,
expand::HOME_DIRECTORY,
wchar::{wstr, WExt, WString, L},
};
@ -12,7 +12,7 @@ pub fn path_apply_working_directory(path: &wstr, working_directory: &wstr) -> WS
// We're going to make sure that if we want to prepend the wd, that the string has no leading
// "/".
let prepend_wd = path.char_at(0) != '/' && path.char_at(0) != HomeDirectory.into();
let prepend_wd = path.char_at(0) != '/' && path.char_at(0) != HOME_DIRECTORY;
if !prepend_wd {
// No need to prepend the wd, so just return the path we were given.

View File

@ -1,7 +1,8 @@
//! A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be
//! extended to support marks, tokenizing multiple strings and disposing of unused string segments.
use crate::ffi::{valid_var_name_char, wcharz_t};
use crate::common::valid_var_name_char;
use crate::ffi::wcharz_t;
use crate::future_feature_flags::{feature_test, FeatureFlag};
use crate::parse_constants::SOURCE_OFFSET_INVALID;
use crate::redirection::RedirectionMode;
@ -1357,7 +1358,7 @@ pub fn variable_assignment_equals_pos(txt: &wstr) -> Option<usize> {
// TODO bracket indexing
for (i, c) in txt.chars().enumerate() {
if !found_potential_variable {
if !valid_var_name_char(c as wchar_t) {
if !valid_var_name_char(c) {
return None;
}
found_potential_variable = true;
@ -1365,7 +1366,7 @@ pub fn variable_assignment_equals_pos(txt: &wstr) -> Option<usize> {
if c == '=' {
return Some(i);
}
if !valid_var_name_char(c as wchar_t) {
if !valid_var_name_char(c) {
return None;
}
}

View File

@ -4,6 +4,7 @@
//! - wstr: a string slice without a nul terminator. Like `&str` but wide chars.
//! - WString: an owning string without a nul terminator. Like `String` but wide chars.
use crate::common::{ENCODE_DIRECT_BASE, ENCODE_DIRECT_END};
pub use widestring::{Utf32Str as wstr, Utf32String as WString};
/// Pull in our extensions.
@ -30,43 +31,6 @@ pub(crate) use L;
/// Note: the resulting string is NOT nul-terminated.
pub use widestring_suffix::widestrs;
// Use Unicode "non-characters" for internal characters as much as we can. This
// gives us 32 "characters" for internal use that we can guarantee should not
// appear in our input stream. See http://www.unicode.org/faq/private_use.html.
pub const RESERVED_CHAR_BASE: char = '\u{FDD0}';
pub const RESERVED_CHAR_END: char = '\u{FDF0}';
// Split the available non-character values into two ranges to ensure there are
// no conflicts among the places we use these special characters.
pub const EXPAND_RESERVED_BASE: char = RESERVED_CHAR_BASE;
pub const EXPAND_RESERVED_END: char = match char::from_u32(EXPAND_RESERVED_BASE as u32 + 16u32) {
Some(c) => c,
None => panic!("private use codepoint in expansion region should be valid char"),
};
pub const WILDCARD_RESERVED_BASE: char = EXPAND_RESERVED_END;
pub const WILDCARD_RESERVED_END: char = match char::from_u32(WILDCARD_RESERVED_BASE as u32 + 16u32)
{
Some(c) => c,
None => panic!("private use codepoint in wildcard region should be valid char"),
};
// These are in the Unicode private-use range. We really shouldn't use this
// range but have little choice in the matter given how our lexer/parser works.
// We can't use non-characters for these two ranges because there are only 66 of
// them and we need at least 256 + 64.
//
// If sizeof(wchar_t)==4 we could avoid using private-use chars; however, that
// would result in fish having different behavior on machines with 16 versus 32
// bit wchar_t. It's better that fish behave the same on both types of systems.
//
// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know
// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF)
// on Mac OS X. See http://www.unicode.org/faq/private_use.html.
pub const ENCODE_DIRECT_BASE: char = '\u{F600}';
pub const ENCODE_DIRECT_END: char = match char::from_u32(ENCODE_DIRECT_BASE as u32 + 256) {
Some(c) => c,
None => panic!("private use codepoint in encode direct region should be valid char"),
};
/// Encode a literal byte in a UTF-32 character. This is required for e.g. the echo builtin, whose
/// escape sequences can be used to construct raw byte sequences which are then interpreted as e.g.
/// UTF-8 by the terminal. If we were to interpret each of those bytes as a codepoint and encode it
@ -78,3 +42,16 @@ pub fn encode_byte_to_char(byte: u8) -> char {
char::from_u32(u32::from(ENCODE_DIRECT_BASE) + u32::from(byte))
.expect("private-use codepoint should be valid char")
}
/// Decode a literal byte from a UTF-32 character.
pub fn decode_byte_from_char(c: char) -> Option<u8> {
if c >= ENCODE_DIRECT_BASE && c < ENCODE_DIRECT_END {
Some(
(u32::from(c) - u32::from(ENCODE_DIRECT_BASE))
.try_into()
.unwrap(),
)
} else {
None
}
}

View File

@ -1,6 +1,66 @@
//! Helper functions for working with wcstring.
use crate::wchar::{wstr, WString};
use crate::compat::MB_CUR_MAX;
use crate::expand::INTERNAL_SEPARATOR;
use crate::flog::FLOGF;
use crate::wchar::{decode_byte_from_char, wstr, WString, L};
use crate::wutil::encoding::{wcrtomb, zero_mbstate, AT_LEAST_MB_LEN_MAX};
/// Implementation of wcs2string that accepts a callback.
/// This invokes \p func with (const char*, size_t) pairs.
/// If \p func returns false, it stops; otherwise it continues.
/// \return false if the callback returned false, otherwise true.
pub fn wcs2string_callback(input: &wstr, mut func: impl FnMut(&[u8]) -> bool) -> bool {
let mut state = zero_mbstate();
let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX];
for mut c in input.chars() {
// TODO: this doesn't seem sound.
if c == INTERNAL_SEPARATOR {
// do nothing
} else if let Some(byte) = decode_byte_from_char(c) {
converted[0] = byte;
if !func(&converted[..1]) {
return false;
}
} else if MB_CUR_MAX() == 1 {
// single-byte locale (C/POSIX/ISO-8859)
// If `c` contains a wide character we emit a question-mark.
if u32::from(c) & !0xFF != 0 {
c = '?';
}
converted[0] = c as u8;
if !func(&converted[..1]) {
return false;
}
} else {
converted = [0; AT_LEAST_MB_LEN_MAX];
let len = unsafe {
wcrtomb(
std::ptr::addr_of_mut!(converted[0]).cast(),
c as libc::wchar_t,
std::ptr::addr_of_mut!(state),
)
};
if len == 0_usize.wrapping_sub(1) {
wcs2string_bad_char(c);
state = zero_mbstate();
} else if !func(&converted[..len]) {
return false;
}
}
}
true
}
fn wcs2string_bad_char(c: char) {
FLOGF!(
char_encoding,
L!("Wide character U+%4X has no narrow representation"),
c
);
}
/// Joins strings with a separator.
pub fn join_strings(strs: &[&wstr], sep: char) -> WString {

13
fish-rust/src/wildcard.rs Normal file
View File

@ -0,0 +1,13 @@
// Enumeration of all wildcard types.
use crate::common::{char_offset, WILDCARD_RESERVED_BASE};
/// Character representing any character except '/' (slash).
pub const ANY_CHAR: char = char_offset(WILDCARD_RESERVED_BASE, 0);
/// Character representing any character string not containing '/' (slash).
pub const ANY_STRING: char = char_offset(WILDCARD_RESERVED_BASE, 1);
/// Character representing any character string.
pub const ANY_STRING_RECURSIVE: char = char_offset(WILDCARD_RESERVED_BASE, 2);
/// This is a special pseudo-char that is not used other than to mark the
/// end of the the special characters so we can sanity check the enum range.
pub const ANY_SENTINEL: char = char_offset(WILDCARD_RESERVED_BASE, 3);

View File

@ -0,0 +1,19 @@
extern "C" {
pub fn wcrtomb(s: *mut libc::c_char, wc: libc::wchar_t, ps: *mut mbstate_t) -> usize;
pub fn mbrtowc(
pwc: *mut libc::wchar_t,
s: *const libc::c_char,
n: usize,
p: *mut mbstate_t,
) -> usize;
}
// HACK This should be mbstate_t from libc but that's not exposed. Since it's only written by
// libc, we define it as opaque type that should be large enough for all implementations.
pub type mbstate_t = [u64; 16];
pub fn zero_mbstate() -> mbstate_t {
[0; 16]
}
// HACK This should be the MB_LEN_MAX macro from libc but that's not easy to get.
pub const AT_LEAST_MB_LEN_MAX: usize = 32;

View File

@ -1,3 +1,4 @@
pub mod encoding;
pub mod errors;
pub mod gettext;
mod normalize_path;
@ -6,6 +7,7 @@ pub mod wcstod;
pub mod wcstoi;
mod wrealpath;
use crate::common::fish_reserved_codepoint;
pub(crate) use gettext::{wgettext, wgettext_fmt};
pub use normalize_path::*;
pub(crate) use printf::sprintf;
@ -28,3 +30,21 @@ pub fn perror(s: &str) {
let _ = stderr.write_all(slice);
let _ = stderr.write_all(b"\n");
}
const PUA1_START: char = '\u{E000}';
const PUA1_END: char = '\u{F900}';
const PUA2_START: char = '\u{F0000}';
const PUA2_END: char = '\u{FFFFE}';
const PUA3_START: char = '\u{100000}';
const PUA3_END: char = '\u{10FFFE}';
/// Return one if the code point is in a Unicode private use area.
fn fish_is_pua(c: char) -> bool {
PUA1_START <= c && c < PUA1_END
}
/// We need this because there are too many implementations that don't return the proper answer for
/// some code points. See issue #3050.
pub fn fish_iswalnum(c: char) -> bool {
!fish_reserved_codepoint(c) && !fish_is_pua(c) && c.is_alphanumeric()
}

View File

@ -4,13 +4,8 @@ use std::{
os::unix::prelude::{OsStrExt, OsStringExt},
};
use cxx::let_cxx_string;
use crate::{
ffi::{str2wcstring, wcs2zstring},
wchar::{wstr, WString},
wchar_ffi::{WCharFromFFI, WCharToFFI},
};
use crate::common::{str2wcstring, wcs2zstring};
use crate::wchar::{wstr, WString};
/// Wide character realpath. The last path component does not need to be valid. If an error occurs,
/// `wrealpath()` returns `None`
@ -19,7 +14,7 @@ pub fn wrealpath(pathname: &wstr) -> Option<WString> {
return None;
}
let mut narrow_path: Vec<u8> = wcs2zstring(&pathname.to_ffi()).from_ffi();
let mut narrow_path: Vec<u8> = wcs2zstring(pathname).into();
// Strip trailing slashes. This is treats "/a//" as equivalent to "/a" if /a is a non-directory.
while narrow_path.len() > 1 && narrow_path[narrow_path.len() - 1] == b'/' {
@ -68,7 +63,5 @@ pub fn wrealpath(pathname: &wstr) -> Option<WString> {
}
};
let_cxx_string!(s = real_path);
Some(str2wcstring(&s).from_ffi())
Some(str2wcstring(&real_path))
}

View File

@ -67,9 +67,8 @@ static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token
if (!needs_expand) {
result = keyword_with_name(token);
} else {
wcstring storage;
if (unescape_string(token, &storage, 0)) {
result = keyword_with_name(storage);
if (auto unescaped = unescape_string(token, 0)) {
result = keyword_with_name(*unescaped);
}
}
}

View File

@ -204,12 +204,11 @@ maybe_t<int> builtin_complete(parser_t &parser, io_streams_t &streams, const wch
}
case 'p':
case 'c': {
wcstring tmp;
if (unescape_string(w.woptarg, &tmp, UNESCAPE_SPECIAL)) {
if (auto tmp = unescape_string(w.woptarg, UNESCAPE_SPECIAL)) {
if (opt == 'p')
path.push_back(tmp);
path.push_back(*tmp);
else
cmd_to_complete.push_back(tmp);
cmd_to_complete.push_back(*tmp);
} else {
streams.err.append_format(_(L"%ls: Invalid token '%ls'\n"), cmd, w.woptarg);
return STATUS_INVALID_ARGS;

View File

@ -531,14 +531,13 @@ maybe_t<int> builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t
if (opts.tokenize) {
auto tok = new_tokenizer(buff.c_str(), TOK_ACCEPT_UNFINISHED);
wcstring out;
if (opts.array) {
// Array mode: assign each token as a separate element of the sole var.
wcstring_list_t tokens;
while (auto t = tok->next()) {
auto text = *tok->text_of(*t);
if (unescape_string(text, &out, UNESCAPE_DEFAULT)) {
tokens.push_back(out);
if (auto out = unescape_string(text, UNESCAPE_DEFAULT)) {
tokens.push_back(*out);
} else {
tokens.push_back(text);
}
@ -549,8 +548,8 @@ maybe_t<int> builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t
std::unique_ptr<tok_t> t;
while ((vars_left() - 1 > 0) && (t = tok->next())) {
auto text = *tok->text_of(*t);
if (unescape_string(text, &out, UNESCAPE_DEFAULT)) {
parser.set_var_and_fire(*var_ptr++, opts.place, out);
if (auto out = unescape_string(text, UNESCAPE_DEFAULT)) {
parser.set_var_and_fire(*var_ptr++, opts.place, *out);
} else {
parser.set_var_and_fire(*var_ptr++, opts.place, text);
}

View File

@ -737,10 +737,9 @@ static int string_unescape(parser_t &parser, io_streams_t &streams, int argc,
arg_iterator_t aiter(argv, optind, streams);
while (const wcstring *arg = aiter.nextstr()) {
wcstring result;
wcstring sep = aiter.want_newline() ? L"\n" : L"";
if (unescape_string(*arg, &result, flags, opts.escape_style)) {
streams.out.append(result + sep);
if (auto result = unescape_string(*arg, flags, opts.escape_style)) {
streams.out.append(*result + sep);
nesc++;
}
}

View File

@ -33,6 +33,7 @@
#include <memory>
#include "common.h"
#include "common.rs.h"
#include "expand.h"
#include "fallback.h" // IWYU pragma: keep
#include "flog.h"
@ -119,17 +120,6 @@ long convert_digit(wchar_t d, int base) {
/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions.
static bool is_hex_digit(int c) { return std::strchr("0123456789ABCDEF", c) != nullptr; }
/// This is a specialization of `convert_digit()` that only handles base 16 and only uppercase.
static long convert_hex_digit(wchar_t d) {
if ((d <= L'9') && (d >= L'0')) {
return d - L'0';
} else if ((d <= L'Z') && (d >= L'A')) {
return 10 + d - L'A';
}
return -1;
}
bool is_windows_subsystem_for_linux() {
#if defined(WSL)
return true;
@ -749,38 +739,6 @@ static void escape_string_url(const wcstring &in, wcstring &out) {
}
}
/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII
/// chars.
static bool unescape_string_url(const wchar_t *in, wcstring *out) {
std::string result;
result.reserve(out->size());
for (wchar_t c = *in; c; c = *++in) {
if (c > 0x7F) return false; // invalid character means we can't decode the string
if (c == '%') {
int c1 = in[1];
if (c1 == 0) return false; // found unexpected end of string
if (c1 == '%') {
result.push_back('%');
in++;
} else {
int c2 = in[2];
if (c2 == 0) return false; // string ended prematurely
long d1 = convert_digit(c1, 16);
if (d1 < 0) return false;
long d2 = convert_digit(c2, 16);
if (d2 < 0) return false;
result.push_back(16 * d1 + d2);
in += 2;
}
} else {
result.push_back(c);
}
}
*out = str2wcstring(result);
return true;
}
/// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str.
static void escape_string_var(const wcstring &in, wcstring &out) {
bool prev_was_hex_encoded = false;
@ -812,46 +770,6 @@ static void escape_string_var(const wcstring &in, wcstring &out) {
}
}
/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII
/// chars.
static bool unescape_string_var(const wchar_t *in, wcstring *out) {
std::string result;
result.reserve(out->size());
bool prev_was_hex_encoded = false;
for (wchar_t c = *in; c; c = *++in) {
if (c > 0x7F) return false; // invalid character means we can't decode the string
if (c == '_') {
int c1 = in[1];
if (c1 == 0) {
if (prev_was_hex_encoded) break;
return false; // found unexpected escape char at end of string
}
if (c1 == '_') {
result.push_back('_');
in++;
} else if (is_hex_digit(c1)) {
int c2 = in[2];
if (c2 == 0) return false; // string ended prematurely
long d1 = convert_hex_digit(c1);
if (d1 < 0) return false;
long d2 = convert_hex_digit(c2);
if (d2 < 0) return false;
result.push_back(16 * d1 + d2);
in += 2;
prev_was_hex_encoded = true;
}
// No "else" clause because if the first char after an underscore is not another
// underscore or a valid hex character then the underscore is there to improve
// readability after we've encoded a character not valid in a var name.
} else {
result.push_back(c);
}
}
*out = str2wcstring(result);
return true;
}
wcstring escape_string_for_double_quotes(wcstring in) {
// We need to escape backslashes, double quotes, and dollars only.
wcstring result = std::move(in);
@ -1130,12 +1048,6 @@ wcstring escape_string(const wcstring &in, escape_flags_t flags, escape_string_s
return result;
}
/// Helper to return the last character in a string, or none.
static maybe_t<wchar_t> string_last_char(const wcstring &str) {
if (str.empty()) return none();
return str.back();
}
/// Given a null terminated string starting with a backslash, read the escape as if it is unquoted,
/// appending to result. Return the number of characters consumed, or none on error.
maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, bool allow_incomplete,
@ -1329,320 +1241,30 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
return in_pos;
}
/// Returns the unescaped version of input_str into output_str (by reference). Returns true if
/// successful. If false, the contents of output_str are unchanged.
static bool unescape_string_internal(const wchar_t *const input, const size_t input_len,
wcstring *output_str, unescape_flags_t flags) {
// Set up result string, which we'll swap with the output on success.
wcstring result;
result.reserve(input_len);
const bool unescape_special = static_cast<bool>(flags & UNESCAPE_SPECIAL);
const bool allow_incomplete = static_cast<bool>(flags & UNESCAPE_INCOMPLETE);
const bool ignore_backslashes = static_cast<bool>(flags & UNESCAPE_NO_BACKSLASHES);
// The positions of open braces.
std::vector<size_t> braces;
// The positions of variable expansions or brace ","s.
// We only read braces as expanders if there's a variable expansion or "," in them.
std::vector<size_t> vars_or_seps;
int brace_count = 0;
bool errored = false;
enum {
mode_unquoted,
mode_single_quotes,
mode_double_quotes,
} mode = mode_unquoted;
for (size_t input_position = 0; input_position < input_len && !errored; input_position++) {
const wchar_t c = input[input_position];
// Here's the character we'll append to result, or none() to suppress it.
maybe_t<wchar_t> to_append_or_none = c;
if (mode == mode_unquoted) {
switch (c) {
case L'\\': {
if (!ignore_backslashes) {
// Backslashes (escapes) are complicated and may result in errors, or
// appending INTERNAL_SEPARATORs, so we have to handle them specially.
auto escape_chars = read_unquoted_escape(
input + input_position, &result, allow_incomplete, unescape_special);
if (!escape_chars.has_value()) {
// A none() return indicates an error.
errored = true;
} else {
// Skip over the characters we read, minus one because the outer loop
// will increment it.
assert(*escape_chars > 0);
input_position += *escape_chars - 1;
}
// We've already appended, don't append anything else.
to_append_or_none = none();
}
break;
}
case L'~': {
if (unescape_special && (input_position == 0)) {
to_append_or_none = HOME_DIRECTORY;
}
break;
}
case L'%': {
// Note that this only recognizes %self if the string is literally %self.
// %self/foo will NOT match this.
if (unescape_special && input_position == 0 &&
!std::wcscmp(input, PROCESS_EXPAND_SELF_STR)) {
to_append_or_none = PROCESS_EXPAND_SELF;
input_position += PROCESS_EXPAND_SELF_STR_LEN - 1; // skip over 'self's
}
break;
}
case L'*': {
if (unescape_special) {
// In general, this is ANY_STRING. But as a hack, if the last appended char
// is ANY_STRING, delete the last char and store ANY_STRING_RECURSIVE to
// reflect the fact that ** is the recursive wildcard.
if (string_last_char(result) == ANY_STRING) {
assert(!result.empty());
result.resize(result.size() - 1);
to_append_or_none = ANY_STRING_RECURSIVE;
} else {
to_append_or_none = ANY_STRING;
}
}
break;
}
case L'?': {
if (unescape_special && !feature_test(feature_flag_t::qmark_noglob)) {
to_append_or_none = ANY_CHAR;
}
break;
}
case L'$': {
if (unescape_special) {
bool is_cmdsub =
input_position + 1 < input_len && input[input_position + 1] == L'(';
if (!is_cmdsub) {
to_append_or_none = VARIABLE_EXPAND;
vars_or_seps.push_back(input_position);
}
}
break;
}
case L'{': {
if (unescape_special) {
brace_count++;
to_append_or_none = BRACE_BEGIN;
// We need to store where the brace *ends up* in the output.
braces.push_back(result.size());
}
break;
}
case L'}': {
if (unescape_special) {
// HACK: The completion machinery sometimes hands us partial tokens.
// We can't parse them properly, but it shouldn't hurt,
// so we don't assert here.
// See #4954.
// assert(brace_count > 0 && "imbalanced brackets are a tokenizer error, we
// shouldn't be able to get here");
brace_count--;
to_append_or_none = BRACE_END;
if (!braces.empty()) {
// HACK: To reduce accidental use of brace expansion, treat a brace
// with zero or one items as literal input. See #4632. (The hack is
// doing it here and like this.)
if (vars_or_seps.empty() || vars_or_seps.back() < braces.back()) {
result[braces.back()] = L'{';
// We also need to turn all spaces back.
for (size_t i = braces.back() + 1; i < result.size(); i++) {
if (result[i] == BRACE_SPACE) result[i] = L' ';
}
to_append_or_none = L'}';
}
// Remove all seps inside the current brace pair, so if we have a
// surrounding pair we only get seps inside *that*.
if (!vars_or_seps.empty()) {
while (!vars_or_seps.empty() && vars_or_seps.back() > braces.back())
vars_or_seps.pop_back();
}
braces.pop_back();
}
}
break;
}
case L',': {
if (unescape_special && brace_count > 0) {
to_append_or_none = BRACE_SEP;
vars_or_seps.push_back(input_position);
}
break;
}
case L' ': {
if (unescape_special && brace_count > 0) {
to_append_or_none = BRACE_SPACE;
}
break;
}
case L'\'': {
mode = mode_single_quotes;
to_append_or_none =
unescape_special ? maybe_t<wchar_t>(INTERNAL_SEPARATOR) : none();
break;
}
case L'\"': {
mode = mode_double_quotes;
to_append_or_none =
unescape_special ? maybe_t<wchar_t>(INTERNAL_SEPARATOR) : none();
break;
}
default: {
break;
}
}
} else if (mode == mode_single_quotes) {
if (c == L'\\') {
// A backslash may or may not escape something in single quotes.
switch (input[input_position + 1]) {
case '\\':
case L'\'': {
to_append_or_none = input[input_position + 1];
input_position += 1; // skip over the backslash
break;
}
case L'\0': {
if (!allow_incomplete) {
errored = true;
} else {
// PCA this line had the following cryptic comment: 'We may ever escape
// a NULL character, but still appending a \ in case I am wrong.' Not
// sure what it means or the importance of this.
input_position += 1; /* Skip over the backslash */
to_append_or_none = L'\\';
}
break;
}
default: {
// Literal backslash that doesn't escape anything! Leave things alone; we'll
// append the backslash itself.
break;
}
}
} else if (c == L'\'') {
to_append_or_none =
unescape_special ? maybe_t<wchar_t>(INTERNAL_SEPARATOR) : none();
mode = mode_unquoted;
}
} else if (mode == mode_double_quotes) {
switch (c) {
case L'"': {
mode = mode_unquoted;
to_append_or_none =
unescape_special ? maybe_t<wchar_t>(INTERNAL_SEPARATOR) : none();
break;
}
case '\\': {
switch (input[input_position + 1]) {
case L'\0': {
if (!allow_incomplete) {
errored = true;
} else {
to_append_or_none = L'\0';
}
break;
}
case '\\':
case L'$':
case '"': {
to_append_or_none = input[input_position + 1];
input_position += 1; /* Skip over the backslash */
break;
}
case '\n': {
/* Swallow newline */
to_append_or_none = none();
input_position += 1; /* Skip over the backslash */
break;
}
default: {
/* Literal backslash that doesn't escape anything! Leave things alone;
* we'll append the backslash itself */
break;
}
}
break;
}
case '$': {
if (unescape_special) {
to_append_or_none = VARIABLE_EXPAND_SINGLE;
vars_or_seps.push_back(input_position);
}
break;
}
default: {
break;
}
}
}
// Now maybe append the char.
if (to_append_or_none.has_value()) {
result.push_back(*to_append_or_none);
}
}
// Return the string by reference, and then success.
if (!errored) {
*output_str = std::move(result);
}
return !errored;
}
bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special) {
assert(str != nullptr);
wcstring output;
bool success = unescape_string_internal(str->c_str(), str->size(), &output, escape_special);
if (success) {
*str = std::move(output);
if (auto unescaped = unescape_string(str->c_str(), str->size(), escape_special)) {
*str = *unescaped;
return true;
}
return success;
return false;
}
bool unescape_string(const wchar_t *input, size_t len, wcstring *output,
unescape_flags_t escape_special, escape_string_style_t style) {
bool success = false;
switch (style) {
case STRING_STYLE_SCRIPT: {
success = unescape_string_internal(input, len, output, escape_special);
break;
}
case STRING_STYLE_URL: {
success = unescape_string_url(input, output);
break;
}
case STRING_STYLE_VAR: {
success = unescape_string_var(input, output);
break;
}
case STRING_STYLE_REGEX: {
// unescaping PCRE2 is not needed/supported, the PCRE2 engine is responsible for that
success = false;
break;
}
}
if (!success) output->clear();
return success;
std::unique_ptr<wcstring> unescape_string(const wchar_t *input, unescape_flags_t escape_special,
escape_string_style_t style) {
return unescape_string(input, std::wcslen(input), escape_special, style);
}
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style) {
return unescape_string(input, std::wcslen(input), output, escape_special, style);
std::unique_ptr<wcstring> unescape_string(const wchar_t *input, size_t len,
unescape_flags_t escape_special,
escape_string_style_t style) {
return rust_unescape_string(input, len, escape_special, style);
}
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style) {
return unescape_string(input.c_str(), input.size(), output, escape_special, style);
std::unique_ptr<wcstring> unescape_string(const wcstring &input, unescape_flags_t escape_special,
escape_string_style_t style) {
return unescape_string(input.c_str(), input.size(), escape_special, style);
}
wcstring format_size(long long sz) {

View File

@ -521,15 +521,15 @@ bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special);
/// Reverse the effects of calling `escape_string`. Returns the unescaped value by reference. On
/// failure, the output is set to an empty string.
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
std::unique_ptr<wcstring> unescape_string(const wchar_t *input, unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
bool unescape_string(const wchar_t *input, size_t len, wcstring *output,
unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
std::unique_ptr<wcstring> unescape_string(const wchar_t *input, size_t len,
unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
std::unique_ptr<wcstring> unescape_string(const wcstring &input, unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
/// Write the given paragraph of output, redoing linebreaks to fit \p termsize.
wcstring reformat_for_screen(const wcstring &msg, const termsize_t &termsize);

View File

@ -1469,8 +1469,8 @@ void completer_t::escape_opening_brackets(const wcstring &argument) {
if (!have_unquoted_unescaped_bracket) return;
// Since completion_apply_to_command_line will escape the completion, we need to provide an
// unescaped version.
wcstring unescaped_argument;
if (!unescape_string(argument, &unescaped_argument, UNESCAPE_INCOMPLETE)) return;
auto unescaped_argument = unescape_string(argument, UNESCAPE_INCOMPLETE);
if (!unescaped_argument) return;
for (completion_t &comp : completions.get_list()) {
if (comp.flags & COMPLETE_REPLACES_TOKEN) continue;
comp.flags |= COMPLETE_REPLACES_TOKEN;
@ -1482,7 +1482,7 @@ void completer_t::escape_opening_brackets(const wcstring &argument) {
if (comp.flags & COMPLETE_DONT_ESCAPE) {
FLOG(warning, L"unexpected completion flag");
}
comp.completion = unescaped_argument + comp.completion;
comp.completion = *unescaped_argument + comp.completion;
}
}
@ -1494,9 +1494,8 @@ void completer_t::mark_completions_duplicating_arguments(const wcstring &cmd,
wcstring_list_t arg_strs;
for (const auto &arg : args) {
wcstring argstr = *arg.get_source(cmd);
wcstring argstr_unesc;
if (unescape_string(argstr, &argstr_unesc, UNESCAPE_DEFAULT)) {
arg_strs.push_back(std::move(argstr_unesc));
if (auto argstr_unesc = unescape_string(argstr, UNESCAPE_DEFAULT)) {
arg_strs.push_back(std::move(*argstr_unesc));
}
}
std::sort(arg_strs.begin(), arg_strs.end());
@ -1668,11 +1667,14 @@ void completer_t::perform_for_commandline(wcstring cmdline) {
source_range_t command_range = {cmd_tok.offset - bias, cmd_tok.length};
wcstring exp_command = *cmd_tok.get_source(cmdline);
bool unescaped =
expand_command_token(ctx, exp_command) &&
unescape_string(previous_argument, &arg_data.previous_argument, UNESCAPE_DEFAULT) &&
unescape_string(current_argument, &arg_data.current_argument, UNESCAPE_INCOMPLETE);
std::unique_ptr<wcstring> prev;
std::unique_ptr<wcstring> cur;
bool unescaped = expand_command_token(ctx, exp_command) &&
(prev = unescape_string(previous_argument, UNESCAPE_DEFAULT)) &&
(cur = unescape_string(current_argument, UNESCAPE_INCOMPLETE));
if (unescaped) {
arg_data.previous_argument = *prev;
arg_data.current_argument = *cur;
// Have to walk over the command and its entire wrap chain. If any command
// disables do_file, then they all do.
walk_wrap_chain(exp_command, *effective_cmdline, command_range, &arg_data);

View File

@ -472,11 +472,11 @@ void env_init(const struct config_paths_t *paths, bool do_uvars, bool default_pa
for (const auto &kv : table) {
if (string_prefixes_string(prefix, kv.first)) {
wcstring escaped_name = kv.first.substr(prefix_len);
wcstring name;
if (unescape_string(escaped_name, &name, unescape_flags_t{}, STRING_STYLE_VAR)) {
wcstring key = name;
if (auto name =
unescape_string(escaped_name, unescape_flags_t{}, STRING_STYLE_VAR)) {
wcstring key = *name;
wcstring replacement = join_strings(kv.second.as_list(), L' ');
abbrs->add(std::move(name), std::move(key), std::move(replacement),
abbrs->add(std::move(*name), std::move(key), std::move(replacement),
abbrs_position_t::command, from_universal);
}
}

View File

@ -800,9 +800,11 @@ bool env_universal_t::populate_1_variable(const wchar_t *input, env_var_t::env_v
// Parse out the value into storage, and decode it into a variable.
storage->clear();
if (!unescape_string(colon + 1, storage, 0)) {
auto unescaped = unescape_string(colon + 1, 0);
if (!unescaped) {
return false;
}
*storage = *unescaped;
env_var_t var{decode_serialized(*storage), flags};
// Parse out the key and write into the map.

View File

@ -971,7 +971,8 @@ expand_result_t expander_t::stage_variables(wcstring input, completion_receiver_
// We accept incomplete strings here, since complete uses expand_string to expand incomplete
// strings from the commandline.
wcstring next;
unescape_string(input, &next, UNESCAPE_SPECIAL | UNESCAPE_INCOMPLETE);
if (auto unescaped = unescape_string(input, UNESCAPE_SPECIAL | UNESCAPE_INCOMPLETE))
next = *unescaped;
if (flags & expand_flag::skip_variables) {
for (auto &i : next) {

View File

@ -376,27 +376,26 @@ static void test_unescape_sane() {
{L"\"abcd\\n\"", L"abcd\\n"}, {L"\\143", L"c"},
{L"'\\143'", L"\\143"}, {L"\\n", L"\n"} // \n normally becomes newline
};
wcstring output;
for (const auto &test : tests) {
bool ret = unescape_string(test.input, &output, UNESCAPE_DEFAULT);
if (!ret) {
auto output = unescape_string(test.input, UNESCAPE_DEFAULT);
if (!output) {
err(L"Failed to unescape '%ls'\n", test.input);
} else if (output != test.expected) {
} else if (*output != test.expected) {
err(L"In unescaping '%ls', expected '%ls' but got '%ls'\n", test.input, test.expected,
output.c_str());
output->c_str());
}
}
// Test for overflow.
if (unescape_string(L"echo \\UFFFFFF", &output, UNESCAPE_DEFAULT)) {
if (unescape_string(L"echo \\UFFFFFF", UNESCAPE_DEFAULT)) {
err(L"Should not have been able to unescape \\UFFFFFF\n");
}
if (unescape_string(L"echo \\U110000", &output, UNESCAPE_DEFAULT)) {
if (unescape_string(L"echo \\U110000", UNESCAPE_DEFAULT)) {
err(L"Should not have been able to unescape \\U110000\n");
}
#if WCHAR_MAX != 0xffff
// TODO: Make this work on MS Windows.
if (!unescape_string(L"echo \\U10FFFF", &output, UNESCAPE_DEFAULT)) {
if (!unescape_string(L"echo \\U10FFFF", UNESCAPE_DEFAULT)) {
err(L"Should have been able to unescape \\U10FFFF\n");
}
#endif
@ -408,8 +407,6 @@ static void test_escape_crazy() {
say(L"Testing escaping and unescaping");
wcstring random_string;
wcstring escaped_string;
wcstring unescaped_string;
bool unescaped_success;
for (size_t i = 0; i < ESCAPE_TEST_COUNT; i++) {
random_string.clear();
while (random() % ESCAPE_TEST_LENGTH) {
@ -417,14 +414,14 @@ static void test_escape_crazy() {
}
escaped_string = escape_string(random_string);
unescaped_success = unescape_string(escaped_string, &unescaped_string, UNESCAPE_DEFAULT);
auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT);
if (!unescaped_success) {
if (!unescaped_string) {
err(L"Failed to unescape string <%ls>", escaped_string.c_str());
break;
} else if (unescaped_string != random_string) {
} else if (*unescaped_string != random_string) {
err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'",
random_string.c_str(), unescaped_string.c_str());
random_string.c_str(), unescaped_string->c_str());
break;
}
}
@ -432,12 +429,12 @@ static void test_escape_crazy() {
// Verify that ESCAPE_NO_PRINTABLES also escapes backslashes so we don't regress on issue #3892.
random_string = L"line 1\\n\nline 2";
escaped_string = escape_string(random_string, ESCAPE_NO_PRINTABLES | ESCAPE_NO_QUOTED);
unescaped_success = unescape_string(escaped_string, &unescaped_string, UNESCAPE_DEFAULT);
if (!unescaped_success) {
auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT);
if (!unescaped_string) {
err(L"Failed to unescape string <%ls>", escaped_string.c_str());
} else if (unescaped_string != random_string) {
} else if (*unescaped_string != random_string) {
err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'",
random_string.c_str(), unescaped_string.c_str());
random_string.c_str(), unescaped_string->c_str());
}
}

View File

@ -960,8 +960,8 @@ parser_test_error_bits_t parse_util_detect_errors_in_argument(const ast::argumen
parser_test_error_bits_t err = 0;
auto check_subtoken = [&arg_src, &out_errors, source_start](size_t begin, size_t end) -> int {
wcstring unesc;
if (!unescape_string(arg_src.c_str() + begin, end - begin, &unesc, UNESCAPE_SPECIAL)) {
auto maybe_unesc = unescape_string(arg_src.c_str() + begin, end - begin, UNESCAPE_SPECIAL);
if (!maybe_unesc) {
if (out_errors) {
const wchar_t *fmt = L"Invalid token '%ls'";
if (arg_src.length() == 2 && arg_src[0] == L'\\' &&
@ -975,6 +975,7 @@ parser_test_error_bits_t parse_util_detect_errors_in_argument(const ast::argumen
}
return 1;
}
const wcstring &unesc = *maybe_unesc;
parser_test_error_bits_t err = 0;
// Check for invalid variable expansions.

View File

@ -60,7 +60,9 @@ bool wildcard_has(const wchar_t *str, size_t len) {
return false;
}
wcstring unescaped;
unescape_string(str, len, &unescaped, UNESCAPE_SPECIAL);
if (auto tmp = unescape_string(wcstring{str, len}, UNESCAPE_SPECIAL)) {
unescaped = *tmp;
}
return wildcard_has_internal(unescaped);
}

View File

@ -158,6 +158,9 @@ echo -e 'abc\x211def'
#CHECK: abc!def
#CHECK: abc!1def
echo \UDE01
#CHECK: <20>
# Comments allowed in between lines (#1987)
echo before comment \
# comment