From 2ef12af60efd4422982c7f58f818f920d6752ae6 Mon Sep 17 00:00:00 2001 From: Shay Aviv Date: Fri, 4 Feb 2022 22:44:45 +0200 Subject: [PATCH] Fix comment parsing inside command substitutions and brackets --- CHANGELOG.rst | 1 + src/common.cpp | 7 +++++++ src/common.h | 5 +++++ src/parse_util.cpp | 12 +++++++++--- src/tokenizer.cpp | 13 ++++++++++--- src/tokenizer.h | 3 +++ tests/checks/basic.fish | 17 +++++++++++++++++ 7 files changed, 52 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 290387d93..55d4f3f2b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -93,6 +93,7 @@ Scripting improvements - ``abbr -q`` returns the correct exit status when given multiple abbreviation names as arguments (:issue:`8431`). - ``command -v`` returns an exit status of 127 instead of 1 if no command was found (:issue:`8547`). - ``argparse`` with ``--ignore-unknown`` no longer breaks with multiple unknown options in a short option group (:issue:`8637`). +- Comments inside command substitutions or brackets now correctly ignore parentheses, quotes, and brackets (:issue:`7866`, :issue:`8022`). Interactive improvements ------------------------ diff --git a/src/common.cpp b/src/common.cpp index 0d7af786d..ccd8d803d 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -523,6 +523,13 @@ const wchar_t *quote_end(const wchar_t *pos, wchar_t quote) { return nullptr; } +const wchar_t *comment_end(const wchar_t *pos) { + do { + pos++; + } while (*pos && *pos != L'\n'); + return pos; +} + void fish_setlocale() { // Use various Unicode symbols if they can be encoded using the current locale, else a simple // ASCII char alternative. All of the can_be_encoded() invocations should return the same diff --git a/src/common.h b/src/common.h index 5b46e3636..178e339a1 100644 --- a/src/common.h +++ b/src/common.h @@ -451,6 +451,11 @@ std::unique_ptr make_unique(Args &&...args) { /// \param quote the quote to use, usually pointed to by \c pos. const wchar_t *quote_end(const wchar_t *pos, wchar_t quote); +/// This functions returns the end of the comment substring beginning at \c pos. +/// +/// \param pos the position where the comment starts, including the '#' symbol. +const wchar_t *comment_end(const wchar_t *pos); + /// This function should be called after calling `setlocale()` to perform fish specific locale /// initialization. void fish_setlocale(); diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 271bf1c1c..c626804d0 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -90,6 +90,8 @@ size_t parse_util_get_offset(const wcstring &str, int line, long line_offset) { static int parse_util_locate_cmdsub(const wchar_t *in, const wchar_t **begin, const wchar_t **end, bool allow_incomplete, bool *inout_is_quoted) { bool escaped = false; + bool is_first = true; + bool is_token_begin = true; bool syntax_error = false; int paran_count = 0; std::vector quoted_cmdsubs; @@ -122,6 +124,10 @@ static int parse_util_locate_cmdsub(const wchar_t *in, const wchar_t **begin, co if (!escaped) { if (*pos == L'\'' || *pos == L'"') { if (!process_opening_quote(*pos)) break; + } else if (*pos == L'\\') { + escaped = true; + } else if (*pos == L'#' && is_token_begin) { + pos = comment_end(pos) - 1; } else { if (*pos == L'(') { if ((paran_count == 0) && (paran_begin == nullptr)) { @@ -161,12 +167,12 @@ static int parse_util_locate_cmdsub(const wchar_t *in, const wchar_t **begin, co } } } - } - if (*pos == '\\') { - escaped = !escaped; + is_token_begin = is_token_delimiter(pos[0], is_first, pos[1]); } else { escaped = false; + is_token_begin = false; } + is_first = false; } syntax_error |= (paran_count < 0); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 21b93bdb2..861c57f61 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -154,6 +154,7 @@ tok_t tokenizer_t::read_string() { int slice_offset = 0; const wchar_t *const buff_start = this->token_cursor; bool is_first = true; + bool is_token_begin = true; auto process_opening_quote = [&](wchar_t quote) -> const wchar_t * { const wchar_t *end = quote_end(this->token_cursor, quote); @@ -192,6 +193,8 @@ tok_t tokenizer_t::read_string() { // has been explicitly ignored (escaped). else if (c == L'\\') { mode |= tok_modes::char_escape; + } else if (c == L'#' && is_token_begin) { + this->token_cursor = comment_end(this->token_cursor) - 1; } else if (c == L'(') { paran_offsets.push_back(this->token_cursor - this->start); expecting.push_back(L')'); @@ -278,8 +281,9 @@ tok_t tokenizer_t::read_string() { FLOGF(error, msg.c_str(), c, c, int(mode_begin), int(mode)); #endif - this->token_cursor++; + is_token_begin = is_token_delimiter(this->token_cursor[0], is_first, this->token_cursor[1]); is_first = false; + this->token_cursor++; } if (!this->accept_unfinished && (mode != tok_modes::regular_text)) { @@ -540,8 +544,7 @@ maybe_t tokenizer_t::next() { while (*this->token_cursor == L'#') { // We have a comment, walk over the comment. const wchar_t *comment_start = this->token_cursor; - while (this->token_cursor[0] != L'\n' && this->token_cursor[0] != L'\0') - this->token_cursor++; + this->token_cursor = comment_end(this->token_cursor); size_t comment_len = this->token_cursor - comment_start; // If we are going to continue after the comment, skip any trailing newline. @@ -679,6 +682,10 @@ maybe_t tokenizer_t::next() { return result; } +bool is_token_delimiter(wchar_t c, bool is_first, maybe_t next) { + return c == L'(' || !tok_is_string_character(c, is_first, next); +} + wcstring tok_first(const wcstring &str) { tokenizer_t t(str.c_str(), 0); if (auto token = t.next()) { diff --git a/src/tokenizer.h b/src/tokenizer.h index f0504db02..fccff61db 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -133,6 +133,9 @@ class tokenizer_t : noncopyable_t { } }; +/// Tests if this character can delimit tokens. +bool is_token_delimiter(wchar_t c, bool is_first, maybe_t next); + /// Returns only the first token from the specified string. This is a convenience function, used to /// retrieve the first token of a string. This can be useful for error messages, etc. On failure, /// returns the empty string. diff --git a/tests/checks/basic.fish b/tests/checks/basic.fish index 5c8d8e517..3befee0ed 100644 --- a/tests/checks/basic.fish +++ b/tests/checks/basic.fish @@ -533,6 +533,23 @@ echo banana echo (echo hello\\) # CHECK: hello\ +# This used to be a parse error - #7866. +echo (echo foo;#) + ) +# CHECK: foo +echo (echo bar #' + ) +# CHECK: bar +echo (#" + echo baz) +# CHECK: baz + +# Make sure we don't match up brackets within comments (#8022). +$fish -c 'echo f[oo # not valid, no matching ]' +# CHECKERR: fish: Unexpected end of string, square brackets do not match +# CHECKERR: echo f[oo # not valid, no matching ] +# CHECKERR: {{ }}^ + # Should fail because $PWD is read-only. for PWD in foo bar true