From 2ef12af60efd4422982c7f58f818f920d6752ae6 Mon Sep 17 00:00:00 2001
From: Shay Aviv <shay.aviv@gmail.com>
Date: Fri, 4 Feb 2022 22:44:45 +0200
Subject: [PATCH] Fix comment parsing inside command substitutions and brackets

---
 CHANGELOG.rst           |  1 +
 src/common.cpp          |  7 +++++++
 src/common.h            |  5 +++++
 src/parse_util.cpp      | 12 +++++++++---
 src/tokenizer.cpp       | 13 ++++++++++---
 src/tokenizer.h         |  3 +++
 tests/checks/basic.fish | 17 +++++++++++++++++
 7 files changed, 52 insertions(+), 6 deletions(-)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 290387d93..55d4f3f2b 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -93,6 +93,7 @@ Scripting improvements
 - ``abbr -q`` returns the correct exit status when given multiple abbreviation names as arguments (:issue:`8431`).
 - ``command -v`` returns an exit status of 127 instead of 1 if no command was found (:issue:`8547`).
 - ``argparse`` with ``--ignore-unknown`` no longer breaks with multiple unknown options in a short option group (:issue:`8637`).
+- Comments inside command substitutions or brackets now correctly ignore parentheses, quotes, and brackets (:issue:`7866`, :issue:`8022`).
 
 Interactive improvements
 ------------------------
diff --git a/src/common.cpp b/src/common.cpp
index 0d7af786d..ccd8d803d 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -523,6 +523,13 @@ const wchar_t *quote_end(const wchar_t *pos, wchar_t quote) {
     return nullptr;
 }
 
+const wchar_t *comment_end(const wchar_t *pos) {
+    do {
+        pos++;
+    } while (*pos && *pos != L'\n');
+    return pos;
+}
+
 void fish_setlocale() {
     // Use various Unicode symbols if they can be encoded using the current locale, else a simple
     // ASCII char alternative. All of the can_be_encoded() invocations should return the same
diff --git a/src/common.h b/src/common.h
index 5b46e3636..178e339a1 100644
--- a/src/common.h
+++ b/src/common.h
@@ -451,6 +451,11 @@ std::unique_ptr<T> make_unique(Args &&...args) {
 /// \param quote the quote to use, usually pointed to by \c pos.
 const wchar_t *quote_end(const wchar_t *pos, wchar_t quote);
 
+/// This functions returns the end of the comment substring beginning at \c pos.
+///
+/// \param pos the position where the comment starts, including the '#' symbol.
+const wchar_t *comment_end(const wchar_t *pos);
+
 /// This function should be called after calling `setlocale()` to perform fish specific locale
 /// initialization.
 void fish_setlocale();
diff --git a/src/parse_util.cpp b/src/parse_util.cpp
index 271bf1c1c..c626804d0 100644
--- a/src/parse_util.cpp
+++ b/src/parse_util.cpp
@@ -90,6 +90,8 @@ size_t parse_util_get_offset(const wcstring &str, int line, long line_offset) {
 static int parse_util_locate_cmdsub(const wchar_t *in, const wchar_t **begin, const wchar_t **end,
                                     bool allow_incomplete, bool *inout_is_quoted) {
     bool escaped = false;
+    bool is_first = true;
+    bool is_token_begin = true;
     bool syntax_error = false;
     int paran_count = 0;
     std::vector<int> quoted_cmdsubs;
@@ -122,6 +124,10 @@ static int parse_util_locate_cmdsub(const wchar_t *in, const wchar_t **begin, co
         if (!escaped) {
             if (*pos == L'\'' || *pos == L'"') {
                 if (!process_opening_quote(*pos)) break;
+            } else if (*pos == L'\\') {
+                escaped = true;
+            } else if (*pos == L'#' && is_token_begin) {
+                pos = comment_end(pos) - 1;
             } else {
                 if (*pos == L'(') {
                     if ((paran_count == 0) && (paran_begin == nullptr)) {
@@ -161,12 +167,12 @@ static int parse_util_locate_cmdsub(const wchar_t *in, const wchar_t **begin, co
                     }
                 }
             }
-        }
-        if (*pos == '\\') {
-            escaped = !escaped;
+            is_token_begin = is_token_delimiter(pos[0], is_first, pos[1]);
         } else {
             escaped = false;
+            is_token_begin = false;
         }
+        is_first = false;
     }
 
     syntax_error |= (paran_count < 0);
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 21b93bdb2..861c57f61 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -154,6 +154,7 @@ tok_t tokenizer_t::read_string() {
     int slice_offset = 0;
     const wchar_t *const buff_start = this->token_cursor;
     bool is_first = true;
+    bool is_token_begin = true;
 
     auto process_opening_quote = [&](wchar_t quote) -> const wchar_t * {
         const wchar_t *end = quote_end(this->token_cursor, quote);
@@ -192,6 +193,8 @@ tok_t tokenizer_t::read_string() {
         // has been explicitly ignored (escaped).
         else if (c == L'\\') {
             mode |= tok_modes::char_escape;
+        } else if (c == L'#' && is_token_begin) {
+            this->token_cursor = comment_end(this->token_cursor) - 1;
         } else if (c == L'(') {
             paran_offsets.push_back(this->token_cursor - this->start);
             expecting.push_back(L')');
@@ -278,8 +281,9 @@ tok_t tokenizer_t::read_string() {
         FLOGF(error, msg.c_str(), c, c, int(mode_begin), int(mode));
 #endif
 
-        this->token_cursor++;
+        is_token_begin = is_token_delimiter(this->token_cursor[0], is_first, this->token_cursor[1]);
         is_first = false;
+        this->token_cursor++;
     }
 
     if (!this->accept_unfinished && (mode != tok_modes::regular_text)) {
@@ -540,8 +544,7 @@ maybe_t<tok_t> tokenizer_t::next() {
     while (*this->token_cursor == L'#') {
         // We have a comment, walk over the comment.
         const wchar_t *comment_start = this->token_cursor;
-        while (this->token_cursor[0] != L'\n' && this->token_cursor[0] != L'\0')
-            this->token_cursor++;
+        this->token_cursor = comment_end(this->token_cursor);
         size_t comment_len = this->token_cursor - comment_start;
 
         // If we are going to continue after the comment, skip any trailing newline.
@@ -679,6 +682,10 @@ maybe_t<tok_t> tokenizer_t::next() {
     return result;
 }
 
+bool is_token_delimiter(wchar_t c, bool is_first, maybe_t<wchar_t> next) {
+    return c == L'(' || !tok_is_string_character(c, is_first, next);
+}
+
 wcstring tok_first(const wcstring &str) {
     tokenizer_t t(str.c_str(), 0);
     if (auto token = t.next()) {
diff --git a/src/tokenizer.h b/src/tokenizer.h
index f0504db02..fccff61db 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -133,6 +133,9 @@ class tokenizer_t : noncopyable_t {
     }
 };
 
+/// Tests if this character can delimit tokens.
+bool is_token_delimiter(wchar_t c, bool is_first, maybe_t<wchar_t> next);
+
 /// Returns only the first token from the specified string. This is a convenience function, used to
 /// retrieve the first token of a string. This can be useful for error messages, etc. On failure,
 /// returns the empty string.
diff --git a/tests/checks/basic.fish b/tests/checks/basic.fish
index 5c8d8e517..3befee0ed 100644
--- a/tests/checks/basic.fish
+++ b/tests/checks/basic.fish
@@ -533,6 +533,23 @@ echo banana
 echo (echo hello\\)
 # CHECK: hello\
 
+# This used to be a parse error - #7866.
+echo (echo foo;#)
+     )
+# CHECK: foo
+echo (echo bar #'
+     )
+# CHECK: bar
+echo (#"
+      echo baz)
+# CHECK: baz
+
+# Make sure we don't match up brackets within comments (#8022).
+$fish -c 'echo f[oo # not valid, no matching ]'
+# CHECKERR: fish: Unexpected end of string, square brackets do not match
+# CHECKERR: echo f[oo # not valid, no matching ]
+# CHECKERR: {{      }}^
+
 # Should fail because $PWD is read-only.
 for PWD in foo bar
     true