From 00f95a978e1caade5addda4c368feaeb26e5c8a4 Mon Sep 17 00:00:00 2001
From: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date: Sun, 11 Mar 2018 19:36:10 -0500
Subject: [PATCH] Make { and } valid, first-class tokenizer elements

---
 src/expand.cpp       |  4 +-
 src/parse_tree.cpp   | 29 +-------------
 src/tokenizer.cpp    | 92 +++++++++++++++++++++-----------------------
 src/tokenizer.h      | 34 +++++++++-------
 src/wcstringutil.cpp | 18 +++++++++
 src/wcstringutil.h   |  1 +
 6 files changed, 89 insertions(+), 89 deletions(-)

diff --git a/src/expand.cpp b/src/expand.cpp
index 3ead4819a..8aa7fc540 100644
--- a/src/expand.cpp
+++ b/src/expand.cpp
@@ -47,6 +47,7 @@
 #include "proc.h"
 #include "reader.h"
 #include "wildcard.h"
+#include "wcstringutil.h"
 #include "wutil.h"  // IWYU pragma: keep
 #ifdef KERN_PROCARGS2
 #else
@@ -941,7 +942,8 @@ static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags,
             whole_item.append(in, length_preceding_braces);
             whole_item.append(item_begin, item_len);
             whole_item.append(brace_end + 1);
-            debug(0, L"Found brace item: %ls\n", whole_item.c_str());
+            auto whole_item2 = trim(whole_item);
+            debug(0, L"Found brace item: %ls\n", whole_item2.c_str());
             expand_braces(whole_item, flags, out, errors);
 
             item_begin = pos + 1;
diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp
index 9c51025f4..9ddcebd09 100644
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@@ -668,35 +668,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
 }
 
 void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) {
-    parse_error_code_t parse_error_code;
-    switch (tok.error) {
-        case TOK_UNTERMINATED_QUOTE: {
-            parse_error_code = parse_error_tokenizer_unterminated_quote;
-            break;
-        }
-        case TOK_UNTERMINATED_SUBSHELL: {
-            parse_error_code = parse_error_tokenizer_unterminated_subshell;
-            break;
-        }
-        case TOK_UNTERMINATED_SLICE: {
-            parse_error_code = parse_error_tokenizer_unterminated_slice;
-            break;
-        }
-        case TOK_UNTERMINATED_ESCAPE: {
-            parse_error_code = parse_error_tokenizer_unterminated_escape;
-            break;
-        }
-        case TOK_INVALID_REDIRECT:
-        case TOK_INVALID_PIPE:
-        default: {
-            parse_error_code = parse_error_tokenizer_other;
-            break;
-        }
-    }
-
+    parse_error_code_t parse_error_code = tok.error->parser_error;
     this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset,
                                   parse_error_code, L"%ls",
-                                  error_message_for_code(tok.error).c_str());
+                                  tok.error->Message);
 }
 
 void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index f2ff61267..ae804bb59 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -16,56 +16,22 @@
 #include "tokenizer.h"
 #include "wutil.h"  // IWYU pragma: keep
 
-/// Error string for unexpected end of string.
-#define QUOTE_ERROR _(L"Unexpected end of string, quotes are not balanced")
-
-/// Error string for mismatched parenthesis.
-#define PARAN_ERROR _(L"Unexpected end of string, parenthesis do not match")
-
-/// Error string for mismatched square brackets.
-#define SQUARE_BRACKET_ERROR _(L"Unexpected end of string, square brackets do not match")
-
-/// Error string for unterminated escape (backslash without continuation).
-#define UNTERMINATED_ESCAPE_ERROR _(L"Unexpected end of string, incomplete escape sequence")
-
-/// Error string for invalid redirections.
-#define REDIRECT_ERROR _(L"Invalid input/output redirection")
-
-/// Error string for when trying to pipe from fd 0.
-#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output")
-
-/// Error for when ) is encountered with no matching (
-#define ERROR_CLOSING_UNOPENED_PARENTHESIS _(L"Unexpected ')' for unopened parenthesis")
-
-/// Error for when [ is encountered while already in bracket mode
-#define ERROR_UNEXPECTED_BRACKET _(L"Unexpected '[' at this location")
-
-wcstring error_message_for_code(tokenizer_error err) {
-    switch (err) {
-        case TOK_UNTERMINATED_QUOTE:
-            return QUOTE_ERROR;
-        case TOK_UNTERMINATED_SUBSHELL:
-            return PARAN_ERROR;
-        case TOK_UNTERMINATED_SLICE:
-            return SQUARE_BRACKET_ERROR;
-        case TOK_UNTERMINATED_ESCAPE:
-            return UNTERMINATED_ESCAPE_ERROR;
-        case TOK_INVALID_REDIRECT:
-            return REDIRECT_ERROR;
-        case TOK_INVALID_PIPE:
-            return PIPE_ERROR;
-        case TOK_CLOSING_UNOPENED_SUBSHELL:
-            return ERROR_CLOSING_UNOPENED_PARENTHESIS;
-        case TOK_ILLEGAL_SLICE:
-            return ERROR_UNEXPECTED_BRACKET;
-        default:
-            assert(0 && "Unknown error type");
-            return {};
-    }
-}
+tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L"");
+tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote);
+tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell);
+tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice);
+tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape);
+tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection"));
+tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output"));
+tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis"));
+tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location"));
+tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion"));
+tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion"));
+tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'"));
+tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'"));
 
 /// Return an error token and mark that we no longer have a next token.
-tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start,
+tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start,
                               const wchar_t *error_loc) {
     assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error");
     assert(error_loc >= token_start && "Invalid error location");
@@ -143,6 +109,7 @@ ENUM_FLAGS(tok_mode) {
 tok_t tokenizer_t::read_string() {
     tok_mode mode { tok_mode::regular_text };
     std::vector<int> paran_offsets;
+    std::vector<char> expecting;
     int slice_offset = 0;
     const wchar_t *const buff_start = this->buff;
     bool is_first = true;
@@ -175,9 +142,18 @@ tok_t tokenizer_t::read_string() {
         }
         else if (c == L'(') {
             paran_offsets.push_back(this->buff - this->start);
+            expecting.push_back(L')');
             mode |= tok_mode::subshell;
         }
+        else if (c == L'{') {
+            paran_offsets.push_back(this->buff - this->start);
+            expecting.push_back(L'}');
+            mode |= tok_mode::curly_braces;
+        }
         else if (c == L')') {
+            if (expecting.size() > 0 && expecting.back() == L'}') {
+                return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff);
+            }
             switch (paran_offsets.size()) {
                 case 0:
                     return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff);
@@ -187,6 +163,19 @@ tok_t tokenizer_t::read_string() {
                     paran_offsets.pop_back();
             }
         }
+        else if (c == L'}') {
+            if (expecting.size() > 0 && expecting.back() == L')') {
+                return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff);
+            }
+            switch (paran_offsets.size()) {
+                case 0:
+                    return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff);
+                case 1:
+                    mode &= ~(tok_mode::curly_braces);
+                default:
+                    paran_offsets.pop_back();
+            }
+        }
         else if (c == L'[') {
             if (this->buff != buff_start) {
                 if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) {
@@ -257,6 +246,13 @@ tok_t tokenizer_t::read_string() {
             error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
                     this->start + offset_of_open_paran);
         }
+        else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) {
+            assert(paran_offsets.size() > 0);
+            size_t offset_of_open_brace = paran_offsets.back();
+
+            error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start,
+                    this->start + offset_of_open_brace);
+        }
         return error;
     }
 
diff --git a/src/tokenizer.h b/src/tokenizer.h
index 1110e86ab..8ce6618a7 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -7,6 +7,7 @@
 
 #include "common.h"
 #include "maybe.h"
+#include "parse_constants.h"
 
 /// Token types.
 enum token_type {
@@ -22,19 +23,26 @@ enum token_type {
     TOK_COMMENT      /// comment token
 };
 
-/// Tokenizer error types.
-enum tokenizer_error {
-    TOK_ERROR_NONE,
-    TOK_UNTERMINATED_QUOTE,
-    TOK_UNTERMINATED_SUBSHELL,
-    TOK_UNTERMINATED_SLICE,
-    TOK_UNTERMINATED_ESCAPE,
-    TOK_INVALID_REDIRECT,
-    TOK_INVALID_PIPE,
-    TOK_CLOSING_UNOPENED_SUBSHELL,
-    TOK_ILLEGAL_SLICE,
+struct tokenizer_error {
+    const wchar_t *Message;
+    enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error
+    tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other)
+        : Message(msg), parser_error(perr) {}
+    tokenizer_error(const tokenizer_error&) = delete;
 };
 
+extern tokenizer_error *TOK_ERROR_NONE;
+extern tokenizer_error *TOK_UNTERMINATED_QUOTE;
+extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL;
+extern tokenizer_error *TOK_UNTERMINATED_SLICE;
+extern tokenizer_error *TOK_UNTERMINATED_ESCAPE;
+extern tokenizer_error *TOK_UNTERMINATED_BRACE;
+extern tokenizer_error *TOK_INVALID_REDIRECT;
+extern tokenizer_error *TOK_INVALID_PIPE;
+extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL;
+extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE;
+extern tokenizer_error *TOK_ILLEGAL_SLICE;
+
 enum class redirection_type_t {
     overwrite,  // normal redirection: > file.txt
     append,     // appending redirection: >> file.txt
@@ -69,7 +77,7 @@ struct tok_t {
     maybe_t<int> redirected_fd{};
 
     // If an error, this is the error code.
-    enum tokenizer_error error { TOK_ERROR_NONE };
+    tokenizer_error *error { TOK_ERROR_NONE };
 
     // If an error, this is the offset of the error within the token. A value of 0 means it occurred
     // at 'offset'.
@@ -99,7 +107,7 @@ class tokenizer_t {
     /// Whether to continue the previous line after the comment.
     bool continue_line_after_comment{false};
 
-    tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start,
+    tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start,
                      const wchar_t *error_loc);
     tok_t read_string();
     maybe_t<tok_t> tok_next();
diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp
index 79209c1c5..5ed6d7b74 100644
--- a/src/wcstringutil.cpp
+++ b/src/wcstringutil.cpp
@@ -45,3 +45,21 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) {
     output.push_back(ellipsis_char);
     return output;
 }
+
+wcstring trim(const wcstring &input) {
+    debug(0, "trimming '%ls'", input.c_str());
+
+    // auto begin = input.cbegin();
+    // for (begin; *begin == L' '; ++begin);
+    // auto end = input.cbegin() + input.size();
+    // for (end; end > begin && *end == L' '; ++end);
+
+    auto begin_offset = input.find_first_not_of(whitespace);
+    if (begin_offset == wcstring::npos) {
+        return wcstring{};
+    }
+    auto end = input.cbegin() + input.find_last_not_of(whitespace);
+
+    wcstring result(input.begin() + begin_offset, end + 1);
+    return result;
+}
diff --git a/src/wcstringutil.h b/src/wcstringutil.h
index 878771f25..8665c0024 100644
--- a/src/wcstringutil.h
+++ b/src/wcstringutil.h
@@ -59,5 +59,6 @@ enum class ellipsis_type {
 };
 
 wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest);
+wcstring trim(const wcstring &input);
 
 #endif