From 00f95a978e1caade5addda4c368feaeb26e5c8a4 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 19:36:10 -0500 Subject: [PATCH] Make { and } valid, first-class tokenizer elements --- src/expand.cpp | 4 +- src/parse_tree.cpp | 29 +------------- src/tokenizer.cpp | 92 +++++++++++++++++++++----------------------- src/tokenizer.h | 34 +++++++++------- src/wcstringutil.cpp | 18 +++++++++ src/wcstringutil.h | 1 + 6 files changed, 89 insertions(+), 89 deletions(-) diff --git a/src/expand.cpp b/src/expand.cpp index 3ead4819a..8aa7fc540 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -47,6 +47,7 @@ #include "proc.h" #include "reader.h" #include "wildcard.h" +#include "wcstringutil.h" #include "wutil.h" // IWYU pragma: keep #ifdef KERN_PROCARGS2 #else @@ -941,7 +942,8 @@ static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, whole_item.append(in, length_preceding_braces); whole_item.append(item_begin, item_len); whole_item.append(brace_end + 1); - debug(0, L"Found brace item: %ls\n", whole_item.c_str()); + auto whole_item2 = trim(whole_item); + debug(0, L"Found brace item: %ls\n", whole_item2.c_str()); expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp index 9c51025f4..9ddcebd09 100644 --- a/src/parse_tree.cpp +++ b/src/parse_tree.cpp @@ -668,35 +668,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta } void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) { - parse_error_code_t parse_error_code; - switch (tok.error) { - case TOK_UNTERMINATED_QUOTE: { - parse_error_code = parse_error_tokenizer_unterminated_quote; - break; - } - case TOK_UNTERMINATED_SUBSHELL: { - parse_error_code = parse_error_tokenizer_unterminated_subshell; - break; - } - case TOK_UNTERMINATED_SLICE: { - parse_error_code = parse_error_tokenizer_unterminated_slice; - break; - } - case TOK_UNTERMINATED_ESCAPE: { - parse_error_code = parse_error_tokenizer_unterminated_escape; - break; - } - case TOK_INVALID_REDIRECT: - case TOK_INVALID_PIPE: - default: { - parse_error_code = parse_error_tokenizer_other; - break; - } - } - + parse_error_code_t parse_error_code = tok.error->parser_error; this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset, parse_error_code, L"%ls", - error_message_for_code(tok.error).c_str()); + tok.error->Message); } void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) { diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index f2ff61267..ae804bb59 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -16,56 +16,22 @@ #include "tokenizer.h" #include "wutil.h" // IWYU pragma: keep -/// Error string for unexpected end of string. -#define QUOTE_ERROR _(L"Unexpected end of string, quotes are not balanced") - -/// Error string for mismatched parenthesis. -#define PARAN_ERROR _(L"Unexpected end of string, parenthesis do not match") - -/// Error string for mismatched square brackets. -#define SQUARE_BRACKET_ERROR _(L"Unexpected end of string, square brackets do not match") - -/// Error string for unterminated escape (backslash without continuation). -#define UNTERMINATED_ESCAPE_ERROR _(L"Unexpected end of string, incomplete escape sequence") - -/// Error string for invalid redirections. -#define REDIRECT_ERROR _(L"Invalid input/output redirection") - -/// Error string for when trying to pipe from fd 0. -#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output") - -/// Error for when ) is encountered with no matching ( -#define ERROR_CLOSING_UNOPENED_PARENTHESIS _(L"Unexpected ')' for unopened parenthesis") - -/// Error for when [ is encountered while already in bracket mode -#define ERROR_UNEXPECTED_BRACKET _(L"Unexpected '[' at this location") - -wcstring error_message_for_code(tokenizer_error err) { - switch (err) { - case TOK_UNTERMINATED_QUOTE: - return QUOTE_ERROR; - case TOK_UNTERMINATED_SUBSHELL: - return PARAN_ERROR; - case TOK_UNTERMINATED_SLICE: - return SQUARE_BRACKET_ERROR; - case TOK_UNTERMINATED_ESCAPE: - return UNTERMINATED_ESCAPE_ERROR; - case TOK_INVALID_REDIRECT: - return REDIRECT_ERROR; - case TOK_INVALID_PIPE: - return PIPE_ERROR; - case TOK_CLOSING_UNOPENED_SUBSHELL: - return ERROR_CLOSING_UNOPENED_PARENTHESIS; - case TOK_ILLEGAL_SLICE: - return ERROR_UNEXPECTED_BRACKET; - default: - assert(0 && "Unknown error type"); - return {}; - } -} +tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L""); +tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote); +tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell); +tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice); +tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape); +tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection")); +tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output")); +tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis")); +tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location")); +tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion")); +tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion")); +tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'")); +tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'")); /// Return an error token and mark that we no longer have a next token. -tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start, +tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start, const wchar_t *error_loc) { assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error"); assert(error_loc >= token_start && "Invalid error location"); @@ -143,6 +109,7 @@ ENUM_FLAGS(tok_mode) { tok_t tokenizer_t::read_string() { tok_mode mode { tok_mode::regular_text }; std::vector paran_offsets; + std::vector expecting; int slice_offset = 0; const wchar_t *const buff_start = this->buff; bool is_first = true; @@ -175,9 +142,18 @@ tok_t tokenizer_t::read_string() { } else if (c == L'(') { paran_offsets.push_back(this->buff - this->start); + expecting.push_back(L')'); mode |= tok_mode::subshell; } + else if (c == L'{') { + paran_offsets.push_back(this->buff - this->start); + expecting.push_back(L'}'); + mode |= tok_mode::curly_braces; + } else if (c == L')') { + if (expecting.size() > 0 && expecting.back() == L'}') { + return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff); + } switch (paran_offsets.size()) { case 0: return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff); @@ -187,6 +163,19 @@ tok_t tokenizer_t::read_string() { paran_offsets.pop_back(); } } + else if (c == L'}') { + if (expecting.size() > 0 && expecting.back() == L')') { + return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff); + } + switch (paran_offsets.size()) { + case 0: + return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff); + case 1: + mode &= ~(tok_mode::curly_braces); + default: + paran_offsets.pop_back(); + } + } else if (c == L'[') { if (this->buff != buff_start) { if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { @@ -257,6 +246,13 @@ tok_t tokenizer_t::read_string() { error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, this->start + offset_of_open_paran); } + else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) { + assert(paran_offsets.size() > 0); + size_t offset_of_open_brace = paran_offsets.back(); + + error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start, + this->start + offset_of_open_brace); + } return error; } diff --git a/src/tokenizer.h b/src/tokenizer.h index 1110e86ab..8ce6618a7 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -7,6 +7,7 @@ #include "common.h" #include "maybe.h" +#include "parse_constants.h" /// Token types. enum token_type { @@ -22,19 +23,26 @@ enum token_type { TOK_COMMENT /// comment token }; -/// Tokenizer error types. -enum tokenizer_error { - TOK_ERROR_NONE, - TOK_UNTERMINATED_QUOTE, - TOK_UNTERMINATED_SUBSHELL, - TOK_UNTERMINATED_SLICE, - TOK_UNTERMINATED_ESCAPE, - TOK_INVALID_REDIRECT, - TOK_INVALID_PIPE, - TOK_CLOSING_UNOPENED_SUBSHELL, - TOK_ILLEGAL_SLICE, +struct tokenizer_error { + const wchar_t *Message; + enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error + tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other) + : Message(msg), parser_error(perr) {} + tokenizer_error(const tokenizer_error&) = delete; }; +extern tokenizer_error *TOK_ERROR_NONE; +extern tokenizer_error *TOK_UNTERMINATED_QUOTE; +extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL; +extern tokenizer_error *TOK_UNTERMINATED_SLICE; +extern tokenizer_error *TOK_UNTERMINATED_ESCAPE; +extern tokenizer_error *TOK_UNTERMINATED_BRACE; +extern tokenizer_error *TOK_INVALID_REDIRECT; +extern tokenizer_error *TOK_INVALID_PIPE; +extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL; +extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE; +extern tokenizer_error *TOK_ILLEGAL_SLICE; + enum class redirection_type_t { overwrite, // normal redirection: > file.txt append, // appending redirection: >> file.txt @@ -69,7 +77,7 @@ struct tok_t { maybe_t redirected_fd{}; // If an error, this is the error code. - enum tokenizer_error error { TOK_ERROR_NONE }; + tokenizer_error *error { TOK_ERROR_NONE }; // If an error, this is the offset of the error within the token. A value of 0 means it occurred // at 'offset'. @@ -99,7 +107,7 @@ class tokenizer_t { /// Whether to continue the previous line after the comment. bool continue_line_after_comment{false}; - tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start, + tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start, const wchar_t *error_loc); tok_t read_string(); maybe_t tok_next(); diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp index 79209c1c5..5ed6d7b74 100644 --- a/src/wcstringutil.cpp +++ b/src/wcstringutil.cpp @@ -45,3 +45,21 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) { output.push_back(ellipsis_char); return output; } + +wcstring trim(const wcstring &input) { + debug(0, "trimming '%ls'", input.c_str()); + + // auto begin = input.cbegin(); + // for (begin; *begin == L' '; ++begin); + // auto end = input.cbegin() + input.size(); + // for (end; end > begin && *end == L' '; ++end); + + auto begin_offset = input.find_first_not_of(whitespace); + if (begin_offset == wcstring::npos) { + return wcstring{}; + } + auto end = input.cbegin() + input.find_last_not_of(whitespace); + + wcstring result(input.begin() + begin_offset, end + 1); + return result; +} diff --git a/src/wcstringutil.h b/src/wcstringutil.h index 878771f25..8665c0024 100644 --- a/src/wcstringutil.h +++ b/src/wcstringutil.h @@ -59,5 +59,6 @@ enum class ellipsis_type { }; wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest); +wcstring trim(const wcstring &input); #endif