diff --git a/src/builtin_commandline.cpp b/src/builtin_commandline.cpp index 2586e5a15..161470f46 100644 --- a/src/builtin_commandline.cpp +++ b/src/builtin_commandline.cpp @@ -143,10 +143,10 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED); tok_t token; while (tok.next(&token)) { - if ((cut_at_cursor) && (token.offset + token.text.size() >= pos)) break; + if ((cut_at_cursor) && (token.offset + token.length >= pos)) break; if (token.type == TOK_STRING) { - wcstring tmp = token.text; + wcstring tmp = tok.text_of(token); unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE); out.append(tmp); out.push_back(L'\n'); diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 6904a3e2c..880bf29ef 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -519,14 +519,14 @@ static void test_tokenizer() { do_test(token.type == TOK_STRING); do_test(token.offset == 0); do_test(token.length == 5); - do_test(token.text == L"alpha"); + do_test(t.text_of(token) == L"alpha"); got = t.next(&token); // beta do_test(got); do_test(token.type == TOK_STRING); do_test(token.offset == 6); do_test(token.length == 4); - do_test(token.text == L"beta"); + do_test(t.text_of(token) == L"beta"); got = t.next(&token); do_test(!got); diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp index bb36a927e..0438e8d3b 100644 --- a/src/parse_tree.cpp +++ b/src/parse_tree.cpp @@ -477,7 +477,7 @@ class parse_ll_t { void accept_tokens(parse_token_t token1, parse_token_t token2); /// Report tokenizer errors. - void report_tokenizer_error(const tok_t &tok); + void report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok); /// Indicate if we hit a fatal error. bool has_fatal_error() const { return this->fatal_errored; } @@ -711,7 +711,7 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta } } -void parse_ll_t::report_tokenizer_error(const tok_t &tok) { +void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) { parse_error_code_t parse_error_code; switch (tok.error) { case TOK_UNTERMINATED_QUOTE: { @@ -738,7 +738,7 @@ void parse_ll_t::report_tokenizer_error(const tok_t &tok) { } } this->parse_error_at_location(tok.offset + tok.error_offset, parse_error_code, L"%ls", - tok.text.c_str()); + tokenizer.text_of(tok).c_str()); } void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) { @@ -1067,10 +1067,11 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token) { // this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it // even starts to look like a feature. result.type = parse_token_type_from_tokenizer_token(token->type); - result.keyword = keyword_for_token(token->type, token->text); - result.has_dash_prefix = !token->text.empty() && token->text.at(0) == L'-'; - result.is_help_argument = result.has_dash_prefix && is_help_argument(token->text); - result.is_newline = (result.type == parse_token_type_end && token->text == L"\n"); + wcstring text = tok->text_of(*token); + result.keyword = keyword_for_token(token->type, text); + result.has_dash_prefix = !text.empty() && text.at(0) == L'-'; + result.is_help_argument = result.has_dash_prefix && is_help_argument(text); + result.is_newline = (result.type == parse_token_type_end && text == L"\n"); // These assertions are totally bogus. Basically our tokenizer works in size_t but we work in // uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just @@ -1128,7 +1129,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags, // Handle tokenizer errors. This is a hack because really the parser should report this for // itself; but it has no way of getting the tokenizer message. if (queue[1].type == parse_special_type_tokenizer_error) { - parser.report_tokenizer_error(tokenizer_token); + parser.report_tokenizer_error(tok, tokenizer_token); } if (!parser.has_fatal_error()) { diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 507162d69..9c8b681bc 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -379,7 +379,7 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar // Calculate end of token. if (token.type == TOK_STRING) { - tok_end += token.text.size(); + tok_end += token.length; } // Cursor was before beginning of this token, means that the cursor is between two tokens, @@ -393,14 +393,14 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar // and break. if (token.type == TOK_STRING && tok_end >= offset_within_cmdsubst) { a = cmdsubst_begin + token.offset; - b = a + token.text.size(); + b = a + token.length; break; } // Remember previous string token. if (token.type == TOK_STRING) { pa = cmdsubst_begin + token.offset; - pb = pa + token.text.size(); + pb = pa + token.length; } } @@ -479,7 +479,8 @@ void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_ while (tok.next(&token)) { if (token.offset > pos) break; - if (token.type == TOK_STRING) last_quote = get_quote(token.text, pos - token.offset); + if (token.type == TOK_STRING) + last_quote = get_quote(tok.text_of(token), pos - token.offset); if (out_type != NULL) *out_type = token.type; diff --git a/src/reader.cpp b/src/reader.cpp index 4559decd2..04b26704d 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -1744,13 +1744,14 @@ static void handle_token_history(history_search_direction_t dir, bool reset = fa tok_t token; while (tok.next(&token)) { if (token.type != TOK_STRING) continue; - if (token.text.find(data->search_buff) == wcstring::npos) continue; + wcstring text = tok.text_of(token); + if (text.find(data->search_buff) == wcstring::npos) continue; if (token.offset >= current_pos) continue; - auto found = find(data->search_prev.begin(), data->search_prev.end(), token.text); + auto found = find(data->search_prev.begin(), data->search_prev.end(), text); if (found == data->search_prev.end()) { data->token_history_pos = token.offset; - str = token.text; + str = text; } } } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index ffce52d9c..810d68ffa 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -34,39 +34,46 @@ /// Error string for when trying to pipe from fd 0. #define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output") -/// Set the latest tokens string to be the specified error message. -void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *where) { +/// Return an error token and mark that we no longer have a next token. +tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start, + const wchar_t *error_loc) { assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error"); - this->last_type = TOK_ERROR; - this->error = error_type; + assert(error_loc >= token_start && "Invalid error location"); + assert(this->buff >= token_start && "Invalid buff location"); + this->has_next = false; - this->global_error_offset = where ? where - this->start : 0; - if (this->squash_errors) { - this->last_token.clear(); - } else { + + tok_t result; + result.type = TOK_ERROR; + result.error = error_type; + result.offset = token_start - this->start; + result.length = this->buff - token_start; + result.error_offset = error_loc - token_start; + if (!this->squash_errors) { switch (error_type) { case TOK_UNTERMINATED_QUOTE: - this->last_token = QUOTE_ERROR; + result.error_text = QUOTE_ERROR; break; case TOK_UNTERMINATED_SUBSHELL: - this->last_token = PARAN_ERROR; + result.error_text = PARAN_ERROR; break; case TOK_UNTERMINATED_SLICE: - this->last_token = SQUARE_BRACKET_ERROR; + result.error_text = SQUARE_BRACKET_ERROR; break; case TOK_UNTERMINATED_ESCAPE: - this->last_token = UNTERMINATED_ESCAPE_ERROR; + result.error_text = UNTERMINATED_ESCAPE_ERROR; break; case TOK_INVALID_REDIRECT: - this->last_token = REDIRECT_ERROR; + result.error_text = REDIRECT_ERROR; break; case TOK_INVALID_PIPE: - this->last_token = PIPE_ERROR; + result.error_text = PIPE_ERROR; break; default: assert(0 && "Unknown error type"); } } + return result; } tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) { @@ -80,34 +87,11 @@ tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), bool tokenizer_t::next(struct tok_t *result) { assert(result != NULL); - if (!this->tok_next()) { + maybe_t tok = this->tok_next(); + if (!tok) { return false; } - - const size_t current_pos = this->buff - this->start; - - // We want to copy our last_token into result->text. If we just do this naively via =, we are - // liable to trigger std::string's CoW implementation: result->text's storage will be - // deallocated and instead will acquire a reference to last_token's storage. But last_token will - // be overwritten soon, which will trigger a new allocation and a copy. So our attempt to re-use - // result->text's storage will have failed. To ensure that doesn't happen, use assign() with - // wchar_t. - result->text.assign(this->last_token.data(), this->last_token.size()); - - result->type = this->last_type; - result->offset = this->last_pos; - result->error = this->last_type == TOK_ERROR ? this->error : TOK_ERROR_NONE; - assert(this->buff >= this->start); - - // Compute error offset. - result->error_offset = 0; - if (this->last_type == TOK_ERROR && this->global_error_offset >= this->last_pos && - this->global_error_offset < current_pos) { - result->error_offset = this->global_error_offset - this->last_pos; - } - - assert(this->buff >= this->start); - result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0; + *result = std::move(*tok); return true; } @@ -143,9 +127,8 @@ static bool tok_is_string_character(wchar_t c, bool is_first) { static int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } /// Read the next token as a string. -void tokenizer_t::read_string() { - long len; - int do_loop = 1; +tok_t tokenizer_t::read_string() { + bool do_loop = true; size_t paran_count = 0; // Up to 96 open parens, before we give up on good error reporting. const size_t paran_offsets_max = 96; @@ -170,8 +153,8 @@ void tokenizer_t::read_string() { this->buff++; if (*this->buff == L'\0') { if ((!this->accept_unfinished)) { - this->call_error(TOK_UNTERMINATED_ESCAPE, error_location); - return; + return this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start, + error_location); } // Since we are about to increment tok->buff, decrement it first so the // increment doesn't go past the end of the buffer. See issue #389. @@ -209,8 +192,8 @@ void tokenizer_t::read_string() { this->buff += wcslen(this->buff); if (!this->accept_unfinished) { - this->call_error(TOK_UNTERMINATED_QUOTE, error_loc); - return; + return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, + error_loc); } do_loop = 0; } @@ -238,8 +221,8 @@ void tokenizer_t::read_string() { const wchar_t *error_loc = this->buff; this->buff += wcslen(this->buff); if ((!this->accept_unfinished)) { - this->call_error(TOK_UNTERMINATED_QUOTE, error_loc); - return; + return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, + error_loc); } do_loop = 0; } @@ -305,6 +288,7 @@ void tokenizer_t::read_string() { } if ((!this->accept_unfinished) && (mode != mode_regular_text)) { + tok_t error; switch (mode) { case mode_subshell: { // Determine the innermost opening paran offset by interrogating paran_offsets. @@ -314,12 +298,14 @@ void tokenizer_t::read_string() { offset_of_open_paran = paran_offsets[paran_count - 1]; } - this->call_error(TOK_UNTERMINATED_SUBSHELL, this->start + offset_of_open_paran); + error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, + this->start + offset_of_open_paran); break; } case mode_array_brackets: case mode_array_brackets_and_subshell: { - this->call_error(TOK_UNTERMINATED_SLICE, this->start + offset_of_bracket); + error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start, + this->start + offset_of_bracket); break; } default: { @@ -327,13 +313,14 @@ void tokenizer_t::read_string() { break; } } - return; + return error; } - len = this->buff - buff_start; - - this->last_token.assign(buff_start, len); - this->last_type = TOK_STRING; + tok_t result; + result.type = TOK_STRING; + result.offset = buff_start - this->start; + result.length = this->buff - buff_start; + return result; } /// Reads a redirection or an "fd pipe" (like 2>|) from a string. Returns how many characters were @@ -482,9 +469,9 @@ static bool iswspace_not_nl(wchar_t c) { } } -bool tokenizer_t::tok_next() { +maybe_t tokenizer_t::tok_next() { if (!this->has_next) { - return false; + return none(); } // Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it. @@ -510,30 +497,31 @@ bool tokenizer_t::tok_next() { // Maybe return the comment. if (this->show_comments) { - this->last_pos = comment_start - this->start; - this->last_token.assign(comment_start, comment_len); - this->last_type = TOK_COMMENT; - return true; + tok_t result; + result.type = TOK_COMMENT; + result.offset = comment_start - this->start; + result.length = comment_len; + return result; } while (iswspace_not_nl(this->buff[0])) this->buff++; } // We made it past the comments and ate any trailing newlines we wanted to ignore. this->continue_line_after_comment = false; - this->last_pos = this->buff - this->start; + size_t start_pos = this->buff - this->start; + tok_t result; + result.offset = start_pos; switch (*this->buff) { case L'\0': { - this->last_type = TOK_END; this->has_next = false; - this->last_token.clear(); - return false; + return none(); } case L'\r': // carriage-return case L'\n': // newline case L';': { - this->last_type = TOK_END; - this->last_token.assign(1, *this->buff); + result.type = TOK_END; + result.length = 1; this->buff++; // Hack: when we get a newline, swallow as many as we can. This compresses multiple // subsequent newlines into a single one. @@ -546,13 +534,15 @@ bool tokenizer_t::tok_next() { break; } case L'&': { - this->last_type = TOK_BACKGROUND; + result.type = TOK_BACKGROUND; + result.length = 1; this->buff++; break; } case L'|': { - this->last_token = L"1"; - this->last_type = TOK_PIPE; + result.type = TOK_PIPE; + result.redirected_fd = 1; + result.length = 1; this->buff++; break; } @@ -565,12 +555,12 @@ bool tokenizer_t::tok_next() { int fd = -1; size_t consumed = read_redirection_or_fd_pipe(this->buff, &mode, &fd); if (consumed == 0 || fd < 0) { - this->call_error(TOK_INVALID_REDIRECT, this->buff); - } else { - this->buff += consumed; - this->last_type = mode; - this->last_token = to_string(fd); + return this->call_error(TOK_INVALID_REDIRECT, this->buff, this->buff); } + result.type = mode; + result.redirected_fd = fd; + result.length = consumed; + this->buff += consumed; break; } default: { @@ -588,30 +578,29 @@ bool tokenizer_t::tok_next() { // that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer // error. if (mode == TOK_PIPE && fd == 0) { - this->call_error(TOK_INVALID_PIPE, error_location); - } else { - this->buff += consumed; - this->last_type = mode; - this->last_token = to_string(fd); + return this->call_error(TOK_INVALID_PIPE, error_location, error_location); } + result.type = mode; + result.redirected_fd = fd; + result.length = consumed; + this->buff += consumed; } else { // Not a redirection or pipe, so just a string. - this->read_string(); + result = this->read_string(); } break; } } - return true; + return result; } wcstring tok_first(const wcstring &str) { - wcstring result; - tokenizer_t t(str.data(), TOK_SQUASH_ERRORS); + tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS); tok_t token; if (t.next(&token) && token.type == TOK_STRING) { - result = std::move(token.text); + return t.text_of(token); } - return result; + return {}; } bool move_word_state_machine_t::consume_char_punctuation(wchar_t c) { diff --git a/src/tokenizer.h b/src/tokenizer.h index c90d4c9fc..21c8d038f 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -6,6 +6,7 @@ #include #include "common.h" +#include "maybe.h" /// Token types. enum token_type { @@ -52,21 +53,26 @@ enum tokenizer_error { typedef unsigned int tok_flags_t; struct tok_t { - // The text of the token, or an error message for type error. - wcstring text; // The type of the token. - token_type type; + token_type type{TOK_NONE}; + + // Offset of the token. + size_t offset{0}; + // Length of the token. + size_t length{0}; + + // If the token represents a redirection, the redirected fd. + maybe_t redirected_fd{}; + // If an error, this is the error code. - enum tokenizer_error error; + enum tokenizer_error error { TOK_ERROR_NONE }; // If an error, this is the offset of the error within the token. A value of 0 means it occurred // at 'offset'. - size_t error_offset; - // Offset of the token. - size_t offset; - // Length of the token. - size_t length; + size_t error_offset{size_t(-1)}; + // If there is an error, the text of the error; otherwise empty. + wcstring error_text{}; - tok_t() : type(TOK_NONE), error(TOK_ERROR_NONE), error_offset(-1), offset(-1), length(-1) {} + tok_t() = default; }; /// The tokenizer struct. @@ -79,13 +85,7 @@ class tokenizer_t { const wchar_t *buff; /// The start of the original string. const wchar_t *const start; - /// The last token. - wcstring last_token; - /// Type of last token. - enum token_type last_type { TOK_NONE }; - /// Offset of last token. - size_t last_pos{0}; - /// Whether there are more tokens. + /// Whether we have additional tokens. bool has_next{true}; /// Whether incomplete tokens are accepted. bool accept_unfinished{false}; @@ -93,18 +93,15 @@ class tokenizer_t { bool show_comments{false}; /// Whether all blank lines are returned. bool show_blank_lines{false}; - /// Last error. - tokenizer_error error{TOK_ERROR_NONE}; - /// Last error offset, in "global" coordinates (relative to orig_buff). - size_t global_error_offset{size_t(-1)}; /// Whether we are squashing errors. bool squash_errors{false}; /// Whether to continue the previous line after the comment. bool continue_line_after_comment{false}; - void call_error(enum tokenizer_error error_type, const wchar_t *where); - void read_string(); - bool tok_next(); + tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start, + const wchar_t *error_loc); + tok_t read_string(); + maybe_t tok_next(); public: /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and @@ -118,6 +115,9 @@ class tokenizer_t { /// Returns the next token by reference. Returns true if we got one, false if we're at the end. bool next(struct tok_t *result); + + /// Returns the text of a token, as a string. + wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); } }; /// Returns only the first token from the specified string. This is a convenience function, used to