diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 712d12ee0..a7c3d1dfb 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -63,13 +63,13 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token const wchar_t *error_loc, maybe_t token_length) { assert(error_type != tokenizer_error_t::none && "tokenizer_error_t::none passed to call_error"); assert(error_loc >= token_start && "Invalid error location"); - assert(this->buff >= token_start && "Invalid buff location"); + assert(this->token_cursor >= token_start && "Invalid buff location"); // If continue_after_error is set and we have a real token length, then skip past it. // Otherwise give up. if (token_length.has_value() && continue_after_error) { - assert(this->buff < error_loc + *token_length && "Unable to continue past error"); - this->buff = error_loc + *token_length; + assert(this->token_cursor < error_loc + *token_length && "Unable to continue past error"); + this->token_cursor = error_loc + *token_length; } else { this->has_next = false; } @@ -78,12 +78,13 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token result.error = error_type; result.offset = token_start - this->start; // If we are passed a token_length, then use it; otherwise infer it from the buffer. - result.length = token_length ? *token_length : this->buff - token_start; + result.length = token_length ? *token_length : this->token_cursor - token_start; result.error_offset = error_loc - token_start; return result; } -tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) { +tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) + : token_cursor(start), start(start) { assert(start != nullptr && "Invalid start"); this->accept_unfinished = static_cast(flags & TOK_ACCEPT_UNFINISHED); @@ -145,11 +146,11 @@ tok_t tokenizer_t::read_string() { std::vector brace_offsets; std::vector expecting; int slice_offset = 0; - const wchar_t *const buff_start = this->buff; + const wchar_t *const buff_start = this->token_cursor; bool is_first = true; while (true) { - wchar_t c = *this->buff; + wchar_t c = *this->token_cursor; #if false wcstring msg = L"Handling 0x%x (%lc)"; tok_mode mode_begin = mode; @@ -173,22 +174,22 @@ tok_t tokenizer_t::read_string() { else if (c == L'\\') { mode |= tok_modes::char_escape; } else if (c == L'(') { - paran_offsets.push_back(this->buff - this->start); + paran_offsets.push_back(this->token_cursor - this->start); expecting.push_back(L')'); mode |= tok_modes::subshell; } else if (c == L'{') { - brace_offsets.push_back(this->buff - this->start); + brace_offsets.push_back(this->token_cursor - this->start); expecting.push_back(L'}'); mode |= tok_modes::curly_braces; } else if (c == L')') { if (expecting.size() > 0 && expecting.back() == L'}') { return this->call_error(tokenizer_error_t::expected_bclose_found_pclose, - this->start, this->buff, 1); + this->start, this->token_cursor, 1); } switch (paran_offsets.size()) { case 0: return this->call_error(tokenizer_error_t::closing_unopened_subshell, - this->start, this->buff, 1); + this->start, this->token_cursor, 1); case 1: mode &= ~(tok_modes::subshell); default: @@ -198,12 +199,12 @@ tok_t tokenizer_t::read_string() { } else if (c == L'}') { if (expecting.size() > 0 && expecting.back() == L')') { return this->call_error(tokenizer_error_t::expected_pclose_found_bclose, - this->start, this->buff, 1); + this->start, this->token_cursor, 1); } switch (brace_offsets.size()) { case 0: - return this->call_error(tokenizer_error_t::closing_unopened_brace, this->buff, - this->start + wcslen(this->start)); + return this->call_error(tokenizer_error_t::closing_unopened_brace, + this->token_cursor, this->start + wcslen(this->start)); case 1: mode &= ~(tok_modes::curly_braces); default: @@ -211,9 +212,9 @@ tok_t tokenizer_t::read_string() { } expecting.pop_back(); } else if (c == L'[') { - if (this->buff != buff_start) { + if (this->token_cursor != buff_start) { mode |= tok_modes::array_brackets; - slice_offset = this->buff - this->start; + slice_offset = this->token_cursor - this->start; } else { // This is actually allowed so the test operator `[` can be used as the head of a // command @@ -225,12 +226,12 @@ tok_t tokenizer_t::read_string() { else if (c == L']' && ((mode & tok_modes::array_brackets) == tok_modes::array_brackets)) { mode &= ~(tok_modes::array_brackets); } else if (c == L'\'' || c == L'"') { - const wchar_t *end = quote_end(this->buff); + const wchar_t *end = quote_end(this->token_cursor); if (end) { - this->buff = end; + this->token_cursor = end; } else { - const wchar_t *error_loc = this->buff; - this->buff += std::wcslen(this->buff); + const wchar_t *error_loc = this->token_cursor; + this->token_cursor += std::wcslen(this->token_cursor); if ((!this->accept_unfinished)) { return this->call_error(tokenizer_error_t::unterminated_quote, buff_start, error_loc); @@ -250,14 +251,14 @@ tok_t tokenizer_t::read_string() { FLOGF(error, msg.c_str(), c, c, int(mode_begin), int(mode)); #endif - this->buff++; + this->token_cursor++; is_first = false; } if ((!this->accept_unfinished) && (mode != tok_modes::regular_text)) { if (mode & tok_modes::char_escape) { return this->call_error(tokenizer_error_t::unterminated_escape, buff_start, - this->buff - 1, 1); + this->token_cursor - 1, 1); } else if (mode & tok_modes::array_brackets) { return this->call_error(tokenizer_error_t::unterminated_slice, buff_start, this->start + slice_offset); @@ -280,7 +281,7 @@ tok_t tokenizer_t::read_string() { tok_t result(token_type_t::string); result.offset = buff_start - this->start; - result.length = this->buff - buff_start; + result.length = this->token_cursor - buff_start; return result; } @@ -492,25 +493,27 @@ maybe_t tokenizer_t::next() { // it. bool preceding_escaped_nl = false; for (;;) { - if (this->buff[0] == L'\\' && this->buff[1] == L'\n') { - this->buff += 2; + if (this->token_cursor[0] == L'\\' && this->token_cursor[1] == L'\n') { + this->token_cursor += 2; this->continue_line_after_comment = true; preceding_escaped_nl = true; - } else if (iswspace_not_nl(this->buff[0])) { - this->buff++; + } else if (iswspace_not_nl(this->token_cursor[0])) { + this->token_cursor++; } else { break; } } - while (*this->buff == L'#') { + while (*this->token_cursor == L'#') { // We have a comment, walk over the comment. - const wchar_t *comment_start = this->buff; - while (this->buff[0] != L'\n' && this->buff[0] != L'\0') this->buff++; - size_t comment_len = this->buff - comment_start; + const wchar_t *comment_start = this->token_cursor; + while (this->token_cursor[0] != L'\n' && this->token_cursor[0] != L'\0') + this->token_cursor++; + size_t comment_len = this->token_cursor - comment_start; // If we are going to continue after the comment, skip any trailing newline. - if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++; + if (this->token_cursor[0] == L'\n' && this->continue_line_after_comment) + this->token_cursor++; // Maybe return the comment. if (this->show_comments) { @@ -520,15 +523,15 @@ maybe_t tokenizer_t::next() { result.preceding_escaped_nl = preceding_escaped_nl; return result; } - while (iswspace_not_nl(this->buff[0])) this->buff++; + while (iswspace_not_nl(this->token_cursor[0])) this->token_cursor++; } // We made it past the comments and ate any trailing newlines we wanted to ignore. this->continue_line_after_comment = false; - const size_t start_pos = this->buff - this->start; + const size_t start_pos = this->token_cursor - this->start; maybe_t result{}; - switch (*this->buff) { + switch (*this->token_cursor) { case L'\0': { this->has_next = false; return none(); @@ -539,60 +542,60 @@ maybe_t tokenizer_t::next() { result.emplace(token_type_t::end); result->offset = start_pos; result->length = 1; - this->buff++; + this->token_cursor++; // Hack: when we get a newline, swallow as many as we can. This compresses multiple // subsequent newlines into a single one. if (!this->show_blank_lines) { - while (*this->buff == L'\n' || *this->buff == 13 /* CR */ || *this->buff == ' ' || - *this->buff == '\t') { - this->buff++; + while (*this->token_cursor == L'\n' || *this->token_cursor == 13 /* CR */ || + *this->token_cursor == ' ' || *this->token_cursor == '\t') { + this->token_cursor++; } } break; } case L'&': { - if (this->buff[1] == L'&') { + if (this->token_cursor[1] == L'&') { // && is and. result.emplace(token_type_t::andand); result->offset = start_pos; result->length = 2; - this->buff += 2; - } else if (this->buff[1] == L'>' || this->buff[1] == L'|') { + this->token_cursor += 2; + } else if (this->token_cursor[1] == L'>' || this->token_cursor[1] == L'|') { // &> and &| redirect both stdout and stderr. - auto redir = pipe_or_redir_t::from_string(buff); + auto redir = pipe_or_redir_t::from_string(this->token_cursor); assert(redir.has_value() && "Should always succeed to parse a &> or &| redirection"); result.emplace(redir->token_type()); result->offset = start_pos; result->length = redir->consumed; - this->buff += redir->consumed; + this->token_cursor += redir->consumed; } else { result.emplace(token_type_t::background); result->offset = start_pos; result->length = 1; - this->buff++; + this->token_cursor++; } break; } case L'|': { - if (this->buff[1] == L'|') { + if (this->token_cursor[1] == L'|') { // || is or. result.emplace(token_type_t::oror); result->offset = start_pos; result->length = 2; - this->buff += 2; - } else if (this->buff[1] == L'&') { + this->token_cursor += 2; + } else if (this->token_cursor[1] == L'&') { // |& is a bashism; in fish it's &|. - return this->call_error(tokenizer_error_t::invalid_pipe_ampersand, this->buff, - this->buff, 2); + return this->call_error(tokenizer_error_t::invalid_pipe_ampersand, + this->token_cursor, this->token_cursor, 2); } else { - auto pipe = pipe_or_redir_t::from_string(buff); + auto pipe = pipe_or_redir_t::from_string(this->token_cursor); assert(pipe.has_value() && pipe->is_pipe && "Should always succeed to parse a | pipe"); result.emplace(pipe->token_type()); result->offset = start_pos; result->length = pipe->consumed; - this->buff += pipe->consumed; + this->token_cursor += pipe->consumed; } break; } @@ -601,23 +604,24 @@ maybe_t tokenizer_t::next() { // There's some duplication with the code in the default case below. The key // difference here is that we must never parse these as a string; a failed // redirection is an error! - auto redir_or_pipe = pipe_or_redir_t::from_string(this->buff); + auto redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor); if (!redir_or_pipe || redir_or_pipe->fd < 0) { - return this->call_error(tokenizer_error_t::invalid_redirect, this->buff, this->buff, + return this->call_error(tokenizer_error_t::invalid_redirect, this->token_cursor, + this->token_cursor, redir_or_pipe ? redir_or_pipe->consumed : 0); } result.emplace(redir_or_pipe->token_type()); result->offset = start_pos; result->length = redir_or_pipe->consumed; - this->buff += redir_or_pipe->consumed; + this->token_cursor += redir_or_pipe->consumed; break; } default: { // Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string. - const wchar_t *error_location = this->buff; + const wchar_t *error_location = this->token_cursor; maybe_t redir_or_pipe{}; - if (iswdigit(*this->buff) || (*this->buff == L'^' && caret_redirs())) { - redir_or_pipe = pipe_or_redir_t::from_string(this->buff); + if (iswdigit(*this->token_cursor) || (*this->token_cursor == L'^' && caret_redirs())) { + redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor); } if (redir_or_pipe) { @@ -631,7 +635,7 @@ maybe_t tokenizer_t::next() { result.emplace(redir_or_pipe->token_type()); result->offset = start_pos; result->length = redir_or_pipe->consumed; - this->buff += redir_or_pipe->consumed; + this->token_cursor += redir_or_pipe->consumed; } else { // Not a redirection or pipe, so just a string. result = this->read_string(); diff --git a/src/tokenizer.h b/src/tokenizer.h index d1689f0a5..992e92f8c 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -103,7 +103,7 @@ class tokenizer_t { void operator=(const tokenizer_t &) = delete; /// A pointer into the original string, showing where the next token begins. - const wchar_t *buff; + const wchar_t *token_cursor; /// The start of the original string. const wchar_t *const start; /// Whether we have additional tokens.