Rename tokenizer_t::buff to token_cursor

That should clarify its role vis-a-vis 'start'.
This commit is contained in:
ridiculousfish 2019-11-08 16:40:15 -08:00
parent 41c42c86e3
commit eb2386f3e3
2 changed files with 64 additions and 60 deletions

View File

@ -63,13 +63,13 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token
const wchar_t *error_loc, maybe_t<size_t> token_length) {
assert(error_type != tokenizer_error_t::none && "tokenizer_error_t::none passed to call_error");
assert(error_loc >= token_start && "Invalid error location");
assert(this->buff >= token_start && "Invalid buff location");
assert(this->token_cursor >= token_start && "Invalid buff location");
// If continue_after_error is set and we have a real token length, then skip past it.
// Otherwise give up.
if (token_length.has_value() && continue_after_error) {
assert(this->buff < error_loc + *token_length && "Unable to continue past error");
this->buff = error_loc + *token_length;
assert(this->token_cursor < error_loc + *token_length && "Unable to continue past error");
this->token_cursor = error_loc + *token_length;
} else {
this->has_next = false;
}
@ -78,12 +78,13 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token
result.error = error_type;
result.offset = token_start - this->start;
// If we are passed a token_length, then use it; otherwise infer it from the buffer.
result.length = token_length ? *token_length : this->buff - token_start;
result.length = token_length ? *token_length : this->token_cursor - token_start;
result.error_offset = error_loc - token_start;
return result;
}
tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) {
tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags)
: token_cursor(start), start(start) {
assert(start != nullptr && "Invalid start");
this->accept_unfinished = static_cast<bool>(flags & TOK_ACCEPT_UNFINISHED);
@ -145,11 +146,11 @@ tok_t tokenizer_t::read_string() {
std::vector<int> brace_offsets;
std::vector<char> expecting;
int slice_offset = 0;
const wchar_t *const buff_start = this->buff;
const wchar_t *const buff_start = this->token_cursor;
bool is_first = true;
while (true) {
wchar_t c = *this->buff;
wchar_t c = *this->token_cursor;
#if false
wcstring msg = L"Handling 0x%x (%lc)";
tok_mode mode_begin = mode;
@ -173,22 +174,22 @@ tok_t tokenizer_t::read_string() {
else if (c == L'\\') {
mode |= tok_modes::char_escape;
} else if (c == L'(') {
paran_offsets.push_back(this->buff - this->start);
paran_offsets.push_back(this->token_cursor - this->start);
expecting.push_back(L')');
mode |= tok_modes::subshell;
} else if (c == L'{') {
brace_offsets.push_back(this->buff - this->start);
brace_offsets.push_back(this->token_cursor - this->start);
expecting.push_back(L'}');
mode |= tok_modes::curly_braces;
} else if (c == L')') {
if (expecting.size() > 0 && expecting.back() == L'}') {
return this->call_error(tokenizer_error_t::expected_bclose_found_pclose,
this->start, this->buff, 1);
this->start, this->token_cursor, 1);
}
switch (paran_offsets.size()) {
case 0:
return this->call_error(tokenizer_error_t::closing_unopened_subshell,
this->start, this->buff, 1);
this->start, this->token_cursor, 1);
case 1:
mode &= ~(tok_modes::subshell);
default:
@ -198,12 +199,12 @@ tok_t tokenizer_t::read_string() {
} else if (c == L'}') {
if (expecting.size() > 0 && expecting.back() == L')') {
return this->call_error(tokenizer_error_t::expected_pclose_found_bclose,
this->start, this->buff, 1);
this->start, this->token_cursor, 1);
}
switch (brace_offsets.size()) {
case 0:
return this->call_error(tokenizer_error_t::closing_unopened_brace, this->buff,
this->start + wcslen(this->start));
return this->call_error(tokenizer_error_t::closing_unopened_brace,
this->token_cursor, this->start + wcslen(this->start));
case 1:
mode &= ~(tok_modes::curly_braces);
default:
@ -211,9 +212,9 @@ tok_t tokenizer_t::read_string() {
}
expecting.pop_back();
} else if (c == L'[') {
if (this->buff != buff_start) {
if (this->token_cursor != buff_start) {
mode |= tok_modes::array_brackets;
slice_offset = this->buff - this->start;
slice_offset = this->token_cursor - this->start;
} else {
// This is actually allowed so the test operator `[` can be used as the head of a
// command
@ -225,12 +226,12 @@ tok_t tokenizer_t::read_string() {
else if (c == L']' && ((mode & tok_modes::array_brackets) == tok_modes::array_brackets)) {
mode &= ~(tok_modes::array_brackets);
} else if (c == L'\'' || c == L'"') {
const wchar_t *end = quote_end(this->buff);
const wchar_t *end = quote_end(this->token_cursor);
if (end) {
this->buff = end;
this->token_cursor = end;
} else {
const wchar_t *error_loc = this->buff;
this->buff += std::wcslen(this->buff);
const wchar_t *error_loc = this->token_cursor;
this->token_cursor += std::wcslen(this->token_cursor);
if ((!this->accept_unfinished)) {
return this->call_error(tokenizer_error_t::unterminated_quote, buff_start,
error_loc);
@ -250,14 +251,14 @@ tok_t tokenizer_t::read_string() {
FLOGF(error, msg.c_str(), c, c, int(mode_begin), int(mode));
#endif
this->buff++;
this->token_cursor++;
is_first = false;
}
if ((!this->accept_unfinished) && (mode != tok_modes::regular_text)) {
if (mode & tok_modes::char_escape) {
return this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
this->buff - 1, 1);
this->token_cursor - 1, 1);
} else if (mode & tok_modes::array_brackets) {
return this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
this->start + slice_offset);
@ -280,7 +281,7 @@ tok_t tokenizer_t::read_string() {
tok_t result(token_type_t::string);
result.offset = buff_start - this->start;
result.length = this->buff - buff_start;
result.length = this->token_cursor - buff_start;
return result;
}
@ -492,25 +493,27 @@ maybe_t<tok_t> tokenizer_t::next() {
// it.
bool preceding_escaped_nl = false;
for (;;) {
if (this->buff[0] == L'\\' && this->buff[1] == L'\n') {
this->buff += 2;
if (this->token_cursor[0] == L'\\' && this->token_cursor[1] == L'\n') {
this->token_cursor += 2;
this->continue_line_after_comment = true;
preceding_escaped_nl = true;
} else if (iswspace_not_nl(this->buff[0])) {
this->buff++;
} else if (iswspace_not_nl(this->token_cursor[0])) {
this->token_cursor++;
} else {
break;
}
}
while (*this->buff == L'#') {
while (*this->token_cursor == L'#') {
// We have a comment, walk over the comment.
const wchar_t *comment_start = this->buff;
while (this->buff[0] != L'\n' && this->buff[0] != L'\0') this->buff++;
size_t comment_len = this->buff - comment_start;
const wchar_t *comment_start = this->token_cursor;
while (this->token_cursor[0] != L'\n' && this->token_cursor[0] != L'\0')
this->token_cursor++;
size_t comment_len = this->token_cursor - comment_start;
// If we are going to continue after the comment, skip any trailing newline.
if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
if (this->token_cursor[0] == L'\n' && this->continue_line_after_comment)
this->token_cursor++;
// Maybe return the comment.
if (this->show_comments) {
@ -520,15 +523,15 @@ maybe_t<tok_t> tokenizer_t::next() {
result.preceding_escaped_nl = preceding_escaped_nl;
return result;
}
while (iswspace_not_nl(this->buff[0])) this->buff++;
while (iswspace_not_nl(this->token_cursor[0])) this->token_cursor++;
}
// We made it past the comments and ate any trailing newlines we wanted to ignore.
this->continue_line_after_comment = false;
const size_t start_pos = this->buff - this->start;
const size_t start_pos = this->token_cursor - this->start;
maybe_t<tok_t> result{};
switch (*this->buff) {
switch (*this->token_cursor) {
case L'\0': {
this->has_next = false;
return none();
@ -539,60 +542,60 @@ maybe_t<tok_t> tokenizer_t::next() {
result.emplace(token_type_t::end);
result->offset = start_pos;
result->length = 1;
this->buff++;
this->token_cursor++;
// Hack: when we get a newline, swallow as many as we can. This compresses multiple
// subsequent newlines into a single one.
if (!this->show_blank_lines) {
while (*this->buff == L'\n' || *this->buff == 13 /* CR */ || *this->buff == ' ' ||
*this->buff == '\t') {
this->buff++;
while (*this->token_cursor == L'\n' || *this->token_cursor == 13 /* CR */ ||
*this->token_cursor == ' ' || *this->token_cursor == '\t') {
this->token_cursor++;
}
}
break;
}
case L'&': {
if (this->buff[1] == L'&') {
if (this->token_cursor[1] == L'&') {
// && is and.
result.emplace(token_type_t::andand);
result->offset = start_pos;
result->length = 2;
this->buff += 2;
} else if (this->buff[1] == L'>' || this->buff[1] == L'|') {
this->token_cursor += 2;
} else if (this->token_cursor[1] == L'>' || this->token_cursor[1] == L'|') {
// &> and &| redirect both stdout and stderr.
auto redir = pipe_or_redir_t::from_string(buff);
auto redir = pipe_or_redir_t::from_string(this->token_cursor);
assert(redir.has_value() &&
"Should always succeed to parse a &> or &| redirection");
result.emplace(redir->token_type());
result->offset = start_pos;
result->length = redir->consumed;
this->buff += redir->consumed;
this->token_cursor += redir->consumed;
} else {
result.emplace(token_type_t::background);
result->offset = start_pos;
result->length = 1;
this->buff++;
this->token_cursor++;
}
break;
}
case L'|': {
if (this->buff[1] == L'|') {
if (this->token_cursor[1] == L'|') {
// || is or.
result.emplace(token_type_t::oror);
result->offset = start_pos;
result->length = 2;
this->buff += 2;
} else if (this->buff[1] == L'&') {
this->token_cursor += 2;
} else if (this->token_cursor[1] == L'&') {
// |& is a bashism; in fish it's &|.
return this->call_error(tokenizer_error_t::invalid_pipe_ampersand, this->buff,
this->buff, 2);
return this->call_error(tokenizer_error_t::invalid_pipe_ampersand,
this->token_cursor, this->token_cursor, 2);
} else {
auto pipe = pipe_or_redir_t::from_string(buff);
auto pipe = pipe_or_redir_t::from_string(this->token_cursor);
assert(pipe.has_value() && pipe->is_pipe &&
"Should always succeed to parse a | pipe");
result.emplace(pipe->token_type());
result->offset = start_pos;
result->length = pipe->consumed;
this->buff += pipe->consumed;
this->token_cursor += pipe->consumed;
}
break;
}
@ -601,23 +604,24 @@ maybe_t<tok_t> tokenizer_t::next() {
// There's some duplication with the code in the default case below. The key
// difference here is that we must never parse these as a string; a failed
// redirection is an error!
auto redir_or_pipe = pipe_or_redir_t::from_string(this->buff);
auto redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor);
if (!redir_or_pipe || redir_or_pipe->fd < 0) {
return this->call_error(tokenizer_error_t::invalid_redirect, this->buff, this->buff,
return this->call_error(tokenizer_error_t::invalid_redirect, this->token_cursor,
this->token_cursor,
redir_or_pipe ? redir_or_pipe->consumed : 0);
}
result.emplace(redir_or_pipe->token_type());
result->offset = start_pos;
result->length = redir_or_pipe->consumed;
this->buff += redir_or_pipe->consumed;
this->token_cursor += redir_or_pipe->consumed;
break;
}
default: {
// Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string.
const wchar_t *error_location = this->buff;
const wchar_t *error_location = this->token_cursor;
maybe_t<pipe_or_redir_t> redir_or_pipe{};
if (iswdigit(*this->buff) || (*this->buff == L'^' && caret_redirs())) {
redir_or_pipe = pipe_or_redir_t::from_string(this->buff);
if (iswdigit(*this->token_cursor) || (*this->token_cursor == L'^' && caret_redirs())) {
redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor);
}
if (redir_or_pipe) {
@ -631,7 +635,7 @@ maybe_t<tok_t> tokenizer_t::next() {
result.emplace(redir_or_pipe->token_type());
result->offset = start_pos;
result->length = redir_or_pipe->consumed;
this->buff += redir_or_pipe->consumed;
this->token_cursor += redir_or_pipe->consumed;
} else {
// Not a redirection or pipe, so just a string.
result = this->read_string();

View File

@ -103,7 +103,7 @@ class tokenizer_t {
void operator=(const tokenizer_t &) = delete;
/// A pointer into the original string, showing where the next token begins.
const wchar_t *buff;
const wchar_t *token_cursor;
/// The start of the original string.
const wchar_t *const start;
/// Whether we have additional tokens.