Rename tokenizer_t::buff to token_cursor

That should clarify its role vis-a-vis 'start'.
2024-11-23 17:06:59 +08:00 · 2019-11-08 16:40:15 -08:00 · 2019-11-08 16:40:15 -08:00 · eb2386f3e3
commit eb2386f3e3
parent 41c42c86e3
2 changed files with 64 additions and 60 deletions
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -63,13 +63,13 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token
                              const wchar_t *error_loc, maybe_t<size_t> token_length) {
    assert(error_type != tokenizer_error_t::none && "tokenizer_error_t::none passed to call_error");
    assert(error_loc >= token_start && "Invalid error location");
-    assert(this->buff >= token_start && "Invalid buff location");
+    assert(this->token_cursor >= token_start && "Invalid buff location");

    // If continue_after_error is set and we have a real token length, then skip past it.
    // Otherwise give up.
    if (token_length.has_value() && continue_after_error) {
-        assert(this->buff < error_loc + *token_length && "Unable to continue past error");
-        this->buff = error_loc + *token_length;
+        assert(this->token_cursor < error_loc + *token_length && "Unable to continue past error");
+        this->token_cursor = error_loc + *token_length;
    } else {
        this->has_next = false;
    }
@ -78,12 +78,13 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token
    result.error = error_type;
    result.offset = token_start - this->start;
    // If we are passed a token_length, then use it; otherwise infer it from the buffer.
-    result.length = token_length ? *token_length : this->buff - token_start;
+    result.length = token_length ? *token_length : this->token_cursor - token_start;
    result.error_offset = error_loc - token_start;
    return result;
 }

-tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) {
+tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags)
+    : token_cursor(start), start(start) {
    assert(start != nullptr && "Invalid start");

    this->accept_unfinished = static_cast<bool>(flags & TOK_ACCEPT_UNFINISHED);
@ -145,11 +146,11 @@ tok_t tokenizer_t::read_string() {
    std::vector<int> brace_offsets;
    std::vector<char> expecting;
    int slice_offset = 0;
-    const wchar_t *const buff_start = this->buff;
+    const wchar_t *const buff_start = this->token_cursor;
    bool is_first = true;

    while (true) {
-        wchar_t c = *this->buff;
+        wchar_t c = *this->token_cursor;
 #if false
        wcstring msg = L"Handling 0x%x (%lc)";
        tok_mode mode_begin = mode;
@ -173,22 +174,22 @@ tok_t tokenizer_t::read_string() {
        else if (c == L'\\') {
            mode |= tok_modes::char_escape;
        } else if (c == L'(') {
-            paran_offsets.push_back(this->buff - this->start);
+            paran_offsets.push_back(this->token_cursor - this->start);
            expecting.push_back(L')');
            mode |= tok_modes::subshell;
        } else if (c == L'{') {
-            brace_offsets.push_back(this->buff - this->start);
+            brace_offsets.push_back(this->token_cursor - this->start);
            expecting.push_back(L'}');
            mode |= tok_modes::curly_braces;
        } else if (c == L')') {
            if (expecting.size() > 0 && expecting.back() == L'}') {
                return this->call_error(tokenizer_error_t::expected_bclose_found_pclose,
-                                        this->start, this->buff, 1);
+                                        this->start, this->token_cursor, 1);
            }
            switch (paran_offsets.size()) {
                case 0:
                    return this->call_error(tokenizer_error_t::closing_unopened_subshell,
-                                            this->start, this->buff, 1);
+                                            this->start, this->token_cursor, 1);
                case 1:
                    mode &= ~(tok_modes::subshell);
                default:
@ -198,12 +199,12 @@ tok_t tokenizer_t::read_string() {
        } else if (c == L'}') {
            if (expecting.size() > 0 && expecting.back() == L')') {
                return this->call_error(tokenizer_error_t::expected_pclose_found_bclose,
-                                        this->start, this->buff, 1);
+                                        this->start, this->token_cursor, 1);
            }
            switch (brace_offsets.size()) {
                case 0:
-                    return this->call_error(tokenizer_error_t::closing_unopened_brace, this->buff,
-                                            this->start + wcslen(this->start));
+                    return this->call_error(tokenizer_error_t::closing_unopened_brace,
+                                            this->token_cursor, this->start + wcslen(this->start));
                case 1:
                    mode &= ~(tok_modes::curly_braces);
                default:
@ -211,9 +212,9 @@ tok_t tokenizer_t::read_string() {
            }
            expecting.pop_back();
        } else if (c == L'[') {
-            if (this->buff != buff_start) {
+            if (this->token_cursor != buff_start) {
                mode |= tok_modes::array_brackets;
-                slice_offset = this->buff - this->start;
+                slice_offset = this->token_cursor - this->start;
            } else {
                // This is actually allowed so the test operator `[` can be used as the head of a
                // command
@ -225,12 +226,12 @@ tok_t tokenizer_t::read_string() {
        else if (c == L']' && ((mode & tok_modes::array_brackets) == tok_modes::array_brackets)) {
            mode &= ~(tok_modes::array_brackets);
        } else if (c == L'\'' || c == L'"') {
-            const wchar_t *end = quote_end(this->buff);
+            const wchar_t *end = quote_end(this->token_cursor);
            if (end) {
-                this->buff = end;
+                this->token_cursor = end;
            } else {
-                const wchar_t *error_loc = this->buff;
-                this->buff += std::wcslen(this->buff);
+                const wchar_t *error_loc = this->token_cursor;
+                this->token_cursor += std::wcslen(this->token_cursor);
                if ((!this->accept_unfinished)) {
                    return this->call_error(tokenizer_error_t::unterminated_quote, buff_start,
                                            error_loc);
@ -250,14 +251,14 @@ tok_t tokenizer_t::read_string() {
        FLOGF(error, msg.c_str(), c, c, int(mode_begin), int(mode));
 #endif

-        this->buff++;
+        this->token_cursor++;
        is_first = false;
    }

    if ((!this->accept_unfinished) && (mode != tok_modes::regular_text)) {
        if (mode & tok_modes::char_escape) {
            return this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
-                                    this->buff - 1, 1);
+                                    this->token_cursor - 1, 1);
        } else if (mode & tok_modes::array_brackets) {
            return this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
                                    this->start + slice_offset);
@ -280,7 +281,7 @@ tok_t tokenizer_t::read_string() {

    tok_t result(token_type_t::string);
    result.offset = buff_start - this->start;
-    result.length = this->buff - buff_start;
+    result.length = this->token_cursor - buff_start;
    return result;
 }

@ -492,25 +493,27 @@ maybe_t<tok_t> tokenizer_t::next() {
    // it.
    bool preceding_escaped_nl = false;
    for (;;) {
-        if (this->buff[0] == L'\\' && this->buff[1] == L'\n') {
-            this->buff += 2;
+        if (this->token_cursor[0] == L'\\' && this->token_cursor[1] == L'\n') {
+            this->token_cursor += 2;
            this->continue_line_after_comment = true;
            preceding_escaped_nl = true;
-        } else if (iswspace_not_nl(this->buff[0])) {
-            this->buff++;
+        } else if (iswspace_not_nl(this->token_cursor[0])) {
+            this->token_cursor++;
        } else {
            break;
        }
    }

-    while (*this->buff == L'#') {
+    while (*this->token_cursor == L'#') {
        // We have a comment, walk over the comment.
-        const wchar_t *comment_start = this->buff;
-        while (this->buff[0] != L'\n' && this->buff[0] != L'\0') this->buff++;
-        size_t comment_len = this->buff - comment_start;
+        const wchar_t *comment_start = this->token_cursor;
+        while (this->token_cursor[0] != L'\n' && this->token_cursor[0] != L'\0')
+            this->token_cursor++;
+        size_t comment_len = this->token_cursor - comment_start;

        // If we are going to continue after the comment, skip any trailing newline.
-        if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
+        if (this->token_cursor[0] == L'\n' && this->continue_line_after_comment)
+            this->token_cursor++;

        // Maybe return the comment.
        if (this->show_comments) {
@ -520,15 +523,15 @@ maybe_t<tok_t> tokenizer_t::next() {
            result.preceding_escaped_nl = preceding_escaped_nl;
            return result;
        }
-        while (iswspace_not_nl(this->buff[0])) this->buff++;
+        while (iswspace_not_nl(this->token_cursor[0])) this->token_cursor++;
    }

    // We made it past the comments and ate any trailing newlines we wanted to ignore.
    this->continue_line_after_comment = false;
-    const size_t start_pos = this->buff - this->start;
+    const size_t start_pos = this->token_cursor - this->start;

    maybe_t<tok_t> result{};
-    switch (*this->buff) {
+    switch (*this->token_cursor) {
        case L'\0': {
            this->has_next = false;
            return none();
@ -539,60 +542,60 @@ maybe_t<tok_t> tokenizer_t::next() {
            result.emplace(token_type_t::end);
            result->offset = start_pos;
            result->length = 1;
-            this->buff++;
+            this->token_cursor++;
            // Hack: when we get a newline, swallow as many as we can. This compresses multiple
            // subsequent newlines into a single one.
            if (!this->show_blank_lines) {
-                while (*this->buff == L'\n' || *this->buff == 13 /* CR */ || *this->buff == ' ' ||
-                       *this->buff == '\t') {
-                    this->buff++;
+                while (*this->token_cursor == L'\n' || *this->token_cursor == 13 /* CR */ ||
+                       *this->token_cursor == ' ' || *this->token_cursor == '\t') {
+                    this->token_cursor++;
                }
            }
            break;
        }
        case L'&': {
-            if (this->buff[1] == L'&') {
+            if (this->token_cursor[1] == L'&') {
                // && is and.
                result.emplace(token_type_t::andand);
                result->offset = start_pos;
                result->length = 2;
-                this->buff += 2;
-            } else if (this->buff[1] == L'>' || this->buff[1] == L'|') {
+                this->token_cursor += 2;
+            } else if (this->token_cursor[1] == L'>' || this->token_cursor[1] == L'|') {
                // &> and &| redirect both stdout and stderr.
-                auto redir = pipe_or_redir_t::from_string(buff);
+                auto redir = pipe_or_redir_t::from_string(this->token_cursor);
                assert(redir.has_value() &&
                       "Should always succeed to parse a &> or &| redirection");
                result.emplace(redir->token_type());
                result->offset = start_pos;
                result->length = redir->consumed;
-                this->buff += redir->consumed;
+                this->token_cursor += redir->consumed;
            } else {
                result.emplace(token_type_t::background);
                result->offset = start_pos;
                result->length = 1;
-                this->buff++;
+                this->token_cursor++;
            }
            break;
        }
        case L'|': {
-            if (this->buff[1] == L'|') {
+            if (this->token_cursor[1] == L'|') {
                // || is or.
                result.emplace(token_type_t::oror);
                result->offset = start_pos;
                result->length = 2;
-                this->buff += 2;
-            } else if (this->buff[1] == L'&') {
+                this->token_cursor += 2;
+            } else if (this->token_cursor[1] == L'&') {
                // |& is a bashism; in fish it's &|.
-                return this->call_error(tokenizer_error_t::invalid_pipe_ampersand, this->buff,
-                                        this->buff, 2);
+                return this->call_error(tokenizer_error_t::invalid_pipe_ampersand,
+                                        this->token_cursor, this->token_cursor, 2);
            } else {
-                auto pipe = pipe_or_redir_t::from_string(buff);
+                auto pipe = pipe_or_redir_t::from_string(this->token_cursor);
                assert(pipe.has_value() && pipe->is_pipe &&
                       "Should always succeed to parse a | pipe");
                result.emplace(pipe->token_type());
                result->offset = start_pos;
                result->length = pipe->consumed;
-                this->buff += pipe->consumed;
+                this->token_cursor += pipe->consumed;
            }
            break;
        }
@ -601,23 +604,24 @@ maybe_t<tok_t> tokenizer_t::next() {
            // There's some duplication with the code in the default case below. The key
            // difference here is that we must never parse these as a string; a failed
            // redirection is an error!
-            auto redir_or_pipe = pipe_or_redir_t::from_string(this->buff);
+            auto redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor);
            if (!redir_or_pipe || redir_or_pipe->fd < 0) {
-                return this->call_error(tokenizer_error_t::invalid_redirect, this->buff, this->buff,
+                return this->call_error(tokenizer_error_t::invalid_redirect, this->token_cursor,
+                                        this->token_cursor,
                                        redir_or_pipe ? redir_or_pipe->consumed : 0);
            }
            result.emplace(redir_or_pipe->token_type());
            result->offset = start_pos;
            result->length = redir_or_pipe->consumed;
-            this->buff += redir_or_pipe->consumed;
+            this->token_cursor += redir_or_pipe->consumed;
            break;
        }
        default: {
            // Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string.
-            const wchar_t *error_location = this->buff;
+            const wchar_t *error_location = this->token_cursor;
            maybe_t<pipe_or_redir_t> redir_or_pipe{};
-            if (iswdigit(*this->buff) || (*this->buff == L'^' && caret_redirs())) {
-                redir_or_pipe = pipe_or_redir_t::from_string(this->buff);
+            if (iswdigit(*this->token_cursor) || (*this->token_cursor == L'^' && caret_redirs())) {
+                redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor);
            }

            if (redir_or_pipe) {
@ -631,7 +635,7 @@ maybe_t<tok_t> tokenizer_t::next() {
                result.emplace(redir_or_pipe->token_type());
                result->offset = start_pos;
                result->length = redir_or_pipe->consumed;
-                this->buff += redir_or_pipe->consumed;
+                this->token_cursor += redir_or_pipe->consumed;
            } else {
                // Not a redirection or pipe, so just a string.
                result = this->read_string();
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -103,7 +103,7 @@ class tokenizer_t {
    void operator=(const tokenizer_t &) = delete;

    /// A pointer into the original string, showing where the next token begins.
-    const wchar_t *buff;
+    const wchar_t *token_cursor;
    /// The start of the original string.
    const wchar_t *const start;
    /// Whether we have additional tokens.