Clean up tokenizer implementation

Rather than storing a bunch of "next_foo" fields, simply populate the tok_t directly.
2025-03-15 23:22:53 +08:00 · 2018-02-23 14:30:15 -08:00 · 2018-02-23 14:30:15 -08:00 · 6673fe5457
commit 6673fe5457
parent e9a4875a6b
7 changed files with 122 additions and 130 deletions
--- a/src/builtin_commandline.cpp
+++ b/src/builtin_commandline.cpp
@ -143,10 +143,10 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
        tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED);
        tok_t token;
        while (tok.next(&token)) {
-            if ((cut_at_cursor) && (token.offset + token.text.size() >= pos)) break;
+            if ((cut_at_cursor) && (token.offset + token.length >= pos)) break;

            if (token.type == TOK_STRING) {
-                wcstring tmp = token.text;
+                wcstring tmp = tok.text_of(token);
                unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE);
                out.append(tmp);
                out.push_back(L'\n');
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@ -519,14 +519,14 @@ static void test_tokenizer() {
        do_test(token.type == TOK_STRING);
        do_test(token.offset == 0);
        do_test(token.length == 5);
-        do_test(token.text == L"alpha");
+        do_test(t.text_of(token) == L"alpha");

        got = t.next(&token);  // beta
        do_test(got);
        do_test(token.type == TOK_STRING);
        do_test(token.offset == 6);
        do_test(token.length == 4);
-        do_test(token.text == L"beta");
+        do_test(t.text_of(token) == L"beta");

        got = t.next(&token);
        do_test(!got);
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@ -477,7 +477,7 @@ class parse_ll_t {
    void accept_tokens(parse_token_t token1, parse_token_t token2);

    /// Report tokenizer errors.
-    void report_tokenizer_error(const tok_t &tok);
+    void report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok);

    /// Indicate if we hit a fatal error.
    bool has_fatal_error() const { return this->fatal_errored; }
@ -711,7 +711,7 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
    }
 }

-void parse_ll_t::report_tokenizer_error(const tok_t &tok) {
+void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) {
    parse_error_code_t parse_error_code;
    switch (tok.error) {
        case TOK_UNTERMINATED_QUOTE: {
@ -738,7 +738,7 @@ void parse_ll_t::report_tokenizer_error(const tok_t &tok) {
        }
    }
    this->parse_error_at_location(tok.offset + tok.error_offset, parse_error_code, L"%ls",
-                                  tok.text.c_str());
+                                  tokenizer.text_of(tok).c_str());
 }

 void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
@ -1067,10 +1067,11 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token) {
    // this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
    // even starts to look like a feature.
    result.type = parse_token_type_from_tokenizer_token(token->type);
-    result.keyword = keyword_for_token(token->type, token->text);
-    result.has_dash_prefix = !token->text.empty() && token->text.at(0) == L'-';
-    result.is_help_argument = result.has_dash_prefix && is_help_argument(token->text);
-    result.is_newline = (result.type == parse_token_type_end && token->text == L"\n");
+    wcstring text = tok->text_of(*token);
+    result.keyword = keyword_for_token(token->type, text);
+    result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
+    result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
+    result.is_newline = (result.type == parse_token_type_end && text == L"\n");

    // These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
    // uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
@ -1128,7 +1129,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
        // Handle tokenizer errors. This is a hack because really the parser should report this for
        // itself; but it has no way of getting the tokenizer message.
        if (queue[1].type == parse_special_type_tokenizer_error) {
-            parser.report_tokenizer_error(tokenizer_token);
+            parser.report_tokenizer_error(tok, tokenizer_token);
        }

        if (!parser.has_fatal_error()) {
--- a/src/parse_util.cpp
+++ b/src/parse_util.cpp
@ -379,7 +379,7 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar

        // Calculate end of token.
        if (token.type == TOK_STRING) {
-            tok_end += token.text.size();
+            tok_end += token.length;
        }

        // Cursor was before beginning of this token, means that the cursor is between two tokens,
@ -393,14 +393,14 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
        // and break.
        if (token.type == TOK_STRING && tok_end >= offset_within_cmdsubst) {
            a = cmdsubst_begin + token.offset;
-            b = a + token.text.size();
+            b = a + token.length;
            break;
        }

        // Remember previous string token.
        if (token.type == TOK_STRING) {
            pa = cmdsubst_begin + token.offset;
-            pb = pa + token.text.size();
+            pb = pa + token.length;
        }
    }

@ -479,7 +479,8 @@ void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_
    while (tok.next(&token)) {
        if (token.offset > pos) break;

-        if (token.type == TOK_STRING) last_quote = get_quote(token.text, pos - token.offset);
+        if (token.type == TOK_STRING)
+            last_quote = get_quote(tok.text_of(token), pos - token.offset);

        if (out_type != NULL) *out_type = token.type;

--- a/src/reader.cpp
+++ b/src/reader.cpp
@ -1744,13 +1744,14 @@ static void handle_token_history(history_search_direction_t dir, bool reset = fa
            tok_t token;
            while (tok.next(&token)) {
                if (token.type != TOK_STRING) continue;
-                if (token.text.find(data->search_buff) == wcstring::npos) continue;
+                wcstring text = tok.text_of(token);
+                if (text.find(data->search_buff) == wcstring::npos) continue;
                if (token.offset >= current_pos) continue;

-                auto found = find(data->search_prev.begin(), data->search_prev.end(), token.text);
+                auto found = find(data->search_prev.begin(), data->search_prev.end(), text);
                if (found == data->search_prev.end()) {
                    data->token_history_pos = token.offset;
-                    str = token.text;
+                    str = text;
                }
            }
        }
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -34,39 +34,46 @@
 /// Error string for when trying to pipe from fd 0.
 #define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output")

-/// Set the latest tokens string to be the specified error message.
-void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *where) {
+/// Return an error token and mark that we no longer have a next token.
+tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start,
+                              const wchar_t *error_loc) {
    assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error");
-    this->last_type = TOK_ERROR;
-    this->error = error_type;
+    assert(error_loc >= token_start && "Invalid error location");
+    assert(this->buff >= token_start && "Invalid buff location");
+
    this->has_next = false;
-    this->global_error_offset = where ? where - this->start : 0;
-    if (this->squash_errors) {
-        this->last_token.clear();
-    } else {
+
+    tok_t result;
+    result.type = TOK_ERROR;
+    result.error = error_type;
+    result.offset = token_start - this->start;
+    result.length = this->buff - token_start;
+    result.error_offset = error_loc - token_start;
+    if (!this->squash_errors) {
        switch (error_type) {
            case TOK_UNTERMINATED_QUOTE:
-                this->last_token = QUOTE_ERROR;
+                result.error_text = QUOTE_ERROR;
                break;
            case TOK_UNTERMINATED_SUBSHELL:
-                this->last_token = PARAN_ERROR;
+                result.error_text = PARAN_ERROR;
                break;
            case TOK_UNTERMINATED_SLICE:
-                this->last_token = SQUARE_BRACKET_ERROR;
+                result.error_text = SQUARE_BRACKET_ERROR;
                break;
            case TOK_UNTERMINATED_ESCAPE:
-                this->last_token = UNTERMINATED_ESCAPE_ERROR;
+                result.error_text = UNTERMINATED_ESCAPE_ERROR;
                break;
            case TOK_INVALID_REDIRECT:
-                this->last_token = REDIRECT_ERROR;
+                result.error_text = REDIRECT_ERROR;
                break;
            case TOK_INVALID_PIPE:
-                this->last_token = PIPE_ERROR;
+                result.error_text = PIPE_ERROR;
                break;
            default:
                assert(0 && "Unknown error type");
        }
    }
+    return result;
 }

 tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) {
@ -80,34 +87,11 @@ tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start),

 bool tokenizer_t::next(struct tok_t *result) {
    assert(result != NULL);
-    if (!this->tok_next()) {
+    maybe_t<tok_t> tok = this->tok_next();
+    if (!tok) {
        return false;
    }
-
-    const size_t current_pos = this->buff - this->start;
-
-    // We want to copy our last_token into result->text. If we just do this naively via =, we are
-    // liable to trigger std::string's CoW implementation: result->text's storage will be
-    // deallocated and instead will acquire a reference to last_token's storage. But last_token will
-    // be overwritten soon, which will trigger a new allocation and a copy. So our attempt to re-use
-    // result->text's storage will have failed. To ensure that doesn't happen, use assign() with
-    // wchar_t.
-    result->text.assign(this->last_token.data(), this->last_token.size());
-
-    result->type = this->last_type;
-    result->offset = this->last_pos;
-    result->error = this->last_type == TOK_ERROR ? this->error : TOK_ERROR_NONE;
-    assert(this->buff >= this->start);
-
-    // Compute error offset.
-    result->error_offset = 0;
-    if (this->last_type == TOK_ERROR && this->global_error_offset >= this->last_pos &&
-        this->global_error_offset < current_pos) {
-        result->error_offset = this->global_error_offset - this->last_pos;
-    }
-
-    assert(this->buff >= this->start);
-    result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
+    *result = std::move(*tok);
    return true;
 }

@ -143,9 +127,8 @@ static bool tok_is_string_character(wchar_t c, bool is_first) {
 static int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); }

 /// Read the next token as a string.
-void tokenizer_t::read_string() {
-    long len;
-    int do_loop = 1;
+tok_t tokenizer_t::read_string() {
+    bool do_loop = true;
    size_t paran_count = 0;
    // Up to 96 open parens, before we give up on good error reporting.
    const size_t paran_offsets_max = 96;
@ -170,8 +153,8 @@ void tokenizer_t::read_string() {
                this->buff++;
                if (*this->buff == L'\0') {
                    if ((!this->accept_unfinished)) {
-                        this->call_error(TOK_UNTERMINATED_ESCAPE, error_location);
-                        return;
+                        return this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start,
+                                                error_location);
                    }
                    // Since we are about to increment tok->buff, decrement it first so the
                    // increment doesn't go past the end of the buffer. See issue #389.
@ -209,8 +192,8 @@ void tokenizer_t::read_string() {
                                this->buff += wcslen(this->buff);

                                if (!this->accept_unfinished) {
-                                    this->call_error(TOK_UNTERMINATED_QUOTE, error_loc);
-                                    return;
+                                    return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
+                                                            error_loc);
                                }
                                do_loop = 0;
                            }
@ -238,8 +221,8 @@ void tokenizer_t::read_string() {
                                const wchar_t *error_loc = this->buff;
                                this->buff += wcslen(this->buff);
                                if ((!this->accept_unfinished)) {
-                                    this->call_error(TOK_UNTERMINATED_QUOTE, error_loc);
-                                    return;
+                                    return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
+                                                            error_loc);
                                }
                                do_loop = 0;
                            }
@ -305,6 +288,7 @@ void tokenizer_t::read_string() {
    }

    if ((!this->accept_unfinished) && (mode != mode_regular_text)) {
+        tok_t error;
        switch (mode) {
            case mode_subshell: {
                // Determine the innermost opening paran offset by interrogating paran_offsets.
@ -314,12 +298,14 @@ void tokenizer_t::read_string() {
                    offset_of_open_paran = paran_offsets[paran_count - 1];
                }

-                this->call_error(TOK_UNTERMINATED_SUBSHELL, this->start + offset_of_open_paran);
+                error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
+                                         this->start + offset_of_open_paran);
                break;
            }
            case mode_array_brackets:
            case mode_array_brackets_and_subshell: {
-                this->call_error(TOK_UNTERMINATED_SLICE, this->start + offset_of_bracket);
+                error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start,
+                                         this->start + offset_of_bracket);
                break;
            }
            default: {
@ -327,13 +313,14 @@ void tokenizer_t::read_string() {
                break;
            }
        }
-        return;
+        return error;
    }

-    len = this->buff - buff_start;
-
-    this->last_token.assign(buff_start, len);
-    this->last_type = TOK_STRING;
+    tok_t result;
+    result.type = TOK_STRING;
+    result.offset = buff_start - this->start;
+    result.length = this->buff - buff_start;
+    return result;
 }

 /// Reads a redirection or an "fd pipe" (like 2>|) from a string. Returns how many characters were
@ -482,9 +469,9 @@ static bool iswspace_not_nl(wchar_t c) {
    }
 }

-bool tokenizer_t::tok_next() {
+maybe_t<tok_t> tokenizer_t::tok_next() {
    if (!this->has_next) {
-        return false;
+        return none();
    }

    // Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it.
@ -510,30 +497,31 @@ bool tokenizer_t::tok_next() {

        // Maybe return the comment.
        if (this->show_comments) {
-            this->last_pos = comment_start - this->start;
-            this->last_token.assign(comment_start, comment_len);
-            this->last_type = TOK_COMMENT;
-            return true;
+            tok_t result;
+            result.type = TOK_COMMENT;
+            result.offset = comment_start - this->start;
+            result.length = comment_len;
+            return result;
        }
        while (iswspace_not_nl(this->buff[0])) this->buff++;
    }

    // We made it past the comments and ate any trailing newlines we wanted to ignore.
    this->continue_line_after_comment = false;
-    this->last_pos = this->buff - this->start;
+    size_t start_pos = this->buff - this->start;

+    tok_t result;
+    result.offset = start_pos;
    switch (*this->buff) {
        case L'\0': {
-            this->last_type = TOK_END;
            this->has_next = false;
-            this->last_token.clear();
-            return false;
+            return none();
        }
        case L'\r':  // carriage-return
        case L'\n':  // newline
        case L';': {
-            this->last_type = TOK_END;
-            this->last_token.assign(1, *this->buff);
+            result.type = TOK_END;
+            result.length = 1;
            this->buff++;
            // Hack: when we get a newline, swallow as many as we can. This compresses multiple
            // subsequent newlines into a single one.
@ -546,13 +534,15 @@ bool tokenizer_t::tok_next() {
            break;
        }
        case L'&': {
-            this->last_type = TOK_BACKGROUND;
+            result.type = TOK_BACKGROUND;
+            result.length = 1;
            this->buff++;
            break;
        }
        case L'|': {
-            this->last_token = L"1";
-            this->last_type = TOK_PIPE;
+            result.type = TOK_PIPE;
+            result.redirected_fd = 1;
+            result.length = 1;
            this->buff++;
            break;
        }
@ -565,12 +555,12 @@ bool tokenizer_t::tok_next() {
            int fd = -1;
            size_t consumed = read_redirection_or_fd_pipe(this->buff, &mode, &fd);
            if (consumed == 0 || fd < 0) {
-                this->call_error(TOK_INVALID_REDIRECT, this->buff);
-            } else {
-                this->buff += consumed;
-                this->last_type = mode;
-                this->last_token = to_string(fd);
+                return this->call_error(TOK_INVALID_REDIRECT, this->buff, this->buff);
            }
+            result.type = mode;
+            result.redirected_fd = fd;
+            result.length = consumed;
+            this->buff += consumed;
            break;
        }
        default: {
@ -588,30 +578,29 @@ bool tokenizer_t::tok_next() {
                // that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer
                // error.
                if (mode == TOK_PIPE && fd == 0) {
-                    this->call_error(TOK_INVALID_PIPE, error_location);
-                } else {
-                    this->buff += consumed;
-                    this->last_type = mode;
-                    this->last_token = to_string(fd);
+                    return this->call_error(TOK_INVALID_PIPE, error_location, error_location);
                }
+                result.type = mode;
+                result.redirected_fd = fd;
+                result.length = consumed;
+                this->buff += consumed;
            } else {
                // Not a redirection or pipe, so just a string.
-                this->read_string();
+                result = this->read_string();
            }
            break;
        }
    }
-    return true;
+    return result;
 }

 wcstring tok_first(const wcstring &str) {
-    wcstring result;
-    tokenizer_t t(str.data(), TOK_SQUASH_ERRORS);
+    tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS);
    tok_t token;
    if (t.next(&token) && token.type == TOK_STRING) {
-        result = std::move(token.text);
+        return t.text_of(token);
    }
-    return result;
+    return {};
 }

 bool move_word_state_machine_t::consume_char_punctuation(wchar_t c) {
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -6,6 +6,7 @@
 #include <stddef.h>

 #include "common.h"
+#include "maybe.h"

 /// Token types.
 enum token_type {
@ -52,21 +53,26 @@ enum tokenizer_error {
 typedef unsigned int tok_flags_t;

 struct tok_t {
-    // The text of the token, or an error message for type error.
-    wcstring text;
    // The type of the token.
-    token_type type;
+    token_type type{TOK_NONE};
+
+    // Offset of the token.
+    size_t offset{0};
+    // Length of the token.
+    size_t length{0};
+
+    // If the token represents a redirection, the redirected fd.
+    maybe_t<int> redirected_fd{};
+
    // If an error, this is the error code.
-    enum tokenizer_error error;
+    enum tokenizer_error error { TOK_ERROR_NONE };
    // If an error, this is the offset of the error within the token. A value of 0 means it occurred
    // at 'offset'.
-    size_t error_offset;
-    // Offset of the token.
-    size_t offset;
-    // Length of the token.
-    size_t length;
+    size_t error_offset{size_t(-1)};
+    // If there is an error, the text of the error; otherwise empty.
+    wcstring error_text{};

-    tok_t() : type(TOK_NONE), error(TOK_ERROR_NONE), error_offset(-1), offset(-1), length(-1) {}
+    tok_t() = default;
 };

 /// The tokenizer struct.
@ -79,13 +85,7 @@ class tokenizer_t {
    const wchar_t *buff;
    /// The start of the original string.
    const wchar_t *const start;
-    /// The last token.
-    wcstring last_token;
-    /// Type of last token.
-    enum token_type last_type { TOK_NONE };
-    /// Offset of last token.
-    size_t last_pos{0};
-    /// Whether there are more tokens.
+    /// Whether we have additional tokens.
    bool has_next{true};
    /// Whether incomplete tokens are accepted.
    bool accept_unfinished{false};
@ -93,18 +93,15 @@ class tokenizer_t {
    bool show_comments{false};
    /// Whether all blank lines are returned.
    bool show_blank_lines{false};
-    /// Last error.
-    tokenizer_error error{TOK_ERROR_NONE};
-    /// Last error offset, in "global" coordinates (relative to orig_buff).
-    size_t global_error_offset{size_t(-1)};
    /// Whether we are squashing errors.
    bool squash_errors{false};
    /// Whether to continue the previous line after the comment.
    bool continue_line_after_comment{false};

-    void call_error(enum tokenizer_error error_type, const wchar_t *where);
-    void read_string();
-    bool tok_next();
+    tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start,
+                     const wchar_t *error_loc);
+    tok_t read_string();
+    maybe_t<tok_t> tok_next();

   public:
    /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
@ -118,6 +115,9 @@ class tokenizer_t {

    /// Returns the next token by reference. Returns true if we got one, false if we're at the end.
    bool next(struct tok_t *result);
+
+    /// Returns the text of a token, as a string.
+    wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); }
 };

 /// Returns only the first token from the specified string. This is a convenience function, used to