Rationalize how the parser reports tokenizer errors

Remove the unnecessary SQUASH_ERROR flag and correctly report errors generated from the tokenizer.
2025-03-27 14:45:13 +08:00 · 2018-02-23 17:28:12 -08:00 · 2018-02-23 17:28:12 -08:00 · c4d903ff98
commit c4d903ff98
parent 0950c35eb2
5 changed files with 42 additions and 49 deletions
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@ -374,8 +374,8 @@ class parse_ll_t {
    void parse_error_unexpected_token(const wchar_t *expected, parse_token_t token);
    void parse_error(parse_token_t token, parse_error_code_t code, const wchar_t *format, ...);
-    void parse_error_at_location(size_t location, parse_error_code_t code, const wchar_t *format,
+    void parse_error_at_location(size_t source_start, size_t source_length, size_t error_location,
-                                 ...);
+                                 parse_error_code_t code, const wchar_t *format, ...);
    void parse_error_failed_production(struct parse_stack_element_t &elem, parse_token_t token);
    void parse_error_unbalancing_token(parse_token_t token);
@ -608,7 +608,8 @@ void parse_ll_t::parse_error(parse_token_t token, parse_error_code_t code, const
    }
 }
-void parse_ll_t::parse_error_at_location(size_t source_location, parse_error_code_t code,
+void parse_ll_t::parse_error_at_location(size_t source_start, size_t source_length,
                                         size_t error_location, parse_error_code_t code,
                                         const wchar_t *fmt, ...) {
    this->fatal_errored = true;
    if (this->should_generate_error_messages) {
@ -621,8 +622,8 @@ void parse_ll_t::parse_error_at_location(size_t source_location, parse_error_cod
        err.code = code;
        va_end(va);
-        err.source_start = source_location;
+        err.source_start = source_start;
-        err.source_length = 0;
+        err.source_length = source_length;
        this->errors.push_back(err);
    }
 }
@ -733,8 +734,10 @@ void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_
            break;
        }
    }
-    this->parse_error_at_location(tok.offset + tok.error_offset, parse_error_code, L"%ls",
+
-                                  tokenizer.text_of(tok).c_str());
+    this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset,
                                  parse_error_code, L"%ls",
                                  error_message_for_code(tok.error).c_str());
 }
 void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
@ -811,8 +814,9 @@ bool parse_ll_t::report_error_for_unclosed_block() {
    }
    if (cursor->source_start != NODE_OFFSET_INVALID) {
        const wcstring node_desc = block_type_user_presentable_description(block_node->type);
-        this->parse_error_at_location(cursor->source_start, parse_error_generic,
+        this->parse_error_at_location(cursor->source_start, 0, cursor->source_start,
-                                      L"Missing end to balance this %ls", node_desc.c_str());
+                                      parse_error_generic, L"Missing end to balance this %ls",
                                      node_desc.c_str());
        reported_error = true;
    }
    return reported_error;
@ -1098,8 +1102,6 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
    if (parse_flags & parse_flag_show_blank_lines) tok_options |= TOK_SHOW_BLANK_LINES;
    if (errors == NULL) tok_options |= TOK_SQUASH_ERRORS;
    tokenizer_t tok(str.c_str(), tok_options);
    // We are an LL(2) parser. We pass two tokens at a time. New tokens come in at index 1. Seed our
--- a/src/parse_util.cpp
+++ b/src/parse_util.cpp
@ -371,7 +371,7 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
    const wcstring buffcpy = wcstring(cmdsubst_begin, cmdsubst_end - cmdsubst_begin);
-    tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SQUASH_ERRORS);
+    tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED);
    tok_t token;
    while (tok.next(&token)) {
        size_t tok_begin = token.offset;
@ -474,7 +474,7 @@ void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_
    size_t prev_pos = 0;
    wchar_t last_quote = L'\0';
-    tokenizer_t tok(cmd.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SQUASH_ERRORS);
+    tokenizer_t tok(cmd.c_str(), TOK_ACCEPT_UNFINISHED);
    tok_t token;
    while (tok.next(&token)) {
        if (token.offset > pos) break;
--- a/src/reader.cpp
+++ b/src/reader.cpp
@ -2316,7 +2316,7 @@ static wchar_t unescaped_quote(const wcstring &str, size_t pos) {
 /// Returns true if the last token is a comment.
 static bool text_ends_in_comment(const wcstring &text) {
-    tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS | TOK_SQUASH_ERRORS);
+    tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS);
    tok_t token;
    while (tok.next(&token)) {
        ;  // pass
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -34,6 +34,26 @@
 /// Error string for when trying to pipe from fd 0.
 #define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output")
 wcstring error_message_for_code(tokenizer_error err) {
    switch (err) {
        case TOK_UNTERMINATED_QUOTE:
            return QUOTE_ERROR;
        case TOK_UNTERMINATED_SUBSHELL:
            return PARAN_ERROR;
        case TOK_UNTERMINATED_SLICE:
            return SQUARE_BRACKET_ERROR;
        case TOK_UNTERMINATED_ESCAPE:
            return UNTERMINATED_ESCAPE_ERROR;
        case TOK_INVALID_REDIRECT:
            return REDIRECT_ERROR;
        case TOK_INVALID_PIPE:
            return PIPE_ERROR;
        default:
            assert(0 && "Unknown error type");
            return {};
    }
 }
 /// Return an error token and mark that we no longer have a next token.
 tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start,
                              const wchar_t *error_loc) {
@ -49,30 +69,6 @@ tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *to
    result.offset = token_start - this->start;
    result.length = this->buff - token_start;
    result.error_offset = error_loc - token_start;
    if (!this->squash_errors) {
        switch (error_type) {
            case TOK_UNTERMINATED_QUOTE:
                result.error_text = QUOTE_ERROR;
                break;
            case TOK_UNTERMINATED_SUBSHELL:
                result.error_text = PARAN_ERROR;
                break;
            case TOK_UNTERMINATED_SLICE:
                result.error_text = SQUARE_BRACKET_ERROR;
                break;
            case TOK_UNTERMINATED_ESCAPE:
                result.error_text = UNTERMINATED_ESCAPE_ERROR;
                break;
            case TOK_INVALID_REDIRECT:
                result.error_text = REDIRECT_ERROR;
                break;
            case TOK_INVALID_PIPE:
                result.error_text = PIPE_ERROR;
                break;
            default:
                assert(0 && "Unknown error type");
        }
    }
    return result;
 }
@ -81,7 +77,6 @@ tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start),
    this->accept_unfinished = static_cast<bool>(flags & TOK_ACCEPT_UNFINISHED);
    this->show_comments = static_cast<bool>(flags & TOK_SHOW_COMMENTS);
    this->squash_errors = static_cast<bool>(flags & TOK_SQUASH_ERRORS);
    this->show_blank_lines = static_cast<bool>(flags & TOK_SHOW_BLANK_LINES);
 }
@ -590,7 +585,7 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
 }
 wcstring tok_first(const wcstring &str) {
-    tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS);
+    tokenizer_t t(str.c_str(), 0);
    tok_t token;
    if (t.next(&token) && token.type == TOK_STRING) {
        return t.text_of(token);
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -46,13 +46,9 @@ enum class redirection_type_t {
 /// Flag telling the tokenizer not to remove comments. Useful for syntax highlighting.
 #define TOK_SHOW_COMMENTS 2
 /// Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing
 /// off of the main thread (since wgettext is not thread safe).
 #define TOK_SQUASH_ERRORS 4
 /// Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. This flag tells
 /// the tokenizer to return each of them as a separate END.
-#define TOK_SHOW_BLANK_LINES 8
+#define TOK_SHOW_BLANK_LINES 4
 typedef unsigned int tok_flags_t;
@ -70,11 +66,10 @@ struct tok_t {
    // If an error, this is the error code.
    enum tokenizer_error error { TOK_ERROR_NONE };
    // If an error, this is the offset of the error within the token. A value of 0 means it occurred
    // at 'offset'.
    size_t error_offset{size_t(-1)};
    // If there is an error, the text of the error; otherwise empty.
    wcstring error_text{};
    tok_t() = default;
 };
@ -97,8 +92,6 @@ class tokenizer_t {
    bool show_comments{false};
    /// Whether all blank lines are returned.
    bool show_blank_lines{false};
    /// Whether we are squashing errors.
    bool squash_errors{false};
    /// Whether to continue the previous line after the comment.
    bool continue_line_after_comment{false};
@ -145,6 +138,9 @@ int fd_redirected_by_pipe(const wcstring &str);
 /// Helper function to return oflags (as in open(2)) for a redirection type.
 int oflags_for_redirection_type(redirection_type_t type);
 /// Returns an error message for an error code.
 wcstring error_message_for_code(tokenizer_error err);
 enum move_word_style_t {
    move_word_style_punctuation,      // stop at punctuation
    move_word_style_path_components,  // stops at path components