fish-shell/src/ast.cpp
Aaron Gyes daf5e11179 Spelling fixes
Found with scspell
2022-10-28 20:10:09 -07:00

1387 lines
55 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "config.h" // IWYU pragma: keep
#include "ast.h"
#include <algorithm>
#include <array>
#include <cstdarg>
#include <cstdlib>
#include <string>
#include "common.h"
#include "enum_map.h"
#include "flog.h"
#include "parse_constants.h"
#include "parse_tree.h"
#include "tokenizer.h"
#include "wutil.h" // IWYU pragma: keep
namespace {
/// \return tokenizer flags corresponding to parse tree flags.
static tok_flags_t tokenizer_flags_from_parse_flags(parse_tree_flags_t flags) {
tok_flags_t tok_flags = 0;
// Note we do not need to respect parse_flag_show_blank_lines, no clients are interested in
// them.
if (flags & parse_flag_include_comments) tok_flags |= TOK_SHOW_COMMENTS;
if (flags & parse_flag_accept_incomplete_tokens) tok_flags |= TOK_ACCEPT_UNFINISHED;
if (flags & parse_flag_continue_after_error) tok_flags |= TOK_CONTINUE_AFTER_ERROR;
return tok_flags;
}
// Given an expanded string, returns any keyword it matches.
static parse_keyword_t keyword_with_name(const wcstring &name) {
return str_to_enum(name.c_str(), keyword_enum_map, keyword_enum_map_len);
}
static bool is_keyword_char(wchar_t c) {
return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') || (c >= L'0' && c <= L'9') ||
c == L'\'' || c == L'"' || c == L'\\' || c == '\n' || c == L'!';
}
/// Given a token, returns the keyword it matches, or parse_keyword_t::none.
static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token) {
/* Only strings can be keywords */
if (tok != token_type_t::string) {
return parse_keyword_t::none;
}
// If token is clean (which most are), we can compare it directly. Otherwise we have to expand
// it. We only expand quotes, and we don't want to do expensive expansions like tilde
// expansions. So we do our own "cleanliness" check; if we find a character not in our allowed
// set we know it's not a keyword, and if we never find a quote we don't have to expand! Note
// that this lowercase set could be shrunk to be just the characters that are in keywords.
parse_keyword_t result = parse_keyword_t::none;
bool needs_expand = false, all_chars_valid = true;
for (wchar_t c : token) {
if (!is_keyword_char(c)) {
all_chars_valid = false;
break;
}
// If we encounter a quote, we need expansion.
needs_expand = needs_expand || c == L'"' || c == L'\'' || c == L'\\';
}
if (all_chars_valid) {
// Expand if necessary.
if (!needs_expand) {
result = keyword_with_name(token);
} else {
wcstring storage;
if (unescape_string(token, &storage, 0)) {
result = keyword_with_name(storage);
}
}
}
return result;
}
/// Convert from tokenizer_t's token type to a parse_token_t type.
static parse_token_type_t parse_token_type_from_tokenizer_token(
enum token_type_t tokenizer_token_type) {
switch (tokenizer_token_type) {
case token_type_t::string:
return parse_token_type_t::string;
case token_type_t::pipe:
return parse_token_type_t::pipe;
case token_type_t::andand:
return parse_token_type_t::andand;
case token_type_t::oror:
return parse_token_type_t::oror;
case token_type_t::end:
return parse_token_type_t::end;
case token_type_t::background:
return parse_token_type_t::background;
case token_type_t::redirect:
return parse_token_type_t::redirection;
case token_type_t::error:
return parse_token_type_t::tokenizer_error;
case token_type_t::comment:
return parse_token_type_t::comment;
}
FLOGF(error, L"Bad token type %d passed to %s", static_cast<int>(tokenizer_token_type),
__FUNCTION__);
DIE("bad token type");
return parse_token_type_t::invalid;
}
/// A token stream generates a sequence of parser tokens, permitting arbitrary lookahead.
class token_stream_t {
public:
explicit token_stream_t(const wcstring &src, parse_tree_flags_t flags,
std::vector<source_range_t> &comments)
: src_(src),
tok_(src_.c_str(), tokenizer_flags_from_parse_flags(flags)),
comment_ranges(comments) {}
/// \return the token at the given index, without popping it. If the token stream is exhausted,
/// it will have parse_token_type_t::terminate. idx = 0 means the next token, idx = 1 means the
/// next-next token, and so forth.
/// We must have that idx < kMaxLookahead.
const parse_token_t &peek(size_t idx = 0) {
assert(idx < kMaxLookahead && "Trying to look too far ahead");
while (idx >= count_) {
lookahead_.at(mask(start_ + count_)) = next_from_tok();
count_ += 1;
}
return lookahead_.at(mask(start_ + idx));
}
/// Pop the next token.
parse_token_t pop() {
if (count_ == 0) {
return next_from_tok();
}
parse_token_t result = lookahead_[start_];
start_ = mask(start_ + 1);
count_ -= 1;
return result;
}
/// Provide the original source code.
const wcstring &source() const { return src_; }
private:
// Helper to mask our circular buffer.
static constexpr size_t mask(size_t idx) { return idx % kMaxLookahead; }
/// \return the next parse token from the tokenizer.
/// This consumes and stores comments.
parse_token_t next_from_tok() {
for (;;) {
parse_token_t res = advance_1();
if (res.type == parse_token_type_t::comment) {
comment_ranges.push_back(res.range());
continue;
}
return res;
}
}
/// \return a new parse token, advancing the tokenizer.
/// This returns comments.
parse_token_t advance_1() {
auto mtoken = tok_.next();
if (!mtoken.has_value()) {
return parse_token_t{parse_token_type_t::terminate};
}
const tok_t &token = *mtoken;
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite
// sketchy, because it ignores quotes. This is the historical behavior. For example,
// `builtin --names` lists builtins, but `builtin "--names"` attempts to run --names as a
// command. Amazingly as of this writing (10/12/13) nobody seems to have noticed this.
// Squint at it really hard and it even starts to look like a feature.
parse_token_t result{parse_token_type_from_tokenizer_token(token.type)};
const wcstring &text = tok_.copy_text_of(token, &storage_);
result.keyword = keyword_for_token(token.type, text);
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
result.is_help_argument = (text == L"-h" || text == L"--help");
result.is_newline = (result.type == parse_token_type_t::end && text == L"\n");
result.may_be_variable_assignment = variable_assignment_equals_pos(text).has_value();
result.tok_error = token.error;
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work
// in uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably
// just crash.
assert(token.offset < SOURCE_OFFSET_INVALID);
result.source_start = static_cast<source_offset_t>(token.offset);
assert(token.length <= SOURCE_OFFSET_INVALID);
result.source_length = static_cast<source_offset_t>(token.length);
if (token.error != tokenizer_error_t::none) {
auto subtoken_offset = static_cast<source_offset_t>(token.error_offset_within_token);
// Skip invalid tokens that have a zero length, especially if they are at EOF.
if (subtoken_offset < result.source_length) {
result.source_start += subtoken_offset;
result.source_length = token.error_length;
}
}
return result;
}
// The maximum number of lookahead supported.
static constexpr size_t kMaxLookahead = 2;
// We implement a queue with a simple circular buffer.
// Note that peek() returns an address, so we must not move elements which are peek'd.
// This prevents using vector (which may reallocate).
// Deque would work but is too heavyweight for just 2 items.
std::array<parse_token_t, kMaxLookahead> lookahead_ = {
{parse_token_type_t::invalid, parse_token_type_t::invalid}};
// Starting index in our lookahead.
// The "first" token is at this index.
size_t start_ = 0;
// Number of items in our lookahead.
size_t count_ = 0;
// A reference to the original source.
const wcstring &src_;
// The tokenizer to generate new tokens.
tokenizer_t tok_;
/// Any comment nodes are collected here.
/// These are only collected if parse_flag_include_comments is set.
std::vector<source_range_t> &comment_ranges;
// Temporary storage.
wcstring storage_;
};
} // namespace
namespace ast {
/// Given a node which we believe to be some sort of block statement, attempt to return a source
/// range for the block's keyword (for, if, etc) and a user-presentable description. This is used to
/// provide better error messages. \return {nullptr, nullptr} if we couldn't find it. Note at this
/// point the parse tree is incomplete; in particular parent nodes are not set.
static std::pair<source_range_t, const wchar_t *> find_block_open_keyword(const node_t *node) {
const node_t *cursor = node;
while (cursor != nullptr) {
switch (cursor->type) {
case type_t::block_statement:
cursor = cursor->as<block_statement_t>()->header.contents.get();
break;
case type_t::for_header: {
const auto *h = cursor->as<for_header_t>();
return {h->kw_for.range, L"for loop"};
}
case type_t::while_header: {
const auto *h = cursor->as<while_header_t>();
return {h->kw_while.range, L"while loop"};
}
case type_t::function_header: {
const auto *h = cursor->as<function_header_t>();
return {h->kw_function.range, L"function definition"};
}
case type_t::begin_header: {
const auto *h = cursor->as<begin_header_t>();
return {h->kw_begin.range, L"begin"};
}
case type_t::if_statement: {
const auto *h = cursor->as<if_statement_t>();
return {h->if_clause.kw_if.range, L"if statement"};
}
case type_t::switch_statement: {
const auto *h = cursor->as<switch_statement_t>();
return {h->kw_switch.range, L"switch statement"};
}
default:
return {source_range_t{}, nullptr};
}
}
return {source_range_t{}, nullptr};
}
/// \return the decoration for this statement.
statement_decoration_t decorated_statement_t::decoration() const {
if (!opt_decoration) {
return statement_decoration_t::none;
}
switch (opt_decoration->kw) {
case parse_keyword_t::kw_command:
return statement_decoration_t::command;
case parse_keyword_t::kw_builtin:
return statement_decoration_t::builtin;
case parse_keyword_t::kw_exec:
return statement_decoration_t::exec;
default:
assert(0 && "Unexpected keyword in statement decoration");
return statement_decoration_t::none;
}
}
/// \return a string literal name for an ast type.
const wchar_t *ast_type_to_string(type_t type) {
switch (type) {
#define ELEM(T) \
case type_t::T: \
return L"" #T;
#include "ast_node_types.inc"
}
assert(0 && "unreachable");
return L"(unknown)";
}
/// Delete an untyped node.
void node_deleter_t::operator()(node_t *n) {
if (!n) return;
switch (n->type) {
#define ELEM(T) \
case type_t::T: \
delete n->as<T##_t>(); \
break;
#include "ast_node_types.inc"
}
}
wcstring node_t::describe() const {
wcstring res = ast_type_to_string(this->type);
if (const auto *n = this->try_as<token_base_t>()) {
append_format(res, L" '%ls'", token_type_description(n->type));
} else if (const auto *n = this->try_as<keyword_base_t>()) {
append_format(res, L" '%ls'", keyword_description(n->kw));
}
return res;
}
/// From C++14.
template <bool B, typename T = void>
using enable_if_t = typename std::enable_if<B, T>::type;
namespace {
struct source_range_visitor_t {
template <typename Node>
enable_if_t<Node::Category == category_t::leaf> visit(const Node &node) {
if (node.unsourced) any_unsourced = true;
// Union with our range.
if (node.range.length > 0) {
if (total.length == 0) {
total = node.range;
} else {
auto end =
std::max(total.start + total.length, node.range.start + node.range.length);
total.start = std::min(total.start, node.range.start);
total.length = end - total.start;
}
}
return;
}
// Other node types recurse.
template <typename Node>
enable_if_t<Node::Category != category_t::leaf> visit(const Node &node) {
node_visitor(*this).accept_children_of(node);
}
// Total range we have encountered.
source_range_t total{0, 0};
// Whether any node was found to be unsourced.
bool any_unsourced{false};
};
} // namespace
maybe_t<source_range_t> node_t::try_source_range() const {
source_range_visitor_t v;
node_visitor(v).accept(this);
if (v.any_unsourced) return none();
return v.total;
}
// Helper to describe a list of keywords.
// TODO: these need to be localized properly.
static wcstring keywords_user_presentable_description(std::initializer_list<parse_keyword_t> kws) {
assert(kws.size() > 0 && "Should not be empty list");
if (kws.size() == 1) {
return format_string(L"keyword '%ls'", keyword_description(*kws.begin()));
}
size_t idx = 0;
wcstring res = L"keywords ";
for (parse_keyword_t kw : kws) {
const wchar_t *optor = (idx++ ? L" or " : L"");
append_format(res, L"%ls'%ls'", optor, keyword_description(kw));
}
return res;
}
// Helper to describe a list of token types.
// TODO: these need to be localized properly.
static wcstring token_types_user_presentable_description(
std::initializer_list<parse_token_type_t> types) {
assert(types.size() > 0 && "Should not be empty list");
if (types.size() == 1) {
return token_type_user_presentable_description(*types.begin());
}
size_t idx = 0;
wcstring res;
for (parse_token_type_t type : types) {
const wchar_t *optor = (idx++ ? L" or " : L"");
append_format(res, L"%ls%ls", optor, token_type_user_presentable_description(type).c_str());
}
return res;
}
namespace {
using namespace ast;
struct populator_t {
template <typename T>
using unique_ptr = std::unique_ptr<T>;
// Construct from a source, flags, top type, and out_errors, which may be null.
populator_t(const wcstring &src, parse_tree_flags_t flags, type_t top_type,
parse_error_list_t *out_errors)
: flags_(flags),
tokens_(src, flags, extras_.comments),
top_type_(top_type),
out_errors_(out_errors) {}
// Given a node type, allocate it and invoke its default constructor.
// \return the resulting Node pointer. It is never null.
template <typename Node>
unique_ptr<Node> allocate() {
unique_ptr<Node> node = make_unique<Node>();
FLOGF(ast_construction, L"%*smake %ls %p", spaces(), "", ast_type_to_string(Node::AstType),
node.get());
return node;
}
// Given a node type, allocate it, invoke its default constructor,
// and then visit it as a field.
// \return the resulting Node pointer. It is never null.
template <typename Node>
unique_ptr<Node> allocate_visit() {
unique_ptr<Node> node = allocate<Node>();
this->visit_node_field(*node);
return node;
}
/// Helper for FLOGF. This returns a number of spaces appropriate for a '%*c' format.
int spaces() const { return static_cast<int>(visit_stack_.size() * 2); }
/// The status of our parser.
enum class status_t {
// Parsing is going just fine, thanks for asking.
ok,
// We have exhausted the token stream, but the caller was OK with an incomplete parse tree.
// All further leaf nodes should have the unsourced flag set.
unsourcing,
// We encountered an parse error and are "unwinding."
// Do not consume any tokens until we get back to a list type which stops unwinding.
unwinding,
};
/// \return the parser's status.
status_t status() {
if (unwinding_) {
return status_t::unwinding;
} else if ((flags_ & parse_flag_leave_unterminated) &&
peek_type() == parse_token_type_t::terminate) {
return status_t::unsourcing;
}
return status_t::ok;
}
/// \return whether the status is unwinding.
/// This is more efficient than checking the status directly.
bool is_unwinding() const { return unwinding_; }
/// \return whether any leaf nodes we visit should be marked as unsourced.
bool unsource_leaves() {
status_t s = status();
return s == status_t::unsourcing || s == status_t::unwinding;
}
/// \return whether we permit an incomplete parse tree.
bool allow_incomplete() const { return flags_ & parse_flag_leave_unterminated; }
/// This indicates a bug in fish code.
void internal_error(const char *func, const wchar_t *fmt, ...) const {
va_list va;
va_start(va, fmt);
wcstring msg = vformat_string(fmt, va);
va_end(va);
FLOG(debug, "Internal parse error from", func, "- this indicates a bug in fish.", msg);
FLOG(debug, "Encountered while parsing:<<<\n%ls\n>>>", tokens_.source().c_str());
abort();
}
/// \return whether a list type \p type allows arbitrary newlines in it.
bool list_type_chomps_newlines(type_t type) const {
switch (type) {
case type_t::argument_list:
// Hackish. If we are producing a freestanding argument list, then it allows
// semicolons, for hysterical raisins.
return top_type_ == type_t::freestanding_argument_list;
case type_t::argument_or_redirection_list:
// No newlines inside arguments.
return false;
case type_t::variable_assignment_list:
// No newlines inside variable assignment lists.
return false;
case type_t::job_list:
// Like echo a \n \n echo b
return true;
case type_t::case_item_list:
// Like switch foo \n \n \n case a \n end
return true;
case type_t::andor_job_list:
// Like while true ; \n \n and true ; end
return true;
case type_t::elseif_clause_list:
// Like if true ; \n \n else if false; end
return true;
case type_t::job_conjunction_continuation_list:
// This would be like echo a && echo b \n && echo c
// We could conceivably support this but do not now.
return false;
case type_t::job_continuation_list:
// This would be like echo a \n | echo b
// We could conceivably support this but do not now.
return false;
default:
internal_error(__FUNCTION__, L"Type %ls not handled", ast_type_to_string(type));
return false;
}
}
/// \return whether a list type \p type allows arbitrary semicolons in it.
bool list_type_chomps_semis(type_t type) const {
switch (type) {
case type_t::argument_list:
// Hackish. If we are producing a freestanding argument list, then it allows
// semicolons, for hysterical raisins.
// That is, this is OK: complete -c foo -a 'x ; y ; z'
// But this is not: foo x ; y ; z
return top_type_ == type_t::freestanding_argument_list;
case type_t::argument_or_redirection_list:
case type_t::variable_assignment_list:
return false;
case type_t::job_list:
// Like echo a ; ; echo b
return true;
case type_t::case_item_list:
// Like switch foo ; ; ; case a \n end
// This is historically allowed.
return true;
case type_t::andor_job_list:
// Like while true ; ; ; and true ; end
return true;
case type_t::elseif_clause_list:
// Like if true ; ; ; else if false; end
return false;
case type_t::job_conjunction_continuation_list:
// Like echo a ; ; && echo b. Not supported.
return false;
case type_t::job_continuation_list:
// This would be like echo a ; | echo b
// Not supported.
// We could conceivably support this but do not now.
return false;
default:
internal_error(__FUNCTION__, L"Type %ls not handled", ast_type_to_string(type));
return false;
}
}
// Chomp extra comments, semicolons, etc. for a given list type.
void chomp_extras(type_t type) {
bool chomp_semis = list_type_chomps_semis(type);
bool chomp_newlines = list_type_chomps_newlines(type);
for (;;) {
const auto &peek = this->tokens_.peek();
if (chomp_newlines && peek.type == parse_token_type_t::end && peek.is_newline) {
// Just skip this newline, no need to save it.
this->tokens_.pop();
} else if (chomp_semis && peek.type == parse_token_type_t::end && !peek.is_newline) {
auto tok = this->tokens_.pop();
// Perhaps save this extra semi.
if (flags_ & parse_flag_show_extra_semis) {
extras_.semis.push_back(tok.range());
}
} else {
break;
}
}
}
/// \return whether a list type should recover from errors.s
/// That is, whether we should stop unwinding when we encounter this type.
bool list_type_stops_unwind(type_t type) const {
return type == type_t::job_list && (flags_ & parse_flag_continue_after_error);
}
/// Report an error based on \p fmt for the source range \p range.
void parse_error_impl(source_range_t range, parse_error_code_t code, const wchar_t *fmt,
va_list va) {
any_error_ = true;
// Ignore additional parse errors while unwinding.
// These may come about e.g. from `true | and`.
if (unwinding_) return;
unwinding_ = true;
FLOGF(ast_construction, L"%*sparse error - begin unwinding", spaces(), "");
// TODO: can store this conditionally dependent on flags.
if (range.start != SOURCE_OFFSET_INVALID) {
extras_.errors.push_back(range);
}
if (out_errors_) {
parse_error_t err;
err.text = vformat_string(fmt, va);
err.code = code;
err.source_start = range.start;
err.source_length = range.length;
out_errors_->push_back(std::move(err));
}
}
/// Report an error based on \p fmt for the source range \p range.
void parse_error(source_range_t range, parse_error_code_t code, const wchar_t *fmt, ...) {
va_list va;
va_start(va, fmt);
parse_error_impl(range, code, fmt, va);
va_end(va);
}
/// Report an error based on \p fmt for the source range \p range.
void parse_error(const parse_token_t &token, parse_error_code_t code, const wchar_t *fmt, ...) {
va_list va;
va_start(va, fmt);
parse_error_impl(token.range(), code, fmt, va);
va_end(va);
}
// \return a reference to a non-comment token at index \p idx.
const parse_token_t &peek_token(size_t idx = 0) { return tokens_.peek(idx); }
// \return the type of a non-comment token.
parse_token_type_t peek_type(size_t idx = 0) { return peek_token(idx).type; }
// Consume the next token, chomping any comments.
// It is an error to call this unless we know there is a non-terminate token available.
// \return the token.
parse_token_t consume_any_token() {
parse_token_t tok = tokens_.pop();
assert(tok.type != parse_token_type_t::comment && "Should not be a comment");
assert(tok.type != parse_token_type_t::terminate &&
"Cannot consume terminate token, caller should check status first");
return tok;
}
// Consume the next token which is expected to be of the given type.
source_range_t consume_token_type(parse_token_type_t type) {
assert(type != parse_token_type_t::terminate &&
"Should not attempt to consume terminate token");
auto tok = consume_any_token();
if (tok.type != type) {
parse_error(tok, parse_error_generic, _(L"Expected %ls, but found %ls"),
token_type_user_presentable_description(type).c_str(),
tok.user_presentable_description().c_str());
return source_range_t{0, 0};
}
return tok.range();
}
// The next token could not be parsed at the top level.
// For example a trailing end like `begin ; end ; end`
// Or an unexpected redirection like `>`
// Consume it and add an error.
void consume_excess_token_generating_error() {
auto tok = consume_any_token();
// In the rare case that we are parsing a freestanding argument list and not a job list,
// generate a generic error.
// TODO: this is a crummy message if we get a tokenizer error, for example:
// complete -c foo -a "'abc"
if (this->top_type_ == type_t::freestanding_argument_list) {
this->parse_error(
tok, parse_error_generic, _(L"Expected %ls, but found %ls"),
token_type_user_presentable_description(parse_token_type_t::string).c_str(),
tok.user_presentable_description().c_str());
return;
}
assert(this->top_type_ == type_t::job_list);
switch (tok.type) {
case parse_token_type_t::string:
// There are three keywords which end a job list.
switch (tok.keyword) {
case parse_keyword_t::kw_end:
this->parse_error(tok, parse_error_unbalancing_end,
_(L"'end' outside of a block"));
break;
case parse_keyword_t::kw_else:
this->parse_error(tok, parse_error_unbalancing_else,
_(L"'else' builtin not inside of if block"));
break;
case parse_keyword_t::kw_case:
this->parse_error(tok, parse_error_unbalancing_case,
_(L"'case' builtin not inside of switch block"));
break;
default:
internal_error(__FUNCTION__,
L"Token %ls should not have prevented parsing a job list",
tok.user_presentable_description().c_str());
break;
}
break;
case parse_token_type_t::pipe:
case parse_token_type_t::redirection:
case parse_token_type_t::background:
case parse_token_type_t::andand:
case parse_token_type_t::oror:
parse_error(tok, parse_error_generic, _(L"Expected a string, but found %ls"),
tok.user_presentable_description().c_str());
break;
case parse_token_type_t::tokenizer_error:
parse_error(tok, parse_error_from_tokenizer_error(tok.tok_error), L"%ls",
tokenizer_get_error_message(tok.tok_error));
break;
case parse_token_type_t::end:
internal_error(__FUNCTION__, L"End token should never be excess");
break;
case parse_token_type_t::terminate:
internal_error(__FUNCTION__, L"Terminate token should never be excess");
break;
default:
internal_error(__FUNCTION__, L"Unexpected excess token type: %ls",
tok.user_presentable_description().c_str());
break;
}
}
// Our can_parse implementations are for optional values and for lists.
// A true return means we should descend into the production, false means stop.
// Note that the argument is always nullptr and should be ignored. It is provided strictly for
// overloading purposes.
bool can_parse(job_conjunction_t *) {
const auto &token = peek_token();
if (token.type != parse_token_type_t::string) return false;
switch (peek_token().keyword) {
case parse_keyword_t::kw_end:
case parse_keyword_t::kw_else:
case parse_keyword_t::kw_case:
// These end a job list.
return false;
case parse_keyword_t::none:
default:
return true;
}
}
bool can_parse(argument_t *) { return peek_type() == parse_token_type_t::string; }
bool can_parse(redirection_t *) { return peek_type() == parse_token_type_t::redirection; }
bool can_parse(argument_or_redirection_t *) {
return can_parse((argument_t *)nullptr) || can_parse((redirection_t *)nullptr);
}
bool can_parse(variable_assignment_t *) {
// Do we have a variable assignment at all?
if (!peek_token(0).may_be_variable_assignment) return false;
// What is the token after it?
switch (peek_type(1)) {
case parse_token_type_t::string:
// We have `a= cmd` and should treat it as a variable assignment.
return true;
case parse_token_type_t::terminate:
// We have `a=` which is OK if we are allowing incomplete, an error otherwise.
return allow_incomplete();
default:
// We have e.g. `a= >` which is an error.
// Note that we do not produce an error here. Instead we return false so this the
// token will be seen by allocate_populate_statement_contents.
return false;
}
}
template <parse_token_type_t... Tok>
bool can_parse(token_t<Tok...> *tok) {
return tok->allows_token(peek_token().type);
}
// Note we have specific overloads for our keyword nodes, as they need custom logic.
bool can_parse(job_conjunction_t::decorator_t *) {
// This is for a job conjunction like `and stuff`
// But if it's `and --help` then we treat it as an ordinary command.
return job_conjunction_t::decorator_t::allows_keyword(peek_token(0).keyword) &&
!peek_token(1).is_help_argument;
}
bool can_parse(decorated_statement_t::decorator_t *) {
// Here the keyword is 'command' or 'builtin' or 'exec'.
// `command stuff` executes a command called stuff.
// `command -n` passes the -n argument to the 'command' builtin.
// `command` by itself is a command.
if (!decorated_statement_t::decorator_t::allows_keyword(peek_token(0).keyword)) {
return false;
}
// Is it like `command --stuff` or `command` by itself?
auto tok1 = peek_token(1);
return tok1.type == parse_token_type_t::string && !tok1.is_dash_prefix_string();
}
bool can_parse(keyword_t<parse_keyword_t::kw_time> *) {
// Time keyword is only the time builtin if the next argument doesn't have a dash.
return keyword_t<parse_keyword_t::kw_time>::allows_keyword(peek_token(0).keyword) &&
!peek_token(1).is_dash_prefix_string();
}
bool can_parse(job_continuation_t *) { return peek_type() == parse_token_type_t::pipe; }
bool can_parse(job_conjunction_continuation_t *) {
auto type = peek_type();
return type == parse_token_type_t::andand || type == parse_token_type_t::oror;
}
bool can_parse(andor_job_t *) {
switch (peek_token().keyword) {
case parse_keyword_t::kw_and:
case parse_keyword_t::kw_or: {
// Check that the argument to and/or is a string that's not help. Otherwise it's
// either 'and --help' or a naked 'and', and not part of this list.
const auto &nexttok = peek_token(1);
return nexttok.type == parse_token_type_t::string && !nexttok.is_help_argument;
}
default:
return false;
}
}
bool can_parse(elseif_clause_t *) {
return peek_token(0).keyword == parse_keyword_t::kw_else &&
peek_token(1).keyword == parse_keyword_t::kw_if;
}
bool can_parse(else_clause_t *) { return peek_token().keyword == parse_keyword_t::kw_else; }
bool can_parse(case_item_t *) { return peek_token().keyword == parse_keyword_t::kw_case; }
// Given that we are a list of type ListNodeType, whose contents type is ContentsNode, populate
// as many elements as we can.
// If exhaust_stream is set, then keep going until we get parse_token_type_t::terminate.
template <type_t ListType, typename ContentsNode>
void populate_list(list_t<ListType, ContentsNode> &list, bool exhaust_stream = false) {
assert(list.contents == nullptr && "List is not initially empty");
// Do not attempt to parse a list if we are unwinding.
if (is_unwinding()) {
assert(!exhaust_stream &&
"exhaust_stream should only be set at top level, and so we should not be "
"unwinding");
// Mark in the list that it was unwound.
FLOGF(ast_construction, L"%*sunwinding %ls", spaces(), "",
ast_type_to_string(ListType));
assert(list.empty() && "Should be an empty list");
return;
}
// We're going to populate a vector with our nodes.
// Later on we will copy this to the heap with a single allocation.
std::vector<std::unique_ptr<ContentsNode>> contents;
for (;;) {
// If we are unwinding, then either we recover or we break the loop, dependent on the
// loop type.
if (is_unwinding()) {
if (!list_type_stops_unwind(ListType)) {
break;
}
// We are going to stop unwinding.
// Rather hackish. Just chomp until we get to a string or end node.
for (auto type = peek_type();
type != parse_token_type_t::string && type != parse_token_type_t::terminate &&
type != parse_token_type_t::end;
type = peek_type()) {
parse_token_t tok = tokens_.pop();
extras_.errors.push_back(tok.range());
FLOGF(ast_construction, L"%*schomping range %u-%u", spaces(), "",
tok.source_start, tok.source_length);
}
FLOGF(ast_construction, L"%*sdone unwinding", spaces(), "");
unwinding_ = false;
}
// Chomp semis and newlines.
chomp_extras(ListType);
// Now try parsing a node.
if (auto node = this->try_parse<ContentsNode>()) {
// #7201: Minimize reallocations of contents vector
if (contents.empty()) {
contents.reserve(64);
}
contents.emplace_back(std::move(node));
} else if (exhaust_stream && peek_type() != parse_token_type_t::terminate) {
// We aren't allowed to stop. Produce an error and keep going.
consume_excess_token_generating_error();
} else {
// We either stop once we can't parse any more of this contents node, or we
// exhausted the stream as requested.
break;
}
}
// Populate our list from our contents.
if (!contents.empty()) {
assert(contents.size() <= UINT32_MAX && "Contents size out of bounds");
assert(list.contents == nullptr && "List should still be empty");
// We're going to heap-allocate our array.
using contents_ptr_t = typename list_t<ListType, ContentsNode>::contents_ptr_t;
auto *array = new contents_ptr_t[contents.size()];
std::move(contents.begin(), contents.end(), array);
list.length = static_cast<uint32_t>(contents.size());
list.contents = array;
}
FLOGF(ast_construction, L"%*s%ls size: %lu", spaces(), "", ast_type_to_string(ListType),
(unsigned long)list.count());
}
/// Allocate and populate a statement contents pointer.
/// This must never return null.
statement_t::contents_ptr_t allocate_populate_statement_contents() {
// In case we get a parse error, we still need to return something non-null. Use a decorated
// statement; all of its leaf nodes will end up unsourced.
auto got_error = [this] {
assert(unwinding_ && "Should have produced an error");
return this->allocate_visit<decorated_statement_t>();
};
using pkt = parse_keyword_t;
const auto &token1 = peek_token(0);
if (token1.type == parse_token_type_t::terminate && allow_incomplete()) {
// This may happen if we just have a 'time' prefix.
// Construct a decorated statement, which will be unsourced.
return this->allocate_visit<decorated_statement_t>();
} else if (token1.type != parse_token_type_t::string) {
// We may be unwinding already; do not produce another error.
// For example in `true | and`.
parse_error(token1, parse_error_generic, _(L"Expected a command, but found %ls"),
token1.user_presentable_description().c_str());
return got_error();
} else if (token1.may_be_variable_assignment) {
// Here we have a variable assignment which we chose to not parse as a variable
// assignment because there was no string after it.
// Ensure we consume the token, so we don't get back here again at the same place.
parse_error(consume_any_token(), parse_error_bare_variable_assignment, L"");
return got_error();
}
// The only block-like builtin that takes any parameters is 'function'. So go to decorated
// statements if the subsequent token looks like '--'. The logic here is subtle:
//
// If we are 'begin', then we expect to be invoked with no arguments.
// If we are 'function', then we are a non-block if we are invoked with -h or --help
// If we are anything else, we require an argument, so do the same thing if the subsequent
// token is a statement terminator.
if (token1.type == parse_token_type_t::string) {
const auto &token2 = peek_token(1);
// If we are a function, then look for help arguments. Otherwise, if the next token
// looks like an option (starts with a dash), then parse it as a decorated statement.
if (token1.keyword == pkt::kw_function && token2.is_help_argument) {
return allocate_visit<decorated_statement_t>();
} else if (token1.keyword != pkt::kw_function && token2.has_dash_prefix) {
return allocate_visit<decorated_statement_t>();
}
// Likewise if the next token doesn't look like an argument at all. This corresponds to
// e.g. a "naked if".
bool naked_invocation_invokes_help =
(token1.keyword != pkt::kw_begin && token1.keyword != pkt::kw_end);
if (naked_invocation_invokes_help && (token2.type == parse_token_type_t::end ||
token2.type == parse_token_type_t::terminate)) {
return allocate_visit<decorated_statement_t>();
}
}
switch (token1.keyword) {
case pkt::kw_not:
case pkt::kw_exclam:
return allocate_visit<not_statement_t>();
case pkt::kw_for:
case pkt::kw_while:
case pkt::kw_function:
case pkt::kw_begin:
return allocate_visit<block_statement_t>();
case pkt::kw_if:
return allocate_visit<if_statement_t>();
case pkt::kw_switch:
return allocate_visit<switch_statement_t>();
case pkt::kw_end:
// 'end' is forbidden as a command.
// For example, `if end` or `while end` will produce this error.
// We still have to descend into the decorated statement because
// we can't leave our pointer as null.
parse_error(token1, parse_error_generic, _(L"Expected a command, but found %ls"),
token1.user_presentable_description().c_str());
return got_error();
default:
return allocate_visit<decorated_statement_t>();
}
}
/// Allocate and populate a block statement header.
/// This must never return null.
block_statement_t::header_ptr_t allocate_populate_block_header() {
switch (peek_token().keyword) {
case parse_keyword_t::kw_for:
return allocate_visit<for_header_t>();
case parse_keyword_t::kw_while:
return allocate_visit<while_header_t>();
case parse_keyword_t::kw_function:
return allocate_visit<function_header_t>();
case parse_keyword_t::kw_begin:
return allocate_visit<begin_header_t>();
default:
internal_error(__FUNCTION__, L"should not have descended into block_header");
DIE("Unreachable");
}
}
template <typename AstNode>
unique_ptr<AstNode> try_parse() {
if (!can_parse((AstNode *)nullptr)) return nullptr;
return allocate_visit<AstNode>();
}
void visit_node_field(argument_t &arg) {
if (unsource_leaves()) {
arg.unsourced = true;
return;
}
arg.range = consume_token_type(parse_token_type_t::string);
}
void visit_node_field(variable_assignment_t &varas) {
if (unsource_leaves()) {
varas.unsourced = true;
return;
}
if (!peek_token().may_be_variable_assignment) {
internal_error(__FUNCTION__,
L"Should not have created variable_assignment_t from this token");
}
varas.range = consume_token_type(parse_token_type_t::string);
}
void visit_node_field(job_continuation_t &node) {
// Special error handling to catch 'and' and 'or' in pipelines, like `true | and false`.
const auto &tok = peek_token(1);
if (tok.keyword == parse_keyword_t::kw_and || tok.keyword == parse_keyword_t::kw_or) {
const wchar_t *cmdname = (tok.keyword == parse_keyword_t::kw_and ? L"and" : L"or");
parse_error(tok, parse_error_andor_in_pipeline, INVALID_PIPELINE_CMD_ERR_MSG, cmdname);
}
node.accept(*this);
}
// Visit branch nodes by just calling accept() to visit their fields.
template <typename Node>
enable_if_t<Node::Category == category_t::branch> visit_node_field(Node &node) {
// This field is a direct embedding of an AST value.
node.accept(*this);
return;
}
// Overload for token fields.
template <parse_token_type_t... TokTypes>
void visit_node_field(token_t<TokTypes...> &token) {
if (unsource_leaves()) {
token.unsourced = true;
return;
}
if (!token.allows_token(peek_token().type)) {
const auto &peek = peek_token();
if ((flags_ & parse_flag_leave_unterminated) &&
(peek.tok_error == tokenizer_error_t::unterminated_quote ||
peek.tok_error == tokenizer_error_t::unterminated_subshell)) {
return;
}
parse_error(peek, parse_error_generic, L"Expected %ls, but found %ls",
token_types_user_presentable_description({TokTypes...}).c_str(),
peek.user_presentable_description().c_str());
token.unsourced = true;
return;
}
parse_token_t tok = consume_any_token();
token.type = tok.type;
token.range = tok.range();
}
// Overload for keyword fields.
template <parse_keyword_t... KWs>
void visit_node_field(keyword_t<KWs...> &keyword) {
if (unsource_leaves()) {
keyword.unsourced = true;
return;
}
if (!keyword.allows_keyword(peek_token().keyword)) {
keyword.unsourced = true;
const auto &peek = peek_token();
if ((flags_ & parse_flag_leave_unterminated) &&
(peek.tok_error == tokenizer_error_t::unterminated_quote ||
peek.tok_error == tokenizer_error_t::unterminated_subshell)) {
return;
}
// Special error reporting for keyword_t<kw_end>.
std::array<parse_keyword_t, sizeof...(KWs)> allowed = {{KWs...}};
if (allowed.size() == 1 && allowed[0] == parse_keyword_t::kw_end) {
assert(!visit_stack_.empty() && "Visit stack should not be empty");
auto p = find_block_open_keyword(visit_stack_.back());
source_range_t kw_range = p.first;
const wchar_t *kw_name = p.second;
if (kw_name) {
this->parse_error(kw_range, parse_error_generic,
L"Missing end to balance this %ls", kw_name);
}
}
parse_error(peek, parse_error_generic, L"Expected %ls, but found %ls",
keywords_user_presentable_description({KWs...}).c_str(),
peek.user_presentable_description().c_str());
return;
}
parse_token_t tok = consume_any_token();
keyword.kw = tok.keyword;
keyword.range = tok.range();
}
// Overload for maybe_newlines
void visit_node_field(maybe_newlines_t &nls) {
if (unsource_leaves()) {
nls.unsourced = true;
return;
}
// TODO: it would be nice to have the start offset be the current position in the token
// stream, even if there are no newlines.
nls.range = {0, 0};
while (peek_token().is_newline) {
auto r = consume_token_type(parse_token_type_t::end);
if (nls.range.length == 0) {
nls.range = r;
} else {
nls.range.length = r.start + r.length - nls.range.start;
}
}
}
template <typename AstNode>
void visit_optional_field(optional_t<AstNode> &ptr) {
// This field is an optional node.
ptr.contents = this->try_parse<AstNode>();
}
template <type_t ListNodeType, typename ContentsNode>
void visit_list_field(list_t<ListNodeType, ContentsNode> &list) {
// This field is an embedding of an array of (pointers to) ContentsNode.
// Parse as many as we can.
populate_list(list);
}
// We currently only have a handful of union pointer types.
// Handle them directly.
void visit_union_field(statement_t::contents_ptr_t &ptr) {
ptr = this->allocate_populate_statement_contents();
assert(ptr && "Statement contents must never be null");
}
void visit_union_field(argument_or_redirection_t::contents_ptr_t &contents) {
if (auto arg = try_parse<argument_t>()) {
contents = std::move(arg);
} else if (auto redir = try_parse<redirection_t>()) {
contents = std::move(redir);
} else {
internal_error(__FUNCTION__, L"Unable to parse argument or redirection");
}
assert(contents && "Statement contents must never be null");
}
void visit_union_field(block_statement_t::header_ptr_t &ptr) {
ptr = this->allocate_populate_block_header();
assert(ptr && "Header pointer must never be null");
}
void will_visit_fields_of(const node_t &node) {
FLOGF(ast_construction, L"%*swill_visit %ls %p", spaces(), "", node.describe().c_str(),
(const void *)&node);
visit_stack_.push_back(&node);
}
void did_visit_fields_of(const node_t &node) {
assert(!visit_stack_.empty() && visit_stack_.back() == &node &&
"Node was not at the top of the visit stack");
visit_stack_.pop_back();
}
/// Flags controlling parsing.
parse_tree_flags_t flags_{};
/// Extra stuff like comment ranges.
ast_t::extras_t extras_{};
/// Stream of tokens which we consume.
token_stream_t tokens_;
/** The type which we are attempting to parse, typically job_list but may be
freestanding_argument_list. */
const type_t top_type_;
/// If set, we are unwinding due to error recovery.
bool unwinding_{false};
/// If set, we have encountered an error.
bool any_error_{false};
/// A stack containing the nodes whose fields we are visiting.
std::vector<const node_t *> visit_stack_{};
// If non-null, populate with errors.
parse_error_list_t *out_errors_{};
};
} // namespace
// Set the parent fields of all nodes in the tree rooted at \p node.
static void set_parents(const node_t *top) {
struct parent_setter_t {
void visit(const node_t &node) {
const_cast<node_t &>(node).parent = parent_;
const node_t *saved = parent_;
parent_ = &node;
node_visitor(*this).accept_children_of(&node);
parent_ = saved;
}
const node_t *parent_{nullptr};
};
struct parent_setter_t ps;
node_visitor(ps).accept(top);
}
// static
ast_t ast_t::parse_from_top(const wcstring &src, parse_tree_flags_t parse_flags,
parse_error_list_t *out_errors, type_t top_type) {
assert((top_type == type_t::job_list || top_type == type_t::freestanding_argument_list) &&
"Invalid top type");
ast_t ast;
populator_t pops(src, parse_flags, top_type, out_errors);
if (top_type == type_t::job_list) {
std::unique_ptr<job_list_t> list = pops.allocate<job_list_t>();
pops.populate_list(*list, true /* exhaust_stream */);
ast.top_.reset(list.release());
} else {
std::unique_ptr<freestanding_argument_list_t> list =
pops.allocate<freestanding_argument_list_t>();
pops.populate_list(list->arguments, true /* exhaust_stream */);
ast.top_.reset(list.release());
}
// Chomp trailing extras, etc.
pops.chomp_extras(type_t::job_list);
ast.any_error_ = pops.any_error_;
ast.extras_ = std::move(pops.extras_);
// Set all parent nodes.
// It turns out to be more convenient to do this after the parse phase.
set_parents(ast.top());
return ast;
}
// static
ast_t ast_t::parse(const wcstring &src, parse_tree_flags_t flags, parse_error_list_t *out_errors) {
return parse_from_top(src, flags, out_errors, type_t::job_list);
}
// static
ast_t ast_t::parse_argument_list(const wcstring &src, parse_tree_flags_t flags,
parse_error_list_t *out_errors) {
return parse_from_top(src, flags, out_errors, type_t::freestanding_argument_list);
}
// \return the depth of a node, i.e. number of parent links.
static int get_depth(const node_t *node) {
int result = 0;
for (const node_t *cursor = node->parent; cursor; cursor = cursor->parent) {
result += 1;
}
return result;
}
wcstring ast_t::dump(const wcstring &orig) const {
wcstring result;
// Return a string that repeats "| " \p amt times.
auto pipespace = [](int amt) {
std::string result;
result.reserve(amt * 2);
for (int i = 0; i < amt; i++) result.append("! ");
return result;
};
traversal_t tv = this->walk();
while (const auto *node = tv.next()) {
int depth = get_depth(node);
// dot-| padding
append_format(result, L"%s", pipespace(depth).c_str());
if (const auto *n = node->try_as<argument_t>()) {
append_format(result, L"argument");
if (auto argsrc = n->try_source(orig)) {
append_format(result, L": '%ls'", argsrc->c_str());
}
} else if (const auto *n = node->try_as<keyword_base_t>()) {
append_format(result, L"keyword: %ls", keyword_description(n->kw));
} else if (const auto *n = node->try_as<token_base_t>()) {
wcstring desc;
switch (n->type) {
case parse_token_type_t::string:
desc = format_string(L"string");
if (auto strsource = n->try_source(orig)) {
append_format(desc, L": '%ls'", strsource->c_str());
}
break;
case parse_token_type_t::redirection:
desc = L"redirection";
if (auto strsource = n->try_source(orig)) {
append_format(desc, L": '%ls'", strsource->c_str());
}
break;
case parse_token_type_t::end:
desc = L"<;>";
break;
case parse_token_type_t::invalid:
// This may occur with errors, e.g. we expected to see a string but saw a
// redirection.
desc = L"<error>";
break;
default:
desc = token_type_user_presentable_description(n->type);
break;
}
append_format(result, L"%ls", desc.c_str());
} else {
append_format(result, L"%ls", node->describe().c_str());
}
append_format(result, L"\n");
}
return result;
}
} // namespace ast