fish-shell/src/parse_tree.cpp

1588 lines
63 KiB
C++
Raw Normal View History

// Programmatic representation of fish code.
2015-07-25 23:14:25 +08:00
#include <assert.h>
#include <stdarg.h>
#include <stddef.h>
#include <stdio.h>
#include <wchar.h>
#include <algorithm>
#include <cwchar>
#include <string>
#include <vector>
2015-07-25 23:14:25 +08:00
#include "common.h"
#include "fallback.h" // IWYU pragma: keep
2015-07-25 23:14:25 +08:00
#include "parse_constants.h"
2013-07-26 06:24:22 +08:00
#include "parse_productions.h"
2015-07-25 23:14:25 +08:00
#include "parse_tree.h"
#include "proc.h"
2013-06-09 10:20:26 +08:00
#include "tokenizer.h"
#include "wutil.h" // IWYU pragma: keep
2013-06-02 13:14:47 +08:00
// This array provides strings for each symbol in enum parse_token_type_t in parse_constants.h.
const wchar_t *const token_type_map[] = {
L"token_type_invalid",
L"symbol_job_list",
L"symbol_job",
L"symbol_job_continuation",
L"symbol_statement",
L"symbol_block_statement",
L"symbol_block_header",
L"symbol_for_header",
L"symbol_while_header",
L"symbol_begin_header",
L"symbol_function_header",
L"symbol_if_statement",
L"symbol_if_clause",
L"symbol_else_clause",
L"symbol_else_continuation",
L"symbol_switch_statement",
L"symbol_case_item_list",
L"symbol_case_item",
L"symbol_boolean_statement",
L"symbol_decorated_statement",
L"symbol_plain_statement",
L"symbol_arguments_or_redirections_list",
L"symbol_argument_or_redirection",
L"symbol_andor_job_list",
L"symbol_argument_list",
L"symbol_freestanding_argument_list",
L"symbol_argument",
L"symbol_redirection",
L"symbol_optional_background",
L"symbol_end_command",
L"parse_token_type_string",
L"parse_token_type_pipe",
L"parse_token_type_redirection",
L"parse_token_type_background",
L"parse_token_type_end",
L"parse_token_type_terminate",
L"parse_special_type_parse_error",
L"parse_special_type_tokenizer_error",
L"parse_special_type_comment",
};
2013-07-29 06:19:38 +08:00
using namespace parse_productions;
2013-06-02 13:14:47 +08:00
static bool production_is_empty(const production_t *production) {
return (*production)[0] == token_type_invalid;
}
/// Returns a string description of this parse error.
wcstring parse_error_t::describe_with_prefix(const wcstring &src, const wcstring &prefix,
bool is_interactive, bool skip_caret) const {
wcstring result = text;
if (!skip_caret && source_start < src.size() && source_start + source_length <= src.size()) {
// Locate the beginning of this line of source.
size_t line_start = 0;
// Look for a newline prior to source_start. If we don't find one, start at the beginning of
// the string; otherwise start one past the newline. Note that source_start may itself point
// at a newline; we want to find the newline before it.
if (source_start > 0) {
size_t newline = src.find_last_of(L'\n', source_start - 1);
if (newline != wcstring::npos) {
line_start = newline + 1;
}
}
// Look for the newline after the source range. If the source range itself includes a
// newline, that's the one we want, so start just before the end of the range.
size_t last_char_in_range =
(source_length == 0 ? source_start : source_start + source_length - 1);
size_t line_end = src.find(L'\n', last_char_in_range);
if (line_end == wcstring::npos) {
line_end = src.size();
}
assert(line_end >= line_start);
assert(source_start >= line_start);
// Don't include the caret and line if we're interactive this is the first line, because
// then it's obvious.
bool skip_caret = (is_interactive && source_start == 0);
if (!skip_caret) {
// Append the line of text.
if (!result.empty()) {
result.push_back(L'\n');
}
result.append(prefix);
result.append(src, line_start, line_end - line_start);
// Append the caret line. The input source may include tabs; for that reason we
// construct a "caret line" that has tabs in corresponding positions.
const wcstring line_to_measure =
prefix + wcstring(src, line_start, source_start - line_start);
wcstring caret_space_line;
caret_space_line.reserve(source_start - line_start);
for (size_t i = 0; i < line_to_measure.size(); i++) {
wchar_t wc = line_to_measure.at(i);
if (wc == L'\t') {
caret_space_line.push_back(L'\t');
} else if (wc == L'\n') {
// It's possible that the source_start points at a newline itself. In that case,
// pretend it's a space. We only expect this to be at the end of the string.
caret_space_line.push_back(L' ');
} else {
int width = fish_wcwidth(wc);
if (width > 0) {
caret_space_line.append(static_cast<size_t>(width), L' ');
}
}
}
result.push_back(L'\n');
result.append(caret_space_line);
result.push_back(L'^');
}
}
return result;
}
2013-06-02 13:14:47 +08:00
wcstring parse_error_t::describe(const wcstring &src) const {
return this->describe_with_prefix(src, wcstring(), get_is_interactive(), false);
}
void parse_error_offset_source_start(parse_error_list_t *errors, size_t amt) {
assert(errors != NULL);
if (amt > 0) {
size_t i, max = errors->size();
for (i = 0; i < max; i++) {
parse_error_t *error = &errors->at(i);
// Preserve the special meaning of -1 as 'unknown'.
if (error->source_start != SOURCE_LOCATION_UNKNOWN) {
error->source_start += amt;
}
}
}
}
/// Returns a string description for the given token type.
const wchar_t *token_type_description(parse_token_type_t type) {
if (type >= 0 && type <= LAST_TOKEN_TYPE) return token_type_map[type];
// This leaks memory but it should never be run unless we have a bug elsewhere in the code.
const wcstring d = format_string(L"unknown_token_type_%ld", static_cast<long>(type));
wchar_t *d2 = new wchar_t[d.size() + 1];
// cppcheck-suppress memleak
return std::wcscpy(d2, d.c_str());
2013-06-09 10:20:26 +08:00
}
#define LONGIFY(x) L##x
#define KEYWORD_MAP(x) \
{ parse_keyword_##x, LONGIFY(#x) }
static const struct {
const parse_keyword_t keyword;
const wchar_t *const name;
}
keyword_map[] =
{
// Note that these must be sorted (except for the first), so that we can do binary search.
KEYWORD_MAP(none),
KEYWORD_MAP(and),
KEYWORD_MAP(begin),
KEYWORD_MAP(builtin),
KEYWORD_MAP(case),
KEYWORD_MAP(command),
KEYWORD_MAP(else),
KEYWORD_MAP(end),
KEYWORD_MAP(exec),
KEYWORD_MAP(for),
KEYWORD_MAP(function),
KEYWORD_MAP(if),
KEYWORD_MAP(in),
KEYWORD_MAP(not),
KEYWORD_MAP(or),
KEYWORD_MAP(switch),
KEYWORD_MAP(while)
};
const wchar_t *keyword_description(parse_keyword_t type) {
if (type >= 0 && type <= LAST_KEYWORD) return keyword_map[type].name;
// This leaks memory but it should never be run unless we have a bug elsewhere in the code.
const wcstring d = format_string(L"unknown_keyword_%ld", static_cast<long>(type));
wchar_t *d2 = new wchar_t[d.size() + 1];
// cppcheck-suppress memleak
return std::wcscpy(d2, d.c_str());
2013-06-23 17:09:46 +08:00
}
static wcstring token_type_user_presentable_description(
parse_token_type_t type, parse_keyword_t keyword = parse_keyword_none) {
if (keyword != parse_keyword_none) {
return format_string(L"keyword '%ls'", keyword_description(keyword));
}
switch (type) {
// Hackish. We only support the following types.
case symbol_statement: {
return L"a command";
}
case symbol_argument: {
return L"an argument";
}
case parse_token_type_string: {
return L"a string";
}
case parse_token_type_pipe: {
return L"a pipe";
}
case parse_token_type_redirection: {
return L"a redirection";
}
case parse_token_type_background: {
return L"a '&'";
}
case parse_token_type_end: {
return L"end of the statement";
}
case parse_token_type_terminate: {
return L"end of the input";
}
default: { return format_string(L"a %ls", token_type_description(type)); }
}
}
static wcstring block_type_user_presentable_description(parse_token_type_t type) {
switch (type) {
case symbol_for_header: {
return L"for loop";
}
case symbol_while_header: {
return L"while loop";
}
case symbol_function_header: {
return L"function definition";
}
case symbol_begin_header: {
return L"begin";
}
case symbol_if_statement: {
return L"if statement";
}
case symbol_switch_statement: {
return L"switch statement";
}
default: { return token_type_description(type); }
}
}
/// Returns a string description of the given parse node.
wcstring parse_node_t::describe() const {
wcstring result = token_type_description(this->type);
return result;
}
/// Returns a string description of the given parse token.
wcstring parse_token_t::describe() const {
wcstring result = token_type_description(type);
if (keyword != parse_keyword_none) {
append_format(result, L" <%ls>", keyword_description(keyword));
}
return result;
}
/// A string description appropriate for presentation to the user.
wcstring parse_token_t::user_presentable_description() const {
return token_type_user_presentable_description(type, keyword);
}
2013-06-23 17:09:46 +08:00
/// Convert from tokenizer_t's token type to a parse_token_t type.
static inline parse_token_type_t parse_token_type_from_tokenizer_token(
enum token_type tokenizer_token_type) {
2013-08-11 15:35:00 +08:00
parse_token_type_t result = token_type_invalid;
switch (tokenizer_token_type) {
case TOK_STRING: {
2013-08-11 15:35:00 +08:00
result = parse_token_type_string;
2013-06-02 13:14:47 +08:00
break;
}
case TOK_PIPE: {
2013-08-11 15:35:00 +08:00
result = parse_token_type_pipe;
2013-06-02 13:14:47 +08:00
break;
}
case TOK_END: {
2013-08-11 15:35:00 +08:00
result = parse_token_type_end;
2013-06-02 13:14:47 +08:00
break;
}
case TOK_BACKGROUND: {
2013-08-11 15:35:00 +08:00
result = parse_token_type_background;
2013-06-02 13:14:47 +08:00
break;
}
case TOK_REDIRECT_OUT:
case TOK_REDIRECT_APPEND:
case TOK_REDIRECT_IN:
case TOK_REDIRECT_FD:
case TOK_REDIRECT_NOCLOB: {
2013-08-11 15:35:00 +08:00
result = parse_token_type_redirection;
break;
}
case TOK_ERROR: {
2013-08-11 15:35:00 +08:00
result = parse_special_type_tokenizer_error;
2013-08-09 06:06:46 +08:00
break;
}
case TOK_COMMENT: {
2013-08-11 15:35:00 +08:00
result = parse_special_type_comment;
2013-08-09 06:06:46 +08:00
break;
}
default: {
fprintf(stderr, "Bad token type %d passed to %s\n", (int)tokenizer_token_type,
__FUNCTION__);
2013-06-02 13:14:47 +08:00
assert(0);
break;
}
2013-06-02 13:14:47 +08:00
}
return result;
}
/// Helper function for dump_tree.
static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &src,
node_offset_t node_idx, size_t indent, wcstring *result,
size_t *line, node_offset_t *inout_first_node_not_dumped) {
assert(node_idx < nodes.size());
// Update first_node_not_dumped. This takes a bit of explanation. While it's true that a parse
// tree may be a "forest", its individual trees are "compact," meaning they are not
// interleaved. Thus we keep track of the largest node index as we descend a tree. One past the
// largest is the start of the next tree.
if (*inout_first_node_not_dumped <= node_idx) {
*inout_first_node_not_dumped = node_idx + 1;
}
const parse_node_t &node = nodes.at(node_idx);
const size_t spacesPerIndent = 2;
// Unindent statement lists by 1 to flatten them.
if (node.type == symbol_job_list || node.type == symbol_arguments_or_redirections_list) {
if (indent > 0) indent -= 1;
}
append_format(*result, L"%2lu - %l2u ", *line, node_idx);
result->append(indent * spacesPerIndent, L' ');
;
2013-06-09 10:20:26 +08:00
result->append(node.describe());
if (node.child_count > 0) {
2013-06-09 10:20:26 +08:00
append_format(*result, L" <%lu children>", node.child_count);
2013-06-02 13:14:47 +08:00
}
if (node.has_comments()) {
append_format(*result, L" <has_comments>", node.child_count);
}
if (node.has_source() && node.type == parse_token_type_string) {
result->append(L": \"");
result->append(src, node.source_start, node.source_length);
result->append(L"\"");
}
if (node.type != parse_token_type_string) {
if (node.has_source()) {
append_format(*result, L" [%ld, %ld]", (long)node.source_start,
(long)node.source_length);
} else {
append_format(*result, L" [no src]", (long)node.source_start,
(long)node.source_length);
2013-08-09 06:06:46 +08:00
}
2013-06-02 13:14:47 +08:00
}
2013-06-09 10:20:26 +08:00
result->push_back(L'\n');
++*line;
for (node_offset_t child_idx = node.child_start;
child_idx < node.child_start + node.child_count; child_idx++) {
dump_tree_recursive(nodes, src, child_idx, indent + 1, result, line,
inout_first_node_not_dumped);
}
2013-06-09 10:20:26 +08:00
}
/// Gives a debugging textual description of a parse tree. Note that this supports "parse forests"
/// too. That is, our tree may not really be a tree, but instead a collection of trees.
wcstring parse_dump_tree(const parse_node_tree_t &nodes, const wcstring &src) {
if (nodes.empty()) return L"(empty!)";
node_offset_t first_node_not_dumped = 0;
2013-06-09 10:20:26 +08:00
size_t line = 0;
wcstring result;
while (first_node_not_dumped < nodes.size()) {
if (first_node_not_dumped > 0) {
result.append(L"---New Tree---\n");
}
dump_tree_recursive(nodes, src, first_node_not_dumped, 0, &result, &line,
&first_node_not_dumped);
}
2013-06-09 10:20:26 +08:00
return result;
}
/// Struct representing elements of the symbol stack, used in the internal state of the LL parser.
struct parse_stack_element_t {
enum parse_token_type_t type;
enum parse_keyword_t keyword;
2013-06-09 10:20:26 +08:00
node_offset_t node_idx;
explicit parse_stack_element_t(parse_token_type_t t, node_offset_t idx)
: type(t), keyword(parse_keyword_none), node_idx(idx) {}
2013-08-11 15:35:00 +08:00
explicit parse_stack_element_t(production_element_t e, node_offset_t idx)
: type(production_element_type(e)), keyword(production_element_keyword(e)), node_idx(idx) {}
wcstring describe(void) const {
2013-06-28 06:12:27 +08:00
wcstring result = token_type_description(type);
if (keyword != parse_keyword_none) {
append_format(result, L" <%ls>", keyword_description(keyword));
2013-06-28 06:12:27 +08:00
}
return result;
}
/// Returns a name that we can show to the user, e.g. "a command".
wcstring user_presentable_description(void) const {
return token_type_user_presentable_description(type, keyword);
}
2013-06-02 13:14:47 +08:00
};
/// The parser itself, private implementation of class parse_t. This is a hand-coded table-driven LL
/// parser. Most hand-coded LL parsers are recursive descent, but recursive descent parsers are
/// difficult to "pause", unlike table-driven parsers.
class parse_ll_t {
// Traditional symbol stack of the LL parser.
2013-08-11 15:35:00 +08:00
std::vector<parse_stack_element_t> symbol_stack;
// Parser output. This is a parse tree, but stored in an array.
parse_node_tree_t nodes;
// Whether we ran into a fatal error, including parse errors or tokenizer errors.
bool fatal_errored;
// Whether we should collect error messages or not.
2013-10-07 07:23:45 +08:00
bool should_generate_error_messages;
// List of errors we have encountered.
parse_error_list_t errors;
// The symbol stack can contain terminal types or symbols. Symbols go on to do productions, but
// terminal types are just matched against input tokens.
2013-08-11 15:35:00 +08:00
bool top_node_handle_terminal_types(parse_token_t token);
void parse_error_unexpected_token(const wchar_t *expected, parse_token_t token);
void parse_error(parse_token_t token, parse_error_code_t code, const wchar_t *format, ...);
void parse_error_at_location(size_t location, parse_error_code_t code, const wchar_t *format,
...);
void parse_error_failed_production(struct parse_stack_element_t &elem, parse_token_t token);
void parse_error_unbalancing_token(parse_token_t token);
// Reports an error for an unclosed block, e.g. 'begin;'. Returns true on success, false on
// failure (e.g. it is not an unclosed block).
bool report_error_for_unclosed_block();
2013-06-28 06:12:27 +08:00
void dump_stack(void) const;
/// Get the node corresponding to the top element of the stack.
parse_node_t &node_for_top_symbol() {
PARSE_ASSERT(!symbol_stack.empty());
2013-06-09 10:20:26 +08:00
const parse_stack_element_t &top_symbol = symbol_stack.back();
2014-04-28 08:23:19 +08:00
PARSE_ASSERT(top_symbol.node_idx != NODE_OFFSET_INVALID);
2013-06-09 10:20:26 +08:00
PARSE_ASSERT(top_symbol.node_idx < nodes.size());
return nodes.at(top_symbol.node_idx);
}
/// Pop from the top of the symbol stack, then push the given production, updating node counts.
/// Note that production_t has type "pointer to array" so some care is required.
inline void symbol_stack_pop_push_production(const production_t *production) {
2013-07-29 06:19:38 +08:00
bool logit = false;
if (logit) {
2013-07-29 06:19:38 +08:00
size_t count = 0;
fprintf(stderr, "Applying production:\n");
for (size_t i = 0; i < MAX_SYMBOLS_PER_PRODUCTION; i++) {
2013-07-29 06:19:38 +08:00
production_element_t elem = (*production)[i];
if (production_element_is_valid(elem)) {
2013-07-29 06:19:38 +08:00
parse_token_type_t type = production_element_type(elem);
parse_keyword_t keyword = production_element_keyword(elem);
fprintf(stderr, "\t%ls <%ls>\n", token_type_description(type),
keyword_description(keyword));
2013-07-29 06:19:38 +08:00
count++;
}
}
if (!count) fprintf(stderr, "\t<empty>\n");
}
// Get the parent index. But we can't get the parent parse node yet, since it may be made
// invalid by adding children.
const node_offset_t parent_node_idx = symbol_stack.back().node_idx;
2013-08-11 15:35:00 +08:00
// Add the children. Confusingly, we want our nodes to be in forwards order (last token
// last, so dumps look nice), but the symbols should be reverse order (last token first, so
// it's lowest on the stack)
const size_t child_start_big = nodes.size();
assert(child_start_big < NODE_OFFSET_INVALID);
node_offset_t child_start = static_cast<node_offset_t>(child_start_big);
2014-04-01 01:01:39 +08:00
// To avoid constructing multiple nodes, we make a single one that we modify.
parse_node_t representative_child(token_type_invalid);
representative_child.parent = parent_node_idx;
2014-04-01 01:01:39 +08:00
node_offset_t child_count = 0;
for (size_t i = 0; i < MAX_SYMBOLS_PER_PRODUCTION; i++) {
2013-07-29 06:19:38 +08:00
production_element_t elem = (*production)[i];
if (!production_element_is_valid(elem)) {
break; // all done, bail out
2013-08-11 15:35:00 +08:00
}
// Append the parse node.
representative_child.type = production_element_type(elem);
nodes.push_back(representative_child);
child_count++;
2013-07-29 06:19:38 +08:00
}
2013-08-11 15:35:00 +08:00
// Update the parent.
2013-07-29 06:19:38 +08:00
parse_node_t &parent_node = nodes.at(parent_node_idx);
2013-08-11 15:35:00 +08:00
// Should have no children yet.
2013-07-29 06:19:38 +08:00
PARSE_ASSERT(parent_node.child_count == 0);
// Tell the node about its children.
2013-07-29 06:19:38 +08:00
parent_node.child_start = child_start;
parent_node.child_count = child_count;
2013-08-11 15:35:00 +08:00
// Replace the top of the stack with new stack elements corresponding to our new nodes. Note
// that these go in reverse order.
symbol_stack.pop_back();
2013-07-29 06:19:38 +08:00
symbol_stack.reserve(symbol_stack.size() + child_count);
node_offset_t idx = child_count;
while (idx--) {
2013-07-29 06:19:38 +08:00
production_element_t elem = (*production)[idx];
PARSE_ASSERT(production_element_is_valid(elem));
symbol_stack.push_back(parse_stack_element_t(elem, child_start + idx));
2013-07-11 14:45:09 +08:00
}
}
2013-07-22 06:22:11 +08:00
public:
// Constructor
explicit parse_ll_t(enum parse_token_type_t goal)
: fatal_errored(false), should_generate_error_messages(true) {
2013-08-11 15:35:00 +08:00
this->symbol_stack.reserve(16);
this->nodes.reserve(64);
this->reset_symbols_and_nodes(goal);
2013-08-11 15:35:00 +08:00
}
// Input
void accept_tokens(parse_token_t token1, parse_token_t token2);
/// Report tokenizer errors.
void report_tokenizer_error(const tok_t &tok);
/// Indicate if we hit a fatal error.
bool has_fatal_error(void) const { return this->fatal_errored; }
/// Indicate whether we want to generate error messages.
void set_should_generate_error_messages(bool flag) {
2013-10-07 07:23:45 +08:00
this->should_generate_error_messages = flag;
}
/// Clear the parse symbol stack (but not the node tree). Add a node of the given type as the
/// goal node. This is called from the constructor.
void reset_symbols(enum parse_token_type_t goal);
2013-08-11 15:35:00 +08:00
/// Clear the parse symbol stack and the node tree. Add a node of the given type as the goal
/// node. This is called from the constructor.
void reset_symbols_and_nodes(enum parse_token_type_t goal);
/// Once parsing is complete, determine the ranges of intermediate nodes.
2013-08-11 15:35:00 +08:00
void determine_node_ranges();
/// Acquire output after parsing. This transfers directly from within self.
2013-08-11 15:35:00 +08:00
void acquire_output(parse_node_tree_t *output, parse_error_list_t *errors);
2013-06-02 13:14:47 +08:00
};
void parse_ll_t::dump_stack(void) const {
// Walk backwards from the top, looking for parents.
2013-06-28 06:12:27 +08:00
wcstring_list_t lines;
if (symbol_stack.empty()) {
2013-06-28 06:12:27 +08:00
lines.push_back(L"(empty)");
} else {
2013-06-28 06:12:27 +08:00
node_offset_t child = symbol_stack.back().node_idx;
node_offset_t cursor = child;
lines.push_back(nodes.at(cursor).describe());
while (cursor--) {
2013-06-28 06:12:27 +08:00
const parse_node_t &node = nodes.at(cursor);
if (node.child_start <= child && node.child_start + node.child_count > child) {
2013-06-28 06:12:27 +08:00
lines.push_back(node.describe());
child = cursor;
}
}
}
fprintf(stderr, "Stack dump (%zu elements):\n", symbol_stack.size());
for (size_t idx = 0; idx < lines.size(); idx++) {
2013-06-28 06:12:27 +08:00
fprintf(stderr, " %ls\n", lines.at(idx).c_str());
}
}
// Give each node a source range equal to the union of the ranges of its children. Terminal nodes
// already have source ranges (and no children). Since children always appear after their parents,
// we can implement this very simply by walking backwards. We then do a second pass to give empty
// nodes an empty source range (but with a valid offset). We do this by walking forward. If a child
// of a node has an invalid source range, we set it equal to the end of the source range of its
// previous child.
void parse_ll_t::determine_node_ranges(void) {
2013-08-09 06:06:46 +08:00
size_t idx = nodes.size();
while (idx--) {
parse_node_t *parent = &nodes[idx];
2013-08-11 15:35:00 +08:00
2013-08-09 06:06:46 +08:00
// Skip nodes that already have a source range. These are terminal nodes.
if (parent->source_start != SOURCE_OFFSET_INVALID) continue;
2013-08-11 15:35:00 +08:00
2013-08-09 06:06:46 +08:00
// Ok, this node needs a source range. Get all of its children, and then set its range.
source_offset_t min_start = SOURCE_OFFSET_INVALID,
max_end = 0; // note SOURCE_OFFSET_INVALID is huge
for (node_offset_t i = 0; i < parent->child_count; i++) {
2013-08-09 06:06:46 +08:00
const parse_node_t &child = nodes.at(parent->child_offset(i));
if (child.has_source()) {
min_start = std::min(min_start, child.source_start);
max_end = std::max(max_end, child.source_start + child.source_length);
}
2013-08-09 06:06:46 +08:00
}
2013-08-11 15:35:00 +08:00
if (min_start != SOURCE_OFFSET_INVALID) {
2013-08-09 06:06:46 +08:00
assert(max_end >= min_start);
parent->source_start = min_start;
parent->source_length = max_end - min_start;
}
}
// Forward pass.
size_t size = nodes.size();
for (idx = 0; idx < size; idx++) {
// Since we populate the source range based on the sibling node, it's simpler to walk over
// the children of each node. We keep a running "child_source_cursor" which is meant to be
// the end of the child's source range. It's initially set to the beginning of the parent'
// source range.
parse_node_t *parent = &nodes[idx];
// If the parent doesn't have a valid source range, then none of its children will either;
// skip it entirely.
if (parent->source_start == SOURCE_OFFSET_INVALID) {
continue;
}
source_offset_t child_source_cursor = parent->source_start;
for (size_t child_idx = 0; child_idx < parent->child_count; child_idx++) {
parse_node_t *child = &nodes[parent->child_start + child_idx];
if (child->source_start == SOURCE_OFFSET_INVALID) {
child->source_start = child_source_cursor;
}
child_source_cursor = child->source_start + child->source_length;
}
}
2013-08-09 06:06:46 +08:00
}
void parse_ll_t::acquire_output(parse_node_tree_t *output, parse_error_list_t *errors) {
if (output != NULL) {
output->swap(this->nodes);
2013-08-11 15:35:00 +08:00
}
this->nodes.clear();
if (errors != NULL) {
errors->swap(this->errors);
2013-08-11 15:35:00 +08:00
}
this->errors.clear();
this->symbol_stack.clear();
}
void parse_ll_t::parse_error(parse_token_t token, parse_error_code_t code, const wchar_t *fmt,
...) {
this->fatal_errored = true;
if (this->should_generate_error_messages) {
// this->dump_stack();
parse_error_t err;
va_list va;
va_start(va, fmt);
err.text = vformat_string(fmt, va);
err.code = code;
va_end(va);
2013-10-07 07:23:45 +08:00
err.source_start = token.source_start;
err.source_length = token.source_length;
this->errors.push_back(err);
}
2013-06-02 13:14:47 +08:00
}
void parse_ll_t::parse_error_at_location(size_t source_location, parse_error_code_t code,
const wchar_t *fmt, ...) {
this->fatal_errored = true;
if (this->should_generate_error_messages) {
// this->dump_stack();
parse_error_t err;
va_list va;
va_start(va, fmt);
err.text = vformat_string(fmt, va);
err.code = code;
va_end(va);
err.source_start = source_location;
err.source_length = 0;
this->errors.push_back(err);
}
}
// Unbalancing token. This includes 'else' or 'case' or 'end' outside of the appropriate block
// This essentially duplicates some logic from resolving the production for symbol_statement_list -
// yuck.
void parse_ll_t::parse_error_unbalancing_token(parse_token_t token) {
this->fatal_errored = true;
if (this->should_generate_error_messages) {
switch (token.keyword) {
case parse_keyword_end: {
this->parse_error(token, parse_error_unbalancing_end, L"'end' outside of a block");
break;
}
case parse_keyword_else: {
this->parse_error(token, parse_error_unbalancing_else,
L"'else' builtin not inside of if block");
break;
}
case parse_keyword_case: {
this->parse_error(token, parse_error_unbalancing_case,
L"'case' builtin not inside of switch block");
break;
}
default: {
// At the moment, this case should only be hit if you parse a
// freestanding_argument_list. For example, 'complete -c foo -a 'one & three'.
// Hackish error message for that case.
if (!symbol_stack.empty() &&
symbol_stack.back().type == symbol_freestanding_argument_list) {
this->parse_error(
token, parse_error_generic, L"Expected %ls, but found %ls",
token_type_user_presentable_description(symbol_argument).c_str(),
token.user_presentable_description().c_str());
} else {
this->parse_error(token, parse_error_generic, L"Did not expect %ls",
token.user_presentable_description().c_str());
}
break;
}
}
}
}
/// This is a 'generic' parse error when we can't match the top of the stack element.
void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &stack_elem,
parse_token_t token) {
fatal_errored = true;
if (this->should_generate_error_messages) {
bool done = false;
// Check for ||.
if (token.type == parse_token_type_pipe && token.source_start > 0) {
// Here we wanted a statement and instead got a pipe. See if this is a double pipe: foo
// || bar. If so, we have a special error message.
const parse_node_t *prev_pipe = nodes.find_node_matching_source_location(
parse_token_type_pipe, token.source_start - 1, NULL);
if (prev_pipe != NULL) {
// The pipe of the previous job abuts our current token. So we have ||.
this->parse_error(token, parse_error_double_pipe, ERROR_BAD_OR);
done = true;
}
}
// Check for &&.
if (!done && token.type == parse_token_type_background && token.source_start > 0) {
// Check to see if there was a previous token_background.
const parse_node_t *prev_background = nodes.find_node_matching_source_location(
parse_token_type_background, token.source_start - 1, NULL);
if (prev_background != NULL) {
// We have &&.
this->parse_error(token, parse_error_double_background, ERROR_BAD_AND);
done = true;
}
}
if (!done) {
const wcstring expected = stack_elem.user_presentable_description();
this->parse_error_unexpected_token(expected.c_str(), token);
}
}
}
void parse_ll_t::report_tokenizer_error(const tok_t &tok) {
parse_error_code_t parse_error_code;
switch (tok.error) {
case TOK_UNTERMINATED_QUOTE: {
parse_error_code = parse_error_tokenizer_unterminated_quote;
break;
}
case TOK_UNTERMINATED_SUBSHELL: {
parse_error_code = parse_error_tokenizer_unterminated_subshell;
break;
}
case TOK_UNTERMINATED_SLICE: {
parse_error_code = parse_error_tokenizer_unterminated_slice;
break;
}
case TOK_UNTERMINATED_ESCAPE: {
parse_error_code = parse_error_tokenizer_unterminated_escape;
break;
}
case TOK_OTHER:
default: {
parse_error_code = parse_error_tokenizer_other;
break;
}
}
this->parse_error_at_location(tok.offset + tok.error_offset, parse_error_code, L"%ls",
tok.text.c_str());
}
void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
fatal_errored = true;
if (this->should_generate_error_messages) {
this->parse_error(token, parse_error_generic, L"Expected %ls, but instead found %ls",
expected, token.user_presentable_description().c_str());
2013-10-07 07:23:45 +08:00
}
2013-06-02 13:14:47 +08:00
}
void parse_ll_t::reset_symbols(enum parse_token_type_t goal) {
// Add a new goal node, and then reset our symbol list to point at it.
node_offset_t where = static_cast<node_offset_t>(nodes.size());
nodes.push_back(parse_node_t(goal));
2013-08-11 15:35:00 +08:00
2013-08-09 06:06:46 +08:00
symbol_stack.clear();
symbol_stack.push_back(parse_stack_element_t(goal, where)); // goal token
2013-08-09 06:06:46 +08:00
this->fatal_errored = false;
}
/// Reset both symbols and nodes.
void parse_ll_t::reset_symbols_and_nodes(enum parse_token_type_t goal) {
2013-08-11 15:35:00 +08:00
nodes.clear();
this->reset_symbols(goal);
2013-08-11 15:35:00 +08:00
}
static bool type_is_terminal_type(parse_token_type_t type) {
switch (type) {
2013-08-11 15:35:00 +08:00
case parse_token_type_string:
case parse_token_type_pipe:
case parse_token_type_redirection:
case parse_token_type_background:
case parse_token_type_end:
case parse_token_type_terminate: {
2013-08-11 15:35:00 +08:00
return true;
}
default: { return false; }
2013-08-11 15:35:00 +08:00
}
}
2013-08-09 06:06:46 +08:00
bool parse_ll_t::report_error_for_unclosed_block() {
bool reported_error = false;
// Unclosed block, for example, 'while true ; '. We want to show the block node that opened it.
const parse_node_t &top_node = this->node_for_top_symbol();
// Hacktastic. We want to point at the source location of the block, but our block doesn't have
// a source range yet - only the terminal tokens do. So get the block statement corresponding to
// this end command. In general this block may be of a variety of types: if_statement,
// switch_statement, etc., each with different node structures. But keep descending the first
// child and eventually you hit a keyword: begin, if, etc. That's the keyword we care about.
const parse_node_t *end_command = this->nodes.get_parent(top_node, symbol_end_command);
const parse_node_t *block_node = end_command ? this->nodes.get_parent(*end_command) : NULL;
if (block_node && block_node->type == symbol_block_statement) {
// Get the header.
block_node = this->nodes.get_child(*block_node, 0, symbol_block_header);
block_node = this->nodes.get_child(*block_node, 0); // specific statement
}
if (block_node != NULL) {
// block_node is now an if_statement, switch_statement, for_header, while_header,
// function_header, or begin_header.
//
// Hackish: descend down the first node until we reach the bottom. This will be a keyword
// node like SWITCH, which will have the source range. Ordinarily the source range would be
// known by the parent node too, but we haven't completed parsing yet, so we haven't yet
// propagated source ranges.
const parse_node_t *cursor = block_node;
while (cursor->child_count > 0) {
cursor = this->nodes.get_child(*cursor, 0);
assert(cursor != NULL);
}
if (cursor->source_start != NODE_OFFSET_INVALID) {
const wcstring node_desc = block_type_user_presentable_description(block_node->type);
this->parse_error_at_location(cursor->source_start, parse_error_generic,
L"Missing end to balance this %ls", node_desc.c_str());
reported_error = true;
}
}
return reported_error;
}
bool parse_ll_t::top_node_handle_terminal_types(parse_token_t token) {
PARSE_ASSERT(!symbol_stack.empty());
2013-06-09 10:20:26 +08:00
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
2013-08-11 15:35:00 +08:00
bool handled = false;
2013-06-09 10:20:26 +08:00
parse_stack_element_t &stack_top = symbol_stack.back();
if (type_is_terminal_type(stack_top.type)) {
// The top of the stack is terminal. We are going to handle this (because we can't produce
// from a terminal type).
2013-08-11 15:35:00 +08:00
handled = true;
// Now see if we actually matched
bool matched = false;
if (stack_top.type == token.type) {
switch (stack_top.type) {
case parse_token_type_string: {
// We matched if the keywords match, or no keyword was required.
matched = (stack_top.keyword == parse_keyword_none ||
stack_top.keyword == token.keyword);
2013-08-11 15:35:00 +08:00
break;
}
default: {
// For other types, we only require that the types match.
2013-08-11 15:35:00 +08:00
matched = true;
break;
}
2013-08-11 15:35:00 +08:00
}
}
if (matched) {
// Success. Tell the node that it matched this token, and what its source range is in
// the parse phase, we only set source ranges for terminal types. We propagate ranges to
// parent nodes afterwards.
2013-06-09 10:20:26 +08:00
parse_node_t &node = node_for_top_symbol();
node.keyword = token.keyword;
2013-06-09 10:20:26 +08:00
node.source_start = token.source_start;
node.source_length = token.source_length;
} else {
2013-08-11 15:35:00 +08:00
// Failure
if (stack_top.type == parse_token_type_string &&
token.type == parse_token_type_string) {
// Keyword failure. We should unify this with the 'matched' computation above.
assert(stack_top.keyword != parse_keyword_none &&
stack_top.keyword != token.keyword);
// Check to see which keyword we got which was considered wrong.
switch (token.keyword) {
// Some keywords are only valid in certain contexts. If this cascaded all the
// way down through the outermost job_list, it was not in a valid context.
case parse_keyword_case:
case parse_keyword_end:
case parse_keyword_else: {
this->parse_error_unbalancing_token(token);
break;
}
case parse_keyword_none: {
// This is a random other string (not a keyword).
const wcstring expected = keyword_description(stack_top.keyword);
this->parse_error(token, parse_error_generic, L"Expected keyword '%ls'",
expected.c_str());
break;
}
default: {
// Got a real keyword we can report.
const wcstring actual = (token.keyword == parse_keyword_none
? token.describe()
: keyword_description(token.keyword));
const wcstring expected = keyword_description(stack_top.keyword);
this->parse_error(token, parse_error_generic,
L"Expected keyword '%ls', instead got keyword '%ls'",
expected.c_str(), actual.c_str());
break;
}
}
} else if (stack_top.keyword == parse_keyword_end &&
token.type == parse_token_type_terminate &&
this->report_error_for_unclosed_block()) {
// Handled by report_error_for_unclosed_block.
} else {
const wcstring expected = stack_top.user_presentable_description();
this->parse_error_unexpected_token(expected.c_str(), token);
}
2013-06-23 17:09:46 +08:00
}
2013-08-11 15:35:00 +08:00
// We handled the token, so pop the symbol stack.
2013-08-11 15:35:00 +08:00
symbol_stack.pop_back();
2013-06-02 13:14:47 +08:00
}
2013-08-11 15:35:00 +08:00
return handled;
2013-06-02 13:14:47 +08:00
}
void parse_ll_t::accept_tokens(parse_token_t token1, parse_token_t token2) {
bool logit = false;
if (logit) {
fprintf(stderr, "Accept token %ls\n", token1.describe().c_str());
}
PARSE_ASSERT(token1.type >= FIRST_PARSE_TOKEN_TYPE);
2013-08-11 15:35:00 +08:00
2013-06-09 10:20:26 +08:00
bool consumed = false;
2013-08-11 15:35:00 +08:00
// Handle special types specially. Note that these are the only types that can be pushed if the
// symbol stack is empty.
if (token1.type == parse_special_type_parse_error ||
token1.type == parse_special_type_tokenizer_error ||
token1.type == parse_special_type_comment) {
// We set the special node's parent to the top of the stack. This means that we have an
// asymmetric relationship: the special node has a parent (which is the node we were trying
// to generate when we encountered the special node), but the parent node does not have the
// special node as a child. This means for example that parents don't have to worry about
// tracking any comment nodes, but we can still recover the parent from the comment.
parse_node_t special_node(token1.type);
special_node.parent = symbol_stack.back().node_idx;
special_node.source_start = token1.source_start;
special_node.source_length = token1.source_length;
nodes.push_back(special_node);
2013-08-09 06:06:46 +08:00
consumed = true;
// Mark special flags.
if (token1.type == parse_special_type_comment) {
this->node_for_top_symbol().flags |= parse_node_flag_has_comments;
}
// Tokenizer errors are fatal.
if (token1.type == parse_special_type_tokenizer_error) this->fatal_errored = true;
2013-08-09 06:06:46 +08:00
}
while (!consumed && !this->fatal_errored) {
PARSE_ASSERT(!symbol_stack.empty());
2013-08-11 15:35:00 +08:00
if (top_node_handle_terminal_types(token1)) {
if (logit) {
fprintf(stderr, "Consumed token %ls\n", token1.describe().c_str());
2013-06-23 17:09:46 +08:00
}
consumed = true;
2013-06-09 10:20:26 +08:00
break;
}
2013-08-11 15:35:00 +08:00
// top_node_match_token may indicate an error if our stack is empty.
if (this->fatal_errored) break;
2013-08-11 15:35:00 +08:00
// Get the production for the top of the stack.
2013-07-29 06:19:38 +08:00
parse_stack_element_t &stack_elem = symbol_stack.back();
parse_node_t &node = nodes.at(stack_elem.node_idx);
parse_node_tag_t tag = 0;
const production_t *production =
production_for_token(stack_elem.type, token1, token2, &tag);
node.tag = tag;
if (production == NULL) {
parse_error_failed_production(stack_elem, token1);
// The above sets fatal_errored, which ends the loop.
} else {
bool is_terminate = (token1.type == parse_token_type_terminate);
// When a job_list encounters something like 'else', it returns an empty production to
// return control to the outer block. But if it's unbalanced, then we'll end up with an
// empty stack! So make sure that doesn't happen. This is the primary mechanism by which
// we detect e.g. unbalanced end. However, if we get a true terminate token, then we
// allow (expect) this to empty the stack.
if (symbol_stack.size() == 1 && production_is_empty(production) && !is_terminate) {
this->parse_error_unbalancing_token(token1);
break;
}
// Manipulate the symbol stack. Note that stack_elem is invalidated by popping the
// stack.
2013-07-29 06:44:09 +08:00
symbol_stack_pop_push_production(production);
// Expect to not have an empty stack, unless this was the terminate type. Note we may
// not have an empty stack with the terminate type (i.e. incomplete input).
assert(is_terminate || !symbol_stack.empty());
if (symbol_stack.empty()) {
break;
}
2013-07-29 06:44:09 +08:00
}
2013-06-09 10:20:26 +08:00
}
}
2013-06-02 13:14:47 +08:00
// Given an expanded string, returns any keyword it matches.
static parse_keyword_t keyword_with_name(const wchar_t *name) {
// Binary search on keyword_map. Start at 1 since 0 is keyword_none.
parse_keyword_t result = parse_keyword_none;
size_t left = 1, right = sizeof keyword_map / sizeof *keyword_map;
while (left < right) {
size_t mid = left + (right - left) / 2;
int cmp = wcscmp(name, keyword_map[mid].name);
if (cmp < 0) {
right = mid; // name was smaller than mid
} else if (cmp > 0) {
left = mid + 1; // name was larger than mid
} else {
result = keyword_map[mid].keyword; // found it
break;
}
}
return result;
}
static bool is_keyword_char(wchar_t c) {
return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') || (c >= L'0' && c <= L'9') ||
c == L'\'' || c == L'"' || c == L'\\' || c == '\n';
}
/// Given a token, returns the keyword it matches, or parse_keyword_none.
static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token) {
/* Only strings can be keywords */
if (tok != TOK_STRING) {
return parse_keyword_none;
}
// If tok_txt is clean (which most are), we can compare it directly. Otherwise we have to expand
// it. We only expand quotes, and we don't want to do expensive expansions like tilde
// expansions. So we do our own "cleanliness" check; if we find a character not in our allowed
// set we know it's not a keyword, and if we never find a quote we don't have to expand! Note
// that this lowercase set could be shrunk to be just the characters that are in keywords.
2013-06-09 10:20:26 +08:00
parse_keyword_t result = parse_keyword_none;
bool needs_expand = false, all_chars_valid = true;
const wchar_t *tok_txt = token.c_str();
for (size_t i = 0; tok_txt[i] != L'\0'; i++) {
wchar_t c = tok_txt[i];
if (!is_keyword_char(c)) {
all_chars_valid = false;
break;
}
// If we encounter a quote, we need expansion.
needs_expand = needs_expand || c == L'"' || c == L'\'' || c == L'\\';
}
if (all_chars_valid) {
// Expand if necessary.
if (!needs_expand) {
result = keyword_with_name(tok_txt);
} else {
wcstring storage;
if (unescape_string(tok_txt, &storage, 0)) {
result = keyword_with_name(storage.c_str());
2013-06-09 10:20:26 +08:00
}
}
}
2013-06-09 10:20:26 +08:00
return result;
}
/// Placeholder invalid token.
static const parse_token_t kInvalidToken = {
token_type_invalid, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0};
/// Terminal token.
static const parse_token_t kTerminalToken = {
parse_token_type_terminate, parse_keyword_none, false, false, SOURCE_OFFSET_INVALID, 0};
static inline bool is_help_argument(const wcstring &txt) { return contains(txt, L"-h", L"--help"); }
/// Return a new parse token, advancing the tokenizer.
static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token) {
if (!tok->next(token)) {
return kTerminalToken;
}
parse_token_t result;
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy,
// because it ignores quotes. This is the historical behavior. For example, `builtin --names`
// lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of
// this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
// even starts to look like a feature.
result.type = parse_token_type_from_tokenizer_token(token->type);
result.keyword = keyword_for_token(token->type, token->text);
result.has_dash_prefix = !token->text.empty() && token->text.at(0) == L'-';
result.is_help_argument = result.has_dash_prefix && is_help_argument(token->text);
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
// crash.
assert(token->offset < SOURCE_OFFSET_INVALID);
result.source_start = (source_offset_t)token->offset;
assert(token->length <= SOURCE_OFFSET_INVALID);
result.source_length = (source_offset_t)token->length;
return result;
}
bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
parse_node_tree_t *output, parse_error_list_t *errors,
parse_token_type_t goal) {
parse_ll_t parser(goal);
2014-01-13 14:39:12 +08:00
parser.set_should_generate_error_messages(errors != NULL);
// Construct the tokenizer.
tok_flags_t tok_options = 0;
if (parse_flags & parse_flag_include_comments) tok_options |= TOK_SHOW_COMMENTS;
if (parse_flags & parse_flag_accept_incomplete_tokens) tok_options |= TOK_ACCEPT_UNFINISHED;
if (parse_flags & parse_flag_show_blank_lines) tok_options |= TOK_SHOW_BLANK_LINES;
if (errors == NULL) tok_options |= TOK_SQUASH_ERRORS;
tokenizer_t tok(str.c_str(), tok_options);
// We are an LL(2) parser. We pass two tokens at a time. New tokens come in at index 1. Seed our
// queue with an initial token at index 1.
parse_token_t queue[2] = {kInvalidToken, kInvalidToken};
// Loop until we have a terminal token.
tok_t tokenizer_token;
for (size_t token_count = 0; queue[0].type != parse_token_type_terminate; token_count++) {
// Push a new token onto the queue.
queue[0] = queue[1];
queue[1] = next_parse_token(&tok, &tokenizer_token);
// If we are leaving things unterminated, then don't pass parse_token_type_terminate.
if (queue[0].type == parse_token_type_terminate &&
(parse_flags & parse_flag_leave_unterminated)) {
break;
}
// Pass these two tokens, unless we're still loading the queue. We know that queue[0] is
// valid; queue[1] may be invalid.
if (token_count > 0) {
parser.accept_tokens(queue[0], queue[1]);
}
// Handle tokenizer errors. This is a hack because really the parser should report this for
// itself; but it has no way of getting the tokenizer message.
if (queue[1].type == parse_special_type_tokenizer_error) {
parser.report_tokenizer_error(tokenizer_token);
}
// Handle errors.
if (parser.has_fatal_error()) {
if (parse_flags & parse_flag_continue_after_error) {
// Hack. Typically the parse error is due to the first token. However, if it's a
// tokenizer error, then has_fatal_error was set due to the check above; in that
// case the second token is what matters.
size_t error_token_idx = 0;
if (queue[1].type == parse_special_type_tokenizer_error) {
error_token_idx = (queue[1].type == parse_special_type_tokenizer_error ? 1 : 0);
token_count = -1; // so that it will be 0 after incrementing, and our tokenizer
// error will be ignored
}
// Mark a special error token, and then keep going.
const parse_token_t token = {parse_special_type_parse_error,
parse_keyword_none,
false,
false,
queue[error_token_idx].source_start,
queue[error_token_idx].source_length};
2014-01-13 14:39:12 +08:00
parser.accept_tokens(token, kInvalidToken);
parser.reset_symbols(goal);
} else {
break; // bail out
2013-08-09 06:06:46 +08:00
}
}
}
// Teach each node where its source range is.
2014-01-13 14:39:12 +08:00
parser.determine_node_ranges();
// Acquire the output from the parser.
2014-01-13 14:39:12 +08:00
parser.acquire_output(output, errors);
#if 0
//wcstring result = dump_tree(this->parser->nodes, str);
//fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", output->size(), sizeof(parse_node_t), output->size() * sizeof(parse_node_t));
#endif
// Indicate if we had a fatal error.
return !parser.has_fatal_error();
2013-06-02 13:14:47 +08:00
}
2013-08-09 06:06:46 +08:00
const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which,
parse_token_type_t expected_type) const {
2013-08-09 06:06:46 +08:00
const parse_node_t *result = NULL;
// We may get nodes with no children if we had an incomplete parse. Don't consider than an
// error.
if (parent.child_count > 0) {
2013-10-07 16:04:37 +08:00
PARSE_ASSERT(which < parent.child_count);
node_offset_t child_offset = parent.child_offset(which);
if (child_offset < this->size()) {
2013-10-07 16:04:37 +08:00
result = &this->at(child_offset);
// If we are given an expected type, then the node must be null or that type.
2013-10-07 16:04:37 +08:00
assert(expected_type == token_type_invalid || expected_type == result->type);
}
2013-08-09 06:06:46 +08:00
}
2013-08-11 15:35:00 +08:00
2013-08-09 06:06:46 +08:00
return result;
}
const parse_node_t &parse_node_tree_t::find_child(const parse_node_t &parent,
parse_token_type_t type) const {
for (node_offset_t i = 0; i < parent.child_count; i++) {
const parse_node_t *child = this->get_child(parent, i);
if (child->type == type) {
return *child;
}
}
PARSE_ASSERT(0);
return *(parse_node_t *)(NULL); // unreachable
}
const parse_node_t *parse_node_tree_t::get_parent(const parse_node_t &node,
parse_token_type_t expected_type) const {
2013-10-07 16:04:37 +08:00
const parse_node_t *result = NULL;
if (node.parent != NODE_OFFSET_INVALID) {
2013-10-07 16:04:37 +08:00
PARSE_ASSERT(node.parent < this->size());
const parse_node_t &parent = this->at(node.parent);
if (expected_type == token_type_invalid || expected_type == parent.type) {
// The type matches (or no type was requested).
2013-10-07 16:04:37 +08:00
result = &parent;
}
}
return result;
2013-10-07 07:23:45 +08:00
}
static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node_t &parent,
parse_token_type_t type,
parse_node_tree_t::parse_node_list_t *result, size_t max_count) {
if (result->size() < max_count) {
if (parent.type == type) result->push_back(&parent);
for (node_offset_t i = 0; i < parent.child_count; i++) {
const parse_node_t *child = tree.get_child(parent, i);
assert(child != NULL);
find_nodes_recursive(tree, *child, type, result, max_count);
}
}
}
parse_node_tree_t::parse_node_list_t parse_node_tree_t::find_nodes(const parse_node_t &parent,
parse_token_type_t type,
size_t max_count) const {
2013-08-09 06:06:46 +08:00
parse_node_list_t result;
find_nodes_recursive(*this, parent, type, &result, max_count);
2013-08-09 06:06:46 +08:00
return result;
}
/// Return true if the given node has the proposed ancestor as an ancestor (or is itself that
/// ancestor).
static bool node_has_ancestor(const parse_node_tree_t &tree, const parse_node_t &node,
const parse_node_t &proposed_ancestor) {
if (&node == &proposed_ancestor) {
return true; // found it
} else if (node.parent == NODE_OFFSET_INVALID) {
return false; // no more parents
2013-10-09 06:05:30 +08:00
}
// Recurse to the parent.
return node_has_ancestor(tree, tree.at(node.parent), proposed_ancestor);
2013-10-09 06:05:30 +08:00
}
const parse_node_t *parse_node_tree_t::find_last_node_of_type(parse_token_type_t type,
const parse_node_t *parent) const {
2013-10-09 06:05:30 +08:00
const parse_node_t *result = NULL;
// Find nodes of the given type in the tree, working backwards.
2013-10-09 06:05:30 +08:00
size_t idx = this->size();
while (idx--) {
2013-10-09 06:05:30 +08:00
const parse_node_t &node = this->at(idx);
if (node.type == type) {
// Types match. Check if it has the right parent.
if (parent == NULL || node_has_ancestor(*this, node, *parent)) {
2013-10-09 06:05:30 +08:00
// Success
result = &node;
break;
}
}
}
return result;
}
const parse_node_t *parse_node_tree_t::find_node_matching_source_location(
parse_token_type_t type, size_t source_loc, const parse_node_t *parent) const {
2013-10-13 09:17:03 +08:00
const parse_node_t *result = NULL;
// Find nodes of the given type in the tree, working backwards.
2013-10-13 09:17:03 +08:00
const size_t len = this->size();
for (size_t idx = 0; idx < len; idx++) {
2013-10-13 09:17:03 +08:00
const parse_node_t &node = this->at(idx);
// Types must match.
if (node.type != type) continue;
2013-10-13 09:17:03 +08:00
// Must contain source location.
if (!node.location_in_or_at_end_of_source_range(source_loc)) continue;
// If a parent is given, it must be an ancestor.
if (parent != NULL && !node_has_ancestor(*this, node, *parent)) continue;
// Found it.
2013-10-13 09:17:03 +08:00
result = &node;
break;
}
return result;
}
bool parse_node_tree_t::argument_list_is_root(const parse_node_t &node) const {
bool result = true;
assert(node.type == symbol_argument_list || node.type == symbol_arguments_or_redirections_list);
const parse_node_t *parent = this->get_parent(node);
if (parent != NULL) {
// We have a parent - check to make sure it's not another list!
result = parent->type != symbol_arguments_or_redirections_list &&
parent->type != symbol_argument_list;
}
return result;
}
enum parse_statement_decoration_t parse_node_tree_t::decoration_for_plain_statement(
const parse_node_t &node) const {
assert(node.type == symbol_plain_statement);
const parse_node_t *decorated_statement = this->get_parent(node, symbol_decorated_statement);
parse_node_tag_t tag =
decorated_statement ? decorated_statement->tag : parse_statement_decoration_none;
return static_cast<parse_statement_decoration_t>(tag);
}
bool parse_node_tree_t::command_for_plain_statement(const parse_node_t &node, const wcstring &src,
wcstring *out_cmd) const {
bool result = false;
assert(node.type == symbol_plain_statement);
const parse_node_t *cmd_node = this->get_child(node, 0, parse_token_type_string);
if (cmd_node != NULL && cmd_node->has_source()) {
out_cmd->assign(src, cmd_node->source_start, cmd_node->source_length);
result = true;
} else {
out_cmd->clear();
}
return result;
}
bool parse_node_tree_t::statement_is_in_pipeline(const parse_node_t &node,
bool include_first) const {
// Moderately nasty hack! Walk up our ancestor chain and see if we are in a job_continuation.
// This checks if we are in the second or greater element in a pipeline; if we are the first
// element we treat this as false. This accepts a few statement types.
bool result = false;
const parse_node_t *ancestor = &node;
// If we're given a plain statement, try to get its decorated statement parent.
if (ancestor && ancestor->type == symbol_plain_statement)
ancestor = this->get_parent(*ancestor, symbol_decorated_statement);
if (ancestor) ancestor = this->get_parent(*ancestor, symbol_statement);
if (ancestor) ancestor = this->get_parent(*ancestor);
if (ancestor) {
if (ancestor->type == symbol_job_continuation) {
// Second or more in a pipeline.
result = true;
} else if (ancestor->type == symbol_job && include_first) {
// Check to see if we have a job continuation that's not empty.
const parse_node_t *continuation =
this->get_child(*ancestor, 1, symbol_job_continuation);
result = (continuation != NULL && continuation->child_count > 0);
}
}
return result;
}
enum token_type parse_node_tree_t::type_for_redirection(const parse_node_t &redirection_node,
const wcstring &src, int *out_fd,
wcstring *out_target) const {
assert(redirection_node.type == symbol_redirection);
enum token_type result = TOK_NONE;
const parse_node_t *redirection_primitive =
this->get_child(redirection_node, 0, parse_token_type_redirection); // like 2>
const parse_node_t *redirection_target =
this->get_child(redirection_node, 1, parse_token_type_string); // like &1 or file path
if (redirection_primitive != NULL && redirection_primitive->has_source()) {
result = redirection_type_for_string(redirection_primitive->get_source(src), out_fd);
}
if (out_target != NULL) {
*out_target = redirection_target ? redirection_target->get_source(src) : L"";
}
return result;
}
const parse_node_t *parse_node_tree_t::header_node_for_block_statement(
const parse_node_t &node) const {
const parse_node_t *result = NULL;
if (node.type == symbol_block_statement) {
const parse_node_t *block_header = this->get_child(node, 0, symbol_block_header);
if (block_header != NULL) {
result = this->get_child(*block_header, 0);
}
}
return result;
}
parse_node_tree_t::parse_node_list_t parse_node_tree_t::specific_statements_for_job(
const parse_node_t &job) const {
assert(job.type == symbol_job);
parse_node_list_t result;
// Initial statement (non-specific).
result.push_back(get_child(job, 0, symbol_statement));
// Our cursor variable. Walk over the list of continuations.
const parse_node_t *continuation = get_child(job, 1, symbol_job_continuation);
while (continuation != NULL && continuation->child_count > 0) {
result.push_back(get_child(*continuation, 1, symbol_statement));
continuation = get_child(*continuation, 2, symbol_job_continuation);
}
// Result now contains a list of statements. But we want a list of specific statements e.g.
// symbol_switch_statement. So replace them in-place in the vector.
for (size_t i = 0; i < result.size(); i++) {
const parse_node_t *statement = result.at(i);
assert(statement->type == symbol_statement);
result.at(i) = this->get_child(*statement, 0);
}
return result;
}
parse_node_tree_t::parse_node_list_t parse_node_tree_t::comment_nodes_for_node(
const parse_node_t &parent) const {
parse_node_list_t result;
if (parent.has_comments()) {
// Walk all our nodes, looking for comment nodes that have the given node as a parent.
for (size_t i = 0; i < this->size(); i++) {
const parse_node_t &potential_comment = this->at(i);
if (potential_comment.type == parse_special_type_comment &&
this->get_parent(potential_comment) == &parent) {
result.push_back(&potential_comment);
}
}
}
return result;
}
enum parse_bool_statement_type_t parse_node_tree_t::statement_boolean_type(
const parse_node_t &node) {
assert(node.type == symbol_boolean_statement);
return static_cast<parse_bool_statement_type_t>(node.tag);
}
bool parse_node_tree_t::job_should_be_backgrounded(const parse_node_t &job) const {
assert(job.type == symbol_job);
const parse_node_t *opt_background = get_child(job, 2, symbol_optional_background);
bool result = opt_background != NULL && opt_background->tag == parse_background;
return result;
}
const parse_node_t *parse_node_tree_t::next_node_in_node_list(
const parse_node_t &node_list, parse_token_type_t entry_type,
const parse_node_t **out_list_tail) const {
parse_token_type_t list_type = node_list.type;
// Paranoia - it doesn't make sense for a list type to contain itself.
assert(list_type != entry_type);
const parse_node_t *list_cursor = &node_list;
const parse_node_t *list_entry = NULL;
// Loop while we don't have an item but do have a list. Note that some nodes may contain
// nothing; e.g. job_list contains blank lines as a production.
while (list_entry == NULL && list_cursor != NULL) {
const parse_node_t *next_cursor = NULL;
// Walk through the children.
for (node_offset_t i = 0; i < list_cursor->child_count; i++) {
const parse_node_t *child = this->get_child(*list_cursor, i);
if (child->type == entry_type) {
// This is the list entry.
list_entry = child;
} else if (child->type == list_type) {
// This is the next in the list.
next_cursor = child;
}
}
// Go to the next entry, even if it's NULL.
list_cursor = next_cursor;
}
// Return what we got.
assert(list_cursor == NULL || list_cursor->type == list_type);
assert(list_entry == NULL || list_entry->type == entry_type);
if (out_list_tail != NULL) *out_list_tail = list_cursor;
return list_entry;
}