mirror of
synced 2025-02-18 03:02:44 +08:00
Rejigger alts
Messing around
This commit is contained in:
@ -4,160 +4,269 @@
#include "parse_constants.h"
#include "tokenizer.h"
#include <array>
struct parse_token_t;
typedef uint8_t parse_node_tag_t;
using parse_node_tag_t = uint8_t;
struct parse_token_t;
namespace grammar {
using production_element_t = uint8_t;
// Forward declarations.
// Define primitive types.
template <enum parse_token_type_t Token>
struct primitive {
static constexpr production_element_t element() { return Token; }
using tok_end = primitive<parse_token_type_end>;
using tok_string = primitive<parse_token_type_string>;
using tok_pipe = primitive<parse_token_type_pipe>;
using tok_background = primitive<parse_token_type_background>;
using tok_redirection = primitive<parse_token_type_redirection>;
// Define keyword types.
template <parse_keyword_t Keyword>
struct keyword {
static constexpr production_element_t element() {
// Convert a parse_keyword_t enum to a production_element_t enum.
return Keyword + LAST_TOKEN_OR_SYMBOL + 1;
// Forward declare all the symbol types.
#define ELEM(T) struct T;
#include "parse_grammar_elements.inc"
// A production is a sequence of production elements.
template <int WHICH, uint8_t count = 0>
struct production_t {
production_element_t elems[count];
// +1 to hold the terminating token_type_invalid
template <size_t Count>
using production_t = std::array<const production_element_t, Count + 1>;
template <int TOKEN>
struct prim {};
using tok_end = prim<TOK_END>;
using tok_string = prim<TOK_STRING>;
template <int WHICH>
struct keyword {};
// A producer holds various productions.
// This is an ugly hack to avoid ODR violations
// Given some type, return a pointer to its production.
template <typename T>
struct producer {};
const production_element_t *production_for() {
static constexpr auto prod = T::production;
return prod.data();
// Get some production element.
template <typename T>
constexpr production_element_t element() {
return T::element();
// Partial specialization hack.
#define ELEM(T) \
template <> \
constexpr production_element_t element<T>() { \
return symbol_##T; \
#include "parse_grammar_elements.inc"
// Empty produces nothing.
struct empty : public producer<production_t<0>> {};
struct empty {
static constexpr production_t<0> production = {{token_type_invalid}};
static const production_element_t *resolve(const parse_token_t &, const parse_token_t &,
parse_node_tag_t *) {
return production_for<empty>();
// Not sure if we need this.
template <class A>
struct single {};
// Sequence represents a list of (at least two) productions.
template <class T0, class... Ts>
struct seq {
static constexpr production_t<1 + sizeof...(Ts)> production = {
{element<T0>(), element<Ts>()..., token_type_invalid}};
static const production_element_t *resolve(const parse_token_t &, const parse_token_t &,
parse_node_tag_t *) {
return production_for<seq>();
template <class A>
using produces_single = producer<single<A>>;
template <class... Args>
using produces_sequence = seq<Args...>;
// Ergonomic way to create a production for a single element.
template <class T>
using single = seq<T>;
template <class T>
using produces_single = single<T>;
// Alternative represents a choice.
template <class A1, class A2, class A3 = empty, class A4 = empty, class A5 = empty>
struct alternative {};
template <class... Args>
using produces_alternative = producer<alternative<Args...>>;
// Sequence represents a list of productions.
template <class A1, class A2, class A3 = empty, class A4 = empty, class A5 = empty,
class A6 = empty>
struct seq {};
template <class... Args>
using produces_sequence = producer<seq<Args...>>;
struct alternative {
// Following are the grammar productions.
#define BODY(T)
#define DEF(T) struct T : public
#define DEF_ALT(T) struct T : public alternative
#define ALT_BODY(T) \
static const production_element_t *resolve(const parse_token_t &, const parse_token_t &, \
parse_node_tag_t *);
// A job_list is a list of jobs, separated by semicolons or newlines
produces_alternative<empty, //
seq<job, job_list>, //
seq<job, job_list>>{};
DEF_ALT(job_list) {
using normal = seq<job, job_list>;
using empty_line = seq<tok_end, job_list>;
using empty = grammar::empty;
// A job is a non-empty list of statements, separated by pipes. (Non-empty is useful for cases
// like if statements, where we require a command). To represent "non-empty", we require a
// statement, followed by a possibly empty job_continuation, and then optionally a background
// specifier '&'
DEF(job) produces_sequence<statement, job_continuation, optional_background>{};
DEF(job) produces_sequence<statement, job_continuation, optional_background>{BODY(job)};
produces_alternative<empty, //
seq<prim<TOK_PIPE>, statement, job_continuation>>{};
DEF_ALT(job_continuation) {
using piped = seq<tok_pipe, statement, job_continuation>;
// A statement is a normal command, or an if / while / and etc
produces_alternative<boolean_statement, //
block_statement, //
if_statement, //
switch_statement, //
DEF_ALT(statement) {
using boolean = single<boolean_statement>;
using block = single<block_statement>;
using ifs = single<if_statement>;
using switchs = single<switch_statement>;
using decorated = single<decorated_statement>;
// A block is a conditional, loop, or begin/end
produces_sequence<if_clause, else_clause, end_command, arguments_or_redirections_list>{};
produces_sequence<if_clause, else_clause, end_command, arguments_or_redirections_list>{
produces_sequence<keyword<parse_keyword_if>, job, tok_end, andor_job_list, job_list>{};
produces_sequence<keyword<parse_keyword_if>, job, tok_end, andor_job_list, job_list>{
DEF(else_clause) produces_alternative<empty, seq<keyword<parse_keyword_else>, else_continuation>>{};
produces_alternative<seq<if_clause, else_clause>, //
seq<tok_end, job_list>>{};
DEF_ALT(else_clause) {
using empty = grammar::empty;
using else_cont = seq<keyword<parse_keyword_else>, else_continuation>;
DEF_ALT(else_continuation) {
using else_if = seq<if_clause, else_clause>;
using else_only = seq<tok_end, job_list>;
produces_sequence<keyword<parse_keyword_switch>, argument, tok_end, case_item_list, end_command,
produces_alternative<empty, //
seq<case_item, case_item_list>, //
seq<tok_end, case_item_list>>{};
DEF_ALT(case_item_list) {
using empty = grammar::empty;
using case_items = seq<case_item, case_item_list>;
using blank_line = seq<tok_end, case_item_list>;
DEF(case_item) produces_sequence<keyword<parse_keyword_case>, argument_list, tok_end, job_list>{};
DEF(case_item) produces_sequence<keyword<parse_keyword_case>, argument_list, tok_end, job_list> {
produces_sequence<block_header, job_list, end_command, arguments_or_redirections_list>{};
DEF(block_header) produces_alternative<for_header, while_header, function_header, begin_header>{};
DEF_ALT(block_header) {
using forh = single<for_header>;
using whileh = single<while_header>;
using funch = single<function_header>;
using beginh = single<begin_header>;
produces_sequence<keyword<parse_keyword_for>, tok_string, keyword<parse_keyword_in>, argument_list,
DEF(while_header) produces_sequence<keyword<parse_keyword_while>, job, tok_end, andor_job_list>{};
produces_sequence<keyword<parse_keyword_while>, job, tok_end, andor_job_list>{BODY(while_header)};
struct begin_header : produces_single<keyword<parse_keyword_begin>> {};
DEF(begin_header) produces_single<keyword<parse_keyword_begin>>{BODY(begin_header)};
// Functions take arguments, and require at least one (the name). No redirections allowed.
produces_sequence<keyword<parse_keyword_function>, argument, argument_list, tok_end>{};
// A boolean statement is AND or OR or NOT
produces_alternative<seq<keyword<parse_keyword_and>, statement>, //
seq<keyword<parse_keyword_or>, statement>, //
seq<keyword<parse_keyword_not>, statement>>{};
DEF_ALT(boolean_statement) {
using ands = seq<keyword<parse_keyword_and>, statement>;
using ors = seq<keyword<parse_keyword_or>, statement>;
using nots = seq<keyword<parse_keyword_not>, statement>;
// An andor_job_list is zero or more job lists, where each starts with an `and` or `or` boolean
// statement.
produces_alternative<empty, //
seq<job, andor_job_list>, //
seq<tok_end, andor_job_list>>{};
DEF_ALT(andor_job_list) {
using empty = grammar::empty;
using andor_job = seq<job, andor_job_list>;
using empty_line = seq<tok_end, andor_job_list>;
// A decorated_statement is a command with a list of arguments_or_redirections, possibly with
// "builtin" or "command" or "exec"
produces_alternative<plain_statement, //
seq<keyword<parse_keyword_command>, plain_statement>, //
seq<keyword<parse_keyword_builtin>, plain_statement>, //
seq<keyword<parse_keyword_exec>, plain_statement>>{};
DEF_ALT(decorated_statement) {
using plains = single<plain_statement>;
using cmds = seq<keyword<parse_keyword_command>, plain_statement>;
using builtins = seq<keyword<parse_keyword_builtin>, plain_statement>;
using execs = seq<keyword<parse_keyword_exec>, plain_statement>;
DEF(plain_statement) produces_sequence<tok_string, arguments_or_redirections_list>{};
produces_sequence<tok_string, arguments_or_redirections_list>{BODY(plain_statement)};
DEF(argument_list) produces_alternative<empty, seq<argument, argument_list>>{};
produces_alternative<empty, seq<argument_or_redirection, arguments_or_redirections_list>>{};
DEF_ALT(argument_list) {
using empty = grammar::empty;
using arg = seq<argument, argument_list>;
DEF(argument_or_redirection) produces_alternative<argument, redirection>{};
DEF(argument) produces_single<tok_string>{};
DEF(optional_background) produces_alternative<empty, prim<TOK_BACKGROUND>>{};
DEF(end_command) produces_single<keyword<parse_keyword_end>>{};
DEF_ALT(arguments_or_redirections_list) {
using empty = grammar::empty;
using value = seq<argument_or_redirection, arguments_or_redirections_list>;
DEF_ALT(argument_or_redirection) {
using arg = single<argument>;
using redir = single<redirection>;
DEF(argument) produces_single<tok_string>{BODY(argument)};
DEF(redirection) produces_sequence<tok_redirection, tok_string>{BODY(redirection)};
DEF_ALT(optional_background) {
using empty = grammar::empty;
using background = single<tok_background>;
DEF(end_command) produces_single<keyword<parse_keyword_end>>{BODY(end_command)};
// A freestanding_argument_list is equivalent to a normal argument list, except it may contain
// TOK_END (newlines, and even semicolons, for historical reasons)
produces_alternative<empty, //
seq<argument, freestanding_argument_list>, //
seq<tok_end, freestanding_argument_list>>{};
DEF_ALT(freestanding_argument_list) {
using empty = grammar::empty;
using arg = seq<argument, freestanding_argument_list>;
using semicolon = seq<tok_end, freestanding_argument_list>;
@ -9,6 +9,7 @@
#include "parse_tree.h"
using namespace parse_productions;
using namespace grammar;
@ -22,40 +23,14 @@ using namespace parse_productions;
// Productions are generally a static const array, and we return a pointer to the array (yes,
// really).
#define RESOLVE(sym) \
static const production_element_t *resolve_##sym( \
#define RESOLVE(SYM) \
const production_element_t *SYM::resolve( \
const parse_token_t &token1, const parse_token_t &token2, parse_node_tag_t *out_tag)
// This is a shorthand for symbols which always resolve to the same production sequence. Using this
// avoids repeating a lot of boilerplate code below.
#define RESOLVE_ONLY(sym, tokens...) \
extern const production_element_t sym##_only[]; \
static const production_element_t *resolve_##sym( \
const parse_token_t &token1, const parse_token_t &token2, parse_node_tag_t *out_tag) { \
UNUSED(token1); \
UNUSED(token2); \
UNUSED(out_tag); \
return sym##_only; \
} \
const production_element_t sym##_only[] = {tokens, token_type_invalid}
// Convert a parse_keyword_t enum to a parse_token_type_t enum.
#define KEYWORD(keyword) (keyword + LAST_TOKEN_OR_SYMBOL + 1)
/// Helper macro to define a production sequence. Note that such sequences must always end with
/// enum `token_type_invalid`.
#define P(production_name, tokens...) \
static const production_element_t production_name[] = {tokens, token_type_invalid}
/// The empty production is used often enough it's worth definining once at module scope.
static const production_element_t empty[] = {token_type_invalid};
/// A job_list is a list of jobs, separated by semicolons or newlines.
RESOLVE(job_list) {
P(normal, symbol_job, symbol_job_list);
P(empty_line, parse_token_type_end, symbol_job_list);
switch (token1.type) {
case parse_token_type_string: {
@ -64,44 +39,38 @@ RESOLVE(job_list) {
case parse_keyword_end:
case parse_keyword_else:
case parse_keyword_case: {
return empty; // end this job list
return production_for<empty>(); // end this job list
default: {
return normal; // normal string
return production_for<normal>(); // normal string
case parse_token_type_pipe:
case parse_token_type_redirection:
case parse_token_type_background: {
return normal;
return production_for<normal>();
case parse_token_type_end: {
return empty_line;
return production_for<empty_line>();
case parse_token_type_terminate: {
return empty; // no more commands, just transition to empty
return production_for<empty>(); // no more commands, just transition to empty
default: { return NO_PRODUCTION; }
// A job is a non-empty list of statements, separated by pipes. (Non-empty is useful for cases like
// if statements, where we require a command). To represent "non-empty", we require a statement,
// followed by a possibly empty job_continuation.
RESOLVE_ONLY(job, symbol_statement, symbol_job_continuation, symbol_optional_background);
RESOLVE(job_continuation) {
P(piped, parse_token_type_pipe, symbol_statement, symbol_job_continuation);
switch (token1.type) {
case parse_token_type_pipe: {
return piped; // pipe, continuation
return production_for<piped>(); // pipe, continuation
default: {
return empty; // not a pipe, no job continuation
return production_for<empty>(); // not a pipe, no job continuation
@ -109,11 +78,6 @@ RESOLVE(job_continuation) {
// A statement is a normal command, or an if / while / and etc.
RESOLVE(statement) {
P(boolean, symbol_boolean_statement);
P(block, symbol_block_statement);
P(ifs, symbol_if_statement);
P(switchs, symbol_switch_statement);
P(decorated, symbol_decorated_statement);
// The only block-like builtin that takes any parameters is 'function' So go to decorated
// statements if the subsequent token looks like '--'. The logic here is subtle:
@ -126,9 +90,9 @@ RESOLVE(statement) {
// If we are a function, then look for help arguments. Otherwise, if the next token looks
// like an option (starts with a dash), then parse it as a decorated statement.
if (token1.keyword == parse_keyword_function && token2.is_help_argument) {
return decorated;
return production_for<decorated>();
} else if (token1.keyword != parse_keyword_function && token2.has_dash_prefix) {
return decorated;
return production_for<decorated>();
// Likewise if the next token doesn't look like an argument at all. This corresponds to e.g.
@ -137,7 +101,7 @@ RESOLVE(statement) {
(token1.keyword != parse_keyword_begin && token1.keyword != parse_keyword_end);
if (naked_invocation_invokes_help &&
(token2.type == parse_token_type_end || token2.type == parse_token_type_terminate)) {
return decorated;
return production_for<decorated>();
@ -147,28 +111,28 @@ RESOLVE(statement) {
case parse_keyword_and:
case parse_keyword_or:
case parse_keyword_not: {
return boolean;
return production_for<boolean>();
case parse_keyword_for:
case parse_keyword_while:
case parse_keyword_function:
case parse_keyword_begin: {
return block;
return production_for<block>();
case parse_keyword_if: {
return ifs;
return production_for<ifs>();
case parse_keyword_else: {
case parse_keyword_switch: {
return switchs;
return production_for<switchs>();
case parse_keyword_end: {
// All other keywords fall through to decorated statement.
default: { return decorated; }
default: { return production_for<decorated>(); }
@ -182,255 +146,201 @@ RESOLVE(statement) {
RESOLVE_ONLY(if_statement, symbol_if_clause, symbol_else_clause, symbol_end_command,
RESOLVE_ONLY(if_clause, KEYWORD(parse_keyword_if), symbol_job, parse_token_type_end,
symbol_andor_job_list, symbol_job_list);
RESOLVE(else_clause) {
P(else_cont, KEYWORD(parse_keyword_else), symbol_else_continuation);
switch (token1.keyword) {
case parse_keyword_else: {
return else_cont;
return production_for<else_cont>();
default: { return empty; }
default: { return production_for<empty>(); }
RESOLVE(else_continuation) {
P(elseif, symbol_if_clause, symbol_else_clause);
P(elseonly, parse_token_type_end, symbol_job_list);
switch (token1.keyword) {
case parse_keyword_if: {
return elseif;
return production_for<else_if>();
default: { return elseonly; }
default: { return production_for<else_only>(); }
RESOLVE_ONLY(switch_statement, KEYWORD(parse_keyword_switch), symbol_argument, parse_token_type_end,
symbol_case_item_list, symbol_end_command, symbol_arguments_or_redirections_list);
RESOLVE(case_item_list) {
P(case_item, symbol_case_item, symbol_case_item_list);
P(blank_line, parse_token_type_end, symbol_case_item_list);
if (token1.keyword == parse_keyword_case)
return case_item;
return production_for<case_items>();
else if (token1.type == parse_token_type_end)
return blank_line;
return production_for<blank_line>();
return empty;
return production_for<empty>();
RESOLVE_ONLY(case_item, KEYWORD(parse_keyword_case), symbol_argument_list, parse_token_type_end,
RESOLVE(andor_job_list) {
P(andor_job, symbol_job, symbol_andor_job_list);
P(empty_line, parse_token_type_end, symbol_andor_job_list);
if (token1.type == parse_token_type_end) {
return empty_line;
return production_for<empty_line>();
} else if (token1.keyword == parse_keyword_and || token1.keyword == parse_keyword_or) {
// Check that the argument to and/or is a string that's not help. Otherwise it's either 'and
// --help' or a naked 'and', and not part of this list.
if (token2.type == parse_token_type_string && !token2.is_help_argument) {
return andor_job;
return production_for<andor_job>();
// All other cases end the list.
return empty;
return production_for<empty>();
RESOLVE(argument_list) {
P(arg, symbol_argument, symbol_argument_list);
switch (token1.type) {
case parse_token_type_string: {
return arg;
return production_for<arg>();
default: { return empty; }
default: { return production_for<empty>(); }
RESOLVE(freestanding_argument_list) {
P(arg, symbol_argument, symbol_freestanding_argument_list);
P(semicolon, parse_token_type_end, symbol_freestanding_argument_list);
switch (token1.type) {
case parse_token_type_string: {
return arg;
return production_for<arg>();
case parse_token_type_end: {
return semicolon;
return production_for<semicolon>();
default: { return empty; }
default: { return production_for<empty>(); }
RESOLVE_ONLY(block_statement, symbol_block_header, symbol_job_list, symbol_end_command,
RESOLVE(block_header) {
P(forh, symbol_for_header);
P(whileh, symbol_while_header);
P(funch, symbol_function_header);
P(beginh, symbol_begin_header);
switch (token1.keyword) {
case parse_keyword_for: {
return forh;
return production_for<forh>();
case parse_keyword_while: {
return whileh;
return production_for<whileh>();
case parse_keyword_function: {
return funch;
return production_for<funch>();
case parse_keyword_begin: {
return beginh;
return production_for<beginh>();
default: { return NO_PRODUCTION; }
RESOLVE_ONLY(for_header, KEYWORD(parse_keyword_for), parse_token_type_string,
KEYWORD(parse_keyword_in), symbol_argument_list, parse_token_type_end);
RESOLVE_ONLY(while_header, KEYWORD(parse_keyword_while), symbol_job, parse_token_type_end,
RESOLVE_ONLY(begin_header, KEYWORD(parse_keyword_begin));
RESOLVE_ONLY(function_header, KEYWORD(parse_keyword_function), symbol_argument,
symbol_argument_list, parse_token_type_end);
// A boolean statement is AND or OR or NOT.
RESOLVE(boolean_statement) {
P(ands, KEYWORD(parse_keyword_and), symbol_statement);
P(ors, KEYWORD(parse_keyword_or), symbol_statement);
P(nots, KEYWORD(parse_keyword_not), symbol_statement);
switch (token1.keyword) {
case parse_keyword_and: {
*out_tag = parse_bool_and;
return ands;
return production_for<ands>();
case parse_keyword_or: {
*out_tag = parse_bool_or;
return ors;
return production_for<ors>();
case parse_keyword_not: {
*out_tag = parse_bool_not;
return nots;
return production_for<nots>();
default: { return NO_PRODUCTION; }
RESOLVE(decorated_statement) {
P(plains, symbol_plain_statement);
P(cmds, KEYWORD(parse_keyword_command), symbol_plain_statement);
P(builtins, KEYWORD(parse_keyword_builtin), symbol_plain_statement);
P(execs, KEYWORD(parse_keyword_exec), symbol_plain_statement);
// If this is e.g. 'command --help' then the command is 'command' and not a decoration. If the
// second token is not a string, then this is a naked 'command' and we should execute it as
// undecorated.
if (token2.type != parse_token_type_string || token2.has_dash_prefix) {
return plains;
return production_for<plains>();
switch (token1.keyword) {
case parse_keyword_command: {
*out_tag = parse_statement_decoration_command;
return cmds;
return production_for<cmds>();
case parse_keyword_builtin: {
*out_tag = parse_statement_decoration_builtin;
return builtins;
return production_for<builtins>();
case parse_keyword_exec: {
*out_tag = parse_statement_decoration_exec;
return execs;
return production_for<execs>();
default: {
*out_tag = parse_statement_decoration_none;
return plains;
return production_for<plains>();
RESOLVE_ONLY(plain_statement, parse_token_type_string, symbol_arguments_or_redirections_list);
RESOLVE(arguments_or_redirections_list) {
P(value, symbol_argument_or_redirection, symbol_arguments_or_redirections_list);
switch (token1.type) {
case parse_token_type_string:
case parse_token_type_redirection: {
return value;
return production_for<value>();
default: { return empty; }
default: { return production_for<empty>(); }
RESOLVE(argument_or_redirection) {
P(arg, symbol_argument);
P(redir, symbol_redirection);
switch (token1.type) {
case parse_token_type_string: {
return arg;
return production_for<arg>();
case parse_token_type_redirection: {
return redir;
return production_for<redir>();
default: { return NO_PRODUCTION; }
RESOLVE_ONLY(argument, parse_token_type_string);
RESOLVE_ONLY(redirection, parse_token_type_redirection, parse_token_type_string);
RESOLVE(optional_background) {
P(background, parse_token_type_background);
switch (token1.type) {
case parse_token_type_background: {
*out_tag = parse_background;
return background;
return production_for<background>();
default: {
*out_tag = parse_no_background;
return empty;
return production_for<empty>();
RESOLVE_ONLY(end_command, KEYWORD(parse_keyword_end));
#define TEST(sym) \
case (symbol_##sym): \
resolver = resolve_##sym; \
#define TEST(SYM) \
case (symbol_##SYM): \
resolver = SYM::resolve; \
const production_element_t *parse_productions::production_for_token(parse_token_type_t node_type,
Reference in New Issue
Block a user