Fix to indent comments in blocks properly (#1710)

This makes two changes to parse trees: 1. Unmaterialized nodes no longer have an invalid source location For example, with the code `while false;end` there are no tokens associated with the while loop's job_list, and therefore it is unmaterialized. Previously it would have had a SOURCE_OFFSET_INVALID. But now it has a zero source length, but an offset equal to the end of the while loop (i.e. the semicolon), and a zero length. Correspondingly, the has_source function now checks the length instead of the offset. 2. Special (comment and error) nodes have always been "disconnected," meaning they are not the child of any other node. However, they now have their parent offsets set to whatever the top of the node stack was when the node was encountered. This gives us a sense of which node the comment is "in", e.g. if we are constructing a job list then the comment's parent will be the job list. This lets us determine the comment's indent.
2024-11-26 10:43:47 +08:00 · 2014-09-29 11:29:50 -07:00 · 2014-09-29 11:29:50 -07:00 · 27dd37ebb4
commit 27dd37ebb4
parent a186f04f15
4 changed files with 94 additions and 18 deletions
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@ -896,9 +896,17 @@ static void test_indents()
        {NULL, -1}
    };

+    const indent_component_t components12[] =
+    {
+        {L"while false", 0},
+        {L"# comment", 1}, //comment indentation handling
+        {L"command", 1}, //comment indentation handling
+        {L"# comment2", 1}, //comment indentation handling
+        {NULL, -1}
+    };


-    const indent_component_t *tests[] = {components1, components2, components3, components4, components5, components6, components7, components8, components9, components10, components11};
+    const indent_component_t *tests[] = {components1, components2, components3, components4, components5, components6, components7, components8, components9, components10, components11, components12};
    for (size_t which = 0; which < sizeof tests / sizeof *tests; which++)
    {
        const indent_component_t *components = tests[which];
--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@ -13,13 +13,6 @@ static bool production_is_empty(const production_t *production)
    return (*production)[0] == token_type_invalid;
 }

-void swap2(parse_node_tree_t &a, parse_node_tree_t &b)
-{
-    fprintf(stderr, "Swapping!\n");
-    // This uses the base vector implementation
-    a.swap(b);
-}
-
 /** Returns a string description of this parse error */
 wcstring parse_error_t::describe_with_prefix(const wcstring &src, const wcstring &prefix, bool is_interactive, bool skip_caret) const
 {
@ -682,6 +675,8 @@ void parse_ll_t::dump_stack(void) const
 // Give each node a source range equal to the union of the ranges of its children
 // Terminal nodes already have source ranges (and no children)
 // Since children always appear after their parents, we can implement this very simply by walking backwards
+// We then do a second pass to give empty nodes an empty source range (but with a valid offset)
+// We do this by walking forward. If a child of a node has an invalid source range, we set it equal to the end of the source range of its previous child
 void parse_ll_t::determine_node_ranges(void)
 {
    size_t idx = nodes.size();
@ -712,6 +707,30 @@ void parse_ll_t::determine_node_ranges(void)
            parent->source_length = max_end - min_start;
        }
    }
+
+    /* Forwards pass */
+    size_t size = nodes.size();
+    for (idx = 0; idx < size; idx++)
+    {
+        /* Since we populate the source range based on the sibling node, it's simpler to walk over the children of each node.
+        We keep a running "child_source_cursor" which is meant to be the end of the child's source range. It's initially set to the beginning of the parent' source range. */
+        parse_node_t *parent = &nodes[idx];
+        // If the parent doesn't have a valid source range, then none of its children will either; skip it entirely
+        if (parent->source_start == SOURCE_OFFSET_INVALID)
+        {
+            continue;
+        }
+        source_offset_t child_source_cursor = parent->source_start;
+        for (size_t child_idx = 0; child_idx < parent->child_count; child_idx++)
+        {
+            parse_node_t *child = &nodes[parent->child_start + child_idx];
+            if (child->source_start == SOURCE_OFFSET_INVALID)
+            {
+                child->source_start = child_source_cursor;
+            }
+            child_source_cursor = child->source_start + child->source_length;
+        }
+    }
 }

 void parse_ll_t::acquire_output(parse_node_tree_t *output, parse_error_list_t *errors)
@ -1002,10 +1021,12 @@ void parse_ll_t::accept_tokens(parse_token_t token1, parse_token_t token2)
    // Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
    if (token1.type == parse_special_type_parse_error || token1.type == parse_special_type_tokenizer_error || token1.type == parse_special_type_comment)
    {
-        parse_node_t err_node(token1.type);
-        err_node.source_start = token1.source_start;
-        err_node.source_length = token1.source_length;
-        nodes.push_back(err_node);
+        /* We set the special node's parent to the top of the stack. This means that we have an asymmetric relationship: the special node has a parent (which is the node we were trying to generate when we encountered the special node), but the parent node does not have the special node as a child. This means for example that parents don't have to worry about tracking any comment nodes, but we can still recover the parent from the comment. */
+        parse_node_t special_node(token1.type);
+        special_node.parent = symbol_stack.back().node_idx;
+        special_node.source_start = token1.source_start;
+        special_node.source_length = token1.source_length;
+        nodes.push_back(special_node);
        consumed = true;

        /* tokenizer errors are fatal */
--- a/parse_tree.h
+++ b/parse_tree.h
@ -111,7 +111,9 @@ public:
    /* Indicate if this node has a range of source code associated with it */
    bool has_source() const
    {
-        return source_start != SOURCE_OFFSET_INVALID;
+        /* Should never have a nonempty range with an invalid offset */
+        assert(this->source_start != SOURCE_OFFSET_INVALID || this->source_length == 0);
+        return this->source_length > 0;
    }

    /* Gets source for the node, or the empty string if it has no source */
--- a/parse_util.cpp
+++ b/parse_util.cpp
@ -840,7 +840,7 @@ static void compute_indents_recursive(const parse_node_tree_t &tree, node_offset
    if (node_idx > *max_visited_node_idx)
        *max_visited_node_idx = node_idx;

-    /* We could implement this by utilizing the fish grammar. But there's an easy trick instead: almost everything that wraps a job list should be indented by 1. So just find all of the job lists. One exception is switch; the other exception is job_list itself: a job_list is a job and a job_list, and we want that child list to be indented the same as the parent. So just find all job_lists whose parent is not a job_list, and increment their indent by 1. */
+    /* We could implement this by utilizing the fish grammar. But there's an easy trick instead: almost everything that wraps a job list should be indented by 1. So just find all of the job lists. One exception is switch, which wraps a case_item_list instead of a job_list. The other exception is job_list itself: a job_list is a job and a job_list, and we want that child list to be indented the same as the parent. So just find all job_lists whose parent is not a job_list, and increment their indent by 1. */

    const parse_node_t &node = tree.at(node_idx);
    const parse_token_type_t node_type = node.type;
@ -877,10 +877,39 @@ static void compute_indents_recursive(const parse_node_tree_t &tree, node_offset


    /* Store the indent into the indent array */
-    if (node.has_source())
+    if (node.source_start != SOURCE_OFFSET_INVALID && node.source_start < indents->size())
    {
-        assert(node.source_start < indents->size());
-        indents->at(node.source_start) = node_indent;
+        if (node.has_source())
+        {
+            /* A normal non-empty node. Store the indent unconditionally. */
+            indents->at(node.source_start) = node_indent;
+        }
+        else
+        {
+            /* An empty node. We have a source offset but no source length. This can come about when a node legitimately empty:
+
+                  while true; end
+
+               The job_list inside the while loop is empty. It still has a source offset (at the end of the while statement) but no source extent.
+               We still need to capture that indent, because there may be comments inside:
+                    while true
+                       # loop forever
+                    end
+
+               The 'loop forever' comment must be indented, by virtue of storing the indent.
+
+               Now consider what happens if we remove the end:
+
+                   while true
+                     # loop forever
+
+                Now both the job_list and end_command are unmaterialized. However, we want the indent to be of the job_list and not the end_command.  Therefore, we only store the indent if it's bigger.
+            */
+            if (node_indent > indents->at(node.source_start))
+            {
+                indents->at(node.source_start) = node_indent;
+            }
+        }
    }


@ -900,7 +929,7 @@ std::vector<int> parse_util_compute_indents(const wcstring &src)

    /* Parse the string. We pass continue_after_error to produce a forest; the trailing indent of the last node we visited becomes the input indent of the next. I.e. in the case of 'switch foo ; cas', we get an invalid parse tree (since 'cas' is not valid) but we indent it as if it were a case item list */
    parse_node_tree_t tree;
-    parse_tree_from_string(src, parse_flag_continue_after_error | parse_flag_accept_incomplete_tokens, &tree, NULL /* errors */);
+    parse_tree_from_string(src, parse_flag_continue_after_error | parse_flag_include_comments | parse_flag_accept_incomplete_tokens, &tree, NULL /* errors */);

    /* Start indenting at the first node. If we have a parse error, we'll have to start indenting from the top again */
    node_offset_t start_node_idx = 0;
@ -922,6 +951,22 @@ std::vector<int> parse_util_compute_indents(const wcstring &src)
        start_node_idx = max_visited_node_idx + 1;
    }

+    /* Handle comments. Each comment node has a parent (which is whatever the top of the symbol stack was when the comment was encountered). So the source range of the comment has the same indent as its parent. */
+    const size_t tree_size = tree.size();
+    for (node_offset_t i=0; i < tree_size; i++)
+    {
+        const parse_node_t &node = tree.at(i);
+        if (node.type == parse_special_type_comment && node.has_source() && node.parent < tree_size)
+        {
+            const parse_node_t &parent = tree.at(node.parent);
+            if (parent.source_start != SOURCE_OFFSET_INVALID)
+            {
+                indents.at(node.source_start) = indents.at(parent.source_start);
+            }
+        }
+    }
+
+    /* Now apply the indents. The indents array has -1 for places where the indent does not change, so start at each value and extend it along the run of -1s */
    int last_indent = 0;
    for (size_t i=0; i<src_size; i++)
    {