BookStack/app/Entities/Tools/PageIncludeParser.php

<?php

namespace BookStack\Entities\Tools;

use BookStack\Util\HtmlDocument;
use Closure;
use DOMNode;
use DOMText;

class PageIncludeParser
{
    protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/";

    public function __construct(
        protected string $pageHtml,
        protected Closure $pageContentForId,
    ) {
    }

    public function parse(): string
    {
        $doc = new HtmlDocument($this->pageHtml);

        $tags = $this->locateAndIsolateIncludeTags($doc);

        foreach ($tags as $tag) {
            $htmlContent = $this->pageContentForId->call($this, $tag->getPageId());
            $content = new PageIncludeContent($htmlContent, $tag);

            if ($content->isInline()) {
                $adopted = $doc->adoptNodes($content->toDomNodes());
                foreach ($adopted as $adoptedContentNode) {
                    $tag->domNode->parentNode->insertBefore($adoptedContentNode, $tag->domNode);
                }
                $tag->domNode->parentNode->removeChild($tag->domNode);
                continue;
            }

            // TODO - Non-inline
        }

        // TODO:
        // Hunt down the specific text nodes with matches
        // Split out tag text node from rest of content
        // Fetch tag content->
          // If range or top-block: delete tag text node, [Promote to top-block], delete old top-block if empty
          // If inline: Replace current text node with new text or elem
        // !! "Range" or "inline" status should come from tag parser and content fetcher, not guessed direct from content
        //     since we could have a range of inline elements

        // [Promote to top-block]
        // Tricky operation.
        // Can throw in before or after current top-block depending on relative position
        // Could [Split] top-block but complex past a single level depth.
        // Maybe [Split] if one level depth, otherwise default to before/after block
        // Should work for the vast majority of cases, and not for those which would
        // technically be invalid in-editor anyway.

        // [Split]
        // Copy original top-block node type and attrs (apart from ID)
        // Move nodes after promoted tag-node into copy
        // Insert copy after original (after promoted top-block eventually)

        // Notes: May want to eventually parse through backwards, which should avoid issues
        // in changes affecting the next tag, where tags may be in the same/adjacent nodes.


        return $doc->getBodyInnerHtml();
    }

    /**
     * Locate include tags within the given document, isolating them to their
     * own nodes in the DOM for future targeted manipulation.
     * @return PageIncludeTag[]
     */
    protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array
    {
        $includeHosts = $doc->queryXPath("//body//*[contains(text(), '{{@')]");
        $includeTags = [];

        /** @var DOMNode $node */
        /** @var DOMNode $childNode */
        foreach ($includeHosts as $node) {
            foreach ($node->childNodes as $childNode) {
                if ($childNode->nodeName === '#text') {
                    array_push($includeTags, ...$this->splitTextNodesAtTags($childNode));
                }
            }
        }

        return $includeTags;
    }

    /**
     * Takes a text DOMNode and splits its text content at include tags
     * into multiple text nodes within the original parent.
     * Returns found PageIncludeTag references.
     * @return PageIncludeTag[]
     */
    protected function splitTextNodesAtTags(DOMNode $textNode): array
    {
        $includeTags = [];
        $text = $textNode->textContent;
        preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE);

        $currentOffset = 0;
        foreach ($matches[0] as $index => $fullTagMatch) {
            $tagOuterContent = $fullTagMatch[0];
            $tagInnerContent = $matches[1][$index][0];
            $tagStartOffset = $fullTagMatch[1];

            if ($currentOffset < $tagStartOffset) {
                $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset);
                $textNode->parentNode->insertBefore(new DOMText($previousText), $textNode);
            }

            $node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode);
            $includeTags[] = new PageIncludeTag($tagInnerContent, $node);
            $currentOffset = $tagStartOffset + strlen($tagOuterContent);
        }

        if ($currentOffset > 0) {
            $textNode->textContent = substr($text, $currentOffset);
        }

        return $includeTags;
    }
}
Includes: Started foundations for new include tag parser 2023-11-23 06:14:28 +08:00			`<?php`

			`namespace BookStack\Entities\Tools;`

			`use BookStack\Util\HtmlDocument;`
			`use Closure;`
Includes: Developed to get new system working with inline includes Adds logic for locating and splitting text nodes. Adds specific classes to offload tag/content specific logic. 2023-11-23 22:29:07 +08:00			`use DOMNode;`
			`use DOMText;`
Includes: Started foundations for new include tag parser 2023-11-23 06:14:28 +08:00
			`class PageIncludeParser`
			`{`
			`protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/";`

			`public function __construct(`
			`protected string $pageHtml,`
			`protected Closure $pageContentForId,`
			`) {`
			`}`

			`public function parse(): string`
			`{`
Includes: Developed to get new system working with inline includes Adds logic for locating and splitting text nodes. Adds specific classes to offload tag/content specific logic. 2023-11-23 22:29:07 +08:00			`$doc = new HtmlDocument($this->pageHtml);`
Includes: Started foundations for new include tag parser 2023-11-23 06:14:28 +08:00
Includes: Developed to get new system working with inline includes Adds logic for locating and splitting text nodes. Adds specific classes to offload tag/content specific logic. 2023-11-23 22:29:07 +08:00			`$tags = $this->locateAndIsolateIncludeTags($doc);`
Includes: Started foundations for new include tag parser 2023-11-23 06:14:28 +08:00
Includes: Developed to get new system working with inline includes Adds logic for locating and splitting text nodes. Adds specific classes to offload tag/content specific logic. 2023-11-23 22:29:07 +08:00			`foreach ($tags as $tag) {`
			`$htmlContent = $this->pageContentForId->call($this, $tag->getPageId());`
			`$content = new PageIncludeContent($htmlContent, $tag);`

			`if ($content->isInline()) {`
			`$adopted = $doc->adoptNodes($content->toDomNodes());`
			`foreach ($adopted as $adoptedContentNode) {`
			`$tag->domNode->parentNode->insertBefore($adoptedContentNode, $tag->domNode);`
			`}`
			`$tag->domNode->parentNode->removeChild($tag->domNode);`
			`continue;`
			`}`

			`// TODO - Non-inline`
			`}`
Includes: Started foundations for new include tag parser 2023-11-23 06:14:28 +08:00
			`// TODO:`
			`// Hunt down the specific text nodes with matches`
			`// Split out tag text node from rest of content`
			`// Fetch tag content->`
			`// If range or top-block: delete tag text node, [Promote to top-block], delete old top-block if empty`
			`// If inline: Replace current text node with new text or elem`
			`// !! "Range" or "inline" status should come from tag parser and content fetcher, not guessed direct from content`
			`// since we could have a range of inline elements`

			`// [Promote to top-block]`
			`// Tricky operation.`
			`// Can throw in before or after current top-block depending on relative position`
			`// Could [Split] top-block but complex past a single level depth.`
			`// Maybe [Split] if one level depth, otherwise default to before/after block`
			`// Should work for the vast majority of cases, and not for those which would`
			`// technically be invalid in-editor anyway.`

			`// [Split]`
			`// Copy original top-block node type and attrs (apart from ID)`
			`// Move nodes after promoted tag-node into copy`
			`// Insert copy after original (after promoted top-block eventually)`

			`// Notes: May want to eventually parse through backwards, which should avoid issues`
			`// in changes affecting the next tag, where tags may be in the same/adjacent nodes.`


Includes: Developed to get new system working with inline includes Adds logic for locating and splitting text nodes. Adds specific classes to offload tag/content specific logic. 2023-11-23 22:29:07 +08:00			`return $doc->getBodyInnerHtml();`
			`}`

			`/**`
			`* Locate include tags within the given document, isolating them to their`
			`* own nodes in the DOM for future targeted manipulation.`
			`* @return PageIncludeTag[]`
			`*/`
			`protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array`
			`{`
			`$includeHosts = $doc->queryXPath("//body//*[contains(text(), '{{@')]");`
			`$includeTags = [];`

			`/** @var DOMNode $node */`
			`/** @var DOMNode $childNode */`
			`foreach ($includeHosts as $node) {`
			`foreach ($node->childNodes as $childNode) {`
			`if ($childNode->nodeName === '#text') {`
			`array_push($includeTags, ...$this->splitTextNodesAtTags($childNode));`
			`}`
			`}`
			`}`

			`return $includeTags;`
			`}`

			`/**`
			`* Takes a text DOMNode and splits its text content at include tags`
			`* into multiple text nodes within the original parent.`
			`* Returns found PageIncludeTag references.`
			`* @return PageIncludeTag[]`
			`*/`
			`protected function splitTextNodesAtTags(DOMNode $textNode): array`
			`{`
			`$includeTags = [];`
			`$text = $textNode->textContent;`
			`preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE);`

			`$currentOffset = 0;`
			`foreach ($matches[0] as $index => $fullTagMatch) {`
			`$tagOuterContent = $fullTagMatch[0];`
			`$tagInnerContent = $matches[1][$index][0];`
			`$tagStartOffset = $fullTagMatch[1];`

			`if ($currentOffset < $tagStartOffset) {`
			`$previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset);`
			`$textNode->parentNode->insertBefore(new DOMText($previousText), $textNode);`
			`}`

			`$node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode);`
			`$includeTags[] = new PageIncludeTag($tagInnerContent, $node);`
			`$currentOffset = $tagStartOffset + strlen($tagOuterContent);`
			`}`

			`if ($currentOffset > 0) {`
			`$textNode->textContent = substr($text, $currentOffset);`
			`}`

			`return $includeTags;`
Includes: Started foundations for new include tag parser 2023-11-23 06:14:28 +08:00			`}`
			`}`