From 75936454cca139d0b226a95ee7a0070bc8702fdc Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Thu, 23 Nov 2023 14:29:07 +0000 Subject: [PATCH] Includes: Developed to get new system working with inline includes Adds logic for locating and splitting text nodes. Adds specific classes to offload tag/content specific logic. --- app/Entities/Tools/PageIncludeContent.php | 68 ++++++++++++++++++ app/Entities/Tools/PageIncludeParser.php | 85 +++++++++++++++++++++-- app/Entities/Tools/PageIncludeTag.php | 30 ++++++++ app/Util/HtmlDocument.php | 15 ++++ tests/Unit/PageIncludeParserTest.php | 2 +- 5 files changed, 192 insertions(+), 8 deletions(-) create mode 100644 app/Entities/Tools/PageIncludeContent.php create mode 100644 app/Entities/Tools/PageIncludeTag.php diff --git a/app/Entities/Tools/PageIncludeContent.php b/app/Entities/Tools/PageIncludeContent.php new file mode 100644 index 000000000..97c470c68 --- /dev/null +++ b/app/Entities/Tools/PageIncludeContent.php @@ -0,0 +1,68 @@ +parseHtml($html, $tag); + } + + protected function parseHtml(string $html, PageIncludeTag $tag): void + { + if (empty($html)) { + return; + } + + $doc = new HtmlDocument($html); + + $sectionId = $tag->getSectionId(); + if (!$sectionId) { + $this->contents = [...$doc->getBodyChildren()]; + $this->isTopLevel = true; + return; + } + + $section = $doc->getElementById($sectionId); + if (!$section) { + return; + } + + $isTopLevel = in_array(strtolower($section->nodeName), static::$topLevelTags); + $this->isTopLevel = $isTopLevel; + $this->contents = $isTopLevel ? [$section] : [...$section->childNodes]; + } + + public function isInline(): bool + { + return !$this->isTopLevel; + } + + public function isEmpty(): bool + { + return empty($this->contents); + } + + /** + * @return DOMNode[] + */ + public function toDomNodes(): array + { + return $this->contents; + } +} diff --git a/app/Entities/Tools/PageIncludeParser.php b/app/Entities/Tools/PageIncludeParser.php index 63d3ea8d6..070b0cc11 100644 --- a/app/Entities/Tools/PageIncludeParser.php +++ b/app/Entities/Tools/PageIncludeParser.php @@ -4,6 +4,8 @@ namespace BookStack\Entities\Tools; use BookStack\Util\HtmlDocument; use Closure; +use DOMNode; +use DOMText; class PageIncludeParser { @@ -17,14 +19,25 @@ class PageIncludeParser public function parse(): string { - $html = new HtmlDocument($this->pageHtml); + $doc = new HtmlDocument($this->pageHtml); - $includeHosts = $html->queryXPath("//body//*[contains(text(), '{{@')]"); - $node = $includeHosts->item(0); + $tags = $this->locateAndIsolateIncludeTags($doc); - // One of the direct child textnodes of the "$includeHosts" should be - // the one with the include tag within. - $textNode = $node->childNodes->item(0); + foreach ($tags as $tag) { + $htmlContent = $this->pageContentForId->call($this, $tag->getPageId()); + $content = new PageIncludeContent($htmlContent, $tag); + + if ($content->isInline()) { + $adopted = $doc->adoptNodes($content->toDomNodes()); + foreach ($adopted as $adoptedContentNode) { + $tag->domNode->parentNode->insertBefore($adoptedContentNode, $tag->domNode); + } + $tag->domNode->parentNode->removeChild($tag->domNode); + continue; + } + + // TODO - Non-inline + } // TODO: // Hunt down the specific text nodes with matches @@ -52,6 +65,64 @@ class PageIncludeParser // in changes affecting the next tag, where tags may be in the same/adjacent nodes. - return $html->getBodyInnerHtml(); + return $doc->getBodyInnerHtml(); + } + + /** + * Locate include tags within the given document, isolating them to their + * own nodes in the DOM for future targeted manipulation. + * @return PageIncludeTag[] + */ + protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array + { + $includeHosts = $doc->queryXPath("//body//*[contains(text(), '{{@')]"); + $includeTags = []; + + /** @var DOMNode $node */ + /** @var DOMNode $childNode */ + foreach ($includeHosts as $node) { + foreach ($node->childNodes as $childNode) { + if ($childNode->nodeName === '#text') { + array_push($includeTags, ...$this->splitTextNodesAtTags($childNode)); + } + } + } + + return $includeTags; + } + + /** + * Takes a text DOMNode and splits its text content at include tags + * into multiple text nodes within the original parent. + * Returns found PageIncludeTag references. + * @return PageIncludeTag[] + */ + protected function splitTextNodesAtTags(DOMNode $textNode): array + { + $includeTags = []; + $text = $textNode->textContent; + preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE); + + $currentOffset = 0; + foreach ($matches[0] as $index => $fullTagMatch) { + $tagOuterContent = $fullTagMatch[0]; + $tagInnerContent = $matches[1][$index][0]; + $tagStartOffset = $fullTagMatch[1]; + + if ($currentOffset < $tagStartOffset) { + $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset); + $textNode->parentNode->insertBefore(new DOMText($previousText), $textNode); + } + + $node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode); + $includeTags[] = new PageIncludeTag($tagInnerContent, $node); + $currentOffset = $tagStartOffset + strlen($tagOuterContent); + } + + if ($currentOffset > 0) { + $textNode->textContent = substr($text, $currentOffset); + } + + return $includeTags; } } diff --git a/app/Entities/Tools/PageIncludeTag.php b/app/Entities/Tools/PageIncludeTag.php new file mode 100644 index 000000000..05a532fb2 --- /dev/null +++ b/app/Entities/Tools/PageIncludeTag.php @@ -0,0 +1,30 @@ +tagContent, 2)[0])); + } + + /** + * Get the section ID that this tag references (if any) + */ + public function getSectionId(): string + { + return trim(explode('#', $this->tagContent, 2)[1] ?? ''); + } +} diff --git a/app/Util/HtmlDocument.php b/app/Util/HtmlDocument.php index b8c53d439..ad5dacd82 100644 --- a/app/Util/HtmlDocument.php +++ b/app/Util/HtmlDocument.php @@ -149,4 +149,19 @@ class HtmlDocument { return $this->document->saveHTML($node); } + + /** + * Adopt the given nodes into this document. + * @param DOMNode[] $nodes + * @return DOMNode[] + */ + public function adoptNodes(array $nodes): array + { + $adopted = []; + foreach ($nodes as $node) { + $adopted[] = $this->document->importNode($node, true); + } + + return $adopted; + } } diff --git a/tests/Unit/PageIncludeParserTest.php b/tests/Unit/PageIncludeParserTest.php index de31504ff..d1912270e 100644 --- a/tests/Unit/PageIncludeParserTest.php +++ b/tests/Unit/PageIncludeParserTest.php @@ -37,7 +37,7 @@ class PageIncludeParserTest extends TestCase protected function runParserTest(string $html, array $contentById, string $expected) { $parser = new PageIncludeParser($html, function (int $id) use ($contentById) { - return $contentById[strval($id)] ?? null; + return $contentById[strval($id)] ?? ''; }); $result = $parser->parse();