From f4449928f83748d015a633cdc1cef50fe822648c Mon Sep 17 00:00:00 2001
From: Dan Brown <ssddanbrown@googlemail.com>
Date: Fri, 14 Feb 2025 19:01:51 +0000
Subject: [PATCH] Searching: Added custom tokenizer that considers soft
 delimiters.

This changes indexing so that a.b now indexes as "a", "b" AND "a.b"
instead of just the first two, for periods and hypens, so terms
containing those characters can be searched within.

Adds hypens as a delimiter - #2095
---
 app/Search/SearchIndex.php          | 41 ++++++++++++++---
 app/Search/SearchOptions.php        |  2 +-
 app/Search/SearchTextTokenizer.php  | 70 +++++++++++++++++++++++++++++
 tests/Search/SearchIndexingTest.php | 16 +++++++
 4 files changed, 121 insertions(+), 8 deletions(-)
 create mode 100644 app/Search/SearchTextTokenizer.php

diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php
index c7d9d6502..a8bd2c4b2 100644
--- a/app/Search/SearchIndex.php
+++ b/app/Search/SearchIndex.php
@@ -16,7 +16,13 @@ class SearchIndex
     /**
      * A list of delimiter characters used to break-up parsed content into terms for indexing.
      */
-    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
+    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
+
+    /**
+     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
+     */
+    public static string $softDelimiters = ".-";
 
     public function __construct(
         protected EntityProvider $entityProvider
@@ -196,15 +202,36 @@ class SearchIndex
     protected function textToTermCountMap(string $text): array
     {
         $tokenMap = []; // {TextToken => OccurrenceCount}
-        $splitChars = static::$delimiters;
-        $token = strtok($text, $splitChars);
+        $softDelims = static::$softDelimiters;
+        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+        $extendedToken = '';
+        $extendedLen = 0;
+
+        $token = $tokenizer->next();
 
         while ($token !== false) {
-            if (!isset($tokenMap[$token])) {
-                $tokenMap[$token] = 0;
+            $delim = $tokenizer->previousDelimiter();
+
+            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+                $extendedToken .= $delim . $token;
+                $extendedLen++;
+            } else {
+                if ($extendedLen > 1) {
+                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+                }
+                $extendedToken = $token;
+                $extendedLen = 1;
             }
-            $tokenMap[$token]++;
-            $token = strtok($splitChars);
+
+            if ($token) {
+                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+            }
+
+            $token = $tokenizer->next();
+        }
+
+        if ($extendedLen > 1) {
+            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
         }
 
         return $tokenMap;
diff --git a/app/Search/SearchOptions.php b/app/Search/SearchOptions.php
index a6f820299..bf527d9c3 100644
--- a/app/Search/SearchOptions.php
+++ b/app/Search/SearchOptions.php
@@ -181,7 +181,7 @@ class SearchOptions
     protected static function parseStandardTermString(string $termString): array
     {
         $terms = explode(' ', $termString);
-        $indexDelimiters = SearchIndex::$delimiters;
+        $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
         $parsed = [
             'terms'  => [],
             'exacts' => [],
diff --git a/app/Search/SearchTextTokenizer.php b/app/Search/SearchTextTokenizer.php
new file mode 100644
index 000000000..f43fd56f1
--- /dev/null
+++ b/app/Search/SearchTextTokenizer.php
@@ -0,0 +1,70 @@
+<?php
+
+namespace BookStack\Search;
+
+/**
+ * A custom text tokenizer which records & provides insight needed for our search indexing.
+ * We used to use basic strtok() but this class does the following which that lacked:
+ * - Tracks and provides the current/previous delimiter that we've stopped at.
+ * - Returns empty tokens upon parsing a delimiter.
+ */
+class SearchTextTokenizer
+{
+    protected int $currentIndex = 0;
+    protected int $length;
+    protected string $currentDelimiter = '';
+    protected string $previousDelimiter = '';
+
+    public function __construct(
+        protected string $text,
+        protected string $delimiters = ' '
+    ) {
+        $this->length = strlen($this->text);
+    }
+
+    /**
+     * Get the current delimiter to be found.
+     */
+    public function currentDelimiter(): string
+    {
+        return $this->currentDelimiter;
+    }
+
+    /**
+     * Get the previous delimiter found.
+     */
+    public function previousDelimiter(): string
+    {
+        return $this->previousDelimiter;
+    }
+
+    /**
+     * Get the next token between delimiters.
+     * Returns false if there's no further tokens.
+     */
+    public function next(): string|false
+    {
+        $token = '';
+
+        for ($i = $this->currentIndex; $i < $this->length; $i++) {
+            $char = $this->text[$i];
+            if (str_contains($this->delimiters, $char)) {
+                $this->previousDelimiter = $this->currentDelimiter;
+                $this->currentDelimiter = $char;
+                $this->currentIndex = $i + 1;
+                return $token;
+            }
+
+            $token .= $char;
+        }
+
+        if ($token) {
+            $this->currentIndex = $this->length;
+            $this->previousDelimiter = $this->currentDelimiter;
+            $this->currentDelimiter = '';
+            return $token;
+        }
+
+        return false;
+    }
+}
diff --git a/tests/Search/SearchIndexingTest.php b/tests/Search/SearchIndexingTest.php
index 43219a4ed..6933813b6 100644
--- a/tests/Search/SearchIndexingTest.php
+++ b/tests/Search/SearchIndexingTest.php
@@ -74,4 +74,20 @@ class SearchIndexingTest extends TestCase
         $this->assertEquals(3, $scoreByTerm->get('Animal'));
         $this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
     }
+
+    public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index()
+    {
+        $page = $this->entities->newPage(['html' => '<p>super.duper awesome-beans big- barry cheese.</p><p>biscuits</p><p>a-bs</p>']);
+
+        $scoreByTerm = $page->searchTerms()->pluck('score', 'term');
+        $expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs'];
+        foreach ($expected as $term) {
+            $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed");
+        }
+
+        $nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits'];
+        foreach ($nonExpected as $term) {
+            $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
+        }
+    }
 }