From f4449928f83748d015a633cdc1cef50fe822648c Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Fri, 14 Feb 2025 19:01:51 +0000 Subject: [PATCH] Searching: Added custom tokenizer that considers soft delimiters. This changes indexing so that a.b now indexes as "a", "b" AND "a.b" instead of just the first two, for periods and hypens, so terms containing those characters can be searched within. Adds hypens as a delimiter - #2095 --- app/Search/SearchIndex.php | 41 ++++++++++++++--- app/Search/SearchOptions.php | 2 +- app/Search/SearchTextTokenizer.php | 70 +++++++++++++++++++++++++++++ tests/Search/SearchIndexingTest.php | 16 +++++++ 4 files changed, 121 insertions(+), 8 deletions(-) create mode 100644 app/Search/SearchTextTokenizer.php diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index c7d9d6502..a8bd2c4b2 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -16,7 +16,13 @@ class SearchIndex /** * A list of delimiter characters used to break-up parsed content into terms for indexing. */ - public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\""; + public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\""; + + /** + * A list of delimiter which could be commonly used within a single term and also indicate a break between terms. + * The indexer will index the full term with these delimiters, plus the terms split via these delimiters. + */ + public static string $softDelimiters = ".-"; public function __construct( protected EntityProvider $entityProvider @@ -196,15 +202,36 @@ class SearchIndex protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} - $splitChars = static::$delimiters; - $token = strtok($text, $splitChars); + $softDelims = static::$softDelimiters; + $tokenizer = new SearchTextTokenizer($text, static::$delimiters); + $extendedToken = ''; + $extendedLen = 0; + + $token = $tokenizer->next(); while ($token !== false) { - if (!isset($tokenMap[$token])) { - $tokenMap[$token] = 0; + $delim = $tokenizer->previousDelimiter(); + + if ($delim && str_contains($softDelims, $delim) && $token !== '') { + $extendedToken .= $delim . $token; + $extendedLen++; + } else { + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; + } + $extendedToken = $token; + $extendedLen = 1; } - $tokenMap[$token]++; - $token = strtok($splitChars); + + if ($token) { + $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1; + } + + $token = $tokenizer->next(); + } + + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; } return $tokenMap; diff --git a/app/Search/SearchOptions.php b/app/Search/SearchOptions.php index a6f820299..bf527d9c3 100644 --- a/app/Search/SearchOptions.php +++ b/app/Search/SearchOptions.php @@ -181,7 +181,7 @@ class SearchOptions protected static function parseStandardTermString(string $termString): array { $terms = explode(' ', $termString); - $indexDelimiters = SearchIndex::$delimiters; + $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters))); $parsed = [ 'terms' => [], 'exacts' => [], diff --git a/app/Search/SearchTextTokenizer.php b/app/Search/SearchTextTokenizer.php new file mode 100644 index 000000000..f43fd56f1 --- /dev/null +++ b/app/Search/SearchTextTokenizer.php @@ -0,0 +1,70 @@ +length = strlen($this->text); + } + + /** + * Get the current delimiter to be found. + */ + public function currentDelimiter(): string + { + return $this->currentDelimiter; + } + + /** + * Get the previous delimiter found. + */ + public function previousDelimiter(): string + { + return $this->previousDelimiter; + } + + /** + * Get the next token between delimiters. + * Returns false if there's no further tokens. + */ + public function next(): string|false + { + $token = ''; + + for ($i = $this->currentIndex; $i < $this->length; $i++) { + $char = $this->text[$i]; + if (str_contains($this->delimiters, $char)) { + $this->previousDelimiter = $this->currentDelimiter; + $this->currentDelimiter = $char; + $this->currentIndex = $i + 1; + return $token; + } + + $token .= $char; + } + + if ($token) { + $this->currentIndex = $this->length; + $this->previousDelimiter = $this->currentDelimiter; + $this->currentDelimiter = ''; + return $token; + } + + return false; + } +} diff --git a/tests/Search/SearchIndexingTest.php b/tests/Search/SearchIndexingTest.php index 43219a4ed..6933813b6 100644 --- a/tests/Search/SearchIndexingTest.php +++ b/tests/Search/SearchIndexingTest.php @@ -74,4 +74,20 @@ class SearchIndexingTest extends TestCase $this->assertEquals(3, $scoreByTerm->get('Animal')); $this->assertEquals(3, $scoreByTerm->get('SuperImportant')); } + + public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index() + { + $page = $this->entities->newPage(['html' => '

super.duper awesome-beans big- barry cheese.

biscuits

a-bs

']); + + $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); + $expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs']; + foreach ($expected as $term) { + $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed"); + } + + $nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits']; + foreach ($nonExpected as $term) { + $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed"); + } + } }