Merge pull request #5488 from BookStackApp/search_index_updates
Some checks failed
analyse-php / build (push) Has been cancelled
lint-php / build (push) Has been cancelled
test-migrations / build (8.2) (push) Has been cancelled
test-migrations / build (8.3) (push) Has been cancelled
test-migrations / build (8.4) (push) Has been cancelled
test-php / build (8.2) (push) Has been cancelled
test-php / build (8.3) (push) Has been cancelled
test-php / build (8.4) (push) Has been cancelled

Search index improvements
This commit is contained in:
Dan Brown 2025-02-14 19:39:08 +00:00 committed by GitHub
commit 92ad81429f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 333 additions and 189 deletions

View File

@ -16,7 +16,13 @@ class SearchIndex
/**
* A list of delimiter characters used to break-up parsed content into terms for indexing.
*/
public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
/**
* A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
* The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
*/
public static string $softDelimiters = ".-";
public function __construct(
protected EntityProvider $entityProvider
@ -196,15 +202,36 @@ class SearchIndex
protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
$splitChars = static::$delimiters;
$token = strtok($text, $splitChars);
$softDelims = static::$softDelimiters;
$tokenizer = new SearchTextTokenizer($text, static::$delimiters);
$extendedToken = '';
$extendedLen = 0;
$token = $tokenizer->next();
while ($token !== false) {
if (!isset($tokenMap[$token])) {
$tokenMap[$token] = 0;
$delim = $tokenizer->previousDelimiter();
if ($delim && str_contains($softDelims, $delim) && $token !== '') {
$extendedToken .= $delim . $token;
$extendedLen++;
} else {
if ($extendedLen > 1) {
$tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
}
$extendedToken = $token;
$extendedLen = 1;
}
$tokenMap[$token]++;
$token = strtok($splitChars);
if ($token) {
$tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
}
$token = $tokenizer->next();
}
if ($extendedLen > 1) {
$tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
}
return $tokenMap;

View File

@ -181,7 +181,7 @@ class SearchOptions
protected static function parseStandardTermString(string $termString): array
{
$terms = explode(' ', $termString);
$indexDelimiters = SearchIndex::$delimiters;
$indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
$parsed = [
'terms' => [],
'exacts' => [],

View File

@ -0,0 +1,70 @@
<?php
namespace BookStack\Search;
/**
* A custom text tokenizer which records & provides insight needed for our search indexing.
* We used to use basic strtok() but this class does the following which that lacked:
* - Tracks and provides the current/previous delimiter that we've stopped at.
* - Returns empty tokens upon parsing a delimiter.
*/
class SearchTextTokenizer
{
protected int $currentIndex = 0;
protected int $length;
protected string $currentDelimiter = '';
protected string $previousDelimiter = '';
public function __construct(
protected string $text,
protected string $delimiters = ' '
) {
$this->length = strlen($this->text);
}
/**
* Get the current delimiter to be found.
*/
public function currentDelimiter(): string
{
return $this->currentDelimiter;
}
/**
* Get the previous delimiter found.
*/
public function previousDelimiter(): string
{
return $this->previousDelimiter;
}
/**
* Get the next token between delimiters.
* Returns false if there's no further tokens.
*/
public function next(): string|false
{
$token = '';
for ($i = $this->currentIndex; $i < $this->length; $i++) {
$char = $this->text[$i];
if (str_contains($this->delimiters, $char)) {
$this->previousDelimiter = $this->currentDelimiter;
$this->currentDelimiter = $char;
$this->currentIndex = $i + 1;
return $token;
}
$token .= $char;
}
if ($token) {
$this->currentIndex = $this->length;
$this->previousDelimiter = $this->currentDelimiter;
$this->currentDelimiter = '';
return $token;
}
return false;
}
}

View File

@ -1,12 +1,9 @@
<?php
namespace Tests\Entity;
namespace Search;
use BookStack\Activity\Models\Tag;
use BookStack\Entities\Models\Book;
use BookStack\Entities\Models\Bookshelf;
use BookStack\Entities\Models\Chapter;
use Illuminate\Support\Str;
use Tests\TestCase;
class EntitySearchTest extends TestCase
@ -312,113 +309,6 @@ class EntitySearchTest extends TestCase
$defaultListTest->assertDontSee($templatePage->name);
}
public function test_sibling_search_for_pages()
{
$chapter = $this->entities->chapterHasPages();
$this->assertGreaterThan(2, count($chapter->pages), 'Ensure we\'re testing with at least 1 sibling');
$page = $chapter->pages->first();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
$search->assertSuccessful();
foreach ($chapter->pages as $page) {
$search->assertSee($page->name);
}
$search->assertDontSee($chapter->name);
}
public function test_sibling_search_for_pages_without_chapter()
{
$page = $this->entities->pageNotWithinChapter();
$bookChildren = $page->book->getDirectVisibleChildren();
$this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
$search->assertSuccessful();
foreach ($bookChildren as $child) {
$search->assertSee($child->name);
}
$search->assertDontSee($page->book->name);
}
public function test_sibling_search_for_chapters()
{
$chapter = $this->entities->chapter();
$bookChildren = $chapter->book->getDirectVisibleChildren();
$this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$chapter->id}&entity_type=chapter");
$search->assertSuccessful();
foreach ($bookChildren as $child) {
$search->assertSee($child->name);
}
$search->assertDontSee($chapter->book->name);
}
public function test_sibling_search_for_books()
{
$books = Book::query()->take(10)->get();
$book = $books->first();
$this->assertGreaterThan(2, count($books), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$book->id}&entity_type=book");
$search->assertSuccessful();
foreach ($books as $expectedBook) {
$search->assertSee($expectedBook->name);
}
}
public function test_sibling_search_for_shelves()
{
$shelves = Bookshelf::query()->take(10)->get();
$shelf = $shelves->first();
$this->assertGreaterThan(2, count($shelves), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$shelf->id}&entity_type=bookshelf");
$search->assertSuccessful();
foreach ($shelves as $expectedShelf) {
$search->assertSee($expectedShelf->name);
}
}
public function test_sibling_search_for_books_provides_results_in_alphabetical_order()
{
$contextBook = $this->entities->book();
$searchBook = $this->entities->book();
$searchBook->name = 'Zebras';
$searchBook->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
$this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
$searchBook->name = '1AAAAAAArdvarks';
$searchBook->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
$this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
}
public function test_sibling_search_for_shelves_provides_results_in_alphabetical_order()
{
$contextShelf = $this->entities->shelf();
$searchShelf = $this->entities->shelf();
$searchShelf->name = 'Zebras';
$searchShelf->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
$this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
$searchShelf->name = '1AAAAAAArdvarks';
$searchShelf->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
$this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
}
public function test_search_works_on_updated_page_content()
{
$page = $this->entities->page();
@ -453,75 +343,6 @@ class EntitySearchTest extends TestCase
$this->withHtml($search)->assertElementContains('.entity-list > .page:nth-child(2)', 'Test page A');
}
public function test_terms_in_headers_have_an_adjusted_index_score()
{
$page = $this->entities->newPage(['name' => 'Test page A', 'html' => '
<p>TermA</p>
<h1>TermB <strong>TermNested</strong></h1>
<h2>TermC</h2>
<h3>TermD</h3>
<h4>TermE</h4>
<h5>TermF</h5>
<h6>TermG</h6>
']);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
$this->assertEquals(1, $scoreByTerm->get('TermA'));
$this->assertEquals(10, $scoreByTerm->get('TermB'));
$this->assertEquals(10, $scoreByTerm->get('TermNested'));
$this->assertEquals(5, $scoreByTerm->get('TermC'));
$this->assertEquals(4, $scoreByTerm->get('TermD'));
$this->assertEquals(3, $scoreByTerm->get('TermE'));
$this->assertEquals(2, $scoreByTerm->get('TermF'));
// Is 1.5 but stored as integer, rounding up
$this->assertEquals(2, $scoreByTerm->get('TermG'));
}
public function test_indexing_works_as_expected_for_page_with_lots_of_terms()
{
$this->markTestSkipped('Time consuming test');
$count = 100000;
$text = '';
$chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_#';
for ($i = 0; $i < $count; $i++) {
$text .= substr(str_shuffle($chars), 0, 5) . ' ';
}
$page = $this->entities->newPage(['name' => 'Test page A', 'html' => '<p>' . $text . '</p>']);
$termCount = $page->searchTerms()->count();
// Expect at least 90% unique rate
$this->assertGreaterThan($count * 0.9, $termCount);
}
public function test_name_and_content_terms_are_merged_to_single_score()
{
$page = $this->entities->newPage(['name' => 'TermA', 'html' => '
<p>TermA</p>
']);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
// Scores 40 for being in the name then 1 for being in the content
$this->assertEquals(41, $scoreByTerm->get('TermA'));
}
public function test_tag_names_and_values_are_indexed_for_search()
{
$page = $this->entities->newPage(['name' => 'PageA', 'html' => '<p>content</p>', 'tags' => [
['name' => 'Animal', 'value' => 'MeowieCat'],
['name' => 'SuperImportant'],
]]);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
$this->assertEquals(5, $scoreByTerm->get('MeowieCat'));
$this->assertEquals(3, $scoreByTerm->get('Animal'));
$this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
}
public function test_matching_terms_in_search_results_are_highlighted()
{
$this->entities->newPage(['name' => 'My Meowie Cat', 'html' => '<p>A superimportant page about meowieable animals</p>', 'tags' => [

View File

@ -0,0 +1,109 @@
<?php
namespace Search;
use Tests\TestCase;
class SearchIndexingTest extends TestCase
{
public function test_terms_in_headers_have_an_adjusted_index_score()
{
$page = $this->entities->newPage(['name' => 'Test page A', 'html' => '
<p>TermA</p>
<h1>TermB <strong>TermNested</strong></h1>
<h2>TermC</h2>
<h3>TermD</h3>
<h4>TermE</h4>
<h5>TermF</h5>
<h6>TermG</h6>
']);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
$this->assertEquals(1, $scoreByTerm->get('TermA'));
$this->assertEquals(10, $scoreByTerm->get('TermB'));
$this->assertEquals(10, $scoreByTerm->get('TermNested'));
$this->assertEquals(5, $scoreByTerm->get('TermC'));
$this->assertEquals(4, $scoreByTerm->get('TermD'));
$this->assertEquals(3, $scoreByTerm->get('TermE'));
$this->assertEquals(2, $scoreByTerm->get('TermF'));
// Is 1.5 but stored as integer, rounding up
$this->assertEquals(2, $scoreByTerm->get('TermG'));
}
public function test_indexing_works_as_expected_for_page_with_lots_of_terms()
{
$this->markTestSkipped('Time consuming test');
$count = 100000;
$text = '';
$chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_#';
for ($i = 0; $i < $count; $i++) {
$text .= substr(str_shuffle($chars), 0, 5) . ' ';
}
$page = $this->entities->newPage(['name' => 'Test page A', 'html' => '<p>' . $text . '</p>']);
$termCount = $page->searchTerms()->count();
// Expect at least 90% unique rate
$this->assertGreaterThan($count * 0.9, $termCount);
}
public function test_name_and_content_terms_are_merged_to_single_score()
{
$page = $this->entities->newPage(['name' => 'TermA', 'html' => '
<p>TermA</p>
']);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
// Scores 40 for being in the name then 1 for being in the content
$this->assertEquals(41, $scoreByTerm->get('TermA'));
}
public function test_tag_names_and_values_are_indexed_for_search()
{
$page = $this->entities->newPage(['name' => 'PageA', 'html' => '<p>content</p>', 'tags' => [
['name' => 'Animal', 'value' => 'MeowieCat'],
['name' => 'SuperImportant'],
]]);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
$this->assertEquals(5, $scoreByTerm->get('MeowieCat'));
$this->assertEquals(3, $scoreByTerm->get('Animal'));
$this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
}
public function test_terms_containing_guillemets_handled()
{
$page = $this->entities->newPage(['html' => '<p>«Hello there» and « there »</p>']);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
$expected = ['Hello', 'there', 'and'];
foreach ($expected as $term) {
$this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed");
}
$nonExpected = ['«', '»'];
foreach ($nonExpected as $term) {
$this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
}
}
public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index()
{
$page = $this->entities->newPage(['html' => '<p>super.duper awesome-beans big- barry cheese.</p><p>biscuits</p><p>a-bs</p>']);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
$expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs'];
foreach ($expected as $term) {
$this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed");
}
$nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits'];
foreach ($nonExpected as $term) {
$this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
}
}
}

View File

@ -1,6 +1,6 @@
<?php
namespace Tests\Entity;
namespace Search;
use BookStack\Search\Options\ExactSearchOption;
use BookStack\Search\Options\FilterSearchOption;

View File

@ -0,0 +1,117 @@
<?php
namespace Search;
use BookStack\Entities\Models\Book;
use BookStack\Entities\Models\Bookshelf;
use Tests\TestCase;
class SiblingSearchTest extends TestCase
{
public function test_sibling_search_for_pages()
{
$chapter = $this->entities->chapterHasPages();
$this->assertGreaterThan(2, count($chapter->pages), 'Ensure we\'re testing with at least 1 sibling');
$page = $chapter->pages->first();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
$search->assertSuccessful();
foreach ($chapter->pages as $page) {
$search->assertSee($page->name);
}
$search->assertDontSee($chapter->name);
}
public function test_sibling_search_for_pages_without_chapter()
{
$page = $this->entities->pageNotWithinChapter();
$bookChildren = $page->book->getDirectVisibleChildren();
$this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
$search->assertSuccessful();
foreach ($bookChildren as $child) {
$search->assertSee($child->name);
}
$search->assertDontSee($page->book->name);
}
public function test_sibling_search_for_chapters()
{
$chapter = $this->entities->chapter();
$bookChildren = $chapter->book->getDirectVisibleChildren();
$this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$chapter->id}&entity_type=chapter");
$search->assertSuccessful();
foreach ($bookChildren as $child) {
$search->assertSee($child->name);
}
$search->assertDontSee($chapter->book->name);
}
public function test_sibling_search_for_books()
{
$books = Book::query()->take(10)->get();
$book = $books->first();
$this->assertGreaterThan(2, count($books), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$book->id}&entity_type=book");
$search->assertSuccessful();
foreach ($books as $expectedBook) {
$search->assertSee($expectedBook->name);
}
}
public function test_sibling_search_for_shelves()
{
$shelves = Bookshelf::query()->take(10)->get();
$shelf = $shelves->first();
$this->assertGreaterThan(2, count($shelves), 'Ensure we\'re testing with at least 1 sibling');
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$shelf->id}&entity_type=bookshelf");
$search->assertSuccessful();
foreach ($shelves as $expectedShelf) {
$search->assertSee($expectedShelf->name);
}
}
public function test_sibling_search_for_books_provides_results_in_alphabetical_order()
{
$contextBook = $this->entities->book();
$searchBook = $this->entities->book();
$searchBook->name = 'Zebras';
$searchBook->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
$this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
$searchBook->name = '1AAAAAAArdvarks';
$searchBook->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
$this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
}
public function test_sibling_search_for_shelves_provides_results_in_alphabetical_order()
{
$contextShelf = $this->entities->shelf();
$searchShelf = $this->entities->shelf();
$searchShelf->name = 'Zebras';
$searchShelf->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
$this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
$searchShelf->name = '1AAAAAAArdvarks';
$searchShelf->save();
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
$this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
}
}