From 6de1ea0194ac6c40e5775ffe9d8b7890c458c9b1 Mon Sep 17 00:00:00 2001 From: Sami Mazouz Date: Sat, 9 Apr 2022 23:04:15 +0100 Subject: [PATCH] fix: languages with combining characters cannot be searched (#3385) * test: searching other types of languages * fix: languages with combining characters cannot be searched * test: can search in discussion titles --- .../Search/Gambit/FulltextGambit.php | 2 +- .../ListWithFulltextSearchTest.php | 53 ++++++++++++++++++- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/framework/core/src/Discussion/Search/Gambit/FulltextGambit.php b/framework/core/src/Discussion/Search/Gambit/FulltextGambit.php index 2c1798eb0..ed0b16f2a 100644 --- a/framework/core/src/Discussion/Search/Gambit/FulltextGambit.php +++ b/framework/core/src/Discussion/Search/Gambit/FulltextGambit.php @@ -25,7 +25,7 @@ class FulltextGambit implements GambitInterface // Replace all non-word characters with spaces. // We do this to prevent MySQL fulltext search boolean mode from taking // effect: https://dev.mysql.com/doc/refman/5.7/en/fulltext-boolean.html - $bit = preg_replace('/[^\p{L}\p{N}_]+/u', ' ', $bit); + $bit = preg_replace('/[^\p{L}\p{N}\p{M}_]+/u', ' ', $bit); $query = $search->getQuery(); $grammar = $query->getGrammar(); diff --git a/framework/core/tests/integration/api/discussions/ListWithFulltextSearchTest.php b/framework/core/tests/integration/api/discussions/ListWithFulltextSearchTest.php index 1a4bf15c2..81694aefa 100644 --- a/framework/core/tests/integration/api/discussions/ListWithFulltextSearchTest.php +++ b/framework/core/tests/integration/api/discussions/ListWithFulltextSearchTest.php @@ -12,6 +12,7 @@ namespace Flarum\Tests\integration\api\discussions; use Carbon\Carbon; use Flarum\Testing\integration\RetrievesAuthorizedUsers; use Flarum\Testing\integration\TestCase; +use Illuminate\Support\Arr; class ListWithFulltextSearchTest extends TestCase { @@ -34,6 +35,8 @@ class ListWithFulltextSearchTest extends TestCase ['id' => 2, 'title' => 'lightsail in title too', 'created_at' => Carbon::createFromDate(2020, 01, 01)->toDateTimeString(), 'user_id' => 1, 'comment_count' => 1], ['id' => 3, 'title' => 'not in title either', 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'comment_count' => 1], ['id' => 4, 'title' => 'not in title or text', 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'comment_count' => 1], + ['id' => 5, 'title' => 'తెలుగు', 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'comment_count' => 1], + ['id' => 6, 'title' => '支持中文吗', 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'comment_count' => 1], ]); $this->database()->table('posts')->insert([ @@ -42,6 +45,8 @@ class ListWithFulltextSearchTest extends TestCase ['id' => 3, 'discussion_id' => 2, 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'type' => 'comment', 'content' => '

another lightsail for discussion 2!

'], ['id' => 4, 'discussion_id' => 3, 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'type' => 'comment', 'content' => '

just one lightsail for discussion 3.

'], ['id' => 5, 'discussion_id' => 4, 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'type' => 'comment', 'content' => '

not in title or text

'], + ['id' => 6, 'discussion_id' => 4, 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'type' => 'comment', 'content' => '

తెలుగు

'], + ['id' => 7, 'discussion_id' => 2, 'created_at' => Carbon::now()->toDateTimeString(), 'user_id' => 1, 'type' => 'comment', 'content' => '

支持中文吗

'], ]); // We need to call these again, since we rolled back the transaction started by `::app()`. @@ -79,7 +84,7 @@ class ListWithFulltextSearchTest extends TestCase return $row['id']; }, $data['data']); - $this->assertEquals(['2', '1', '3'], $ids, 'IDs do not match'); + $this->assertEqualsCanonicalizing(['2', '1', '3'], $ids, 'IDs do not match'); } /** @@ -100,7 +105,51 @@ class ListWithFulltextSearchTest extends TestCase return $row['id']; }, $data['data']); - $this->assertEquals(['2', '1', '3'], $ids, 'IDs do not match'); + $this->assertEqualsCanonicalizing(['2', '1', '3'], $ids, 'IDs do not match'); + } + + /** + * @test + */ + public function can_search_telugu_like_languages() + { + $response = $this->send( + $this->request('GET', '/api/discussions') + ->withQueryParams([ + 'filter' => ['q' => 'తెలుగు'], + 'include' => 'mostRelevantPost', + ]) + ); + + $data = json_decode($response->getBody()->getContents(), true); + $ids = array_map(function ($row) { + return $row['id']; + }, $data['data']); + + $this->assertEqualsCanonicalizing(['4', '5'], $ids, 'IDs do not match'); + $this->assertEqualsCanonicalizing(['6'], Arr::pluck($data['included'], 'id')); + } + + /** + * @test + */ + public function can_search_cjk_languages() + { + $response = $this->send( + $this->request('GET', '/api/discussions') + ->withQueryParams([ + 'filter' => ['q' => '支持中文吗'], + 'include' => 'mostRelevantPost', + ]) + ); + + $data = json_decode($response->getBody()->getContents(), true); + $ids = array_map(function ($row) { + return $row['id']; + }, $data['data']); + + $this->assertEqualsCanonicalizing(['2', '6'], $ids, 'IDs do not match'); + $this->assertEqualsCanonicalizing(['7'], Arr::pluck($data['included'], 'id')); } /**