From 7fdc0481538151d8a5ed3ec2a32639950f5d8ac6 Mon Sep 17 00:00:00 2001 From: 6543 Date: Sat, 9 Mar 2024 02:39:27 +0100 Subject: [PATCH] Patch in exact search for meilisearch (#29671) meilisearch does not have an search option to contorl fuzzynes per query right now: - https://github.com/meilisearch/meilisearch/issues/1192 - https://github.com/orgs/meilisearch/discussions/377 - https://github.com/meilisearch/meilisearch/discussions/1096 so we have to create a workaround by post-filter the search result in gitea until this is addressed. For future works I added an option in backend only atm, to enable fuzzynes for issue indexer too. And also refactored the code so the fuzzy option is equal in logic to code indexer --- *Sponsored by Kithara Software GmbH* --- modules/indexer/code/bleve/bleve.go | 12 +-- .../code/elasticsearch/elasticsearch.go | 8 +- modules/indexer/code/indexer_test.go | 2 +- modules/indexer/code/internal/indexer.go | 4 +- modules/indexer/code/search.go | 5 +- modules/indexer/internal/bleve/query.go | 7 ++ modules/indexer/issues/bleve/bleve.go | 17 +++- .../issues/elasticsearch/elasticsearch.go | 12 ++- modules/indexer/issues/internal/model.go | 2 + .../indexer/issues/meilisearch/meilisearch.go | 91 +++++++++++++++++-- .../issues/meilisearch/meilisearch_test.go | 45 +++++++++ routers/web/explore/code.go | 4 +- routers/web/repo/search.go | 4 +- routers/web/user/code.go | 4 +- 14 files changed, 184 insertions(+), 33 deletions(-) diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 8ba50ed77c9..107dd23598d 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error { // Search searches for files in the specified repo. // Returns the matching file-paths -func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { +func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { var ( indexerQuery query.Query keywordQuery query.Query ) - if isMatch { - prefixQuery := bleve.NewPrefixQuery(keyword) - prefixQuery.FieldVal = "Content" - keywordQuery = prefixQuery - } else { + if isFuzzy { phraseQuery := bleve.NewMatchPhraseQuery(keyword) phraseQuery.FieldVal = "Content" phraseQuery.Analyzer = repoIndexerAnalyzer keywordQuery = phraseQuery + } else { + prefixQuery := bleve.NewPrefixQuery(keyword) + prefixQuery.FieldVal = "Content" + keywordQuery = prefixQuery } if len(repoIDs) > 0 { diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 0f70f134855..065b0b20618 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan } // Search searches for codes and language stats by given conditions. -func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { - searchType := esMultiMatchTypeBestFields - if isMatch { - searchType = esMultiMatchTypePhrasePrefix +func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { + searchType := esMultiMatchTypePhrasePrefix + if isFuzzy { + searchType = esMultiMatchTypeBestFields } kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType) diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index 5eb8e61e3db..23dbd634105 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { for _, kw := range keywords { t.Run(kw.Keyword, func(t *testing.T) { - total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false) + total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true) assert.NoError(t, err) assert.Len(t, kw.IDs, int(total)) assert.Len(t, langs, kw.Langs) diff --git a/modules/indexer/code/internal/indexer.go b/modules/indexer/code/internal/indexer.go index da3ac3623c9..c92419deb22 100644 --- a/modules/indexer/code/internal/indexer.go +++ b/modules/indexer/code/internal/indexer.go @@ -16,7 +16,7 @@ type Indexer interface { internal.Indexer Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error Delete(ctx context.Context, repoID int64) error - Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) + Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) } // NewDummyIndexer returns a dummy indexer @@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error { return fmt.Errorf("indexer is not ready") } -func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) { +func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) { return 0, nil, nil, fmt.Errorf("indexer is not ready") } diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go index 2ddc2397fa1..89a62a8d3e2 100644 --- a/modules/indexer/code/search.go +++ b/modules/indexer/code/search.go @@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res } // PerformSearch perform a search on a repository -func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) { +// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2 +func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) { if len(keyword) == 0 { return 0, nil, nil, nil } - total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch) + total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy) if err != nil { return 0, nil, nil, err } diff --git a/modules/indexer/internal/bleve/query.go b/modules/indexer/internal/bleve/query.go index c7d66538c12..2a427c40202 100644 --- a/modules/indexer/internal/bleve/query.go +++ b/modules/indexer/internal/bleve/query.go @@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue return q } +// PrefixQuery generates a match prefix query for the given prefix and field +func PrefixQuery(matchPrefix, field string) *query.PrefixQuery { + q := bleve.NewPrefixQuery(matchPrefix) + q.FieldVal = field + return q +} + // BoolFieldQuery generates a bool field query for the given value and field func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery { q := bleve.NewBoolFieldQuery(value) diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go index 6a5d65cb665..aaea854efa0 100644 --- a/modules/indexer/issues/bleve/bleve.go +++ b/modules/indexer/issues/bleve/bleve.go @@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( var queries []query.Query if options.Keyword != "" { - keywordQueries := []query.Query{ - inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer), - inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer), - inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer), + if options.IsFuzzyKeyword { + queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ + inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer), + inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer), + inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer), + }...)) + } else { + queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ + inner_bleve.PrefixQuery(options.Keyword, "title"), + inner_bleve.PrefixQuery(options.Keyword, "content"), + inner_bleve.PrefixQuery(options.Keyword, "comments"), + }...)) } - queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...)) } if len(options.RepoIDs) > 0 || options.AllPublic { diff --git a/modules/indexer/issues/elasticsearch/elasticsearch.go b/modules/indexer/issues/elasticsearch/elasticsearch.go index 3acd3ade715..0077da263a7 100644 --- a/modules/indexer/issues/elasticsearch/elasticsearch.go +++ b/modules/indexer/issues/elasticsearch/elasticsearch.go @@ -19,6 +19,10 @@ import ( const ( issueIndexerLatestVersion = 1 + // multi-match-types, currently only 2 types are used + // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types + esMultiMatchTypeBestFields = "best_fields" + esMultiMatchTypePhrasePrefix = "phrase_prefix" ) var _ internal.Indexer = &Indexer{} @@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( query := elastic.NewBoolQuery() if options.Keyword != "" { - query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments")) + + searchType := esMultiMatchTypePhrasePrefix + if options.IsFuzzyKeyword { + searchType = esMultiMatchTypeBestFields + } + + query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType)) } if len(options.RepoIDs) > 0 { diff --git a/modules/indexer/issues/internal/model.go b/modules/indexer/issues/internal/model.go index 947335d8ce9..d41fec4aba8 100644 --- a/modules/indexer/issues/internal/model.go +++ b/modules/indexer/issues/internal/model.go @@ -74,6 +74,8 @@ type SearchResult struct { type SearchOptions struct { Keyword string // keyword to search + IsFuzzyKeyword bool // if false the levenshtein distance is 0 + RepoIDs []int64 // repository IDs which the issues belong to AllPublic bool // if include all public repositories diff --git a/modules/indexer/issues/meilisearch/meilisearch.go b/modules/indexer/issues/meilisearch/meilisearch.go index 325883196bb..c4299200653 100644 --- a/modules/indexer/issues/meilisearch/meilisearch.go +++ b/modules/indexer/issues/meilisearch/meilisearch.go @@ -5,6 +5,7 @@ package meilisearch import ( "context" + "errors" "strconv" "strings" @@ -16,12 +17,15 @@ import ( ) const ( - issueIndexerLatestVersion = 2 + issueIndexerLatestVersion = 3 // TODO: make this configurable if necessary maxTotalHits = 10000 ) +// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types. +var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content") + var _ internal.Indexer = &Indexer{} // Indexer implements Indexer interface @@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer { }, DisplayedAttributes: []string{ "id", + "title", + "content", + "comments", }, FilterableAttributes: []string{ "repo_id", @@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( return nil, err } - hits := make([]internal.Match, 0, len(searchRes.Hits)) - for _, hit := range searchRes.Hits { - hits = append(hits, internal.Match{ - ID: int64(hit.(map[string]any)["id"].(float64)), - }) + hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword) + if err != nil { + return nil, err } return &internal.SearchResult{ @@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string { } return field + ":asc" } + +// nonFuzzyWorkaround is needed as meilisearch does not have an exact search +// and you can only change "typo tolerance" per index. So we have to post-filter the results +// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance +// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed +func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) { + hits := make([]internal.Match, 0, len(searchRes.Hits)) + for _, hit := range searchRes.Hits { + hit, ok := hit.(map[string]any) + if !ok { + return nil, ErrMalformedResponse + } + + if !isFuzzy { + keyword = strings.ToLower(keyword) + + // declare a anon func to check if the title, content or at least one comment contains the keyword + found, err := func() (bool, error) { + // check if title match first + title, ok := hit["title"].(string) + if !ok { + return false, ErrMalformedResponse + } else if strings.Contains(strings.ToLower(title), keyword) { + return true, nil + } + + // check if content has a match + content, ok := hit["content"].(string) + if !ok { + return false, ErrMalformedResponse + } else if strings.Contains(strings.ToLower(content), keyword) { + return true, nil + } + + // now check for each comment if one has a match + // so we first try to cast and skip if there are no comments + comments, ok := hit["comments"].([]any) + if !ok { + return false, ErrMalformedResponse + } else if len(comments) == 0 { + return false, nil + } + + // now we iterate over all and report as soon as we detect one match + for i := range comments { + comment, ok := comments[i].(string) + if !ok { + return false, ErrMalformedResponse + } + if strings.Contains(strings.ToLower(comment), keyword) { + return true, nil + } + } + + // we got no match + return false, nil + }() + + if err != nil { + return nil, err + } else if !found { + continue + } + } + issueID, ok := hit["id"].(float64) + if !ok { + return nil, ErrMalformedResponse + } + hits = append(hits, internal.Match{ + ID: int64(issueID), + }) + } + return hits, nil +} diff --git a/modules/indexer/issues/meilisearch/meilisearch_test.go b/modules/indexer/issues/meilisearch/meilisearch_test.go index 3d7237268e1..ecce704236b 100644 --- a/modules/indexer/issues/meilisearch/meilisearch_test.go +++ b/modules/indexer/issues/meilisearch/meilisearch_test.go @@ -10,7 +10,11 @@ import ( "testing" "time" + "code.gitea.io/gitea/modules/indexer/issues/internal" "code.gitea.io/gitea/modules/indexer/issues/internal/tests" + + "github.com/meilisearch/meilisearch-go" + "github.com/stretchr/testify/assert" ) func TestMeilisearchIndexer(t *testing.T) { @@ -48,3 +52,44 @@ func TestMeilisearchIndexer(t *testing.T) { tests.TestIndexer(t, indexer) } + +func TestNonFuzzyWorkaround(t *testing.T) { + // get unexpected return + _, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{ + Hits: []any{"aa", "bb", "cc", "dd"}, + }, "bowling", false) + assert.ErrorIs(t, err, ErrMalformedResponse) + + validResponse := &meilisearch.SearchResponse{ + Hits: []any{ + map[string]any{ + "id": float64(11), + "title": "a title", + "content": "issue body with no match", + "comments": []any{"hey whats up?", "I'm currently bowling", "nice"}, + }, + map[string]any{ + "id": float64(22), + "title": "Bowling as title", + "content": "", + "comments": []any{}, + }, + map[string]any{ + "id": float64(33), + "title": "Bowl-ing as fuzzy match", + "content": "", + "comments": []any{}, + }, + }, + } + + // nonFuzzy + hits, err := nonFuzzyWorkaround(validResponse, "bowling", false) + assert.NoError(t, err) + assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits) + + // fuzzy + hits, err = nonFuzzyWorkaround(validResponse, "bowling", true) + assert.NoError(t, err) + assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits) +} diff --git a/routers/web/explore/code.go b/routers/web/explore/code.go index 2cde8b655ee..a6bc71ac9cd 100644 --- a/routers/web/explore/code.go +++ b/routers/web/explore/code.go @@ -35,7 +35,7 @@ func Code(ctx *context.Context) { keyword := ctx.FormTrim("q") queryType := ctx.FormTrim("t") - isMatch := queryType == "match" + isFuzzy := queryType != "match" ctx.Data["Keyword"] = keyword ctx.Data["Language"] = language @@ -77,7 +77,7 @@ func Code(ctx *context.Context) { ) if (len(repoIDs) > 0) || isAdmin { - total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch) + total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy) if err != nil { if code_indexer.IsAvailable(ctx) { ctx.ServerError("SearchResults", err) diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go index c53d8fd918a..766dd5726aa 100644 --- a/routers/web/repo/search.go +++ b/routers/web/repo/search.go @@ -25,7 +25,7 @@ func Search(ctx *context.Context) { keyword := ctx.FormTrim("q") queryType := ctx.FormTrim("t") - isMatch := queryType == "match" + isFuzzy := queryType != "match" ctx.Data["Keyword"] = keyword ctx.Data["Language"] = language @@ -43,7 +43,7 @@ func Search(ctx *context.Context) { } total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID}, - language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch) + language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy) if err != nil { if code_indexer.IsAvailable(ctx) { ctx.ServerError("SearchResults", err) diff --git a/routers/web/user/code.go b/routers/web/user/code.go index eb711b76ebb..8613d38b65a 100644 --- a/routers/web/user/code.go +++ b/routers/web/user/code.go @@ -40,7 +40,7 @@ func CodeSearch(ctx *context.Context) { keyword := ctx.FormTrim("q") queryType := ctx.FormTrim("t") - isMatch := queryType == "match" + isFuzzy := queryType != "match" ctx.Data["Keyword"] = keyword ctx.Data["Language"] = language @@ -75,7 +75,7 @@ func CodeSearch(ctx *context.Context) { ) if len(repoIDs) > 0 { - total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch) + total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy) if err != nil { if code_indexer.IsAvailable(ctx) { ctx.ServerError("SearchResults", err)