Patch in exact search for meilisearch (#29671)

meilisearch does not have an search option to contorl fuzzynes per query
right now:
 - https://github.com/meilisearch/meilisearch/issues/1192
 - https://github.com/orgs/meilisearch/discussions/377
 - https://github.com/meilisearch/meilisearch/discussions/1096

so we have to create a workaround by post-filter the search result in
gitea until this is addressed.

For future works I added an option in backend only atm, to enable
fuzzynes for issue indexer too.
And also refactored the code so the fuzzy option is equal in logic to
code indexer


---
*Sponsored by Kithara Software GmbH*
This commit is contained in:
6543 2024-03-09 02:39:27 +01:00 committed by GitHub
parent baeb251174
commit 7fdc048153
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 184 additions and 33 deletions

View File

@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {
// Search searches for files in the specified repo. // Search searches for files in the specified repo.
// Returns the matching file-paths // Returns the matching file-paths
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var ( var (
indexerQuery query.Query indexerQuery query.Query
keywordQuery query.Query keywordQuery query.Query
) )
if isMatch { if isFuzzy {
prefixQuery := bleve.NewPrefixQuery(keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
} else {
phraseQuery := bleve.NewMatchPhraseQuery(keyword) phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content" phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery keywordQuery = phraseQuery
} else {
prefixQuery := bleve.NewPrefixQuery(keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
} }
if len(repoIDs) > 0 { if len(repoIDs) > 0 {

View File

@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
} }
// Search searches for codes and language stats by given conditions. // Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchType := esMultiMatchTypeBestFields searchType := esMultiMatchTypePhrasePrefix
if isMatch { if isFuzzy {
searchType = esMultiMatchTypePhrasePrefix searchType = esMultiMatchTypeBestFields
} }
kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType) kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)

View File

@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
for _, kw := range keywords { for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) { t.Run(kw.Keyword, func(t *testing.T) {
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false) total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true)
assert.NoError(t, err) assert.NoError(t, err)
assert.Len(t, kw.IDs, int(total)) assert.Len(t, kw.IDs, int(total))
assert.Len(t, langs, kw.Langs) assert.Len(t, langs, kw.Langs)

View File

@ -16,7 +16,7 @@ type Indexer interface {
internal.Indexer internal.Indexer
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error Delete(ctx context.Context, repoID int64) error
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
} }
// NewDummyIndexer returns a dummy indexer // NewDummyIndexer returns a dummy indexer
@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
return fmt.Errorf("indexer is not ready") return fmt.Errorf("indexer is not ready")
} }
func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) { func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
return 0, nil, nil, fmt.Errorf("indexer is not ready") return 0, nil, nil, fmt.Errorf("indexer is not ready")
} }

View File

@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
} }
// PerformSearch perform a search on a repository // PerformSearch perform a search on a repository
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) { // if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
if len(keyword) == 0 { if len(keyword) == 0 {
return 0, nil, nil, nil return 0, nil, nil, nil
} }
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch) total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy)
if err != nil { if err != nil {
return 0, nil, nil, err return 0, nil, nil, err
} }

View File

@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue
return q return q
} }
// PrefixQuery generates a match prefix query for the given prefix and field
func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
q := bleve.NewPrefixQuery(matchPrefix)
q.FieldVal = field
return q
}
// BoolFieldQuery generates a bool field query for the given value and field // BoolFieldQuery generates a bool field query for the given value and field
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery { func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
q := bleve.NewBoolFieldQuery(value) q := bleve.NewBoolFieldQuery(value)

View File

@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
var queries []query.Query var queries []query.Query
if options.Keyword != "" { if options.Keyword != "" {
keywordQueries := []query.Query{ if options.IsFuzzyKeyword {
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer), queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer), inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer), inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
}...))
} else {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.PrefixQuery(options.Keyword, "title"),
inner_bleve.PrefixQuery(options.Keyword, "content"),
inner_bleve.PrefixQuery(options.Keyword, "comments"),
}...))
} }
queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...))
} }
if len(options.RepoIDs) > 0 || options.AllPublic { if len(options.RepoIDs) > 0 || options.AllPublic {

View File

@ -19,6 +19,10 @@ import (
const ( const (
issueIndexerLatestVersion = 1 issueIndexerLatestVersion = 1
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrasePrefix = "phrase_prefix"
) )
var _ internal.Indexer = &Indexer{} var _ internal.Indexer = &Indexer{}
@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
query := elastic.NewBoolQuery() query := elastic.NewBoolQuery()
if options.Keyword != "" { if options.Keyword != "" {
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))
searchType := esMultiMatchTypePhrasePrefix
if options.IsFuzzyKeyword {
searchType = esMultiMatchTypeBestFields
}
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
} }
if len(options.RepoIDs) > 0 { if len(options.RepoIDs) > 0 {

View File

@ -74,6 +74,8 @@ type SearchResult struct {
type SearchOptions struct { type SearchOptions struct {
Keyword string // keyword to search Keyword string // keyword to search
IsFuzzyKeyword bool // if false the levenshtein distance is 0
RepoIDs []int64 // repository IDs which the issues belong to RepoIDs []int64 // repository IDs which the issues belong to
AllPublic bool // if include all public repositories AllPublic bool // if include all public repositories

View File

@ -5,6 +5,7 @@ package meilisearch
import ( import (
"context" "context"
"errors"
"strconv" "strconv"
"strings" "strings"
@ -16,12 +17,15 @@ import (
) )
const ( const (
issueIndexerLatestVersion = 2 issueIndexerLatestVersion = 3
// TODO: make this configurable if necessary // TODO: make this configurable if necessary
maxTotalHits = 10000 maxTotalHits = 10000
) )
// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")
var _ internal.Indexer = &Indexer{} var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface // Indexer implements Indexer interface
@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer {
}, },
DisplayedAttributes: []string{ DisplayedAttributes: []string{
"id", "id",
"title",
"content",
"comments",
}, },
FilterableAttributes: []string{ FilterableAttributes: []string{
"repo_id", "repo_id",
@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
return nil, err return nil, err
} }
hits := make([]internal.Match, 0, len(searchRes.Hits)) hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
for _, hit := range searchRes.Hits { if err != nil {
hits = append(hits, internal.Match{ return nil, err
ID: int64(hit.(map[string]any)["id"].(float64)),
})
} }
return &internal.SearchResult{ return &internal.SearchResult{
@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string {
} }
return field + ":asc" return field + ":asc"
} }
// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
// and you can only change "typo tolerance" per index. So we have to post-filter the results
// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
hits := make([]internal.Match, 0, len(searchRes.Hits))
for _, hit := range searchRes.Hits {
hit, ok := hit.(map[string]any)
if !ok {
return nil, ErrMalformedResponse
}
if !isFuzzy {
keyword = strings.ToLower(keyword)
// declare a anon func to check if the title, content or at least one comment contains the keyword
found, err := func() (bool, error) {
// check if title match first
title, ok := hit["title"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(title), keyword) {
return true, nil
}
// check if content has a match
content, ok := hit["content"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(content), keyword) {
return true, nil
}
// now check for each comment if one has a match
// so we first try to cast and skip if there are no comments
comments, ok := hit["comments"].([]any)
if !ok {
return false, ErrMalformedResponse
} else if len(comments) == 0 {
return false, nil
}
// now we iterate over all and report as soon as we detect one match
for i := range comments {
comment, ok := comments[i].(string)
if !ok {
return false, ErrMalformedResponse
}
if strings.Contains(strings.ToLower(comment), keyword) {
return true, nil
}
}
// we got no match
return false, nil
}()
if err != nil {
return nil, err
} else if !found {
continue
}
}
issueID, ok := hit["id"].(float64)
if !ok {
return nil, ErrMalformedResponse
}
hits = append(hits, internal.Match{
ID: int64(issueID),
})
}
return hits, nil
}

View File

@ -10,7 +10,11 @@ import (
"testing" "testing"
"time" "time"
"code.gitea.io/gitea/modules/indexer/issues/internal"
"code.gitea.io/gitea/modules/indexer/issues/internal/tests" "code.gitea.io/gitea/modules/indexer/issues/internal/tests"
"github.com/meilisearch/meilisearch-go"
"github.com/stretchr/testify/assert"
) )
func TestMeilisearchIndexer(t *testing.T) { func TestMeilisearchIndexer(t *testing.T) {
@ -48,3 +52,44 @@ func TestMeilisearchIndexer(t *testing.T) {
tests.TestIndexer(t, indexer) tests.TestIndexer(t, indexer)
} }
func TestNonFuzzyWorkaround(t *testing.T) {
// get unexpected return
_, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{
Hits: []any{"aa", "bb", "cc", "dd"},
}, "bowling", false)
assert.ErrorIs(t, err, ErrMalformedResponse)
validResponse := &meilisearch.SearchResponse{
Hits: []any{
map[string]any{
"id": float64(11),
"title": "a title",
"content": "issue body with no match",
"comments": []any{"hey whats up?", "I'm currently bowling", "nice"},
},
map[string]any{
"id": float64(22),
"title": "Bowling as title",
"content": "",
"comments": []any{},
},
map[string]any{
"id": float64(33),
"title": "Bowl-ing as fuzzy match",
"content": "",
"comments": []any{},
},
},
}
// nonFuzzy
hits, err := nonFuzzyWorkaround(validResponse, "bowling", false)
assert.NoError(t, err)
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits)
// fuzzy
hits, err = nonFuzzyWorkaround(validResponse, "bowling", true)
assert.NoError(t, err)
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
}

View File

@ -35,7 +35,7 @@ func Code(ctx *context.Context) {
keyword := ctx.FormTrim("q") keyword := ctx.FormTrim("q")
queryType := ctx.FormTrim("t") queryType := ctx.FormTrim("t")
isMatch := queryType == "match" isFuzzy := queryType != "match"
ctx.Data["Keyword"] = keyword ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language ctx.Data["Language"] = language
@ -77,7 +77,7 @@ func Code(ctx *context.Context) {
) )
if (len(repoIDs) > 0) || isAdmin { if (len(repoIDs) > 0) || isAdmin {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch) total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil { if err != nil {
if code_indexer.IsAvailable(ctx) { if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err) ctx.ServerError("SearchResults", err)

View File

@ -25,7 +25,7 @@ func Search(ctx *context.Context) {
keyword := ctx.FormTrim("q") keyword := ctx.FormTrim("q")
queryType := ctx.FormTrim("t") queryType := ctx.FormTrim("t")
isMatch := queryType == "match" isFuzzy := queryType != "match"
ctx.Data["Keyword"] = keyword ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language ctx.Data["Language"] = language
@ -43,7 +43,7 @@ func Search(ctx *context.Context) {
} }
total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID}, total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID},
language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch) language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil { if err != nil {
if code_indexer.IsAvailable(ctx) { if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err) ctx.ServerError("SearchResults", err)

View File

@ -40,7 +40,7 @@ func CodeSearch(ctx *context.Context) {
keyword := ctx.FormTrim("q") keyword := ctx.FormTrim("q")
queryType := ctx.FormTrim("t") queryType := ctx.FormTrim("t")
isMatch := queryType == "match" isFuzzy := queryType != "match"
ctx.Data["Keyword"] = keyword ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language ctx.Data["Language"] = language
@ -75,7 +75,7 @@ func CodeSearch(ctx *context.Context) {
) )
if len(repoIDs) > 0 { if len(repoIDs) > 0 {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch) total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil { if err != nil {
if code_indexer.IsAvailable(ctx) { if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err) ctx.ServerError("SearchResults", err)