Allow code search by filename (#32210)
Some checks are pending
release-nightly / nightly-binary (push) Waiting to run
release-nightly / nightly-docker-rootful (push) Waiting to run
release-nightly / nightly-docker-rootless (push) Waiting to run

This is a large and complex PR, so let me explain in detail its changes.

First, I had to create new index mappings for Bleve and ElasticSerach as
the current ones do not support search by filename. This requires Gitea
to recreate the code search indexes (I do not know if this is a breaking
change, but I feel it deserves a heads-up).

I've used [this
approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html)
to model the filename index. It allows us to efficiently search for both
the full path and the name of a file. Bleve, however, does not support
this out-of-box, so I had to code a brand new [token
filter](https://blevesearch.com/docs/Token-Filters/) to generate the
search terms.

I also did an overhaul in the `indexer_test.go` file. It now asserts the
order of the expected results (this is important since matches based on
the name of a file are more relevant than those based on its content).
I've added new test scenarios that deal with searching by filename. They
use a new repo included in the Gitea fixture.

The screenshot below depicts how Gitea shows the search results. It
shows results based on content in the same way as the current version
does. In matches based on the filename, the first seven lines of the
file contents are shown (BTW, this is how GitHub does it).


![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858)

Resolves #32096

---------

Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
This commit is contained in:
Bruno Sofiato 2024-10-11 20:35:04 -03:00 committed by GitHub
parent 0fe5e2b08c
commit 900ac62251
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
38 changed files with 721 additions and 50 deletions

View File

@ -712,3 +712,24 @@
type: 3 type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}" config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810 created_unix: 946684810
-
id: 108
repo_id: 62
type: 1
config: "{}"
created_unix: 946684810
-
id: 109
repo_id: 62
type: 2
config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}"
created_unix: 946684810
-
id: 110
repo_id: 62
type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810

View File

@ -1768,3 +1768,34 @@
size: 0 size: 0
is_fsck_enabled: true is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false close_issues_via_commit_in_any_branch: false
-
id: 62
owner_id: 42
owner_name: org42
lower_name: search-by-path
name: search-by-path
default_branch: master
num_watches: 0
num_stars: 0
num_forks: 0
num_issues: 0
num_closed_issues: 0
num_pulls: 0
num_closed_pulls: 0
num_milestones: 0
num_closed_milestones: 0
num_projects: 0
num_closed_projects: 0
is_private: false
is_empty: false
is_archived: false
is_mirror: false
status: 0
is_fork: false
fork_id: 0
is_template: false
template_id: 0
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false

View File

@ -1517,3 +1517,40 @@
repo_admin_change_team_access: false repo_admin_change_team_access: false
theme: "" theme: ""
keep_activity_private: false keep_activity_private: false
-
id: 42
lower_name: org42
name: org42
full_name: Org42
email: org42@example.com
keep_email_private: false
email_notifications_preference: onmention
passwd: ZogKvWdyEx:password
passwd_hash_algo: dummy
must_change_password: false
login_source: 0
login_name: org42
type: 1
salt: ZogKvWdyEx
max_repo_creation: -1
is_active: false
is_admin: false
is_restricted: false
allow_git_hook: false
allow_import_local: false
allow_create_organization: true
prohibit_login: false
avatar: avatar42
avatar_email: org42@example.com
use_custom_avatar: false
num_followers: 0
num_following: 0
num_stars: 0
num_repos: 1
num_teams: 0
num_members: 0
visibility: 0
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false

View File

@ -138,12 +138,12 @@ func getTestCases() []struct {
{ {
name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative", name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)}, opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)},
count: 33, count: 34,
}, },
{ {
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative", name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)}, opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)},
count: 38, count: 39,
}, },
{ {
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName", name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName",
@ -158,7 +158,7 @@ func getTestCases() []struct {
{ {
name: "AllPublic/PublicRepositoriesOfOrganization", name: "AllPublic/PublicRepositoriesOfOrganization",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)}, opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)},
count: 33, count: 34,
}, },
{ {
name: "AllTemplates", name: "AllTemplates",

View File

@ -92,7 +92,10 @@ func TestSearchUsers(t *testing.T) {
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}}, testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}},
[]int64{26, 41}) []int64{26, 41})
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}}, testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
[]int64{42})
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}},
[]int64{}) []int64{})
// test users // test users

View File

@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo" "code.gitea.io/gitea/modules/gitrepo"
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal" "code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@ -53,6 +54,7 @@ type RepoIndexerData struct {
RepoID int64 RepoID int64
CommitID string CommitID string
Content string Content string
Filename string
Language string Language string
UpdatedAt time.Time UpdatedAt time.Time
} }
@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {
const ( const (
repoIndexerAnalyzer = "repoIndexerAnalyzer" repoIndexerAnalyzer = "repoIndexerAnalyzer"
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType" repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 6 repoIndexerLatestVersion = 7
) )
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
textFieldMapping.IncludeInAll = false textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping) docMapping.AddFieldMappingsAt("Content", textFieldMapping)
fileNamedMapping := bleve.NewTextFieldMapping()
fileNamedMapping.IncludeInAll = false
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
termFieldMapping := bleve.NewTextFieldMapping() termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name termFieldMapping.Analyzer = analyzer_keyword.Name
@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping) docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping() mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
}); err != nil { }); err != nil {
return nil, err return nil, err
} }
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
}); err != nil {
return nil, err
}
mapping.DefaultAnalyzer = repoIndexerAnalyzer mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping) mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
return batch.Index(id, &RepoIndexerData{ return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID, RepoID: repo.ID,
CommitID: commitSha, CommitID: commitSha,
Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents), Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC(),
@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
keywordQuery query.Query keywordQuery query.Query
) )
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
phraseQuery.FieldVal = "Content" pathQuery.FieldVal = "Filename"
phraseQuery.Analyzer = repoIndexerAnalyzer pathQuery.SetBoost(10)
keywordQuery = phraseQuery
contentQuery := bleve.NewMatchQuery(opts.Keyword)
contentQuery.FieldVal = "Content"
if opts.IsKeywordFuzzy { if opts.IsKeywordFuzzy {
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
} }
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
if len(opts.RepoIDs) > 0 { if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs)) repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs { for _, repoID := range opts.RepoIDs {
@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
from, pageSize := opts.GetSkipTake() from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false) searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true searchRequest.IncludeLocations = true
if len(opts.Language) == 0 { if len(opts.Language) == 0 {
@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
endIndex = locationEnd endIndex = locationEnd
} }
} }
if len(hit.Locations["Filename"]) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
}
language := hit.Fields["Language"].(string) language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil { if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {

View File

@ -0,0 +1,101 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package path
import (
"slices"
"strings"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const (
Name = "gitea/path"
)
type TokenFilter struct{}
func NewTokenFilter() *TokenFilter {
return &TokenFilter{}
}
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewTokenFilter(), nil
}
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
if len(input) == 1 {
// if there is only one token, we dont need to generate the reversed chain
return generatePathTokens(input, false)
}
normal := generatePathTokens(input, false)
reversed := generatePathTokens(input, true)
return append(normal, reversed...)
}
// Generates path tokens from the input tokens.
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
//
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
// to efficiently search for filenames without supplying the fullpath.
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
terms := make([]string, 0, len(input))
longestTerm := 0
if reversed {
slices.Reverse(input)
}
for i := 0; i < len(input); i++ {
var sb strings.Builder
sb.WriteString(string(input[0].Term))
for j := 1; j < i; j++ {
sb.WriteString("/")
sb.WriteString(string(input[j].Term))
}
term := sb.String()
if longestTerm < len(term) {
longestTerm = len(term)
}
terms = append(terms, term)
}
output := make(analysis.TokenStream, 0, len(terms))
for _, term := range terms {
var start, end int
if reversed {
start = 0
end = len(term)
} else {
start = longestTerm - len(term)
end = longestTerm
}
token := analysis.Token{
Position: 1,
Start: start,
End: end,
Type: analysis.AlphaNumeric,
Term: []byte(term),
}
output = append(output, &token)
}
return output
}
func init() {
registry.RegisterTokenFilter(Name, TokenFilterConstructor)
}

View File

@ -0,0 +1,76 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package path
import (
"fmt"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/stretchr/testify/assert"
)
type Scenario struct {
Input string
Tokens []string
}
func TestTokenFilter(t *testing.T) {
scenarios := []struct {
Input string
Terms []string
}{
{
Input: "Dockerfile",
Terms: []string{"Dockerfile"},
},
{
Input: "Dockerfile.rootless",
Terms: []string{"Dockerfile.rootless"},
},
{
Input: "a/b/c/Dockerfile.rootless",
Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
},
{
Input: "",
Terms: []string{},
},
}
for _, scenario := range scenarios {
t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
terms := extractTerms(scenario.Input)
assert.Len(t, terms, len(scenario.Terms))
for _, term := range terms {
assert.Contains(t, scenario.Terms, term)
}
})
}
}
func extractTerms(input string) []string {
tokens := tokenize(input)
filteredTokens := filter(tokens)
terms := make([]string, 0, len(filteredTokens))
for _, token := range filteredTokens {
terms = append(terms, string(token.Term))
}
return terms
}
func filter(input analysis.TokenStream) analysis.TokenStream {
filter := NewTokenFilter()
return filter.Filter(input)
}
func tokenize(input string) analysis.TokenStream {
tokenizer := unicode.NewUnicodeTokenizer()
return tokenizer.Tokenize([]byte(input))
}

View File

@ -30,7 +30,7 @@ import (
) )
const ( const (
esRepoIndexerLatestVersion = 1 esRepoIndexerLatestVersion = 2
// multi-match-types, currently only 2 types are used // multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields" esMultiMatchTypeBestFields = "best_fields"
@ -57,12 +57,50 @@ func NewIndexer(url, indexerName string) *Indexer {
const ( const (
defaultMapping = `{ defaultMapping = `{
"settings": {
"analysis": {
"analyzer": {
"filename_path_analyzer": {
"tokenizer": "path_tokenizer"
},
"reversed_filename_path_analyzer": {
"tokenizer": "reversed_path_tokenizer"
}
},
"tokenizer": {
"path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/"
},
"reversed_path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/",
"reverse": true
}
}
}
},
"mappings": { "mappings": {
"properties": { "properties": {
"repo_id": { "repo_id": {
"type": "long", "type": "long",
"index": true "index": true
}, },
"filename": {
"type": "text",
"term_vector": "with_positions_offsets",
"index": true,
"fields": {
"path": {
"type": "text",
"analyzer": "reversed_filename_path_analyzer"
},
"path_reversed": {
"type": "text",
"analyzer": "filename_path_analyzer"
}
}
},
"content": { "content": {
"type": "text", "type": "text",
"term_vector": "with_positions_offsets", "term_vector": "with_positions_offsets",
@ -136,6 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
Id(id). Id(id).
Doc(map[string]any{ Doc(map[string]any{
"repo_id": repo.ID, "repo_id": repo.ID,
"filename": update.Filename,
"content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
"commit_id": sha, "commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents), "language": analyze.GetCodeLanguage(update.Filename, fileContents),
@ -231,11 +270,11 @@ func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
return err return err
} }
// indexPos find words positions for start and the following end on content. It will // contentMatchIndexPos find words positions for start and the following end on content. It will
// return the beginning position of the first start and the ending position of the // return the beginning position of the first start and the ending position of the
// first end following the start string. // first end following the start string.
// If not found any of the positions, it will return -1, -1. // If not found any of the positions, it will return -1, -1.
func indexPos(content, start, end string) (int, int) { func contentMatchIndexPos(content, start, end string) (int, int) {
startIdx := strings.Index(content, start) startIdx := strings.Index(content, start)
if startIdx < 0 { if startIdx < 0 {
return -1, -1 return -1, -1
@ -244,22 +283,29 @@ func indexPos(content, start, end string) (int, int) {
if endIdx < 0 { if endIdx < 0 {
return -1, -1 return -1, -1
} }
return startIdx, startIdx + len(start) + endIdx + len(end) return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
} }
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
hits := make([]*internal.SearchResult, 0, pageSize) hits := make([]*internal.SearchResult, 0, pageSize)
for _, hit := range searchResult.Hits.Hits { for _, hit := range searchResult.Hits.Hits {
repoID, fileName := internal.ParseIndexerID(hit.Id)
res := make(map[string]any)
if err := json.Unmarshal(hit.Source, &res); err != nil {
return 0, nil, nil, err
}
// FIXME: There is no way to get the position the keyword on the content currently on the same request. // FIXME: There is no way to get the position the keyword on the content currently on the same request.
// So we get it from content, this may made the query slower. See // So we get it from content, this may made the query slower. See
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291 // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
var startIndex, endIndex int var startIndex, endIndex int
c, ok := hit.Highlight["content"] if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
if ok && len(c) > 0 { startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
// FIXME: Since the highlighting content will include <em> and </em> for the keywords, // FIXME: Since the highlighting content will include <em> and </em> for the keywords,
// now we should find the positions. But how to avoid html content which contains the // now we should find the positions. But how to avoid html content which contains the
// <em> and </em> tags? If elastic search has handled that? // <em> and </em> tags? If elastic search has handled that?
startIndex, endIndex = indexPos(c[0], "<em>", "</em>") startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
if startIndex == -1 { if startIndex == -1 {
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
} }
@ -267,12 +313,6 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
panic(fmt.Sprintf("2===%#v", hit.Highlight)) panic(fmt.Sprintf("2===%#v", hit.Highlight))
} }
repoID, fileName := internal.ParseIndexerID(hit.Id)
res := make(map[string]any)
if err := json.Unmarshal(hit.Source, &res); err != nil {
return 0, nil, nil, err
}
language := res["language"].(string) language := res["language"].(string)
hits = append(hits, &internal.SearchResult{ hits = append(hits, &internal.SearchResult{
@ -283,7 +323,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
Language: language, Language: language,
StartIndex: startIndex, StartIndex: startIndex,
EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data EndIndex: endIndex,
Color: enry.GetColor(language), Color: enry.GetColor(language),
}) })
} }
@ -315,7 +355,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
searchType = esMultiMatchTypeBestFields searchType = esMultiMatchTypeBestFields
} }
kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType) kwQuery := elastic.NewBoolQuery().Should(
elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType),
elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
)
query := elastic.NewBoolQuery() query := elastic.NewBoolQuery()
query = query.Must(kwQuery) query = query.Must(kwQuery)
if len(opts.RepoIDs) > 0 { if len(opts.RepoIDs) > 0 {
@ -341,6 +384,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
Highlight( Highlight(
elastic.NewHighlight(). elastic.NewHighlight().
Field("content"). Field("content").
Field("filename").
NumOfFragments(0). // return all highting content on fragments NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"), HighlighterType("fvh"),
). ).
@ -373,6 +417,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
Highlight( Highlight(
elastic.NewHighlight(). elastic.NewHighlight().
Field("content"). Field("content").
Field("filename").
NumOfFragments(0). // return all highting content on fragments NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"), HighlighterType("fvh"),
). ).

View File

@ -10,7 +10,7 @@ import (
) )
func TestIndexPos(t *testing.T) { func TestIndexPos(t *testing.T) {
startIdx, endIdx := indexPos("test index start and end", "start", "end") startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
assert.EqualValues(t, 11, startIdx) assert.EqualValues(t, 11, startIdx)
assert.EqualValues(t, 24, endIdx) assert.EqualValues(t, 15, endIdx)
} }

View File

@ -6,6 +6,7 @@ package code
import ( import (
"context" "context"
"os" "os"
"slices"
"testing" "testing"
"code.gitea.io/gitea/models/db" "code.gitea.io/gitea/models/db"
@ -20,53 +21,166 @@ import (
_ "code.gitea.io/gitea/models/activities" _ "code.gitea.io/gitea/models/activities"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
_ "github.com/mattn/go-sqlite3"
) )
type codeSearchResult struct {
Filename string
Content string
}
func TestMain(m *testing.M) { func TestMain(m *testing.M) {
unittest.MainTest(m) unittest.MainTest(m)
} }
func testIndexer(name string, t *testing.T, indexer internal.Indexer) { func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
t.Run(name, func(t *testing.T) { t.Run(name, func(t *testing.T) {
var repoID int64 = 1 assert.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer))
err := index(git.DefaultContext, indexer, repoID)
assert.NoError(t, err)
keywords := []struct { keywords := []struct {
RepoIDs []int64 RepoIDs []int64
Keyword string Keyword string
IDs []int64
Langs int Langs int
Results []codeSearchResult
}{ }{
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
{ {
RepoIDs: nil, RepoIDs: nil,
Keyword: "Description", Keyword: "Description",
IDs: []int64{repoID},
Langs: 1, Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
}, },
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{ {
RepoIDs: []int64{2}, RepoIDs: []int64{2},
Keyword: "Description", Keyword: "Description",
IDs: []int64{},
Langs: 0, Langs: 0,
}, },
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
{ {
RepoIDs: nil, RepoIDs: nil,
Keyword: "repo1", Keyword: "repo1",
IDs: []int64{repoID},
Langs: 1, Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
}, },
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{ {
RepoIDs: []int64{2}, RepoIDs: []int64{2},
Keyword: "repo1", Keyword: "repo1",
IDs: []int64{},
Langs: 0, Langs: 0,
}, },
// Search for a non-existing term.
// This scenario yields no results
{ {
RepoIDs: nil, RepoIDs: nil,
Keyword: "non-exist", Keyword: "non-exist",
IDs: []int64{},
Langs: 0, Langs: 0,
}, },
// Search for an exact match on the contents of a file within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "pineaple",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for an exact match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avocado.md",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for an partial match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avo",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on both the contents and the filenames within the repo '62'.
// This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents
{
RepoIDs: []int64{62},
Keyword: "cucumber",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "cucumber.md",
Content: "Salad is good for your health",
},
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on the filenames within the repo '62'.
// This scenario yields two results (both are based on filename, the first one is an exact match)
{
RepoIDs: []int64{62},
Keyword: "ham",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "ham.md",
Content: "This is also not cheese",
},
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
},
},
// Search for matches on the contents of files within the repo '62'.
// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
{
RepoIDs: []int64{62},
Keyword: "This is not cheese",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
{
Filename: "ham.md",
Content: "This is also not cheese",
},
},
},
} }
for _, kw := range keywords { for _, kw := range keywords {
@ -81,19 +195,37 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
IsKeywordFuzzy: true, IsKeywordFuzzy: true,
}) })
assert.NoError(t, err) assert.NoError(t, err)
assert.Len(t, kw.IDs, int(total))
assert.Len(t, langs, kw.Langs) assert.Len(t, langs, kw.Langs)
ids := make([]int64, 0, len(res)) hits := make([]codeSearchResult, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID) if total > 0 {
assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content) assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
} }
assert.EqualValues(t, kw.IDs, ids)
for _, hit := range res {
hits = append(hits, codeSearchResult{
Filename: hit.Filename,
Content: hit.Content,
}) })
} }
assert.NoError(t, indexer.Delete(context.Background(), repoID)) lastIndex := -1
for _, expected := range kw.Results {
index := slices.Index(hits, expected)
if index == -1 {
assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
} else if lastIndex > index {
assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
} else {
lastIndex = index
}
}
})
}
assert.NoError(t, tearDownRepositoryIndexes(indexer))
}) })
} }
@ -136,3 +268,25 @@ func TestESIndexAndSearch(t *testing.T) {
testIndexer("elastic_search", t, indexer) testIndexer("elastic_search", t, indexer)
} }
func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := index(ctx, indexer, repoID); err != nil {
return err
}
}
return nil
}
func tearDownRepositoryIndexes(indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := indexer.Delete(context.Background(), repoID); err != nil {
return err
}
}
return nil
}
func repositoriesToSearch() []int64 {
return []int64{1, 62}
}

View File

@ -10,6 +10,10 @@ import (
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
) )
const (
filenameMatchNumberOfLines = 7 // Copied from github search
)
func FilenameIndexerID(repoID int64, filename string) string { func FilenameIndexerID(repoID int64, filename string) string {
return internal.Base36(repoID) + "_" + filename return internal.Base36(repoID) + "_" + filename
} }
@ -30,3 +34,17 @@ func FilenameOfIndexerID(indexerID string) string {
} }
return indexerID[index+1:] return indexerID[index+1:]
} }
// Given the contents of file, returns the boundaries of its first seven lines.
func FilenameMatchIndexPos(content string) (int, int) {
count := 1
for i, c := range content {
if c == '\n' {
count++
if count == filenameMatchNumberOfLines {
return 0, i
}
}
}
return 0, len(content)
}

View File

@ -11,10 +11,15 @@ import (
"code.gitea.io/gitea/modules/util" "code.gitea.io/gitea/modules/util"
"github.com/blevesearch/bleve/v2" "github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/index/upsidedown" "github.com/blevesearch/bleve/v2/index/upsidedown"
"github.com/ethantkoenig/rupture" "github.com/ethantkoenig/rupture"
) )
const (
maxFuzziness = 2
)
// openIndexer open the index at the specified path, checking for metadata // openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or // updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil) // re-created), returns (nil, nil)
@ -48,7 +53,27 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil return index, 0, nil
} }
// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
// may be different on two string and they still be considered equivalent.
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int { func GuessFuzzinessByKeyword(s string) int {
tokenizer := unicode.NewUnicodeTokenizer()
tokens := tokenizer.Tokenize([]byte(s))
if len(tokens) > 0 {
fuzziness := maxFuzziness
for _, token := range tokens {
fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
}
return fuzziness
}
return 0
}
func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2 // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot. // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
@ -57,5 +82,5 @@ func GuessFuzzinessByKeyword(s string) int {
return 0 return 0
} }
} }
return min(2, len(s)/4) return min(maxFuzziness, len(s)/4)
} }

View File

@ -0,0 +1,45 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
)
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
scenarios := []struct {
Input string
Fuzziness int // See util.go for the definition of fuzziness in this particular context
}{
{
Input: "",
Fuzziness: 0,
},
{
Input: "Avocado",
Fuzziness: 1,
},
{
Input: "Geschwindigkeit",
Fuzziness: 2,
},
{
Input: "non-exist",
Fuzziness: 0,
},
{
Input: "갃갃갃",
Fuzziness: 0,
},
}
for _, scenario := range scenarios {
t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
})
}
}

View File

@ -0,0 +1 @@
ref: refs/heads/master

View File

@ -0,0 +1,4 @@
[core]
repositoryformatversion = 0
filemode = true
bare = true

View File

@ -0,0 +1,8 @@
This repository will be used to test code search. The snippet below shows its directory structure
.
├── avocado.md
├── cucumber.md
├── ham.md
└── potato
└── ham.md

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/post-receive.d"`; do
sh "$SHELL_FOLDER/post-receive.d/$i"
done

View File

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" post-receive

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/pre-receive.d"`; do
sh "$SHELL_FOLDER/pre-receive.d/$i"
done

View File

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" pre-receive

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/proc-receive.d"`; do
sh "$SHELL_FOLDER/proc-receive.d/$i"
done

View File

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" proc-receive

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/update.d"`; do
sh "$SHELL_FOLDER/update.d/$i" $1 $2 $3
done

View File

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" update $1 $2 $3

View File

@ -0,0 +1,6 @@
# git ls-files --others --exclude-from=.git/info/exclude
# Lines that start with '#' are comments.
# For a project mostly in C, the following would be a good set of
# exclude patterns (uncomment them if you want to use them):
# *.[oa]
# *~

View File

@ -0,0 +1,13 @@
90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch
985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1
78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check
3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update
4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check
3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits
4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head
5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1

View File

@ -0,0 +1,2 @@
P pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack

View File

@ -0,0 +1,14 @@
# pack-refs with: peeled fully-peeled sorted
90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch
985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1
78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check
3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update
4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check
3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits
4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head
5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1

View File

@ -177,7 +177,7 @@ func TestAPIGetAll(t *testing.T) {
var apiOrgList []*api.Organization var apiOrgList []*api.Organization
DecodeJSON(t, resp, &apiOrgList) DecodeJSON(t, resp, &apiOrgList)
assert.Len(t, apiOrgList, 12) assert.Len(t, apiOrgList, 13)
assert.Equal(t, "Limited Org 36", apiOrgList[1].FullName) assert.Equal(t, "Limited Org 36", apiOrgList[1].FullName)
assert.Equal(t, "limited", apiOrgList[1].Visibility) assert.Equal(t, "limited", apiOrgList[1].Visibility)
@ -186,7 +186,7 @@ func TestAPIGetAll(t *testing.T) {
resp = MakeRequest(t, req, http.StatusOK) resp = MakeRequest(t, req, http.StatusOK)
DecodeJSON(t, resp, &apiOrgList) DecodeJSON(t, resp, &apiOrgList)
assert.Len(t, apiOrgList, 8) assert.Len(t, apiOrgList, 9)
assert.Equal(t, "org 17", apiOrgList[0].FullName) assert.Equal(t, "org 17", apiOrgList[0].FullName)
assert.Equal(t, "public", apiOrgList[0].Visibility) assert.Equal(t, "public", apiOrgList[0].Visibility)
} }

View File

@ -94,9 +94,9 @@ func TestAPISearchRepo(t *testing.T) {
}{ }{
{ {
name: "RepositoriesMax50", requestURL: "/api/v1/repos/search?limit=50&private=false", expectedResults: expectedResults{ name: "RepositoriesMax50", requestURL: "/api/v1/repos/search?limit=50&private=false", expectedResults: expectedResults{
nil: {count: 35}, nil: {count: 36},
user: {count: 35}, user: {count: 36},
user2: {count: 35}, user2: {count: 36},
}, },
}, },
{ {