Fix bug on elastic search (#12811)

* Fix bug on elastic search

* Add more comments for elastic search result startIndex and endIndex

* refactor indexPos

* refactor indexPos

* Fix bug
This commit is contained in:
Lunny Xiao 2020-09-12 20:31:52 +08:00 committed by GitHub
parent ae528d8321
commit 8ce10fb6e1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 15 deletions

View file

@ -90,6 +90,7 @@ const (
}, },
"content": { "content": {
"type": "text", "type": "text",
"term_vector": "with_positions_offsets",
"index": true "index": true
}, },
"commit_id": { "commit_id": {
@ -251,6 +252,22 @@ func (b *ElasticSearchIndexer) Delete(repoID int64) error {
return err return err
} }
// indexPos find words positions for start and the following end on content. It will
// return the beginning position of the frist start and the ending position of the
// first end following the start string.
// If not found any of the positions, it will return -1, -1.
func indexPos(content, start, end string) (int, int) {
startIdx := strings.Index(content, start)
if startIdx < 0 {
return -1, -1
}
endIdx := strings.Index(content[startIdx+len(start):], end)
if endIdx < 0 {
return -1, -1
}
return startIdx, startIdx + len(start) + endIdx + len(end)
}
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) { func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
hits := make([]*SearchResult, 0, pageSize) hits := make([]*SearchResult, 0, pageSize)
for _, hit := range searchResult.Hits.Hits { for _, hit := range searchResult.Hits.Hits {
@ -260,18 +277,12 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
var startIndex, endIndex int = -1, -1 var startIndex, endIndex int = -1, -1
c, ok := hit.Highlight["content"] c, ok := hit.Highlight["content"]
if ok && len(c) > 0 { if ok && len(c) > 0 {
var subStr = make([]rune, 0, len(kw)) // FIXME: Since the high lighting content will include <em> and </em> for the keywords,
startIndex = strings.IndexFunc(c[0], func(r rune) bool { // now we should find the poisitions. But how to avoid html content which contains the
if len(subStr) >= len(kw) { // <em> and </em> tags? If elastic search has handled that?
subStr = subStr[1:] startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
} if startIndex == -1 {
subStr = append(subStr, r) panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
return strings.EqualFold(kw, string(subStr))
})
if startIndex > -1 {
endIndex = startIndex + len(kw)
} else {
panic(fmt.Sprintf("1===%#v", hit.Highlight))
} }
} else { } else {
panic(fmt.Sprintf("2===%#v", hit.Highlight)) panic(fmt.Sprintf("2===%#v", hit.Highlight))
@ -293,7 +304,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
Language: language, Language: language,
StartIndex: startIndex, StartIndex: startIndex,
EndIndex: endIndex, EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data
Color: enry.GetColor(language), Color: enry.GetColor(language),
}) })
} }
@ -347,7 +358,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string,
Index(b.indexerAliasName). Index(b.indexerAliasName).
Aggregation("language", aggregation). Aggregation("language", aggregation).
Query(query). Query(query).
Highlight(elastic.NewHighlight().Field("content")). Highlight(
elastic.NewHighlight().
Field("content").
NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"),
).
Sort("repo_id", true). Sort("repo_id", true).
From(start).Size(pageSize). From(start).Size(pageSize).
Do(context.Background()) Do(context.Background())
@ -373,7 +389,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string,
searchResult, err := b.client.Search(). searchResult, err := b.client.Search().
Index(b.indexerAliasName). Index(b.indexerAliasName).
Query(query). Query(query).
Highlight(elastic.NewHighlight().Field("content")). Highlight(
elastic.NewHighlight().
Field("content").
NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"),
).
Sort("repo_id", true). Sort("repo_id", true).
From(start).Size(pageSize). From(start).Size(pageSize).
Do(context.Background()) Do(context.Background())

View file

@ -34,3 +34,9 @@ func TestESIndexAndSearch(t *testing.T) {
testIndexer("elastic_search", t, indexer) testIndexer("elastic_search", t, indexer)
} }
func TestIndexPos(t *testing.T) {
startIdx, endIdx := indexPos("test index start and end", "start", "end")
assert.EqualValues(t, 11, startIdx)
assert.EqualValues(t, 24, endIdx)
}