Add detected file language to code search (#10256)

Move langauge detection to separate module to be more reusable

Add option to disable vendored file exclusion from file search

Allways show all language stats for search
This commit is contained in:
Lauris BH 2020-02-20 21:53:55 +02:00 committed by GitHub
parent efbd7ca39b
commit 3c45cf8494
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 346 additions and 63 deletions

View file

@ -0,0 +1,36 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package analyze
import (
"path/filepath"
"github.com/src-d/enry/v2"
)
// GetCodeLanguageWithCallback detects code language based on file name and content using callback
func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string {
if language, ok := enry.GetLanguageByExtension(filename); ok {
return language
}
if language, ok := enry.GetLanguageByFilename(filename); ok {
return language
}
content, err := contentFunc()
if err != nil {
return enry.OtherLanguage
}
return enry.GetLanguage(filepath.Base(filename), content)
}
// GetCodeLanguage detects code language based on file name and content
func GetCodeLanguage(filename string, content []byte) string {
return GetCodeLanguageWithCallback(filename, func() ([]byte, error) {
return content, nil
})
}

View file

@ -9,7 +9,8 @@ import (
"io"
"io/ioutil"
"math"
"path/filepath"
"code.gitea.io/gitea/modules/analyze"
"github.com/src-d/enry/v2"
"gopkg.in/src-d/go-git.v4"
@ -51,25 +52,15 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
// TODO: Use .gitattributes file for linguist overrides
language, ok := enry.GetLanguageByExtension(f.Name)
if !ok {
if language, ok = enry.GetLanguageByFilename(f.Name); !ok {
content, err := readFile(f, fileSizeLimit)
if err != nil {
return nil
}
language = enry.GetLanguage(filepath.Base(f.Name), content)
if language == enry.OtherLanguage {
return nil
}
}
language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) {
return readFile(f, fileSizeLimit)
})
if language == enry.OtherLanguage || language == "" {
return nil
}
if language != "" {
sizes[language] += f.Size
total += f.Size
}
sizes[language] += f.Size
total += f.Size
return nil
})

View file

@ -9,16 +9,20 @@ import (
"os"
"strconv"
"strings"
"time"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"github.com/blevesearch/bleve"
"github.com/blevesearch/bleve/analysis/analyzer/custom"
analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom"
analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
@ -26,6 +30,7 @@ import (
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
"github.com/src-d/enry/v2"
)
const unicodeNormalizeName = "unicodeNormalize"
@ -86,8 +91,11 @@ func openIndexer(path string, latestVersion int) (bleve.Index, error) {
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
Content string
RepoID int64
CommitID string
Content string
Language string
UpdatedAt time.Time
}
// Type returns the document type, for bleve's mapping.Classifier interface.
@ -95,7 +103,11 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
@ -118,8 +130,11 @@ func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.Flushin
id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
Content: string(charset.ToUTF8DropErrors(fileContents)),
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}
@ -131,7 +146,7 @@ func addDelete(filename string, repo *models.Repository, batch rupture.FlushingB
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 4
repoIndexerLatestVersion = 5
)
// createRepoIndexer create a repo indexer if one does not already exist
@ -145,11 +160,21 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
timeFieldMapping := bleve.NewDateTimeFieldMapping()
timeFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
"type": custom.Name,
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
@ -255,7 +280,7 @@ func (b *BleveIndexer) Index(repoID int64) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
if err := addUpdate(update, repo, batch); err != nil {
if err := addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
@ -289,7 +314,7 @@ func (b *BleveIndexer) Delete(repoID int64) error {
// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) {
func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
@ -309,16 +334,35 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
indexerQuery = phraseQuery
}
// Save for reuse without language filter
facetQuery := indexerQuery
if len(language) > 0 {
languageQuery := bleve.NewMatchQuery(language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name
indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
languageQuery,
)
}
from := (page - 1) * pageSize
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID"}
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(language) == 0 {
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
}
result, err := b.indexer.Search(searchRequest)
if err != nil {
return 0, nil, err
return 0, nil, nil, err
}
total := int64(result.Total)
searchResults := make([]*SearchResult, len(result.Hits))
for i, hit := range result.Hits {
var startIndex, endIndex int = -1, -1
@ -333,13 +377,47 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
endIndex = locationEnd
}
}
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
updatedUnix = timeutil.TimeStamp(t.Unix())
}
searchResults[i] = &SearchResult{
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: filenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: filenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
UpdatedUnix: updatedUnix,
Language: language,
Color: enry.GetColor(language),
}
}
return int64(result.Total), searchResults, nil
searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
if len(language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
facetRequest.IncludeLocations = true
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
if result, err = b.indexer.Search(facetRequest); err != nil {
return 0, nil, nil, err
}
}
languagesFacet := result.Facets["languages"]
for _, term := range languagesFacet.Terms {
if len(term.Term) == 0 {
continue
}
searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
Language: term.Term,
Color: enry.GetColor(term.Term),
Count: term.Count,
})
}
return total, searchResults, searchResultLanguages, nil
}

View file

@ -49,27 +49,34 @@ func TestIndexAndSearch(t *testing.T) {
keywords = []struct {
Keyword string
IDs []int64
Langs int
}{
{
Keyword: "Description",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "repo1",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "non-exist",
IDs: []int64{},
Langs: 0,
},
}
)
for _, kw := range keywords {
total, res, err := idx.Search(nil, kw.Keyword, 1, 10)
total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)
assert.NotNil(t, langs)
assert.Len(t, langs, kw.Langs)
var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)

View file

@ -12,22 +12,34 @@ import (
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
)
// SearchResult result of performing a search in a repo
type SearchResult struct {
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
CommitID string
UpdatedUnix timeutil.TimeStamp
Language string
Color string
}
// SearchResultLanguages result of top languages count in search results
type SearchResultLanguages struct {
Language string
Color string
Count int
}
// Indexer defines an interface to indexer issues contents
type Indexer interface {
Index(repoID int64) error
Delete(repoID int64) error
Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error)
Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error)
Close()
}

View file

@ -11,6 +11,7 @@ import (
"strings"
"code.gitea.io/gitea/modules/highlight"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/util"
)
@ -18,6 +19,10 @@ import (
type Result struct {
RepoID int64
Filename string
CommitID string
UpdatedUnix timeutil.TimeStamp
Language string
Color string
HighlightClass string
LineNumbers []int
FormattedLines gotemplate.HTML
@ -100,6 +105,10 @@ func searchResult(result *SearchResult, startIndex, endIndex int) (*Result, erro
return &Result{
RepoID: result.RepoID,
Filename: result.Filename,
CommitID: result.CommitID,
UpdatedUnix: result.UpdatedUnix,
Language: result.Language,
Color: result.Color,
HighlightClass: highlight.FileNameToHighlightClass(result.Filename),
LineNumbers: lineNumbers,
FormattedLines: gotemplate.HTML(formattedLinesBuffer.String()),
@ -107,14 +116,14 @@ func searchResult(result *SearchResult, startIndex, endIndex int) (*Result, erro
}
// PerformSearch perform a search on a repository
func PerformSearch(repoIDs []int64, keyword string, page, pageSize int) (int, []*Result, error) {
func PerformSearch(repoIDs []int64, language, keyword string, page, pageSize int) (int, []*Result, []*SearchResultLanguages, error) {
if len(keyword) == 0 {
return 0, nil, nil
return 0, nil, nil, nil
}
total, results, err := indexer.Search(repoIDs, keyword, page, pageSize)
total, results, resultLanguages, err := indexer.Search(repoIDs, language, keyword, page, pageSize)
if err != nil {
return 0, nil, err
return 0, nil, nil, err
}
displayResults := make([]*Result, len(results))
@ -123,8 +132,8 @@ func PerformSearch(repoIDs []int64, keyword string, page, pageSize int) (int, []
startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex)
displayResults[i], err = searchResult(result, startIndex, endIndex)
if err != nil {
return 0, nil, err
return 0, nil, nil, err
}
}
return int(total), displayResults, nil
return int(total), displayResults, resultLanguages, nil
}

View file

@ -71,12 +71,12 @@ func (w *wrappedIndexer) Delete(repoID int64) error {
return indexer.Delete(repoID)
}
func (w *wrappedIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) {
func (w *wrappedIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
indexer, err := w.get()
if err != nil {
return 0, nil, err
return 0, nil, nil, err
}
return indexer.Search(repoIDs, keyword, page, pageSize)
return indexer.Search(repoIDs, language, keyword, page, pageSize)
}

View file

@ -41,6 +41,7 @@ var (
MaxIndexerFileSize int64
IncludePatterns []glob.Glob
ExcludePatterns []glob.Glob
ExcludeVendored bool
}{
IssueType: "bleve",
IssuePath: "indexers/issues.bleve",
@ -52,6 +53,7 @@ var (
IssueQueueBatchNumber: 20,
MaxIndexerFileSize: 1024 * 1024,
ExcludeVendored: true,
}
)
@ -77,6 +79,7 @@ func newIndexerService() {
}
Indexer.IncludePatterns = IndexerGlobFromString(sec.Key("REPO_INDEXER_INCLUDE").MustString(""))
Indexer.ExcludePatterns = IndexerGlobFromString(sec.Key("REPO_INDEXER_EXCLUDE").MustString(""))
Indexer.ExcludeVendored = sec.Key("REPO_INDEXER_EXCLUDE_VENDORED").MustBool(true)
Indexer.UpdateQueueLength = sec.Key("UPDATE_BUFFER_LEN").MustInt(20)
Indexer.MaxIndexerFileSize = sec.Key("MAX_FILE_SIZE").MustInt64(1024 * 1024)
Indexer.StartupTimeout = sec.Key("STARTUP_TIMEOUT").MustDuration(30 * time.Second)