Reduce repo indexer disk usage (#3452)

This commit is contained in:
Ethan Koenig 2018-02-05 10:29:17 -08:00 committed by Lauris BH
parent 283e87d814
commit a89592d4ab
14 changed files with 704 additions and 97 deletions

View file

@ -6,12 +6,17 @@ package indexer
import (
"fmt"
"os"
"strconv"
"code.gitea.io/gitea/modules/setting"
"github.com/blevesearch/bleve"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
)
// indexerID a bleve-compatible unique identifier for an integer id
@ -53,40 +58,36 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}
// Update represents an update to an indexer
type Update interface {
addToBatch(batch *bleve.Batch) error
}
const maxBatchSize = 16
// Batch batch of indexer updates that automatically flushes once it
// reaches a certain size
type Batch struct {
batch *bleve.Batch
index bleve.Index
}
// Add add update to batch, possibly flushing
func (batch *Batch) Add(update Update) error {
if err := update.addToBatch(batch.batch); err != nil {
return err
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(setting.Indexer.IssuePath)
if err != nil && os.IsNotExist(err) {
return nil, nil
} else if err != nil {
return nil, err
}
return batch.flushIfFull()
}
func (batch *Batch) flushIfFull() error {
if batch.batch.Size() >= maxBatchSize {
return batch.Flush()
metadata, err := rupture.ReadIndexMetadata(path)
if err != nil {
return nil, err
}
if metadata.Version < latestVersion {
// the indexer is using a previous version, so we should delete it and
// re-populate
return nil, os.RemoveAll(path)
}
return nil
}
// Flush manually flush the batch, regardless of its size
func (batch *Batch) Flush() error {
if err := batch.index.Batch(batch.batch); err != nil {
return err
index, err := bleve.Open(path)
if err != nil && err == upsidedown.IncompatibleVersion {
// the indexer was built with a previous version of bleve, so we should
// delete it and re-populate
return nil, os.RemoveAll(path)
} else if err != nil {
return nil, err
}
batch.batch.Reset()
return nil
return index, nil
}

View file

@ -5,8 +5,6 @@
package indexer
import (
"os"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
@ -14,12 +12,19 @@ import (
"github.com/blevesearch/bleve/analysis/analyzer/custom"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/ethantkoenig/rupture"
)
// issueIndexer (thread-safe) index for searching issues
var issueIndexer bleve.Index
const (
issueIndexerAnalyzer = "issueIndexer"
issueIndexerDocType = "issueIndexerDocType"
issueIndexerLatestVersion = 1
)
// IssueIndexerData data stored in the issue indexer
type IssueIndexerData struct {
RepoID int64
@ -28,35 +33,33 @@ type IssueIndexerData struct {
Comments []string
}
// Type returns the document type, for bleve's mapping.Classifier interface.
func (i *IssueIndexerData) Type() string {
return issueIndexerDocType
}
// IssueIndexerUpdate an update to the issue indexer
type IssueIndexerUpdate struct {
IssueID int64
Data *IssueIndexerData
}
func (update IssueIndexerUpdate) addToBatch(batch *bleve.Batch) error {
return batch.Index(indexerID(update.IssueID), update.Data)
// AddToFlushingBatch adds the update to the given flushing batch.
func (i IssueIndexerUpdate) AddToFlushingBatch(batch rupture.FlushingBatch) error {
return batch.Index(indexerID(i.IssueID), i.Data)
}
const issueIndexerAnalyzer = "issueIndexer"
// InitIssueIndexer initialize issue indexer
func InitIssueIndexer(populateIndexer func() error) {
_, err := os.Stat(setting.Indexer.IssuePath)
if err != nil && !os.IsNotExist(err) {
var err error
issueIndexer, err = openIndexer(setting.Indexer.IssuePath, issueIndexerLatestVersion)
if err != nil {
log.Fatal(4, "InitIssueIndexer: %v", err)
} else if err == nil {
issueIndexer, err = bleve.Open(setting.Indexer.IssuePath)
if err == nil {
return
} else if err != upsidedown.IncompatibleVersion {
log.Fatal(4, "InitIssueIndexer, open index: %v", err)
}
log.Warn("Incompatible bleve version, deleting and recreating issue indexer")
if err = os.RemoveAll(setting.Indexer.IssuePath); err != nil {
log.Fatal(4, "InitIssueIndexer: remove index, %v", err)
}
}
if issueIndexer != nil {
return
}
if err = createIssueIndexer(); err != nil {
log.Fatal(4, "InitIssuesIndexer: create index, %v", err)
}
@ -70,9 +73,13 @@ func createIssueIndexer() error {
mapping := bleve.NewIndexMapping()
docMapping := bleve.NewDocumentMapping()
docMapping.AddFieldMappingsAt("RepoID", bleve.NewNumericFieldMapping())
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.Store = false
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Title", textFieldMapping)
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
docMapping.AddFieldMappingsAt("Comments", textFieldMapping)
@ -89,7 +96,8 @@ func createIssueIndexer() error {
}
mapping.DefaultAnalyzer = issueIndexerAnalyzer
mapping.AddDocumentMapping("issues", docMapping)
mapping.AddDocumentMapping(issueIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
var err error
issueIndexer, err = bleve.New(setting.Indexer.IssuePath, mapping)
@ -97,11 +105,8 @@ func createIssueIndexer() error {
}
// IssueIndexerBatch batch to add updates to
func IssueIndexerBatch() *Batch {
return &Batch{
batch: issueIndexer.NewBatch(),
index: issueIndexer,
}
func IssueIndexerBatch() rupture.FlushingBatch {
return rupture.NewFlushingBatch(issueIndexer, maxBatchSize)
}
// SearchIssuesByKeyword searches for issues by given conditions.

View file

@ -5,7 +5,6 @@
package indexer
import (
"os"
"strings"
"code.gitea.io/gitea/modules/log"
@ -15,10 +14,17 @@ import (
"github.com/blevesearch/bleve/analysis/analyzer/custom"
"github.com/blevesearch/bleve/analysis/token/camelcase"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/token/unique"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/ethantkoenig/rupture"
)
const repoIndexerAnalyzer = "repoIndexerAnalyzer"
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 1
)
// repoIndexer (thread-safe) index for repository contents
var repoIndexer bleve.Index
@ -40,6 +46,11 @@ type RepoIndexerData struct {
Content string
}
// Type returns the document type, for bleve's mapping.Classifier interface.
func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
// RepoIndexerUpdate an update to the repo indexer
type RepoIndexerUpdate struct {
Filepath string
@ -47,13 +58,14 @@ type RepoIndexerUpdate struct {
Data *RepoIndexerData
}
func (update RepoIndexerUpdate) addToBatch(batch *bleve.Batch) error {
// AddToFlushingBatch adds the update to the given flushing batch.
func (update RepoIndexerUpdate) AddToFlushingBatch(batch rupture.FlushingBatch) error {
id := filenameIndexerID(update.Data.RepoID, update.Filepath)
switch update.Op {
case RepoIndexerOpUpdate:
return batch.Index(id, update.Data)
case RepoIndexerOpDelete:
batch.Delete(id)
return batch.Delete(id)
default:
log.Error(4, "Unrecognized repo indexer op: %d", update.Op)
}
@ -62,48 +74,50 @@ func (update RepoIndexerUpdate) addToBatch(batch *bleve.Batch) error {
// InitRepoIndexer initialize repo indexer
func InitRepoIndexer(populateIndexer func() error) {
_, err := os.Stat(setting.Indexer.RepoPath)
var err error
repoIndexer, err = openIndexer(setting.Indexer.RepoPath, repoIndexerLatestVersion)
if err != nil {
if os.IsNotExist(err) {
if err = createRepoIndexer(); err != nil {
log.Fatal(4, "CreateRepoIndexer: %v", err)
}
if err = populateIndexer(); err != nil {
log.Fatal(4, "PopulateRepoIndex: %v", err)
}
} else {
log.Fatal(4, "InitRepoIndexer: %v", err)
}
} else {
repoIndexer, err = bleve.Open(setting.Indexer.RepoPath)
if err != nil {
log.Fatal(4, "InitRepoIndexer, open index: %v", err)
}
log.Fatal(4, "InitRepoIndexer: %v", err)
}
if repoIndexer != nil {
return
}
if err = createRepoIndexer(); err != nil {
log.Fatal(4, "CreateRepoIndexer: %v", err)
}
if err = populateIndexer(); err != nil {
log.Fatal(4, "PopulateRepoIndex: %v", err)
}
}
// createRepoIndexer create a repo indexer if one does not already exist
func createRepoIndexer() error {
var err error
docMapping := bleve.NewDocumentMapping()
docMapping.AddFieldMappingsAt("RepoID", bleve.NewNumericFieldMapping())
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
if err = addUnicodeNormalizeTokenFilter(mapping); err != nil {
return err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
} else if err = mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
"type": custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name, unique.Name},
}); err != nil {
return err
}
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping("repo", docMapping)
var err error
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
repoIndexer, err = bleve.New(setting.Indexer.RepoPath, mapping)
return err
}
@ -121,11 +135,8 @@ func filenameOfIndexerID(indexerID string) string {
}
// RepoIndexerBatch batch to add updates to
func RepoIndexerBatch() *Batch {
return &Batch{
batch: repoIndexer.NewBatch(),
index: repoIndexer,
}
func RepoIndexerBatch() rupture.FlushingBatch {
return rupture.NewFlushingBatch(repoIndexer, maxBatchSize)
}
// DeleteRepoFromIndexer delete all of a repo's files from indexer
@ -138,8 +149,7 @@ func DeleteRepoFromIndexer(repoID int64) error {
}
batch := RepoIndexerBatch()
for _, hit := range result.Hits {
batch.batch.Delete(hit.ID)
if err = batch.flushIfFull(); err != nil {
if err = batch.Delete(hit.ID); err != nil {
return err
}
}