Reduce repo indexer disk usage (#3452)
This commit is contained in:
parent
283e87d814
commit
a89592d4ab
14 changed files with 704 additions and 97 deletions
|
@ -6,12 +6,17 @@ package indexer
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
|
||||
"github.com/blevesearch/bleve"
|
||||
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
|
||||
"github.com/blevesearch/bleve/index/upsidedown"
|
||||
"github.com/blevesearch/bleve/mapping"
|
||||
"github.com/blevesearch/bleve/search/query"
|
||||
"github.com/ethantkoenig/rupture"
|
||||
)
|
||||
|
||||
// indexerID a bleve-compatible unique identifier for an integer id
|
||||
|
@ -53,40 +58,36 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
|||
})
|
||||
}
|
||||
|
||||
// Update represents an update to an indexer
|
||||
type Update interface {
|
||||
addToBatch(batch *bleve.Batch) error
|
||||
}
|
||||
|
||||
const maxBatchSize = 16
|
||||
|
||||
// Batch batch of indexer updates that automatically flushes once it
|
||||
// reaches a certain size
|
||||
type Batch struct {
|
||||
batch *bleve.Batch
|
||||
index bleve.Index
|
||||
}
|
||||
|
||||
// Add add update to batch, possibly flushing
|
||||
func (batch *Batch) Add(update Update) error {
|
||||
if err := update.addToBatch(batch.batch); err != nil {
|
||||
return err
|
||||
// openIndexer open the index at the specified path, checking for metadata
|
||||
// updates and bleve version updates. If index needs to be created (or
|
||||
// re-created), returns (nil, nil)
|
||||
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
|
||||
_, err := os.Stat(setting.Indexer.IssuePath)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return batch.flushIfFull()
|
||||
}
|
||||
|
||||
func (batch *Batch) flushIfFull() error {
|
||||
if batch.batch.Size() >= maxBatchSize {
|
||||
return batch.Flush()
|
||||
metadata, err := rupture.ReadIndexMetadata(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if metadata.Version < latestVersion {
|
||||
// the indexer is using a previous version, so we should delete it and
|
||||
// re-populate
|
||||
return nil, os.RemoveAll(path)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Flush manually flush the batch, regardless of its size
|
||||
func (batch *Batch) Flush() error {
|
||||
if err := batch.index.Batch(batch.batch); err != nil {
|
||||
return err
|
||||
index, err := bleve.Open(path)
|
||||
if err != nil && err == upsidedown.IncompatibleVersion {
|
||||
// the indexer was built with a previous version of bleve, so we should
|
||||
// delete it and re-populate
|
||||
return nil, os.RemoveAll(path)
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
batch.batch.Reset()
|
||||
return nil
|
||||
return index, nil
|
||||
}
|
||||
|
|
|
@ -5,8 +5,6 @@
|
|||
package indexer
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
|
||||
|
@ -14,12 +12,19 @@ import (
|
|||
"github.com/blevesearch/bleve/analysis/analyzer/custom"
|
||||
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||
"github.com/blevesearch/bleve/index/upsidedown"
|
||||
"github.com/ethantkoenig/rupture"
|
||||
)
|
||||
|
||||
// issueIndexer (thread-safe) index for searching issues
|
||||
var issueIndexer bleve.Index
|
||||
|
||||
const (
|
||||
issueIndexerAnalyzer = "issueIndexer"
|
||||
issueIndexerDocType = "issueIndexerDocType"
|
||||
|
||||
issueIndexerLatestVersion = 1
|
||||
)
|
||||
|
||||
// IssueIndexerData data stored in the issue indexer
|
||||
type IssueIndexerData struct {
|
||||
RepoID int64
|
||||
|
@ -28,35 +33,33 @@ type IssueIndexerData struct {
|
|||
Comments []string
|
||||
}
|
||||
|
||||
// Type returns the document type, for bleve's mapping.Classifier interface.
|
||||
func (i *IssueIndexerData) Type() string {
|
||||
return issueIndexerDocType
|
||||
}
|
||||
|
||||
// IssueIndexerUpdate an update to the issue indexer
|
||||
type IssueIndexerUpdate struct {
|
||||
IssueID int64
|
||||
Data *IssueIndexerData
|
||||
}
|
||||
|
||||
func (update IssueIndexerUpdate) addToBatch(batch *bleve.Batch) error {
|
||||
return batch.Index(indexerID(update.IssueID), update.Data)
|
||||
// AddToFlushingBatch adds the update to the given flushing batch.
|
||||
func (i IssueIndexerUpdate) AddToFlushingBatch(batch rupture.FlushingBatch) error {
|
||||
return batch.Index(indexerID(i.IssueID), i.Data)
|
||||
}
|
||||
|
||||
const issueIndexerAnalyzer = "issueIndexer"
|
||||
|
||||
// InitIssueIndexer initialize issue indexer
|
||||
func InitIssueIndexer(populateIndexer func() error) {
|
||||
_, err := os.Stat(setting.Indexer.IssuePath)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
var err error
|
||||
issueIndexer, err = openIndexer(setting.Indexer.IssuePath, issueIndexerLatestVersion)
|
||||
if err != nil {
|
||||
log.Fatal(4, "InitIssueIndexer: %v", err)
|
||||
} else if err == nil {
|
||||
issueIndexer, err = bleve.Open(setting.Indexer.IssuePath)
|
||||
if err == nil {
|
||||
return
|
||||
} else if err != upsidedown.IncompatibleVersion {
|
||||
log.Fatal(4, "InitIssueIndexer, open index: %v", err)
|
||||
}
|
||||
log.Warn("Incompatible bleve version, deleting and recreating issue indexer")
|
||||
if err = os.RemoveAll(setting.Indexer.IssuePath); err != nil {
|
||||
log.Fatal(4, "InitIssueIndexer: remove index, %v", err)
|
||||
}
|
||||
}
|
||||
if issueIndexer != nil {
|
||||
return
|
||||
}
|
||||
|
||||
if err = createIssueIndexer(); err != nil {
|
||||
log.Fatal(4, "InitIssuesIndexer: create index, %v", err)
|
||||
}
|
||||
|
@ -70,9 +73,13 @@ func createIssueIndexer() error {
|
|||
mapping := bleve.NewIndexMapping()
|
||||
docMapping := bleve.NewDocumentMapping()
|
||||
|
||||
docMapping.AddFieldMappingsAt("RepoID", bleve.NewNumericFieldMapping())
|
||||
numericFieldMapping := bleve.NewNumericFieldMapping()
|
||||
numericFieldMapping.IncludeInAll = false
|
||||
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
|
||||
|
||||
textFieldMapping := bleve.NewTextFieldMapping()
|
||||
textFieldMapping.Store = false
|
||||
textFieldMapping.IncludeInAll = false
|
||||
docMapping.AddFieldMappingsAt("Title", textFieldMapping)
|
||||
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
|
||||
docMapping.AddFieldMappingsAt("Comments", textFieldMapping)
|
||||
|
@ -89,7 +96,8 @@ func createIssueIndexer() error {
|
|||
}
|
||||
|
||||
mapping.DefaultAnalyzer = issueIndexerAnalyzer
|
||||
mapping.AddDocumentMapping("issues", docMapping)
|
||||
mapping.AddDocumentMapping(issueIndexerDocType, docMapping)
|
||||
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
|
||||
|
||||
var err error
|
||||
issueIndexer, err = bleve.New(setting.Indexer.IssuePath, mapping)
|
||||
|
@ -97,11 +105,8 @@ func createIssueIndexer() error {
|
|||
}
|
||||
|
||||
// IssueIndexerBatch batch to add updates to
|
||||
func IssueIndexerBatch() *Batch {
|
||||
return &Batch{
|
||||
batch: issueIndexer.NewBatch(),
|
||||
index: issueIndexer,
|
||||
}
|
||||
func IssueIndexerBatch() rupture.FlushingBatch {
|
||||
return rupture.NewFlushingBatch(issueIndexer, maxBatchSize)
|
||||
}
|
||||
|
||||
// SearchIssuesByKeyword searches for issues by given conditions.
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
package indexer
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
|
@ -15,10 +14,17 @@ import (
|
|||
"github.com/blevesearch/bleve/analysis/analyzer/custom"
|
||||
"github.com/blevesearch/bleve/analysis/token/camelcase"
|
||||
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/analysis/token/unique"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||
"github.com/ethantkoenig/rupture"
|
||||
)
|
||||
|
||||
const repoIndexerAnalyzer = "repoIndexerAnalyzer"
|
||||
const (
|
||||
repoIndexerAnalyzer = "repoIndexerAnalyzer"
|
||||
repoIndexerDocType = "repoIndexerDocType"
|
||||
|
||||
repoIndexerLatestVersion = 1
|
||||
)
|
||||
|
||||
// repoIndexer (thread-safe) index for repository contents
|
||||
var repoIndexer bleve.Index
|
||||
|
@ -40,6 +46,11 @@ type RepoIndexerData struct {
|
|||
Content string
|
||||
}
|
||||
|
||||
// Type returns the document type, for bleve's mapping.Classifier interface.
|
||||
func (d *RepoIndexerData) Type() string {
|
||||
return repoIndexerDocType
|
||||
}
|
||||
|
||||
// RepoIndexerUpdate an update to the repo indexer
|
||||
type RepoIndexerUpdate struct {
|
||||
Filepath string
|
||||
|
@ -47,13 +58,14 @@ type RepoIndexerUpdate struct {
|
|||
Data *RepoIndexerData
|
||||
}
|
||||
|
||||
func (update RepoIndexerUpdate) addToBatch(batch *bleve.Batch) error {
|
||||
// AddToFlushingBatch adds the update to the given flushing batch.
|
||||
func (update RepoIndexerUpdate) AddToFlushingBatch(batch rupture.FlushingBatch) error {
|
||||
id := filenameIndexerID(update.Data.RepoID, update.Filepath)
|
||||
switch update.Op {
|
||||
case RepoIndexerOpUpdate:
|
||||
return batch.Index(id, update.Data)
|
||||
case RepoIndexerOpDelete:
|
||||
batch.Delete(id)
|
||||
return batch.Delete(id)
|
||||
default:
|
||||
log.Error(4, "Unrecognized repo indexer op: %d", update.Op)
|
||||
}
|
||||
|
@ -62,48 +74,50 @@ func (update RepoIndexerUpdate) addToBatch(batch *bleve.Batch) error {
|
|||
|
||||
// InitRepoIndexer initialize repo indexer
|
||||
func InitRepoIndexer(populateIndexer func() error) {
|
||||
_, err := os.Stat(setting.Indexer.RepoPath)
|
||||
var err error
|
||||
repoIndexer, err = openIndexer(setting.Indexer.RepoPath, repoIndexerLatestVersion)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
if err = createRepoIndexer(); err != nil {
|
||||
log.Fatal(4, "CreateRepoIndexer: %v", err)
|
||||
}
|
||||
if err = populateIndexer(); err != nil {
|
||||
log.Fatal(4, "PopulateRepoIndex: %v", err)
|
||||
}
|
||||
} else {
|
||||
log.Fatal(4, "InitRepoIndexer: %v", err)
|
||||
}
|
||||
} else {
|
||||
repoIndexer, err = bleve.Open(setting.Indexer.RepoPath)
|
||||
if err != nil {
|
||||
log.Fatal(4, "InitRepoIndexer, open index: %v", err)
|
||||
}
|
||||
log.Fatal(4, "InitRepoIndexer: %v", err)
|
||||
}
|
||||
if repoIndexer != nil {
|
||||
return
|
||||
}
|
||||
|
||||
if err = createRepoIndexer(); err != nil {
|
||||
log.Fatal(4, "CreateRepoIndexer: %v", err)
|
||||
}
|
||||
if err = populateIndexer(); err != nil {
|
||||
log.Fatal(4, "PopulateRepoIndex: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// createRepoIndexer create a repo indexer if one does not already exist
|
||||
func createRepoIndexer() error {
|
||||
var err error
|
||||
docMapping := bleve.NewDocumentMapping()
|
||||
docMapping.AddFieldMappingsAt("RepoID", bleve.NewNumericFieldMapping())
|
||||
numericFieldMapping := bleve.NewNumericFieldMapping()
|
||||
numericFieldMapping.IncludeInAll = false
|
||||
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
|
||||
|
||||
textFieldMapping := bleve.NewTextFieldMapping()
|
||||
textFieldMapping.IncludeInAll = false
|
||||
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
|
||||
|
||||
mapping := bleve.NewIndexMapping()
|
||||
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
|
||||
if err = addUnicodeNormalizeTokenFilter(mapping); err != nil {
|
||||
return err
|
||||
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
|
||||
} else if err = mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
|
||||
"type": custom.Name,
|
||||
"char_filters": []string{},
|
||||
"tokenizer": unicode.Name,
|
||||
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
|
||||
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name, unique.Name},
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
mapping.DefaultAnalyzer = repoIndexerAnalyzer
|
||||
mapping.AddDocumentMapping("repo", docMapping)
|
||||
var err error
|
||||
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
|
||||
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
|
||||
|
||||
repoIndexer, err = bleve.New(setting.Indexer.RepoPath, mapping)
|
||||
return err
|
||||
}
|
||||
|
@ -121,11 +135,8 @@ func filenameOfIndexerID(indexerID string) string {
|
|||
}
|
||||
|
||||
// RepoIndexerBatch batch to add updates to
|
||||
func RepoIndexerBatch() *Batch {
|
||||
return &Batch{
|
||||
batch: repoIndexer.NewBatch(),
|
||||
index: repoIndexer,
|
||||
}
|
||||
func RepoIndexerBatch() rupture.FlushingBatch {
|
||||
return rupture.NewFlushingBatch(repoIndexer, maxBatchSize)
|
||||
}
|
||||
|
||||
// DeleteRepoFromIndexer delete all of a repo's files from indexer
|
||||
|
@ -138,8 +149,7 @@ func DeleteRepoFromIndexer(repoID int64) error {
|
|||
}
|
||||
batch := RepoIndexerBatch()
|
||||
for _, hit := range result.Hits {
|
||||
batch.batch.Delete(hit.ID)
|
||||
if err = batch.flushIfFull(); err != nil {
|
||||
if err = batch.Delete(hit.ID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue