From 7ffa7f5bcef12a0cc05aff660cb5217dc7e4c886 Mon Sep 17 00:00:00 2001 From: Gusted Date: Fri, 5 Apr 2024 11:48:03 +0200 Subject: [PATCH] Refactor LFS GC functions - Remove options that currently aren't set on `GarbageCollectLFSMetaObjectsOptions` and `IterateLFSMetaObjectsForRepoOptions`. - Simplify `IterateRepositoryIDsWithLFSMetaObjects` and `IterateLFSMetaObjectsForRepo`. - `IterateLFSMetaObjectsForRepo` was previously able to get in a loop (`gc-lfs` doctor check was able to reproduce this) because the code expected that the records would be updated to not match the SQL query, but that wasn't the case. Simply enforce that only records higher than the latest `id` from the previous iteration are allowed. - For `gc-lfs` doctor check this was because `UpdatedLessRecentlyThan` option was not set, which caused that records just marked as active in the iteration weren't being filtered. - Add unit tests - Most likely a regression from 2cc3a6381cab34113fd2dc2c24ef0efc22c4336d. - The bug with `gc-lfs` was found on Codeberg. --- .../lfs_meta_object.yaml | 7 ++ models/git/lfs.go | 47 +++----- models/git/lfs_test.go | 101 ++++++++++++++++++ services/doctor/lfs.go | 1 + services/repository/lfs.go | 32 ++---- 5 files changed, 132 insertions(+), 56 deletions(-) create mode 100644 models/git/TestIterateRepositoryIDsWithLFSMetaObjects/lfs_meta_object.yaml create mode 100644 models/git/lfs_test.go diff --git a/models/git/TestIterateRepositoryIDsWithLFSMetaObjects/lfs_meta_object.yaml b/models/git/TestIterateRepositoryIDsWithLFSMetaObjects/lfs_meta_object.yaml new file mode 100644 index 000000000..fdfa66a21 --- /dev/null +++ b/models/git/TestIterateRepositoryIDsWithLFSMetaObjects/lfs_meta_object.yaml @@ -0,0 +1,7 @@ +- + + id: 1000 + oid: 9d172e5c64b4f0024b9901ec6afe9ea052f3c9b6ff9f4b07956d8c48c86fca82 + size: 25 + repository_id: 1 + created_unix: 1712309123 diff --git a/models/git/lfs.go b/models/git/lfs.go index 837dc9fd3..44b741c4c 100644 --- a/models/git/lfs.go +++ b/models/git/lfs.go @@ -337,58 +337,47 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) { func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error { batchSize := setting.Database.IterateBufferSize sess := db.GetEngine(ctx) - id := int64(0) + var start int type RepositoryCount struct { RepositoryID int64 Count int64 } for { counts := make([]*RepositoryCount, 0, batchSize) - sess.Select("repository_id, COUNT(id) AS count"). + if err := sess.Select("repository_id, COUNT(id) AS count"). Table("lfs_meta_object"). - Where("repository_id > ?", id). GroupBy("repository_id"). - OrderBy("repository_id ASC") - - if err := sess.Limit(batchSize, 0).Find(&counts); err != nil { + OrderBy("repository_id ASC").Limit(batchSize, start).Find(&counts); err != nil { return err } if len(counts) == 0 { return nil } + start += len(counts) for _, count := range counts { if err := f(ctx, count.RepositoryID, count.Count); err != nil { return err } } - id = counts[len(counts)-1].RepositoryID } } // IterateLFSMetaObjectsForRepoOptions provides options for IterateLFSMetaObjectsForRepo type IterateLFSMetaObjectsForRepoOptions struct { - OlderThan timeutil.TimeStamp - UpdatedLessRecentlyThan timeutil.TimeStamp - OrderByUpdated bool - LoopFunctionAlwaysUpdates bool + OlderThan timeutil.TimeStamp + UpdatedLessRecentlyThan timeutil.TimeStamp } // IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo -func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(context.Context, *LFSMetaObject, int64) error, opts *IterateLFSMetaObjectsForRepoOptions) error { - var start int +func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(context.Context, *LFSMetaObject) error, opts *IterateLFSMetaObjectsForRepoOptions) error { batchSize := setting.Database.IterateBufferSize engine := db.GetEngine(ctx) - type CountLFSMetaObject struct { - Count int64 - LFSMetaObject `xorm:"extends"` - } - id := int64(0) for { - beans := make([]*CountLFSMetaObject, 0, batchSize) - sess := engine.Table("lfs_meta_object").Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`"). + beans := make([]*LFSMetaObject, 0, batchSize) + sess := engine.Table("lfs_meta_object").Select("`lfs_meta_object`.*"). Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid"). Where("`lfs_meta_object`.repository_id = ?", repoID) if !opts.OlderThan.IsZero() { @@ -397,25 +386,19 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont if !opts.UpdatedLessRecentlyThan.IsZero() { sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan) } - sess.GroupBy("`lfs_meta_object`.id") - if opts.OrderByUpdated { - sess.OrderBy("`lfs_meta_object`.updated_unix ASC") - } else { - sess.And("`lfs_meta_object`.id > ?", id) - sess.OrderBy("`lfs_meta_object`.id ASC") - } - if err := sess.Limit(batchSize, start).Find(&beans); err != nil { + sess.GroupBy("`lfs_meta_object`.id"). + And("`lfs_meta_object`.id > ?", id). + OrderBy("`lfs_meta_object`.id ASC") + + if err := sess.Limit(batchSize, 0).Find(&beans); err != nil { return err } if len(beans) == 0 { return nil } - if !opts.LoopFunctionAlwaysUpdates { - start += len(beans) - } for _, bean := range beans { - if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil { + if err := f(ctx, bean); err != nil { return err } } diff --git a/models/git/lfs_test.go b/models/git/lfs_test.go new file mode 100644 index 000000000..565b2e930 --- /dev/null +++ b/models/git/lfs_test.go @@ -0,0 +1,101 @@ +// Copyright 2024 The Forgejo Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package git + +import ( + "context" + "path/filepath" + "testing" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" + + "github.com/stretchr/testify/assert" +) + +func TestIterateRepositoryIDsWithLFSMetaObjects(t *testing.T) { + defer unittest.OverrideFixtures( + unittest.FixturesOptions{ + Dir: filepath.Join(setting.AppWorkPath, "models/fixtures/"), + Base: setting.AppWorkPath, + Dirs: []string{"models/git/TestIterateRepositoryIDsWithLFSMetaObjects/"}, + }, + )() + assert.NoError(t, unittest.PrepareTestDatabase()) + + type repocount struct { + repoid int64 + count int64 + } + expected := []repocount{{1, 1}, {54, 4}} + + t.Run("Normal batch size", func(t *testing.T) { + defer test.MockVariableValue(&setting.Database.IterateBufferSize, 20)() + cases := []repocount{} + + err := IterateRepositoryIDsWithLFSMetaObjects(db.DefaultContext, func(ctx context.Context, repoID, count int64) error { + cases = append(cases, repocount{repoID, count}) + return nil + }) + assert.NoError(t, err) + assert.EqualValues(t, expected, cases) + }) + + t.Run("Low batch size", func(t *testing.T) { + defer test.MockVariableValue(&setting.Database.IterateBufferSize, 1)() + cases := []repocount{} + + err := IterateRepositoryIDsWithLFSMetaObjects(db.DefaultContext, func(ctx context.Context, repoID, count int64) error { + cases = append(cases, repocount{repoID, count}) + return nil + }) + assert.NoError(t, err) + assert.EqualValues(t, expected, cases) + }) +} + +func TestIterateLFSMetaObjectsForRepo(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + + expectedIDs := []int64{1, 2, 3, 4} + + t.Run("Normal batch size", func(t *testing.T) { + defer test.MockVariableValue(&setting.Database.IterateBufferSize, 20)() + actualIDs := []int64{} + + err := IterateLFSMetaObjectsForRepo(db.DefaultContext, 54, func(ctx context.Context, lo *LFSMetaObject) error { + actualIDs = append(actualIDs, lo.ID) + return nil + }, &IterateLFSMetaObjectsForRepoOptions{}) + assert.NoError(t, err) + assert.EqualValues(t, expectedIDs, actualIDs) + }) + + t.Run("Low batch size", func(t *testing.T) { + defer test.MockVariableValue(&setting.Database.IterateBufferSize, 1)() + actualIDs := []int64{} + + err := IterateLFSMetaObjectsForRepo(db.DefaultContext, 54, func(ctx context.Context, lo *LFSMetaObject) error { + actualIDs = append(actualIDs, lo.ID) + return nil + }, &IterateLFSMetaObjectsForRepoOptions{}) + assert.NoError(t, err) + assert.EqualValues(t, expectedIDs, actualIDs) + + t.Run("Batch handles updates", func(t *testing.T) { + actualIDs := []int64{} + + err := IterateLFSMetaObjectsForRepo(db.DefaultContext, 54, func(ctx context.Context, lo *LFSMetaObject) error { + actualIDs = append(actualIDs, lo.ID) + _, err := db.DeleteByID[LFSMetaObject](ctx, lo.ID) + assert.NoError(t, err) + return nil + }, &IterateLFSMetaObjectsForRepoOptions{}) + assert.NoError(t, err) + assert.EqualValues(t, expectedIDs, actualIDs) + }) + }) +} diff --git a/services/doctor/lfs.go b/services/doctor/lfs.go index 5f110b8f9..8531b7bbe 100644 --- a/services/doctor/lfs.go +++ b/services/doctor/lfs.go @@ -44,6 +44,7 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool OlderThan: time.Now().Add(-24 * time.Hour * 7), // We don't set the UpdatedLessRecentlyThan because we want to do a full GC }); err != nil { + logger.Error("Couldn't garabage collect LFS objects: %v", err) return err } diff --git a/services/repository/lfs.go b/services/repository/lfs.go index 4d48881b8..b49a4f71e 100644 --- a/services/repository/lfs.go +++ b/services/repository/lfs.go @@ -5,7 +5,6 @@ package repository import ( "context" - "errors" "fmt" "time" @@ -21,12 +20,10 @@ import ( // GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function type GarbageCollectLFSMetaObjectsOptions struct { - LogDetail func(format string, v ...any) - AutoFix bool - OlderThan time.Time - UpdatedLessRecentlyThan time.Time - NumberToCheckPerRepo int64 - ProportionToCheckPerRepo float64 + LogDetail func(format string, v ...any) + AutoFix bool + OlderThan time.Time + UpdatedLessRecentlyThan time.Time } // GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories @@ -49,9 +46,6 @@ func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMet return err } - if newMinimum := int64(float64(count) * opts.ProportionToCheckPerRepo); newMinimum > opts.NumberToCheckPerRepo && opts.NumberToCheckPerRepo != 0 { - opts.NumberToCheckPerRepo = newMinimum - } return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts) }) } @@ -78,13 +72,9 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R defer gitRepo.Close() store := lfs.NewContentStore() - errStop := errors.New("STOPERR") objectFormat := git.ObjectFormatFromName(repo.ObjectFormatName) - err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error { - if opts.NumberToCheckPerRepo > 0 && total > opts.NumberToCheckPerRepo { - return errStop - } + err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject) error { total++ pointerSha := git.ComputeBlobHash(objectFormat, []byte(metaObject.Pointer.StringContent())) @@ -123,16 +113,10 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R // // It is likely that a week is potentially excessive but it should definitely be enough that any // unassociated LFS object is genuinely unassociated. - OlderThan: timeutil.TimeStamp(opts.OlderThan.Unix()), - UpdatedLessRecentlyThan: timeutil.TimeStamp(opts.UpdatedLessRecentlyThan.Unix()), - OrderByUpdated: true, - LoopFunctionAlwaysUpdates: true, + OlderThan: timeutil.TimeStamp(opts.OlderThan.Unix()), + UpdatedLessRecentlyThan: timeutil.TimeStamp(opts.UpdatedLessRecentlyThan.Unix()), }) - - if err == errStop { - opts.LogDetail("Processing stopped at %d total LFSMetaObjects in %-v", total, repo) - return nil - } else if err != nil { + if err != nil { return err } return nil