From 13e4bbdbfa104a2384834b634285dce2f4dafe2e Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Tue, 14 Jun 2022 18:00:57 +0200 Subject: [PATCH] [chore] Duplicated media cleanup (#649) * add migration to clean up duplicated media * use /tmp/gotosocial for testrig storage path * defer remove storage tempdir * skip if not attached to status or status not found * log errors at error level * only log delete as else clause if successful * just return nil on down * reword delete logic a little bit * check if storage base path is defined * check for status id more thoroughly * don't log error if just no rows * go fmt * break statusIDLoop when found * break currentlyUsedLoop when found --- ...20220612091800_duplicated_media_cleanup.go | 164 ++++++++++++++++++ testrig/config.go | 5 +- testrig/storage.go | 3 + 3 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 internal/db/bundb/migrations/20220612091800_duplicated_media_cleanup.go diff --git a/internal/db/bundb/migrations/20220612091800_duplicated_media_cleanup.go b/internal/db/bundb/migrations/20220612091800_duplicated_media_cleanup.go new file mode 100644 index 0000000000..6139fd77dc --- /dev/null +++ b/internal/db/bundb/migrations/20220612091800_duplicated_media_cleanup.go @@ -0,0 +1,164 @@ +/* + GoToSocial + Copyright (C) 2021-2022 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package migrations + +import ( + "context" + "database/sql" + "fmt" + "path" + + "codeberg.org/gruf/go-store/kv" + "codeberg.org/gruf/go-store/storage" + "github.com/sirupsen/logrus" + "github.com/superseriousbusiness/gotosocial/internal/config" + "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/uptrace/bun" +) + +func init() { + deleteAttachment := func(ctx context.Context, l *logrus.Entry, a *gtsmodel.MediaAttachment, s *kv.KVStore, tx bun.Tx) { + if err := s.Delete(a.File.Path); err != nil && err != storage.ErrNotFound { + l.Errorf("error removing file %s: %s", a.File.Path, err) + } else { + l.Debugf("deleted %s", a.File.Path) + } + + if err := s.Delete(a.Thumbnail.Path); err != nil && err != storage.ErrNotFound { + l.Errorf("error removing file %s: %s", a.Thumbnail.Path, err) + } else { + l.Debugf("deleted %s", a.Thumbnail.Path) + } + + if _, err := tx.NewDelete(). + Model(a). + WherePK(). + Exec(ctx); err != nil { + l.Errorf("error deleting attachment with id %s: %s", a.ID, err) + } else { + l.Debugf("deleted attachment with id %s", a.ID) + } + } + + up := func(ctx context.Context, db *bun.DB) error { + l := logrus.WithField("migration", "20220612091800_duplicated_media_cleanup") + + storageBasePath := config.GetStorageLocalBasePath() + if storageBasePath == "" { + return fmt.Errorf("%s must be set to do storage migration", config.StorageLocalBasePathFlag()) + } + + return db.RunInTx(ctx, nil, func(ctx context.Context, tx bun.Tx) error { + s, err := kv.OpenFile(storageBasePath, &storage.DiskConfig{ + LockFile: path.Join(storageBasePath, "store.lock"), + }) + if err != nil { + return fmt.Errorf("error creating storage backend: %s", err) + } + defer s.Close() + + // step 1. select all media attachment remote URLs that have duplicates + var dupes int + dupedRemoteURLs := []*gtsmodel.MediaAttachment{} + if err := tx.NewSelect(). + Model(&dupedRemoteURLs). + ColumnExpr("remote_url", "count(*)"). + Where("remote_url IS NOT NULL"). + Group("remote_url"). + Having("count(*) > 1"). + Scan(ctx); err != nil { + return err + } + dupes = len(dupedRemoteURLs) + l.Infof("found %d attachments with duplicate remote URLs", dupes) + + for i, dupedRemoteURL := range dupedRemoteURLs { + if i%10 == 0 { + l.Infof("cleaning %d of %d", i, dupes) + } + + // step 2: select all media attachments associated with this url + dupedAttachments := []*gtsmodel.MediaAttachment{} + if err := tx.NewSelect(). + Model(&dupedAttachments). + Where("remote_url = ?", dupedRemoteURL.RemoteURL). + Scan(ctx); err != nil { + l.Errorf("error running same attachments query: %s", err) + continue + } + l.Debugf("found %d duplicates of attachment with remote url %s", len(dupedAttachments), dupedRemoteURL.RemoteURL) + + var statusID string + statusIDLoop: + for _, dupe := range dupedAttachments { + if dupe.StatusID != "" { + statusID = dupe.StatusID + break statusIDLoop + } + } + + if statusID == "" { + l.Debugf("%s not associated with a status, moving on", dupedRemoteURL.RemoteURL) + continue + } + l.Debugf("%s is associated with status %s", dupedRemoteURL.RemoteURL, statusID) + + // step 3: get the status that these attachments are supposedly associated with, bail if we can't get it + status := >smodel.Status{} + if err := tx.NewSelect(). + Model(status). + Where("id = ?", statusID). + Scan(ctx); err != nil { + if err != sql.ErrNoRows { + l.Errorf("error selecting status with id %s: %s", statusID, err) + } + continue + } + + // step 4: for each attachment, check if it's actually one that the status is currently set to use, and delete if not + for _, dupe := range dupedAttachments { + var currentlyUsed bool + currentlyUsedLoop: + for _, attachmentID := range status.AttachmentIDs { + if attachmentID == dupe.ID { + currentlyUsed = true + break currentlyUsedLoop + } + } + + if currentlyUsed { + l.Debugf("attachment with id %s is a correct current attachment, leaving it alone!", dupe.ID) + continue + } + + deleteAttachment(ctx, l, dupe, s, tx) + } + } + return nil + }) + } + + down := func(ctx context.Context, db *bun.DB) error { + return nil + } + + if err := Migrations.Register(up, down); err != nil { + panic(err) + } +} diff --git a/testrig/config.go b/testrig/config.go index e424670fa6..92d04c4531 100644 --- a/testrig/config.go +++ b/testrig/config.go @@ -19,6 +19,9 @@ package testrig import ( + "os" + "path" + "github.com/coreos/go-oidc/v3/oidc" "github.com/superseriousbusiness/gotosocial/internal/config" ) @@ -64,7 +67,7 @@ var TestDefaults = config.Configuration{ MediaRemoteCacheDays: 30, StorageBackend: "local", - StorageLocalBasePath: "/gotosocial/storage", + StorageLocalBasePath: path.Join(os.TempDir(), "gotosocial"), StatusesMaxChars: 5000, StatusesCWMaxChars: 100, diff --git a/testrig/storage.go b/testrig/storage.go index 0e91d7dbef..6662565935 100644 --- a/testrig/storage.go +++ b/testrig/storage.go @@ -21,6 +21,7 @@ package testrig import ( "fmt" "os" + "path" "codeberg.org/gruf/go-store/kv" "codeberg.org/gruf/go-store/storage" @@ -94,6 +95,8 @@ func StandardStorageSetup(s *kv.KVStore, relativePath string) { // StandardStorageTeardown deletes everything in storage so that it's clean for the next test func StandardStorageTeardown(s *kv.KVStore) { + defer os.RemoveAll(path.Join(os.TempDir(), "gotosocial")) + iter, err := s.Iterator(nil) if err != nil { panic(err)