-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a new command to the CCN operator CLI to run a garbage collector on local storage. The new `gc run` command lists all the files that are not linked to any message or permanent pin and deletes them. Using the --verbose option, the command will print more details on the files it will preserve and delete. The --dry-run option allows to run the GC without actually deleting any file.
- Loading branch information
1 parent
fdf844f
commit 15b9a1e
Showing
4 changed files
with
171 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
""" | ||
This migration checks all the files stored in local storage (=GridFS) and compares them to the list | ||
of messages already on the node. The files that are not linked to any message are scheduled for | ||
deletion. | ||
""" | ||
import asyncio | ||
import datetime as dt | ||
from typing import Any, Dict, FrozenSet, List | ||
from typing import cast | ||
|
||
import pytz | ||
import typer | ||
from aleph_message.models import MessageType | ||
from configmanager import Config | ||
|
||
import aleph.model | ||
from aleph.ccn_cli.cli_config import CliConfig | ||
from aleph.config import get_defaults | ||
from aleph.model import init_db_globals | ||
from aleph.model.filepin import PermanentPin | ||
from aleph.model.hashes import delete_value as delete_gridfs_file | ||
from aleph.model.messages import Message | ||
|
||
gc_ns = typer.Typer() | ||
|
||
|
||
# Get all the messages that potentially store data in local storage: | ||
# * AGGREGATEs with item_type=="storage" | ||
# * POSTs with item_type=="storage" | ||
# * STOREs with content.item_type=="storage" | ||
async def get_hashes( | ||
msg_type: MessageType, item_type_field: str, item_hash_field: str | ||
) -> FrozenSet[str]: | ||
def rgetitem(dictionary: Any, fields: List[str]) -> Any: | ||
value = dictionary[fields[0]] | ||
if len(fields) > 1: | ||
return rgetitem(value, fields[1:]) | ||
return value | ||
|
||
return frozenset( | ||
[ | ||
rgetitem(msg, item_hash_field.split(".")) | ||
async for msg in Message.collection.find( | ||
{"type": msg_type, item_type_field: "storage"}, | ||
{item_hash_field: 1}, | ||
batch_size=1000, | ||
) | ||
] | ||
) | ||
|
||
|
||
def print_files_to_preserve(files_to_preserve: Dict[str, FrozenSet[str]]) -> None: | ||
typer.echo("The following files will be preserved:") | ||
for file_type, files in files_to_preserve.items(): | ||
typer.echo(f"* {len(files)} {file_type}") | ||
|
||
|
||
async def list_files_to_preserve( | ||
gridfs_files_dict: Dict[str, Dict], | ||
temporary_files_ttl: int, | ||
) -> Dict[str, FrozenSet[str]]: | ||
files_to_preserve_dict = {} | ||
|
||
# Preserve any file that was uploaded less than an hour ago | ||
current_datetime = pytz.utc.localize(dt.datetime.utcnow()) | ||
files_to_preserve_dict["temporary files"] = frozenset( | ||
[ | ||
file["filename"] | ||
for file in gridfs_files_dict.values() | ||
if file["uploadDate"] | ||
> current_datetime - dt.timedelta(seconds=temporary_files_ttl) | ||
] | ||
) | ||
|
||
# Get all the messages that potentially store data in local storage: | ||
# * AGGREGATEs with item_type=="storage" | ||
# * POSTs with item_type=="storage" | ||
# * STOREs with content.item_type=="storage" | ||
files_to_preserve_dict["aggregates"] = await get_hashes( | ||
MessageType.aggregate, "item_type", "item_hash" | ||
) | ||
files_to_preserve_dict["posts"] = await get_hashes( | ||
MessageType.post, "item_type", "item_hash" | ||
) | ||
files_to_preserve_dict["stores"] = await get_hashes( | ||
MessageType.store, "content.item_type", "content.item_hash" | ||
) | ||
|
||
# We also keep permanent pins, even if they are also stored on IPFS | ||
files_to_preserve_dict["file pins"] = frozenset( | ||
[ | ||
pin["multihash"] | ||
async for pin in PermanentPin.collection.find({}, {"multihash": 1}) | ||
] | ||
) | ||
|
||
return files_to_preserve_dict | ||
|
||
|
||
async def run(ctx: typer.Context, dry_run: bool): | ||
config = Config(schema=get_defaults()) | ||
cli_config = cast(CliConfig, ctx.obj) | ||
config.yaml.load(str(cli_config.config_file_path)) | ||
|
||
init_db_globals(config=config) | ||
if aleph.model.db is None: # for mypy | ||
raise ValueError("DB not initialized as expected.") | ||
|
||
# Get a set of all the files currently in GridFS | ||
gridfs_files_dict = { | ||
file["filename"]: file | ||
async for file in aleph.model.db["fs.files"].find( | ||
projection={"_id": 0, "filename": 1, "length": 1, "uploadDate": 1}, | ||
batch_size=1000, | ||
) | ||
} | ||
gridfs_files = frozenset(gridfs_files_dict.keys()) | ||
|
||
typer.echo(f"Found {len(gridfs_files_dict)} files in local storage.") | ||
|
||
files_to_preserve_dict = await list_files_to_preserve( | ||
gridfs_files_dict=gridfs_files_dict, | ||
temporary_files_ttl=config.storage.temporary_files_ttl.value, | ||
) | ||
files_to_preserve = frozenset().union(*files_to_preserve_dict.values()) | ||
files_to_delete = gridfs_files - files_to_preserve | ||
|
||
if cli_config.verbose: | ||
print_files_to_preserve(files_to_preserve_dict) | ||
|
||
restored_memory = sum( | ||
gridfs_files_dict[filename]["length"] for filename in files_to_delete | ||
) | ||
typer.echo( | ||
f"{len(files_to_delete)} will be deleted, totaling {restored_memory} bytes." | ||
) | ||
|
||
if dry_run: | ||
if cli_config.verbose: | ||
typer.echo("The following files will be deleted:") | ||
for file_to_delete in files_to_delete: | ||
typer.echo(f"* {file_to_delete}") | ||
|
||
else: | ||
for file_to_delete in files_to_delete: | ||
typer.echo(f"Deleting {file_to_delete}...") | ||
await delete_gridfs_file(file_to_delete) | ||
|
||
typer.echo("Done.") | ||
|
||
|
||
@gc_ns.command(name="run") | ||
def run_gc( | ||
ctx: typer.Context, | ||
dry_run: bool = typer.Option( | ||
False, help="If set, display files to delete without deleting them." | ||
), | ||
): | ||
asyncio.run(run(ctx, dry_run)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters