Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validation task for supported filetype IDs #2

Merged
merged 4 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions marda_registry/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import glob
import pathlib
from typing import Type
from typing import Any, Type

import mongomock as pymongo
import yaml


def load_registry_collection(
model: Type, database: pymongo.Database | None = None, validate: bool = True
) -> int:
) -> list[Any]:
"""Loads any entries of the specified model ty pes from the corresponding data directory,
optionally validating and inserting them into the given database.

Expand All @@ -18,17 +18,22 @@ def load_registry_collection(
validate: Whether to validate the entries before inserting them into the database.

Returns:
The number of entries ingested for that type.
The entries ingested for that type.

"""
name = model.__name__.lower() + "s"
entries = glob.glob(str(pathlib.Path(__file__).parent / "data" / name / "*.yml"))
for entry in entries:
filenames = glob.glob(str(pathlib.Path(__file__).parent / "data" / name / "*.yml"))
entries = []
for entry in filenames:
with open(entry, "r") as f:
data = yaml.safe_load(f)

if validate:
model(**data)
entries.append(model(**data))
else:
entries.append(data)

if database:
database[name].insert_one(data)

return len(entries)
return entries
23 changes: 22 additions & 1 deletion tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,35 @@ def validate_entries(_):
from marda_registry.utils import load_registry_collection

counts = {}
errors = []
for type_ in (FileType, Extractor):
counts[type_] = load_registry_collection(
entries = load_registry_collection(
type_,
database=None,
validate=True,
)
counts[type_] = len(entries)
print(f"Loaded {counts[type_]} {type_.__name__} entries")

if type_ is Extractor:
filetype_ids = set(
d.stem
for d in Path(__file__).parent.glob(
"./marda_registry/data/filetypes/*.yml"
)
)

for extractor in entries:
for filetype in extractor.supported_filetypes:
if filetype.id not in filetype_ids:
breakpoint()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of course...

errors.append(
f"Extractor {extractor.name=} has invalid filetype {filetype.id=}. Should be one of {filetype_ids=}"
)

if errors:
raise RuntimeError("\n".join(errors))

print("Done!")


Expand Down
Loading