diff --git a/resources/base.imsc.json.example b/resources/base.imsc.json.example index 6afc927..2033913 100644 --- a/resources/base.imsc.json.example +++ b/resources/base.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id": "c5bed39a-4379-11ef-ba5a-ffbc783163b6", "name" : "Generic metadata schema", "instrument" : "", diff --git a/resources/coda.imsc.json.example b/resources/coda.imsc.json.example index f0efeb6..da2d12f 100644 --- a/resources/coda.imsc.json.example +++ b/resources/coda.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id" : "715ce7ba-3f91-11ef-932f-37a5c6fd60b1", "name" : "Coda Metadata Schema", "instrument": "coda", diff --git a/resources/config.sample.json b/resources/config.sample.json index b6718ac..be1953a 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -27,7 +27,8 @@ "hash_file_extension": "b2b", "ingestor_files_directory": "../ingestor", "message_to_file": true, - "message_file_extension": "message.json" + "message_file_extension": "message.json", + "use_full_file_path": false } }, "kafka": { diff --git a/resources/dream.imsc.json.example b/resources/dream.imsc.json.example index 37ed6d4..6a0c36d 100644 --- a/resources/dream.imsc.json.example +++ b/resources/dream.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id" : "72a991ee-437a-11ef-8fd2-1f95660accb7", "name" : "dream Metadata Schema", "instrument": "dream", diff --git a/resources/loki.imsc.json.example b/resources/loki.imsc.json.example index cc9e813..625e86b 100644 --- a/resources/loki.imsc.json.example +++ b/resources/loki.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id" : "891322f6-437a-11ef-980a-7bdc756bd0b3", "name" : "Loki Metadata Schema", "instrument": "loki", diff --git a/src/scicat_communication.py b/src/scicat_communication.py index 1257564..f7f9f7a 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -2,6 +2,7 @@ # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) import json import logging +from dataclasses import asdict from typing import Any from urllib.parse import quote, urljoin @@ -107,15 +108,13 @@ def create_scicat_origdatablock( return result -def render_full_url( - url: str, - config: SciCatOptions, -) -> str: +def render_full_url(url: str, config: SciCatOptions) -> str: + urls = asdict(config.urls) if not url.startswith("http://") and not url.startswith("https://"): - for endpoint in config.urls.keys(): + for endpoint in urls.keys(): if url.startswith(endpoint): - url = url.replace(endpoint, config.urls[endpoint]) - break + return url.replace(endpoint, urls[endpoint]) + return url diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index 12de863..129a10a 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -204,6 +204,7 @@ class FileHandlingOptions: ingestor_files_directory: str = "../ingestor" message_to_file: bool = True message_file_extension: str = "message.json" + use_full_file_path: bool = False @dataclass(kw_only=True) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index df81824..b10fad7 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -60,6 +60,15 @@ def to_date(value: Any) -> str | None: def to_dict(value: Any) -> dict: + if isinstance(value, str): + result = ast.literal_eval(value) + if isinstance(result, dict): + return result + else: + raise ValueError( + "Invalid value. Must be able to convert to a dictionary. Got ", value + ) + return dict(value) @@ -93,9 +102,20 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: "join_with_space": lambda value: ", ".join( ast.literal_eval(value) if isinstance(value, str) else value ), - "evaluate": lambda value: ast.literal_eval(value), + # "evaluate": lambda value: ast.literal_eval(value), + # We are not adding the evaluate function here since + # ``evaluate`` function should be avoided if possible. + # It might seem easy to use, but it is very easy to break + # when the input is not as expected. + # It is better to use the specific converters for the types. + # However, if it is the only way to go, you can add it here. + # Please add a comment to explain why it is needed. "filename": lambda value: os.path.basename(value), + "dirname": lambda value: os.path.dirname(value), "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)), + "getitem": lambda value, key: value[ + key + ], # The only operator that takes an argument } ) @@ -129,6 +149,7 @@ def extract_variables_values( config: OfflineIngestorConfig, ) -> dict: variable_map = { + "ingestor_run_id": str(uuid.uuid4()), "filepath": pathlib.Path(config.nexus_file), "now": datetime.datetime.now(tz=datetime.UTC).isoformat(), } @@ -137,12 +158,13 @@ def extract_variables_values( if isinstance(variable_recipe, NexusFileMetadataVariable): value = _retrieve_values_from_file(variable_recipe, h5file) elif isinstance(variable_recipe, ScicatMetadataVariable): + full_endpoint_url = render_full_url( + render_variable_value(variable_recipe.url, variable_map), + config.scicat, + ) value = retrieve_value_from_scicat( config=config.scicat, - scicat_endpoint_url=render_full_url( - render_variable_value(variable_recipe.url, variable_map), - config.scicat, - ), + scicat_endpoint_url=full_endpoint_url, field_name=variable_recipe.field, ) elif isinstance(variable_recipe, ValueMetadataVariable): @@ -152,7 +174,12 @@ def extract_variables_values( if isinstance(value, str) else value ) - value = _get_operator(variable_recipe.operator)(value) + _operator = _get_operator(variable_recipe.operator) + if variable_recipe.field: + value = _operator(value, variable_recipe.field) + else: + value = _operator(value) + else: raise Exception("Invalid variable source: ", source) variable_map[variable_name] = convert_to_type(value, variable_recipe.value_type) @@ -165,7 +192,7 @@ def extract_paths_from_h5_file( _path: list[str], ) -> list[str]: master_key = _path.pop(0) - output_paths = [master_key] + output_paths = [] if "*" in master_key: temp_keys = [k2 for k2 in _h5_object.keys() if re.search(master_key, k2)] for key in temp_keys: @@ -216,6 +243,8 @@ class ScicatDataset: proposalId: str | None = None ownerGroup: str | None = None accessGroups: list[str] | None = None + startTime: str | None = None + endTime: str | None = None @dataclass(kw_only=True) diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index 9006249..eacaab0 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -67,6 +67,8 @@ class ValueMetadataVariable(MetadataSchemaVariable): operator: str = "" value: str + field: str | None = None + # We only allow one field(argument) for now @dataclass(kw_only=True) @@ -110,6 +112,7 @@ class MetadataSchema: name: str instrument: str selector: str | dict + order: int variables: dict[str, MetadataSchemaVariable] schema: dict[str, MetadataItem] @@ -138,6 +141,12 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema": def render_variable_value(var_value: str, variable_registry: dict) -> str: + # If it is only one variable, then it is a simple replacement + if (var_key := var_value.removesuffix(">").removeprefix("<")) in variable_registry: + return variable_registry[var_key] + + # If it is a complex variable, then it is a combination of variables + # similar to f-string in python for reg_var_name, reg_var_value in variable_registry.items(): var_value = var_value.replace("<" + reg_var_name + ">", str(reg_var_value)) @@ -158,11 +167,13 @@ def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]: MetadataSchema.from_file(schema_file_path) for schema_file_path in list_schema_file_names(dir_path) ], - key=lambda schema: schema.name, + key=lambda schema: (schema.order, schema.name.capitalize()), + # name is capitalized to make sure that the order is + # alphabetically sorted in a non-case-sensitive way ) schemas = OrderedDict() for metadata_schema in metadata_schemas: - schemas[metadata_schema.name] = metadata_schema + schemas[metadata_schema.id] = metadata_schema return schemas diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index d240688..abc2095 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -146,11 +146,16 @@ def main() -> None: ) # Collect data-file descriptions + if not config.ingestion.file_handling.use_full_file_path: + source_folder = variable_map["source_folder"] + else: + source_folder = None + data_file_list = create_data_file_list( nexus_file=nexus_file_path, ingestor_directory=ingestor_directory, config=fh_options, - source_folder=variable_map["source_folder"], + source_folder=source_folder, logger=logger, # TODO: add done_writing_message_file and nexus_structure_file ) diff --git a/tests/test_scicat_metadata_schema.py b/tests/test_scicat_metadata_schema.py index 7d8876c..04c7a20 100644 --- a/tests/test_scicat_metadata_schema.py +++ b/tests/test_scicat_metadata_schema.py @@ -48,17 +48,25 @@ def test_collect_metadata_schema() -> None: assert len(schemas) == len(ALL_SCHEMA_EXAMPLES) for schema_name, schema in schemas.items(): assert isinstance(schema, MetadataSchema) - assert schema_name == schema.name + assert schema_name == schema.id assert isinstance(schemas, OrderedDict) - # Check if the schema is ordered by the schema name - assert list(schemas.keys()) == sorted(schemas.keys()) + # Check if the schema is ordered by the schema order and name. + # The expected keys are hardcoded on purpose. + # Always hardcode the expected keys to avoid the test being too flexible. + assert list(schemas.keys()) == [ + "715ce7ba-3f91-11ef-932f-37a5c6fd60b1", # Coda, 1, Coda Metadata Schema + "72a991ee-437a-11ef-8fd2-1f95660accb7", # Dream, 1, dream Metadata Schema + "c5bed39a-4379-11ef-ba5a-ffbc783163b6", # Base, 1, Generic metadata schema + "891322f6-437a-11ef-980a-7bdc756bd0b3", # Loki, 1, Loki Metadata Schema + ] def test_metadata_schema_selection() -> None: schemas = OrderedDict( { "schema1": MetadataSchema( + order=1, id="schema1", name="Schema 1", instrument="", @@ -67,6 +75,7 @@ def test_metadata_schema_selection() -> None: schema={}, ), "schema2": MetadataSchema( + order=2, id="schema2", name="Schema 2", instrument="", @@ -75,6 +84,7 @@ def test_metadata_schema_selection() -> None: schema={}, ), "schema3": MetadataSchema( + order=3, id="schema3", name="Schema 3", instrument="", @@ -96,6 +106,7 @@ def test_metadata_schema_selection_wrong_selector_target_name_raises() -> None: OrderedDict( { "schema1": MetadataSchema( + order=1, id="schema1", name="Schema 1", instrument="", @@ -115,6 +126,7 @@ def test_metadata_schema_selection_wrong_selector_function_name_raises() -> None OrderedDict( { "schema1": MetadataSchema( + order=1, id="schema1", name="Schema 1", instrument="",