Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

All the fixes introduced while testing the production deployments #93

Merged
merged 3 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 272 additions & 0 deletions resources/small-coda.imsc.json.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
{
"id" : "628b28d6-9c26-11ef-948d-0b2d405fc82f",
"name" : "Test Coda Metadata Schema",
"instrument": "coda",
"selector": "filename:starts_with:/ess/services/scicat-ingestor/software/test-data/small-coda",
"order": 110,
"variables" : {
"job_id": {
"source": "NXS",
"path": "/entry/entry_identifier_uuid",
"value_type": "string"
},
"pid": {
"source": "VALUE",
"value": "20.500.12269/<ingestor_run_id>",
"value_type": "string"
},
"proposal_id": {
"source": "NXS",
"path": "/entry/experiment_identifier",
"value_type": "string"
},
"proposal_data": {
"source": "SC",
"url": "proposals/<proposal_id>",
"field" : "",
"value_type": "dict"
},
"pi_firstname": {
"source": "VALUE",
"operator": "getitem",
"value": "<proposal_data>",
"field" : "pi_firstname",
"value_type": "string"
},
"pi_lastname": {
"source": "VALUE",
"operator": "getitem",
"value": "<proposal_data>",
"field" : "pi_lastname",
"value_type": "string"
},
"pi_email": {
"source": "VALUE",
"operator": "getitem",
"value": "<proposal_data>",
"field" : "pi_email",
"value_type": "string"
},
"dataset_original_name": {
"source": "NXS",
"path": "/entry/title",
"value_type": "string"
},
"dataset_name": {
"source": "VALUE",
"value" : "coda test - <dataset_original_name> - <ingestor_run_id>",
"value_type": "string"
},
"instrument_name": {
"source": "NXS",
"path": "/entry/instrument/name",
"value_type": "string"
},
"instruments_data": {
"source": "SC",
"url": "instruments?filter=%7B%22where%22%20%3A%20%7B%20%22name%22%20%3A%20%22coda%22%20%7D%20%7D",
"field": "",
"value_type": "list"
},
"instrument_data": {
"source": "VALUE",
"operator": "getitem",
"value": "<instruments_data>",
"field" : 0,
"value_type": "dict"
},
"instrument_id": {
"source": "VALUE",
"operator": "getitem",
"value": "<instrument_data>",
"field" : "id",
"value_type": "string"
},
"start_time": {
"source": "NXS",
"path": "/entry/start_time",
"value_type": "date"
},
"end_time": {
"source": "NXS",
"path": "/entry/end_time",
"value_type": "date"
},
"run_number": {
"source": "NXS",
"path": "/entry/entry_identifier",
"value_type": "integer"
},
"acquisition_team_members_list": {
"source": "NXS",
"path" : "/entry/user_*/name",
"value_type": "string[]"
},
"acquisition_team_members": {
"source": "VALUE",
"operator" : "join_with_space",
"value" : "<acquisition_team_members_list>",
"value_type": "string"
},
"owner_group": {
"source": "VALUE",
"value": "<proposal_id>",
"value_type": "string"
},
"access_groups": {
"source": "VALUE",
"value": ["scientific information management systems group"],
"value_type": "string[]"
},
"source_folder": {
"source": "VALUE",
"operator": "dirname",
"value": "<filepath>",
"value_type": "string"
},
"keywords" : {
"source": "VALUE",
"value": ["TEST CODA","Scicat Ingestor 05","TEST RUN","CODA","<instrument_name>","CODA <instrument_name>"],
"value_type": "string[]"
}
},
"schema": {
"pid": {
"field_type": "high_level",
"machine_name": "pid",
"value": "<pid>",
"type": "string"
},
"type" : {
"field_type": "high_level",
"machine_name": "type",
"value": "raw",
"type": "string"
},
"proposal_id": {
"field_type": "high_level",
"machine_name": "proposalId",
"value": "<proposal_id>",
"type": "string"
},
"dataset_name": {
"field_type": "high_level",
"machine_name": "datasetName",
"value": "<dataset_name>",
"type": "string"
},
"principal_investigator": {
"field_type": "high_level",
"machine_name": "principalInvestigator",
"value": "<pi_firstname> <pi_lastname>",
"type": "string"
},
"owner": {
"field_type": "high_level",
"machine_name": "owner",
"value": "<pi_firstname> <pi_lastname>",
"type": "string"
},
"owner_email": {
"field_type": "high_level",
"machine_name": "ownerEmail",
"value": "<pi_email>",
"type": "email"
},
"contact_email": {
"field_type": "high_level",
"machine_name": "contactEmail",
"value": "<pi_email>",
"type": "email"
},
"instrument_id": {
"field_type": "high_level",
"machine_name": "instrumentId",
"value": "<instrument_id>",
"type": "string"
},
"creation_location": {
"field_type": "high_level",
"machine_name": "creationLocation",
"value": "ESS:CODA:<instrument_name>",
"type": "string"
},
"start_time_hl": {
"field_type": "high_level",
"machine_name": "startTime",
"value": "<start_time>",
"type": "date"
},
"end_time_hl": {
"field_type": "high_level",
"machine_name": "endTime",
"value": "<end_time>",
"type": "date"
},
"start_time_sm": {
"field_type": "scientific_metadata",
"machine_name": "start_time",
"human_name": "Start Time",
"value": "<start_time>",
"type": "date"
},
"end_time_sm": {
"field_type": "scientific_metadata",
"machine_name": "end_time",
"human_name": "End Time",
"value": "<end_time>",
"type": "date"
},
"run_number_sm": {
"field_type": "scientific_metadata",
"machine_name": "run_number",
"human_name": "Run Number",
"value": "<run_number>",
"type": "integer"
},
"job_id": {
"field_type": "scientific_metadata",
"machine_name": "job_id",
"human_name": "ESS Data Collection Job Id",
"value": "<job_id>",
"type": "string"
},
"acquisition_team_members": {
"field_type": "scientific_metadata",
"machine_name": "acquisition_team_members",
"human_name": "Acquisition Team Members",
"value": "<acquisition_team_members>",
"type": "string"
},
"owner_group": {
"field_type": "high_level",
"machine_name": "ownerGroup",
"value": "<owner_group>",
"type": "string"
},
"access_groups": {
"field_type": "high_level",
"machine_name": "accessGroups",
"value": "<access_groups>",
"type": "string[]"
},
"source_folder": {
"field_type": "high_level",
"machine_name": "sourceFolder",
"value": "<source_folder>",
"type": "string"
},
"creation_time": {
"field_type": "high_level",
"machine_name": "creationTime",
"value": "<now>",
"type": "date"
},
"keywords": {
"field_type": "high_level",
"machine_name": "keywords",
"value": "<keywords>",
"type": "string[]"
}
}
}
13 changes: 6 additions & 7 deletions src/scicat_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,9 @@ def extract_variables_values(
) -> dict:
variable_map = {
"ingestor_run_id": str(uuid.uuid4()),
"filepath": pathlib.Path(config.nexus_file),
"data_file_path": pathlib.Path(config.nexus_file),
"now": datetime.datetime.now(tz=datetime.UTC).isoformat(),
"ingestor_files_directory": config.ingestion.file_handling.ingestor_files_directory,
}
for variable_name, variable_recipe in variables.items():
source = variable_recipe.source
Expand All @@ -187,11 +188,7 @@ def extract_variables_values(
)
elif isinstance(variable_recipe, ValueMetadataVariable):
value = variable_recipe.value
value = (
render_variable_value(value, variable_map)
if isinstance(value, str)
else value
)
value = render_variable_value(value, variable_map)
_operator = _get_operator(variable_recipe.operator)
if variable_recipe.field is not None:
value = _operator(value, variable_recipe.field)
Expand Down Expand Up @@ -265,6 +262,8 @@ class ScicatDataset:
accessGroups: list[str] | None = None
startTime: str | None = None
endTime: str | None = None
runNumber: str | None = None
keywords: list[str] | None = None


@dataclass(kw_only=True)
Expand Down Expand Up @@ -459,7 +458,7 @@ def _filter_by_field_type(
return [field for field in schemas if field.field_type == field_type]


def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any:
def _render_variable_as_type(value: Any, variable_map: dict, dtype: str) -> Any:
return convert_to_type(render_variable_value(value, variable_map), dtype)


Expand Down
2 changes: 1 addition & 1 deletion src/scicat_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def build_logger(

# Add graylog handler
if logging_options.graylog:
graylog_handler = graypy.GELFTCPHandler(
graylog_handler = graypy.GELFUDPHandler(
logging_options.graylog_host,
int(logging_options.graylog_port),
facility=logging_options.graylog_facility,
Expand Down
19 changes: 15 additions & 4 deletions src/scicat_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from collections.abc import Callable
from dataclasses import dataclass
from importlib.metadata import entry_points
from typing import Any

SCIENTIFIC_METADATA_TYPE = "scientific_metadata"
HIGH_LEVEL_METADATA_TYPE = "high_level"
Expand Down Expand Up @@ -140,20 +141,30 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema":
return cls.from_dict(_load_json_schema(schema_file_name))


def render_variable_value(var_value: str, variable_registry: dict) -> str:
def render_variable_value(var_value: Any, variable_registry: dict) -> str:
# if input is not a string it converts it to string
output_value = var_value if isinstance(var_value, str) else json.dumps(var_value)

# If it is only one variable, then it is a simple replacement
if (var_key := var_value.removesuffix(">").removeprefix("<")) in variable_registry:
if (
var_key := output_value.removesuffix(">").removeprefix("<")
) in variable_registry:
return variable_registry[var_key]

# If it is a complex variable, then it is a combination of variables
# similar to f-string in python
for reg_var_name, reg_var_value in variable_registry.items():
var_value = var_value.replace("<" + reg_var_name + ">", str(reg_var_value))
output_value = output_value.replace(
"<" + reg_var_name + ">", str(reg_var_value)
)

if "<" in var_value and ">" in var_value:
raise Exception(f"Unresolved variable: {var_value}")

return var_value
output_value = (
output_value if isinstance(var_value, str) else json.loads(output_value)
)
Comment on lines +164 to +166
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about the intention here. Can you elaborate more? @nitrosx

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In some cases, the variable value (aka var_value) is a list or a dict.
When that is the case, the variable value is converted to a json string, the substitution is executed and converted back to the original type.
Basically it is a trick to perform nested substitutions without traversing the whole nested structure.

Check also line 149.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah it's making the string dumped output_value back to a dictionary/list. I see...
It sounds like a bit of hack but it won't be a problem since > and < are not easy to be mistaken with other symbols used in list or dictionary but we'd better document it.
I'll do it in another PR with some unit tests.

return output_value


def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]:
Expand Down
5 changes: 5 additions & 0 deletions src/scicat_offline_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ def main() -> None:
with h5py.File(nexus_file_path) as h5file:
# load instrument metadata configuration
metadata_schema = select_applicable_schema(nexus_file_path, schemas)
logger.info(
"Metadata Schema selected : %s (Id: %s)",
metadata_schema.name,
metadata_schema.id,
)

# define variables values
variable_map = extract_variables_values(
Expand Down
4 changes: 2 additions & 2 deletions test-data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ with h5py.File('copied_coda.hdf', 'r+') as f:
del instrument_gr[name]

# Copy the rest of the file
with h5py.File('small_coda.hdf', 'w') as new_f:
with h5py.File('small-coda.hdf', 'w') as new_f:
# copy everything
f.copy('entry', new_f)

Expand All @@ -39,7 +39,7 @@ with h5py.File('copied_ymir.hdf', 'r+') as f:
del instrument_gr[name]

# Copy the rest of the file
with h5py.File('small_ymir.hdf', 'w') as new_f:
with h5py.File('small-ymir.hdf', 'w') as new_f:
# copy everything
f.copy('entry', new_f)

Expand Down
File renamed without changes.
File renamed without changes.
Loading