diff --git a/dev-requirements.txt b/dev-requirements.txt index 4ff0ffb22..3ba04e52f 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --extra=dev --output-file=dev-requirements.txt pyproject.toml @@ -19,7 +19,6 @@ astroid==2.15.5 attrs==22.1.0 # via # cattrs - # pytest # requests-cache backcall==0.2.0 # via ipython @@ -191,6 +190,8 @@ matplotlib-scalebar==0.8.1 # via orix mccabe==0.7.0 # via pylint +mergedeep==1.3.4 + # via pynxtools (pyproject.toml) mpmath==1.2.1 # via sympy mypy==1.2.0 @@ -341,8 +342,6 @@ psutil==5.9.2 # pyxem ptyprocess==0.7.0 # via pexpect -py==1.11.0 - # via pytest pycifrw==4.4.5 # via diffpy-structure pycodestyle==2.9.1 @@ -511,7 +510,6 @@ typing-extensions==4.3.0 # astroid # mypy # numcodecs - # pylint tzdata==2023.3 # via pytz-deprecation-shim tzlocal==4.3 diff --git a/examples/json_map/README.md b/examples/json_map/README.md new file mode 100644 index 000000000..788cb6890 --- /dev/null +++ b/examples/json_map/README.md @@ -0,0 +1,36 @@ +# JSON Map Reader + +## What is this reader? + +This reader is designed to allow users of pynxtools to convert their existing data with the help of a map file. The map file tells the reader what to pick from your data files and convert them to FAIR NeXus files. The following formats are supported as input files: +* HDF5 (any extension works i.e. h5, hdf5, nxs, etc) +* JSON +* Python Dict Objects Pickled with [pickle](https://docs.python.org/3/library/pickle.html). These can contain [xarray.DataArray](https://docs.xarray.dev/en/stable/generated/xarray.DataArray.html) objects as well as regular Python types and Numpy types. + +It accepts any NXDL file that you like as long as your mapping file contains all the fields. +Please use the --generate-template function of the dataconverter to create a .mapping.json file. + +```console +user@box:~$ dataconverter --nxdl NXmynxdl --generate-template > mynxdl.mapping.json +``` +##### Details on the [mapping.json](/pynxtools/dataconverter/readers/json_map/README.md#the-mappingjson-file) file. + +## How to run these examples? + +### Automatically merge partial NeXus files +```console +user@box:~$ dataconverter --nxdl NXiv_temp --input-file voltage_and_temperature.nxs --input-file current.nxs --output auto_merged.nxs +``` + +### Map and copy over data to new NeXus file +```console +user@box:~$ dataconverter --nxdl NXiv_temp --mapping merge_copied.mapping.json --input-file voltage_and_temperature.nxs --input-file current.nxs --output merged_copied.nxs +``` + +### Map and link over data to new NeXus file +```console +user@box:~$ dataconverter --nxdl NXiv_temp --mapping merge_linked.mapping.json --input-file voltage_and_temperature.nxs --input-file current.nxs --output merged_linked.nxs +``` + +## Contact person in FAIRmat for this reader +Sherjeel Shabih diff --git a/examples/json_map/auto_merged.nxs b/examples/json_map/auto_merged.nxs new file mode 100644 index 000000000..ee12a9bfa Binary files /dev/null and b/examples/json_map/auto_merged.nxs differ diff --git a/examples/json_map/current.nxs b/examples/json_map/current.nxs new file mode 100644 index 000000000..d96db6ccc Binary files /dev/null and b/examples/json_map/current.nxs differ diff --git a/examples/json_map/merge_copied.mapping.json b/examples/json_map/merge_copied.mapping.json new file mode 100644 index 000000000..bba897874 --- /dev/null +++ b/examples/json_map/merge_copied.mapping.json @@ -0,0 +1,35 @@ +{ + "/@default": "entry", + "/ENTRY[entry]/DATA[data]/current": "/entry/data/current", + "/ENTRY[entry]/DATA[data]/current_295C": "/entry/data/current_295C", + "/ENTRY[entry]/DATA[data]/current_300C": "/entry/data/current_300C", + "/ENTRY[entry]/DATA[data]/current_305C": "/entry/data/current_305C", + "/ENTRY[entry]/DATA[data]/current_310C": "/entry/data/current_310C", + "/ENTRY[entry]/DATA[data]/temperature": "/entry/data/temperature", + "/ENTRY[entry]/DATA[data]/voltage": "/entry/data/voltage", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/voltage_controller/calibration_time": "/entry/instrument/environment/voltage_controller/calibration_time", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/voltage_controller/run_control": "/entry/instrument/environment/voltage_controller/run_control", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/voltage_controller/value": "/entry/instrument/environment/voltage_controller/value", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/temperature_controller/calibration_time": "/entry/instrument/environment/temperature_controller/calibration_time", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/temperature_controller/run_control": "/entry/instrument/environment/temperature_controller/run_control", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/temperature_controller/value": "/entry/instrument/environment/temperature_controller/value", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/current_sensor/calibration_time": "/entry/instrument/environment/current_sensor/calibration_time", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/current_sensor/run_control": "/entry/instrument/environment/current_sensor/run_control", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/current_sensor/value": "/entry/instrument/environment/current_sensor/value", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/independent_controllers": ["voltage_controller", "temperature_control"], + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/measurement_sensors": ["current_sensor"], + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/NXpid[heating_pid]/description": "/entry/instrument/environment/heating_pid/description", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/NXpid[heating_pid]/setpoint": "/entry/instrument/environment/heating_pid/setpoint", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/NXpid[heating_pid]/K_p_value": "/entry/instrument/environment/heating_pid/K_p_value", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/NXpid[heating_pid]/K_i_value": "/entry/instrument/environment/heating_pid/K_i_value", + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/NXpid[heating_pid]/K_d_value": "/entry/instrument/environment/heating_pid/K_d_value", + "/ENTRY[entry]/PROCESS[process]/program": "Bluesky", + "/ENTRY[entry]/PROCESS[process]/program/@version": "1.6.7", + "/ENTRY[entry]/SAMPLE[sample]/name": "super", + "/ENTRY[entry]/SAMPLE[sample]/atom_types": "Si, C", + "/ENTRY[entry]/definition": "NXiv_temp", + "/ENTRY[entry]/definition/@version": "1", + "/ENTRY[entry]/experiment_identifier": "dbdfed37-35ed-4aee-a465-aaa0577205b1", + "/ENTRY[entry]/experiment_description": "A simple IV temperature experiment.", + "/ENTRY[entry]/start_time": "2022-05-30T16:37:03.909201+02:00" +} \ No newline at end of file diff --git a/examples/json_map/merge_linked.mapping.json b/examples/json_map/merge_linked.mapping.json new file mode 100644 index 000000000..47ede8b92 --- /dev/null +++ b/examples/json_map/merge_linked.mapping.json @@ -0,0 +1,25 @@ +{ + "/@default": "entry", + "/ENTRY[entry]/DATA[data]/current": {"link": "current.nxs:/entry/data/current"}, + "/ENTRY[entry]/DATA[data]/current_295C": {"link": "current.nxs:/entry/data/current_295C"}, + "/ENTRY[entry]/DATA[data]/current_300C": {"link": "current.nxs:/entry/data/current_300C"}, + "/ENTRY[entry]/DATA[data]/current_305C": {"link": "current.nxs:/entry/data/current_305C"}, + "/ENTRY[entry]/DATA[data]/current_310C": {"link": "current.nxs:/entry/data/current_310C"}, + "/ENTRY[entry]/DATA[data]/temperature": {"link": "voltage_and_temperature.nxs:/entry/data/temperature"}, + "/ENTRY[entry]/DATA[data]/voltage": {"link": "voltage_and_temperature.nxs:/entry/data/voltage"}, + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/voltage_controller": {"link": "voltage_and_temperature.nxs:/entry/instrument/environment/voltage_controller"}, + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/temperature_controller": {"link": "voltage_and_temperature.nxs:/entry/instrument/environment/temperature_controller"}, + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/current_sensor": {"link": "current.nxs:/entry/instrument/environment/current_sensor"}, + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/independent_controllers": ["voltage_controller", "temperature_control"], + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/measurement_sensors": ["current_sensor"], + "/ENTRY[entry]/INSTRUMENT[instrument]/ENVIRONMENT[environment]/NXpid[heating_pid]": {"link": "voltage_and_temperature.nxs:/entry/instrument/environment/heating_pid"}, + "/ENTRY[entry]/PROCESS[process]/program": "Bluesky", + "/ENTRY[entry]/PROCESS[process]/program/@version": "1.6.7", + "/ENTRY[entry]/SAMPLE[sample]/name": "super", + "/ENTRY[entry]/SAMPLE[sample]/atom_types": "Si, C", + "/ENTRY[entry]/definition": "NXiv_temp", + "/ENTRY[entry]/definition/@version": "1", + "/ENTRY[entry]/experiment_identifier": "dbdfed37-35ed-4aee-a465-aaa0577205b1", + "/ENTRY[entry]/experiment_description": "A simple IV temperature experiment.", + "/ENTRY[entry]/start_time": "2022-05-30T16:37:03.909201+02:00" +} \ No newline at end of file diff --git a/examples/json_map/merged_copied.nxs b/examples/json_map/merged_copied.nxs new file mode 100644 index 000000000..564b448c4 Binary files /dev/null and b/examples/json_map/merged_copied.nxs differ diff --git a/examples/json_map/merged_linked.nxs b/examples/json_map/merged_linked.nxs new file mode 100644 index 000000000..6e7ed77b5 Binary files /dev/null and b/examples/json_map/merged_linked.nxs differ diff --git a/examples/json_map/voltage_and_temperature.nxs b/examples/json_map/voltage_and_temperature.nxs new file mode 100644 index 000000000..729b0876e Binary files /dev/null and b/examples/json_map/voltage_and_temperature.nxs differ diff --git a/pynxtools/dataconverter/README.md b/pynxtools/dataconverter/README.md index 937dabb1a..f8d600f41 100644 --- a/pynxtools/dataconverter/README.md +++ b/pynxtools/dataconverter/README.md @@ -37,9 +37,26 @@ Options: parameters the converter supports. --undocumented Shows a log output for all undocumented fields + --mapping TEXT Takes a .mapping.json file and + converts data from given input files. --help Show this message and exit. ``` +#### Merge partial NeXus files into one + +```console +user@box:~$ dataconverter --nxdl nxdl --input-file partial1.nxs --input-file partial2.nxs +``` + +#### Map an HDF5/JSON/(Python Dict pickled in a pickle file) + +```console +user@box:~$ dataconverter --nxdl nxdl --input-file any_data.hdf5 --mapping my_custom_map.mapping.json +``` + +#### You can find actual examples with data files at [`examples/json_map`](../../examples/json_map/). + + #### Use with multiple input files ```console diff --git a/pynxtools/dataconverter/convert.py b/pynxtools/dataconverter/convert.py index 7232e5644..3db435c78 100644 --- a/pynxtools/dataconverter/convert.py +++ b/pynxtools/dataconverter/convert.py @@ -65,7 +65,7 @@ def get_names_of_all_readers() -> List[str]: # pylint: disable=too-many-arguments,too-many-locals -def convert(input_file: Tuple[str], +def convert(input_file: Tuple[str, ...], reader: str, nxdl: str, output: str, @@ -153,7 +153,7 @@ def parse_params_file(params_file): ) @click.option( '--reader', - default='example', + default='json_map', type=click.Choice(get_names_of_all_readers(), case_sensitive=False), help='The reader to use. default="example"' ) @@ -192,15 +192,20 @@ def parse_params_file(params_file): default=False, help='Shows a log output for all undocumented fields' ) +@click.option( + '--mapping', + help='Takes a .mapping.json file and converts data from given input files.' +) # pylint: disable=too-many-arguments -def convert_cli(input_file: Tuple[str], +def convert_cli(input_file: Tuple[str, ...], reader: str, nxdl: str, output: str, generate_template: bool, fair: bool, params_file: str, - undocumented: bool): + undocumented: bool, + mapping: str): """The CLI entrypoint for the convert function""" if params_file: try: @@ -216,6 +221,10 @@ def convert_cli(input_file: Tuple[str], sys.tracebacklimit = 0 raise IOError("\nError: Please supply an NXDL file with the option:" " --nxdl ") + if mapping: + reader = "json_map" + if mapping: + input_file = input_file + tuple([mapping]) convert(input_file, reader, nxdl, output, generate_template, fair, undocumented) diff --git a/pynxtools/dataconverter/hdfdict.py b/pynxtools/dataconverter/hdfdict.py index 4edb68259..a4bbf87e6 100644 --- a/pynxtools/dataconverter/hdfdict.py +++ b/pynxtools/dataconverter/hdfdict.py @@ -123,7 +123,16 @@ def _recurse(hdfobject, datadict): elif isinstance(value, h5py.Dataset): if not lazy: value = unpacker(value) - datadict[key] = value + datadict[key] = ( + value.asstr()[...] + if h5py.check_string_dtype(value.dtype) + else value + ) + + if "attrs" in dir(value): + datadict[key + "@"] = {} + for attr, attrval in value.attrs.items(): + datadict[key + "@"][attr] = attrval return datadict diff --git a/pynxtools/dataconverter/readers/json_map/README.md b/pynxtools/dataconverter/readers/json_map/README.md index 4b4820c49..b81aec969 100644 --- a/pynxtools/dataconverter/readers/json_map/README.md +++ b/pynxtools/dataconverter/readers/json_map/README.md @@ -1,24 +1,63 @@ # JSON Map Reader -This reader allows you to convert either data from a .json file or an xarray exported as a .pickle using a flat .mapping.json file. +## What is this reader? + +This reader is designed to allow users of pynxtools to convert their existing data with the help of a map file. The map file tells the reader what to pick from your data files and convert them to FAIR NeXus files. The following formats are supported as input files: +* HDF5 (any extension works i.e. h5, hdf5, nxs, etc) +* JSON +* Python Dict Objects Pickled with [pickle](https://docs.python.org/3/library/pickle.html). These can contain [xarray.DataArray](https://docs.xarray.dev/en/stable/generated/xarray.DataArray.html) objects as well as regular Python types and Numpy types. It accepts any NXDL file that you like as long as your mapping file contains all the fields. Please use the --generate-template function of the dataconverter to create a .mapping.json file. ```console -user@box:~$ python convert.py --nxdl NXmynxdl --generate-template > mynxdl.mapping.json +user@box:~$ dataconverter --nxdl NXmynxdl --generate-template > mynxdl.mapping.json ``` There are some example files you can use: +[data.mapping.json](/tests/data/dataconverter/readers/json_map/data.mapping.json) -[data.mapping.json](/tests/data/tools/dataconverter/readers/json_map/data.mapping.json) - -[data.json](/tests/data/tools/dataconverter/readers/json_map/data.json) +[data.json](/tests/data/dataconverter/readers/json_map/data.json) ```console -user@box:~$ python convert.py --nxdl NXtest --input-file data.json --input-file data.mapping.json --reader json_map +user@box:~$ dataconverter --nxdl NXtest --input-file data.json --mapping data.mapping.json +``` + +##### [Example](/examples/json_map/) with HDF5 files. + +## The mapping.json file + +This file is designed to let you fill in the requirements of a NeXus Application Definition without writing any code. If you already have data in the formats listed above, you just need to use this mapping file to help the dataconverter pick your data correctly. + +The mapping files will always be based on the Template the dataconverter generates. See above on how to generate a mapping file. +The right hand side values of the Template keys are what you can modify. + +Here are the three different ways you can fill the right hand side of the Template keys: +* Write the nested path in your datafile. This is indicated by a leading `/` before the word `entry` to make `/entry/data/current_295C` below. +Example: + +```json + "/ENTRY[entry]/DATA[data]/current_295C": "/entry/data/current_295C", + "/ENTRY[entry]/NXODD_name/posint_value": "/a_level_down/another_level_down/posint_value", +``` + +* Write the values directly in the mapping file for missing data from your data file. + +```json + + "/ENTRY[entry]/PROCESS[process]/program": "Bluesky", + "/ENTRY[entry]/PROCESS[process]/program/@version": "1.6.7" +``` + +* Write JSON objects with a link key. This follows the same link mechanism that the dataconverter implements. In the context of this reader, you can only use external links to your data files. In the example below, `current.nxs` is an already existing HDF5 file that we link to in our new NeXus file without copying over the data. The format is as follows: +`"link": ":"` +Note: This only works for HDF5 files currently. + +```json + "/ENTRY[entry]/DATA[data]/current_295C": {"link": "current.nxs:/entry/data/current_295C"}, + "/ENTRY[entry]/DATA[data]/current_300C": {"link": "current.nxs:/entry/data/current_300C"}, ``` ## Contact person in FAIRmat for this reader -Sherjeel Shabih \ No newline at end of file +Sherjeel Shabih diff --git a/pynxtools/dataconverter/readers/json_map/reader.py b/pynxtools/dataconverter/readers/json_map/reader.py index 65a2eb7c5..d17bb075b 100644 --- a/pynxtools/dataconverter/readers/json_map/reader.py +++ b/pynxtools/dataconverter/readers/json_map/reader.py @@ -21,6 +21,7 @@ import pickle import numpy as np import xarray +from mergedeep import merge from pynxtools.dataconverter.readers.base.reader import BaseReader from pynxtools.dataconverter.template import Template @@ -57,9 +58,26 @@ def get_val_nested_keystring_from_dict(keystring, data): return data[current_key] +def get_attrib_nested_keystring_from_dict(keystring, data): + """ + Fetches all attributes from the data dict using path strings without a leading '/': + 'path/to/data/in/dict' + """ + if isinstance(keystring, (list, dict)): + return keystring + + key_splits = keystring.split("/") + parents = key_splits[:-1] + target = key_splits[-1] + for key in parents: + data = data[key] + + return data[target + "@"] if target + "@" in data.keys() else None + + def is_path(keystring): """Checks whether a given value in the mapping is a mapping path or just data""" - return isinstance(keystring, str) and keystring[0] == "/" + return isinstance(keystring, str) and len(keystring) > 0 and keystring[0] == "/" def fill_undocumented(mapping, template, data): @@ -68,6 +86,7 @@ def fill_undocumented(mapping, template, data): if is_path(value): template["undocumented"][path] = get_val_nested_keystring_from_dict(value[1:], data) + fill_attributes(path, value[1:], data, template) else: template["undocumented"][path] = value @@ -81,6 +100,7 @@ def fill_documented(template, mapping, template_provided, data): if is_path(map_str): template[path] = get_val_nested_keystring_from_dict(map_str[1:], data) + fill_attributes(path, map_str[1:], data, template) else: template[path] = map_str @@ -89,6 +109,14 @@ def fill_documented(template, mapping, template_provided, data): pass +def fill_attributes(path, map_str, data, template): + """Fills in the template all attributes found in the data object""" + attribs = get_attrib_nested_keystring_from_dict(map_str, data) + if attribs: + for key, value in attribs.items(): + template[path + "/@" + key] = value + + def convert_shapes_to_slice_objects(mapping): """Converts shape slice strings to slice objects for indexing""" for key in mapping: @@ -97,6 +125,25 @@ def convert_shapes_to_slice_objects(mapping): mapping[key]["shape"] = parse_slice(mapping[key]["shape"]) +def get_map_from_partials(partials, template, data): + """Takes a list of partials and returns a mapping dictionary to fill partials in our template""" + mapping: dict = {} + for partial in partials: + path = "" + template_path = "" + for part in partial.split("/")[1:]: + path = path + "/" + part + attribs = get_attrib_nested_keystring_from_dict(path[1:], data) + if template_path + "/" + part in template.keys(): + template_path = template_path + "/" + part + else: + nx_name = f"{attribs['NX_class'][2:].upper()}[{part}]" if attribs and "NX_class" in attribs else part # pylint: disable=line-too-long + template_path = template_path + "/" + nx_name + mapping[template_path] = path + + return mapping + + class JsonMapReader(BaseReader): """A reader that takes a mapping json file and a data file/object to return a template.""" @@ -118,10 +165,10 @@ def read(self, The mapping is only accepted as file.mapping.json to the inputs. """ data: dict = {} - mapping: dict = {} + mapping: dict = None + partials: list = [] - if objects: - data = objects[0] + data = objects[0] if objects else data for file_path in file_paths: file_extension = file_path[file_path.rindex("."):] @@ -142,17 +189,22 @@ def read(self, if is_hdf5: hdf = hdfdict.load(file_path) hdf.unlazy() - data = dict(hdf) + merge(data, dict(hdf)) + if "entry@" in data and "partial" in data["entry@"]: + partials.extend(data["entry@"]["partial"]) if mapping is None: - template = Template({x: "/hierarchical/path/in/your/datafile" for x in template}) - raise IOError("Please supply a JSON mapping file: --input-file" - " my_nxdl_map.mapping.json\n\n You can use this " - "template for the required fields: \n" + str(template)) + if len(partials) > 0: + mapping = get_map_from_partials(partials, template, data) + else: + template = Template({x: "/hierarchical/path/in/your/datafile" for x in template}) + raise IOError("Please supply a JSON mapping file: --input-file" + " my_nxdl_map.mapping.json\n\n You can use this " + "template for the required fields: \n" + str(template)) + new_template = Template() convert_shapes_to_slice_objects(mapping) - new_template = Template() fill_documented(new_template, mapping, template, data) fill_undocumented(mapping, new_template, data) diff --git a/pyproject.toml b/pyproject.toml index 5d0dc3f83..8b1af5f0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,8 @@ dependencies = [ "lark>=1.1.5", "requests", "requests_cache", - "nanonispy@git+https://github.com/ramav87/nanonispy.git@a0da87c58482d29624a2bf5deecb763dd1274212" + "nanonispy@git+https://github.com/ramav87/nanonispy.git@a0da87c58482d29624a2bf5deecb763dd1274212", + "mergedeep" ] [project.urls] diff --git a/tests/dataconverter/test_convert.py b/tests/dataconverter/test_convert.py index f6702bf01..e0b17c5ad 100644 --- a/tests/dataconverter/test_convert.py +++ b/tests/dataconverter/test_convert.py @@ -61,6 +61,8 @@ def restore_xarray_file_from_tmp(tmp_path): ]) def test_find_nxdl(cli_inputs): """Unit test to check if dataconverter can find NXDLs in contributed/applications folder.""" + cli_inputs.extend(["--reader", "example"]) + runner = CliRunner() result = runner.invoke(dataconverter.convert_cli, cli_inputs) if "NXdoesnotexist" in cli_inputs: