From a4ab7d189a72e8d4ece326711118d1ddd1936306 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Tue, 19 Nov 2024 11:32:45 +0100 Subject: [PATCH 1/4] Custom group resolution for depends_on --- src/scippnexus/base.py | 12 +++++- src/scippnexus/nxtransformations.py | 60 ++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/src/scippnexus/base.py b/src/scippnexus/base.py index b3dd218a..4a7be631 100644 --- a/src/scippnexus/base.py +++ b/src/scippnexus/base.py @@ -222,7 +222,7 @@ def nx_class(self) -> type | None: return NXroot @cached_property - def attrs(self) -> dict[str, Any]: + def attrs(self) -> MappingProxyType[str, Any]: """The attributes of the group. Cannot be used for writing attributes, since they are cached for performance.""" @@ -479,6 +479,16 @@ def dims(self) -> tuple[str, ...]: def shape(self) -> tuple[int, ...]: return tuple(self.sizes.values()) + @property + def definitions(self) -> MappingProxyType[str, str | type] | None: + return ( + None if self._definitions is None else MappingProxyType(self._definitions) + ) + + @property + def underlying(self) -> H5Group: + return self._group + def _create_field_params_numpy(data: np.ndarray): return data, None, {} diff --git a/src/scippnexus/nxtransformations.py b/src/scippnexus/nxtransformations.py index ac18deb4..150592fb 100644 --- a/src/scippnexus/nxtransformations.py +++ b/src/scippnexus/nxtransformations.py @@ -37,13 +37,20 @@ import warnings from dataclasses import dataclass, field, replace +from pathlib import PurePosixPath from typing import Literal import numpy as np import scipp as sc from scipp.scipy import interpolate -from .base import Group, NexusStructureError, NXobject, base_definitions_dict +from .base import ( + Group, + NexusStructureError, + NXobject, + base_definitions_dict, + is_dataset, +) from .field import DependsOn, Field @@ -266,6 +273,55 @@ def compute(self) -> sc.Variable | sc.DataArray: return transform +def _resolve_path(cwd: str, path: str) -> str: + """Resolve a path as if in a working directory, based only on strings. + + ``base`` must be absolute. + ``path`` is resolved as if ``base`` is the current working directory. + Returns an absolute path. + """ + p = PurePosixPath(path) + if p.is_absolute(): + return p.as_posix() + + base_parts = list(PurePosixPath(cwd).parts) + path_parts = [segment for segment in p.parts if segment != '.'] + while path_parts[0] == '..': + if not path_parts: + raise ValueError(f"Relative path beyond root: '{p}'") + base_parts.pop(-1) + path_parts.pop(0) + return PurePosixPath(*base_parts, *path_parts).as_posix() + + +def _locate_depends_on_target(parent: Field | Group, depends_on: str) -> Field | Group: + """Find the target of a depends_on link. + + The returned object is equivalent to calling ``parent[depends_on]`` + in the context of transformations. + This function does not work in general because it does not handle NXdata attributes. + + The method used here uses the underlying h5py groups to find the target group or + field to avoid constructing expensive intermediate snx.Group objects. + """ + raw = parent.underlying + target_path = _resolve_path(raw.name, depends_on) + target = raw.file[target_path] + + if is_dataset(target): + from .base import _dtype_fromdataset, _squeezed_field_sizes + + res = Field( + target, + parent=Group(target.parent, definitions=parent.definitions), + sizes=_squeezed_field_sizes(target), + dtype=_dtype_fromdataset(target), + ) + else: + res = Group(target, definitions=parent.definitions) + return res + + def parse_depends_on_chain( parent: Field | Group, depends_on: DependsOn ) -> TransformationChain | None: @@ -274,7 +330,7 @@ def parse_depends_on_chain( depends_on = depends_on.value try: while depends_on != '.': - transform = parent[depends_on] + transform = _locate_depends_on_target(parent, depends_on) parent = transform.parent depends_on = transform.attrs['depends_on'] chain.transformations[transform.name] = transform[()] From 44ebeb6de29bf172f62b273891fb8901300cb2dc Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Tue, 19 Nov 2024 14:00:31 +0100 Subject: [PATCH 2/4] Operate entirely on h5py objects --- src/scippnexus/nxtransformations.py | 42 +++++++++++++++++------------ src/scippnexus/typing.py | 2 +- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/scippnexus/nxtransformations.py b/src/scippnexus/nxtransformations.py index 150592fb..4e221f6b 100644 --- a/src/scippnexus/nxtransformations.py +++ b/src/scippnexus/nxtransformations.py @@ -36,10 +36,12 @@ from __future__ import annotations import warnings +from collections.abc import Mapping from dataclasses import dataclass, field, replace from pathlib import PurePosixPath from typing import Literal +import h5py import numpy as np import scipp as sc from scipp.scipy import interpolate @@ -273,28 +275,32 @@ def compute(self) -> sc.Variable | sc.DataArray: return transform -def _resolve_path(cwd: str, path: str) -> str: +def _resolve_path(cwd: PurePosixPath, path: PurePosixPath) -> PurePosixPath: """Resolve a path as if in a working directory, based only on strings. ``base`` must be absolute. ``path`` is resolved as if ``base`` is the current working directory. Returns an absolute path. """ - p = PurePosixPath(path) - if p.is_absolute(): - return p.as_posix() + if path.is_absolute(): + return path - base_parts = list(PurePosixPath(cwd).parts) - path_parts = [segment for segment in p.parts if segment != '.'] + base_parts = list(cwd.parts) + path_parts = [segment for segment in path.parts if segment != '.'] while path_parts[0] == '..': if not path_parts: - raise ValueError(f"Relative path beyond root: '{p}'") + raise ValueError(f"Relative path beyond root: '{path}'") base_parts.pop(-1) path_parts.pop(0) - return PurePosixPath(*base_parts, *path_parts).as_posix() + return PurePosixPath(*base_parts, *path_parts) -def _locate_depends_on_target(parent: Field | Group, depends_on: str) -> Field | Group: +def _locate_depends_on_target( + file: h5py.File, + base_path: PurePosixPath, + depends_on: PurePosixPath, + definitions: Mapping[str, type] | None, +) -> tuple[Field | Group, PurePosixPath]: """Find the target of a depends_on link. The returned object is equivalent to calling ``parent[depends_on]`` @@ -304,22 +310,21 @@ def _locate_depends_on_target(parent: Field | Group, depends_on: str) -> Field | The method used here uses the underlying h5py groups to find the target group or field to avoid constructing expensive intermediate snx.Group objects. """ - raw = parent.underlying - target_path = _resolve_path(raw.name, depends_on) - target = raw.file[target_path] + target_path = _resolve_path(base_path, depends_on) + target = file[target_path.as_posix()] if is_dataset(target): from .base import _dtype_fromdataset, _squeezed_field_sizes res = Field( target, - parent=Group(target.parent, definitions=parent.definitions), + parent=Group(target.parent, definitions=definitions), sizes=_squeezed_field_sizes(target), dtype=_dtype_fromdataset(target), ) else: - res = Group(target, definitions=parent.definitions) - return res + res = Group(target, definitions=definitions) + return res, target_path.parent def parse_depends_on_chain( @@ -328,10 +333,13 @@ def parse_depends_on_chain( """Follow a depends_on chain and return the transformations.""" chain = TransformationChain(depends_on.parent, depends_on.value) depends_on = depends_on.value + file = parent.underlying.file + base_path = PurePosixPath(parent.underlying.name) try: while depends_on != '.': - transform = _locate_depends_on_target(parent, depends_on) - parent = transform.parent + transform, base_path = _locate_depends_on_target( + file, base_path, PurePosixPath(depends_on), parent.definitions + ) depends_on = transform.attrs['depends_on'] chain.transformations[transform.name] = transform[()] except KeyError as e: diff --git a/src/scippnexus/typing.py b/src/scippnexus/typing.py index 339071c7..75286cab 100644 --- a/src/scippnexus/typing.py +++ b/src/scippnexus/typing.py @@ -17,7 +17,7 @@ def name(self) -> str: """Name of dataset or group""" @property - def file(self) -> list[int]: + def file(self) -> Any: """File of dataset or group""" @property From f72aff4436181f80224733edbde128e0951df6bc Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Wed, 20 Nov 2024 09:09:41 +0100 Subject: [PATCH 3/4] Init field within Field class --- src/scippnexus/base.py | 33 ----------------------------- src/scippnexus/field.py | 31 +++++++++++++++++++++++++++ src/scippnexus/nxdata.py | 1 - src/scippnexus/nxtransformations.py | 4 ---- 4 files changed, 31 insertions(+), 38 deletions(-) diff --git a/src/scippnexus/base.py b/src/scippnexus/base.py index 4a7be631..ae7142f7 100644 --- a/src/scippnexus/base.py +++ b/src/scippnexus/base.py @@ -41,45 +41,12 @@ def is_dataset(obj: H5Group | H5Dataset) -> bool: return hasattr(obj, 'shape') -_scipp_dtype = { - np.dtype('int8'): sc.DType.int32, - np.dtype('int16'): sc.DType.int32, - np.dtype('uint8'): sc.DType.int32, - np.dtype('uint16'): sc.DType.int32, - np.dtype('uint32'): sc.DType.int32, - np.dtype('uint64'): sc.DType.int64, - np.dtype('int32'): sc.DType.int32, - np.dtype('int64'): sc.DType.int64, - np.dtype('float32'): sc.DType.float32, - np.dtype('float64'): sc.DType.float64, - np.dtype('bool'): sc.DType.bool, -} - - -def _dtype_fromdataset(dataset: H5Dataset) -> sc.DType: - return _scipp_dtype.get(dataset.dtype, sc.DType.string) - - -def _squeezed_field_sizes(dataset: H5Dataset) -> dict[str, int]: - if (shape := dataset.shape) == (1,): - return {} - return {f'dim_{i}': size for i, size in enumerate(shape)} - - class NXobject: - def _init_field(self, field: Field): - if field.sizes is None: - field.sizes = _squeezed_field_sizes(field.dataset) - field.dtype = _dtype_fromdataset(field.dataset) - def __init__(self, attrs: dict[str, Any], children: dict[str, Field | Group]): """Subclasses should call this in their __init__ method, or ensure that they initialize the fields in `children` with the correct sizes and dtypes.""" self._attrs = attrs self._children = children - for field in children.values(): - if isinstance(field, Field): - self._init_field(field) @property def unit(self) -> None | sc.Unit: diff --git a/src/scippnexus/field.py b/src/scippnexus/field.py index 395d2280..d0e65d6d 100644 --- a/src/scippnexus/field.py +++ b/src/scippnexus/field.py @@ -81,6 +81,31 @@ def _as_datetime(obj: Any): return None +_scipp_dtype = { + np.dtype('int8'): sc.DType.int32, + np.dtype('int16'): sc.DType.int32, + np.dtype('uint8'): sc.DType.int32, + np.dtype('uint16'): sc.DType.int32, + np.dtype('uint32'): sc.DType.int32, + np.dtype('uint64'): sc.DType.int64, + np.dtype('int32'): sc.DType.int32, + np.dtype('int64'): sc.DType.int64, + np.dtype('float32'): sc.DType.float32, + np.dtype('float64'): sc.DType.float64, + np.dtype('bool'): sc.DType.bool, +} + + +def _dtype_fromdataset(dataset: H5Dataset) -> sc.DType: + return _scipp_dtype.get(dataset.dtype, sc.DType.string) + + +def _squeezed_field_sizes(dataset: H5Dataset) -> dict[str, int]: + if (shape := dataset.shape) == (1,): + return {} + return {f'dim_{i}': size for i, size in enumerate(shape)} + + @dataclass class Field: """NeXus field. @@ -93,6 +118,12 @@ class Field: dtype: sc.DType | None = None errors: H5Dataset | None = None + def __post_init__(self) -> None: + if self.sizes is None: + self.sizes = _squeezed_field_sizes(self.dataset) + if self.dtype is None: + self.dtype = _dtype_fromdataset(self.dataset) + @cached_property def attrs(self) -> dict[str, Any]: """The attributes of the dataset. diff --git a/src/scippnexus/nxdata.py b/src/scippnexus/nxdata.py index 37352483..5d97e621 100644 --- a/src/scippnexus/nxdata.py +++ b/src/scippnexus/nxdata.py @@ -481,7 +481,6 @@ def __init__(self, attrs: dict[str, Any], children: dict[str, Field | Group]): for k in list(children): if k.startswith(name): field = children.pop(k) - self._init_field(field) field.sizes = { 'time' if i == 0 else f'dim_{i}': size for i, size in enumerate(field.dataset.shape) diff --git a/src/scippnexus/nxtransformations.py b/src/scippnexus/nxtransformations.py index 4e221f6b..5782b031 100644 --- a/src/scippnexus/nxtransformations.py +++ b/src/scippnexus/nxtransformations.py @@ -314,13 +314,9 @@ def _locate_depends_on_target( target = file[target_path.as_posix()] if is_dataset(target): - from .base import _dtype_fromdataset, _squeezed_field_sizes - res = Field( target, parent=Group(target.parent, definitions=definitions), - sizes=_squeezed_field_sizes(target), - dtype=_dtype_fromdataset(target), ) else: res = Group(target, definitions=definitions) From 213888740bab90a31c35f8d3d5af4ea586fd8637 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Mon, 25 Nov 2024 09:36:29 +0100 Subject: [PATCH 4/4] Use DependsOn directly --- src/scippnexus/nxtransformations.py | 51 ++++++++--------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/src/scippnexus/nxtransformations.py b/src/scippnexus/nxtransformations.py index 5782b031..ff0cc6a4 100644 --- a/src/scippnexus/nxtransformations.py +++ b/src/scippnexus/nxtransformations.py @@ -35,10 +35,10 @@ from __future__ import annotations +import posixpath import warnings from collections.abc import Mapping from dataclasses import dataclass, field, replace -from pathlib import PurePosixPath from typing import Literal import h5py @@ -275,43 +275,20 @@ def compute(self) -> sc.Variable | sc.DataArray: return transform -def _resolve_path(cwd: PurePosixPath, path: PurePosixPath) -> PurePosixPath: - """Resolve a path as if in a working directory, based only on strings. - - ``base`` must be absolute. - ``path`` is resolved as if ``base`` is the current working directory. - Returns an absolute path. - """ - if path.is_absolute(): - return path - - base_parts = list(cwd.parts) - path_parts = [segment for segment in path.parts if segment != '.'] - while path_parts[0] == '..': - if not path_parts: - raise ValueError(f"Relative path beyond root: '{path}'") - base_parts.pop(-1) - path_parts.pop(0) - return PurePosixPath(*base_parts, *path_parts) - - def _locate_depends_on_target( file: h5py.File, - base_path: PurePosixPath, - depends_on: PurePosixPath, + depends_on: DependsOn, definitions: Mapping[str, type] | None, -) -> tuple[Field | Group, PurePosixPath]: +) -> tuple[Field | Group, str]: """Find the target of a depends_on link. The returned object is equivalent to calling ``parent[depends_on]`` in the context of transformations. - This function does not work in general because it does not handle NXdata attributes. - - The method used here uses the underlying h5py groups to find the target group or - field to avoid constructing expensive intermediate snx.Group objects. + This function does not work in general because it does not process any attributes + of parents which is required to fully load some groups. """ - target_path = _resolve_path(base_path, depends_on) - target = file[target_path.as_posix()] + target_path = depends_on.absolute_path() + target = file[target_path] if is_dataset(target): res = Field( @@ -320,7 +297,7 @@ def _locate_depends_on_target( ) else: res = Group(target, definitions=definitions) - return res, target_path.parent + return res, posixpath.dirname(target_path) def parse_depends_on_chain( @@ -328,15 +305,15 @@ def parse_depends_on_chain( ) -> TransformationChain | None: """Follow a depends_on chain and return the transformations.""" chain = TransformationChain(depends_on.parent, depends_on.value) - depends_on = depends_on.value + # Use raw h5py objects to follow the chain because that avoids constructing + # expensive intermediate snx.Group objects. file = parent.underlying.file - base_path = PurePosixPath(parent.underlying.name) try: - while depends_on != '.': - transform, base_path = _locate_depends_on_target( - file, base_path, PurePosixPath(depends_on), parent.definitions + while depends_on.value != '.': + transform, base = _locate_depends_on_target( + file, depends_on, parent.definitions ) - depends_on = transform.attrs['depends_on'] + depends_on = DependsOn(parent=base, value=transform.attrs['depends_on']) chain.transformations[transform.name] = transform[()] except KeyError as e: warnings.warn(