diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 21f6a45..6fd15f4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ on: # yamllint disable-line rule:truthy workflow_dispatch: env: - POETRY_VERSION: 1.8.3 + POETRY_VERSION: 1.8.4 REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92b7e4e..f2d9399 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ --- repos: - repo: https://github.com/python-poetry/poetry - rev: 1.8.3 + rev: 1.8.4 hooks: - id: poetry-check - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-docstring-first - id: check-json @@ -16,13 +16,18 @@ repos: - id: name-tests-test - id: pretty-format-json args: [--autofix, --no-ensure-ascii] + exclude: \.ipynb$ - id: trailing-whitespace + - repo: https://github.com/srstevenson/nb-clean + rev: 4.0.1 + hooks: + - id: nb-clean - repo: https://github.com/facebook/usort rev: v1.0.8 hooks: - id: usort - repo: https://github.com/psf/black-pre-commit-mirror - rev: 24.8.0 + rev: 24.10.0 hooks: - id: black args: [--preview] @@ -33,7 +38,7 @@ repos: args: [--exit-zero] verbose: true additional_dependencies: - - flake8-bugbear == 24.4.26 + - flake8-bugbear == 24.10.31 - repo: https://github.com/adrienverge/yamllint rev: v1.35.1 hooks: diff --git a/examples/demo.ipynb b/examples/demo.ipynb index 033e692..9d49da6 100644 --- a/examples/demo.ipynb +++ b/examples/demo.ipynb @@ -1,98 +1,134 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from pyambit.datamodel import Substances, Study \n", - "import nexusformat.nexus.tree as nx\n", - "import os.path\n", - "import tempfile\n", - "# to_nexus is not added without this import\n", - "from pyambit import nexus_writer\n", - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def query(url = \"https://apps.ideaconsult.net/gracious/substance/\" ,params = {\"max\" : 1}):\n", - " substances = None\n", - " headers = {'Accept': 'application/json'}\n", - " result = requests.get(url,params=params,headers=headers)\n", - " if result.status_code==200:\n", - " response = result.json()\n", - " substances = Substances.model_construct(**response)\n", - " for substance in substances.substance:\n", - " url_study = \"{}/study\".format(substance.URI)\n", - " study = requests.get(url_study,headers=headers)\n", - " if study.status_code==200:\n", - " response_study = study.json()\n", - " substance.study = Study.model_construct(**response_study).study\n", - "\n", - " return substances\n", - "\n", - "def write_studies_nexus(substances):\n", - " for substance in substances.substance:\n", - " for study in substance.study:\n", - " file = os.path.join(tempfile.gettempdir(), \"study_{}.nxs\".format(study.uuid))\n", - " nxroot = nx.NXroot()\n", - " try:\n", - " study.to_nexus(nxroot)\n", - " nxroot.save(file, mode=\"w\")\n", - " except Exception as err:\n", - " #print(\"error\",file,str(err))\n", - " print(file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "try:\n", - " substances = query(params = {\"max\" : 10}) \n", - " _json = substances.model_dump(exclude_none=True)\n", - " new_substances = Substances.model_construct(**_json)\n", - " #test roundtrip\n", - " assert substances == new_substances\n", - "\n", - " file = os.path.join(tempfile.gettempdir(), \"remote.json\")\n", - " print(file)\n", - " with open(file, 'w', encoding='utf-8') as file:\n", - " file.write(substances.model_dump_json(exclude_none=True))\n", - " write_studies_nexus(substances)\n", - "except Exception as x:\n", - " print(x)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.12.5" - } + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from pyambit.datamodel import Substances, Study \n", + "import nexusformat.nexus.tree as nx\n", + "import os.path\n", + "import tempfile\n", + "# to_nexus is not added without this import\n", + "from pyambit import nexus_writer\n", + "import json\n", + "from IPython.display import display, HTML" + ] }, - "nbformat": 4, - "nbformat_minor": 2 + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def query(url = \"https://apps.ideaconsult.net/gracious/substance/\" ,params = {\"max\" : 1}):\n", + " substances = None\n", + " headers = {'Accept': 'application/json'}\n", + " result = requests.get(url,params=params,headers=headers)\n", + " if result.status_code==200:\n", + " response = result.json()\n", + " substances = Substances.model_construct(**response)\n", + " for substance in substances.substance:\n", + " url_study = \"{}/study?max=10000\".format(substance.URI)\n", + " study = requests.get(url_study,headers=headers)\n", + " if study.status_code==200:\n", + " response_study = study.json()\n", + " substance.study = Study.model_construct(**response_study).study\n", + " #break\n", + "\n", + " return substances\n", + "\n", + "def write_studies_nexus(substances, single_file=True):\n", + " if single_file:\n", + " nxroot = nx.NXroot()\n", + " substances.to_nexus(nxroot)\n", + " file = os.path.join(tempfile.gettempdir(), \"remote.nxs\")\n", + " print(file)\n", + " nxroot.save(file, mode=\"w\")\n", + " else: \n", + " for substance in substances.substance:\n", + " for study in substance.study:\n", + " file = os.path.join(tempfile.gettempdir(), \"study_{}.nxs\".format(study.uuid))\n", + " print(file)\n", + " nxroot = nx.NXroot()\n", + " try:\n", + " study.to_nexus(nxroot)\n", + " nxroot.save(file, mode=\"w\")\n", + " except Exception as err:\n", + " #print(\"error\",file,str(err))\n", + " print(file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import traceback" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://apps.ideaconsult.net/gracious/substance/\"\n", + "#url = \"http://localhost:9090/ambit2/substance/\"\n", + "#url = \"http://localhost:9090/ambit2/substance/POLY-e02442cc-8f7c-3a71-82cf-7df5888a4bfa\"\n", + "#url = \"http://localhost:9090/ambit2/substance/POLY-25d13fa6-c18b-35c8-b0f6-7325f5f3e505\"\n", + "try:\n", + " substances = query(url=url,params = {\"max\" : 1}) \n", + " _json = substances.model_dump(exclude_none=True)\n", + " new_substances = Substances.model_construct(**_json)\n", + " #test roundtrip\n", + " assert substances == new_substances\n", + "\n", + " file = os.path.join(tempfile.gettempdir(), \"remote.json\")\n", + " print(file)\n", + " with open(file, 'w', encoding='utf-8') as file:\n", + " file.write(substances.model_dump_json(exclude_none=True))\n", + " \n", + " for s in substances.substance:\n", + " for pa in s.study:\n", + " effectarrays_only, df = pa.convert_effectrecords2array()\n", + " display(df.dropna(axis=1,how=\"all\"))\n", + " print(effectarrays_only)\n", + " #break\n", + " #write_studies_nexus(substances, single_file=False)\n", + "except Exception as x:\n", + " traceback.print_exc()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/examples/test.py b/examples/test.py deleted file mode 100644 index cc7945c..0000000 --- a/examples/test.py +++ /dev/null @@ -1,3 +0,0 @@ -import pyambit - -print("test") diff --git a/src/pyambit/datamodel.py b/src/pyambit/datamodel.py index d96d62d..b5ab3ae 100644 --- a/src/pyambit/datamodel.py +++ b/src/pyambit/datamodel.py @@ -20,7 +20,7 @@ model_validator, ) -from pyambit.ambit_deco import add_ambitmodel_method +from pyambit.ambit_deco import add_ambitmodel_method # noqa: F401 class AmbitModel(BaseModel): @@ -83,7 +83,14 @@ def model_construct(cls, **data: Any) -> "Protocol": return super().model_construct(**data) def __repr__(self): - return f"Protocol(topcategory={self.topcategory!r}, category={self.category!r}, endpoint={self.endpoint!r}, guideline={self.guideline!r})" + return ( + "Protocol(" + f"topcategory={self.topcategory!r}, " + f"category={self.category!r}, " + f"endpoint={self.endpoint!r}, " + f"guideline={self.guideline!r}" + ")" + ) def __eq__(self, other): if not isinstance(other, Protocol): @@ -139,15 +146,15 @@ def __repr__(self): EffectResult = create_model("EffectResult", __base__=EffectResult) -class ValueArray(AmbitModel): +class BaseValueArray(AmbitModel): unit: Optional[str] = None # the arrays can in fact contain strings, we don't need textValue! values: Union[npt.NDArray, None] = None errQualifier: Optional[str] = None errorValue: Optional[Union[npt.NDArray, None]] = None # but loValue - upValue need some support - # also loValue + textValue as used in composition data - auxiliary: Optional[Dict[str, npt.NDArray]] = None + # also loValue + textValue as used in composition / analytics data + # See ValueArray model_config = ConfigDict(arbitrary_types_allowed=True) @@ -158,14 +165,25 @@ def create( unit: str = None, errorValue: npt.NDArray = None, errQualifier: str = None, - auxiliary: Dict[str, npt.NDArray] = None, ): return cls( - values=values, - unit=unit, - errorValue=errorValue, - errQualifier=errQualifier, - auxiliary=auxiliary, + values=values, unit=unit, errorValue=errorValue, errQualifier=errQualifier + ) + + @classmethod + def model_construct(cls, **data): + def deserialize(value): + if isinstance(value, list): + return np.array(value) # Convert lists back to numpy arrays + return value + + values = deserialize(data.get("values")) + unit = data.get("unit") + errQualifier = data.get("errQualifier") + errorValue = deserialize(data.get("errorValue")) + + return cls( + values=values, unit=unit, errQualifier=errQualifier, errorValue=errorValue ) def model_dump_json(self, **kwargs) -> str: @@ -179,27 +197,154 @@ def serialize(obj): return json.dumps(model_dict, default=serialize, **kwargs) def __eq__(self, other): - def compare_auxiliary(aux1, aux2): - if aux1 is aux2: - return True - if aux1 is None or aux2 is None: - return False - if aux1.keys() != aux2.keys(): - return False - return all(np.array_equal(aux1[k], aux2[k]) for k in aux1) - - if not isinstance(other, ValueArray): + if not isinstance(other, BaseValueArray): return False return ( self.unit == other.unit and self.errQualifier == other.errQualifier and np.array_equal(self.values, other.values) - and compare_auxiliary(self.auxiliary, other.auxiliary) and np.array_equal(self.errorValue, other.errorValue) ) +class MetaValueArray(BaseValueArray): + conditions: Optional[Dict[str, str]] = None + model_config = ConfigDict(arbitrary_types_allowed=True) + + @classmethod + def create( + cls, + values: npt.NDArray = None, + unit: str = None, + errorValue: npt.NDArray = None, + errQualifier: str = None, + conditions: Optional[Dict[str, str]] = None, + ): + return cls( + values=values, + unit=unit, + errorValue=errorValue, + errQualifier=errQualifier, + conditions=conditions, + ) + + @classmethod + def model_construct(cls, **data): + base_instance = super().model_construct(**data) + conditions = data.get("conditions", None) + return cls( + values=base_instance.values, + unit=base_instance.unit, + errorValue=base_instance.errorValue, + errQualifier=base_instance.errQualifier, + conditions=conditions, + ) + + def model_dump_json(self, **kwargs) -> str: + def serialize(obj): + if isinstance(obj, np.ndarray): + return obj.tolist() # Convert NumPy arrays to lists + raise TypeError(f"Type {type(obj).__name__} not serializable") + + model_dict = self.model_dump() + return json.dumps(model_dict, default=serialize, **kwargs) + + def __eq__(self, other): + if not isinstance(other, MetaValueArray): + return False + return super().__eq__(other) and self.conditions == other.conditions + + +class ValueArray(MetaValueArray): + auxiliary: Optional[Dict[str, Union[npt.NDArray, "MetaValueArray"]]] = None + model_config = ConfigDict(arbitrary_types_allowed=True) + + @classmethod + def create( + cls, + values: npt.NDArray = None, + unit: str = None, + errorValue: npt.NDArray = None, + errQualifier: str = None, + conditions: Optional[Dict[str, str]] = None, + auxiliary: Dict[str, Union[npt.NDArray, "MetaValueArray"]] = None, + ): + return cls( + values=values, + unit=unit, + errorValue=errorValue, + errQualifier=errQualifier, + conditions=conditions, + auxiliary=auxiliary, + ) + + @classmethod + def model_construct(cls, **data): + def deserialize(value): + if isinstance(value, list): + return np.array(value) # Convert lists back to numpy arrays + return value + + base_data = {k: deserialize(v) for k, v in data.items() if k != "auxiliary"} + base_instance = MetaValueArray.model_construct(**base_data) + auxiliary_data = data.get("auxiliary", {}) + + if auxiliary_data is not None: + auxiliary = {} + for key, value in auxiliary_data.items(): + if isinstance( + value, dict + ): # Check if it's a dictionary representing a MetaValueArray + auxiliary[key] = MetaValueArray.model_construct(**value) + else: + auxiliary[key] = deserialize(value) + else: + auxiliary = None + + return cls( + values=base_instance.values, + unit=base_instance.unit, + errQualifier=base_instance.errQualifier, + errorValue=base_instance.errorValue, + conditions=base_instance.conditions, + auxiliary=auxiliary, + ) + + def model_dump(self): + base_dict = super().model_dump() + return {**base_dict, "auxiliary": self.auxiliary} + + def __eq__(self, other): + if not isinstance(other, ValueArray): + return False + return super().__eq__(other) and self.compare_auxiliary( + self.auxiliary, other.auxiliary + ) + + @staticmethod + def compare_auxiliary(aux1, aux2): + if aux1 is aux2: + return True + if aux1 is None or aux2 is None: + return False + if aux1.keys() != aux2.keys(): + return False + return all(np.array_equal(aux1[k], aux2[k]) for k in aux1) + + def model_dump_json(self, **kwargs) -> str: + def serialize(obj): + if isinstance(obj, np.ndarray): + return obj.tolist() # Convert NumPy arrays to lists + if isinstance(obj, MetaValueArray): + return obj.model_dump() # Serialize BaseValueArray to a dictionary + raise TypeError(f"Type {type(obj).__name__} not serializable") + + model_dict = self.model_dump() + return json.dumps(model_dict, default=serialize, **kwargs) + + class EffectRecord(AmbitModel): + nx_name: Optional[str] = None endpoint: str endpointtype: Optional[str] = None result: EffectResult = None @@ -329,10 +474,16 @@ def __eq__(self, other): def __repr__(self): return ( - f"EffectRecord(endpoint={self.endpoint!r}, endpointtype={self.endpointtype!r}, " - f"result={self.result!r}, conditions={self.conditions!r}, " - f"idresult={self.idresult!r}, endpointGroup={self.endpointGroup!r}, " - f"endpointSynonyms={self.endpointSynonyms!r}, sampleID={self.sampleID!r})" + "EffectRecord(" + f"endpoint={self.endpoint!r}, " + f"endpointtype={self.endpointtype!r}, " + f"result={self.result!r}, " + f"conditions={self.conditions!r}, " + f"idresult={self.idresult!r}, " + f"endpointGroup={self.endpointGroup!r}, " + f"endpointSynonyms={self.endpointSynonyms!r}, " + f"sampleID={self.sampleID!r}" + ")" ) @@ -388,20 +539,23 @@ def model_construct(cls, **data: Any) -> "EffectArray": isinstance(a, str) for a in alternatives ): raise ValueError( - f"Alternative axes for '{primary_axis}' should be a list of strings." + f"Alternative axes for '{primary_axis}' should be a list of " + "strings." ) # Ensure all alternative axes are present in 'axes' if primary_axis not in data["axes"]: raise ValueError( - f"Primary axis '{primary_axis}' in axis_groups must be a key in axes." + f"Primary axis '{primary_axis}' in axis_groups must be a key " + "in axes." ) # Validate that each alternative axis exists in 'axes' for alt_axis in alternatives: if alt_axis not in data["axes"]: raise ValueError( - f"Alternative axis '{alt_axis}' in axis_groups must be a key in axes." + f"Alternative axis '{alt_axis}' in axis_groups must be a " + "key in axes." ) new_axis_groups[primary_axis] = alternatives @@ -421,12 +575,19 @@ def __eq__(self, other): ) def __repr__(self): + repr_endpointtype = repr(self.endpointtype) if self.endpointtype else "" repr_signal = repr(self.signal) if self.signal else "None" repr_axes = repr(self.axes) if self.axes else "None" repr_axis_groups = repr(self.axis_groups) if self.axis_groups else "None" return ( - f"EffectArray(signal={repr_signal}, axes={repr_axes}, " - f"axis_groups={repr_axis_groups}, {super().__repr__()})" + "EffectArray(" + f"endpoint={self.endpoint}, " + f"endpointtype={repr_endpointtype}, " + f"signal={repr_signal}, " + f"axes={repr_axes}, " + f"axis_groups={repr_axis_groups}, " + f"{super().__repr__()}" + ")" ) @@ -463,9 +624,13 @@ def __eq__(self, other): def __repr__(self): return ( - f"ProtocolEffectRecord(protocol={self.protocol}, documentUUID={self.documentUUID}, " - f"studyResultType={self.studyResultType}, interpretationResult={self.interpretationResult}, " - f"{super().__repr__()})" + "ProtocolEffectRecord(" + f"protocol={self.protocol}, " + f"documentUUID={self.documentUUID}, " + f"studyResultType={self.studyResultType}, " + f"interpretationResult={self.interpretationResult}, " + f"{super().__repr__()}" + ")" ) @@ -505,12 +670,14 @@ def __eq__(self, other): def __repr__(self): return ( - f"ReliabilityParams(r_isRobustStudy={self.r_isRobustStudy}, " + "ReliabilityParams(" + f"r_isRobustStudy={self.r_isRobustStudy}, " f"r_isUsedforClassification={self.r_isUsedforClassification}, " f"r_isUsedforMSDS={self.r_isUsedforMSDS}, " f"r_purposeFlag={self.r_purposeFlag}, " f"r_studyResultType={self.r_studyResultType}, " - f"r_value={self.r_value})" + f"r_value={self.r_value}" + ")" ) @@ -533,7 +700,13 @@ def __eq__(self, other): ) def __repr__(self): - return f"Citation(year={self.year}, title={self.title}, " f"owner={self.owner})" + return ( + "Citation(" + f"year={self.year}, " + f"title={self.title}, " + f"owner={self.owner}" + ")" + ) Citation = create_model("Citation", __base__=Citation) @@ -625,6 +798,7 @@ class ProtocolApplication(AmbitModel): """ uuid: Optional[str] = None + nx_name: Optional[str] = None # reliability: Optional[ReliabilityParams] interpretationResult: Optional[str] = None interpretationCriteria: Optional[str] = None @@ -725,13 +899,19 @@ def __eq__(self, other): def __repr__(self): return ( - f"ProtocolApplication(uuid={self.uuid!r}, " + "ProtocolApplication(" + f"uuid={self.uuid!r}, " f"interpretationResult={self.interpretationResult!r}, " f"interpretationCriteria={self.interpretationCriteria!r}, " - f"parameters={self.parameters!r}, citation={self.citation!r}, " - f"effects={self.effects!r}, owner={self.owner!r}, " - f"protocol={self.protocol!r}, investigation_uuid={self.investigation_uuid!r}, " - f"assay_uuid={self.assay_uuid!r}, updated={self.updated!r})" + f"parameters={self.parameters!r}, " + f"citation={self.citation!r}, " + f"effects={self.effects!r}, " + f"owner={self.owner!r}, " + f"protocol={self.protocol!r}, " + f"investigation_uuid={self.investigation_uuid!r}, " + f"assay_uuid={self.assay_uuid!r}, " + f"updated={self.updated!r}" + ")" ) def create_multidimensional_matrix( @@ -779,13 +959,22 @@ def create_multidimensional_matrix( # Determine the shape of the multidimensional matrix shape = tuple(len(values) for values in axis_values) # Initialize the multidimensional matrix with NaNs - matrix = np.full(shape, "" if signal_col == "textValue" else np.nan) + if signal_col == "textValue": + matrix = np.full(shape, "") + else: + matrix = np.full(shape, np.nan) matrix_errors = None if errors_col is None else np.full(shape, np.nan) auxsignals = {} if auxsignal_cols: for a in auxsignal_cols: - auxsignals[a] = np.full(shape, "" if a == "textValue" else np.nan) + if a == "textValue": + _arr = np.empty(shape, dtype=object) + if len(shape) > 0: + _arr[:] = "" + auxsignals[a] = _arr + else: + auxsignals[a] = np.full(shape, np.nan) # Populate the matrix with signal values for _, row in df.iterrows(): @@ -804,10 +993,14 @@ def create_multidimensional_matrix( if auxsignal_cols: for a in auxsignal_cols: if not pd.isna(row[a]): - auxsignals[a][indices] = row[a] - except Exception as x: - print("matrix", self.uuid) - print(row) + if isinstance(row[a], bytes): + auxsignals[a][indices] = row[a].decode("utf-8") + else: + auxsignals[a][indices] = row[a] + except: # noqa: B001,E722 FIXME + # print("matrix", self.uuid) + # print(row) + print(axis_indices) print(primary_axis_cols) print(traceback.format_exc()) @@ -815,9 +1008,10 @@ def create_multidimensional_matrix( unique_values = sorted(df[axis].unique()) axes[axis].values = unique_values - # Collect alternative axis values - tbd - sorting may change order of alternative axes! + # Collect alternative axis values - tbd - sorting may change order of + # alternative axes! if alt_axes is not None: - for primary_axis, alt_cols in alt_axes.items(): + for _primary_axis, alt_cols in alt_axes.items(): for alt_col in alt_cols: if alt_col in df.columns: _tmp = sorted(df[alt_col].unique()) @@ -843,8 +1037,10 @@ def convert_effectrecords2array(self): if len(_nonnumcols) > 0: df_set = split_df_by_columns(_df, _nonnumcols) # debug + # here the null columns (e.g. replicates) are lost + # print(df_set) - for key, df in df_set.items(): + for _key, df in df_set.items(): # df.to_excel("{}_{}.xlsx".format(self.uuid,key),index=False) for endpointtype in df["endpointtype"].unique(): @@ -901,7 +1097,9 @@ def convert_effectrecords2array(self): try: _f["loValue"] = _f["loValue"].fillna(_tmp[_col]) except Exception as x: - # print(_f['loValue'].apply(type).value_counts()) + # print( + # _f['loValue'].apply(type).value_counts() + # ) print(x) print(_col, _f["loValue"], self.uuid) @@ -922,20 +1120,20 @@ def convert_effectrecords2array(self): if _tmp["loValue"].dropna().empty else transform_array(_tmp["loValue"].values) ) - loQualifier = ( - None - if _tmp["loQualifier"].dropna().empty - else transform_array(_tmp["loQualifier"].values) - ) - upQualifier = ( - None - if _tmp["upQualifier"].dropna().empty - else transform_array(_tmp["upQualifier"].values) - ) - - errqualifier = _tmp["errQualifier"].unique()[ - 0 - ] # if _tmp["errQualifier"].nunique() == 1 else _tmp["errQualifier"] + # _loQualifier = ( + # None + # if _tmp["loQualifier"].dropna().empty + # else transform_array(_tmp["loQualifier"].values) + # ) + # _upQualifier = ( + # None + # if _tmp["upQualifier"].dropna().empty + # else transform_array(_tmp["upQualifier"].values) + # ) + + errqualifier = _tmp["errQualifier"].unique()[0] + # if _tmp["errQualifier"].nunique() == 1 + # else _tmp["errQualifier"] # df_axes["loValue"] = loValues auxsignal_cols = [] @@ -953,40 +1151,70 @@ def convert_effectrecords2array(self): auxsignal_cols.append(tag) df_axes[tag] = _values - if _tmp["errorValue"].dropna().empty: - error_col = None + if df_axes.isna().any().any(): + # for some reason there are still nan values + axes_all = [] + nan_columns = df_axes.columns[df_axes.isna().any()].tolist() + df_axes_nan = df_axes[ + df_axes[nan_columns].isna().any(axis=1) + ] + df_axes_nan = df_axes_nan.dropna(axis=1, how="all") + df_axes_not_nan = df_axes[ + df_axes[nan_columns].notna().all(axis=1) + ] + if not df_axes_not_nan.empty: + axes_all.append(df_axes_not_nan) + # print(print(df_axes_not_nan)) + if not df_axes_nan.empty: + # ignore for now + # axes_all.append(df_axes_nan) + print(df_axes_nan) else: - error_col = "errorValue" - df_axes[error_col] = _tmp[error_col] - - matrix, axes, matrix_errors, auxsignals = ( - self.create_multidimensional_matrix( - df_axes, - signal_col, - axes, - alt_axes, - error_col, - auxsignal_cols, + axes_all = [df_axes] + + for df_axes in axes_all: + if _tmp["errorValue"].dropna().empty: + error_col = None + else: + error_col = "errorValue" + df_axes[error_col] = _tmp[error_col] + + matrix, axes, matrix_errors, auxsignals = ( + self.create_multidimensional_matrix( + df_axes, + signal_col, + axes, + alt_axes, + error_col, + auxsignal_cols, + ) ) - ) - - earray = EffectArray( - endpoint=endpoint, - endpointtype=endpointtype, - conditions=new_conditions, - signal=ValueArray( - unit=unit, - # values=textValue if loValues is None else loValues, - values=matrix, - errQualifier=errqualifier, - errorValue=matrix_errors, - auxiliary=auxsignals, - ), - axes=axes, - axis_groups=alt_axes, - ) - arrays.append(earray) - # print(earray) + # Remove items where the value is None or NaN + new_conditions = { + k: v + for k, v in new_conditions.items() + if v is not None + and not (isinstance(v, float) and np.isnan(v)) + } + + earray = EffectArray( + endpoint=endpoint, + endpointtype=endpointtype, + conditions=new_conditions, + signal=ValueArray( + unit=unit, + # values=textValue if loValues is None + # else loValues, + values=matrix, + errQualifier=errqualifier, + errorValue=matrix_errors, + auxiliary=auxsignals, + ), + axes=axes, + axis_groups=alt_axes, + ) + arrays.append(earray) + # print(earray) return arrays, _df @@ -1008,7 +1236,7 @@ class Study(AmbitModel): papps = Study(**parsed_json) for papp in papps: print(papp) - """ + """ # noqa: B950 study: List[ProtocolApplication] @@ -1123,9 +1351,17 @@ def __eq__(self, other: Any) -> bool: def __repr__(self) -> str: return ( - f"Compound(URI={self.URI}, structype={self.structype}, metric={self.metric}, " - f"name={self.name}, cas={self.cas}, einecs={self.einecs}, " - f"inchikey={self.inchikey}, inchi={self.inchi}, formula={self.formula})" + "Compound(" + f"URI={self.URI}, " + f"structype={self.structype}, " + f"metric={self.metric}, " + f"name={self.name}, " + f"cas={self.cas}, " + f"einecs={self.einecs}, " + f"inchikey={self.inchikey}, " + f"inchi={self.inchi}, " + f"formula={self.formula}" + ")" ) @@ -1316,10 +1552,19 @@ def __eq__(self, other): def __repr__(self): return ( - f"SubstanceRecord(URI={self.URI}, ownerUUID={self.ownerUUID}, ownerName={self.ownerName}, " - f"i5uuid={self.i5uuid}, name={self.name}, publicname={self.publicname}, format={self.format}, " - f"substanceType={self.substanceType}, referenceSubstance={self.referenceSubstance}, " - f"study={self.study}, composition={self.composition})" + "SubstanceRecord(" + f"URI={self.URI}, " + f"ownerUUID={self.ownerUUID}, " + f"ownerName={self.ownerName}, " + f"i5uuid={self.i5uuid}, " + f"name={self.name}, " + f"publicname={self.publicname}, " + f"format={self.format}, " + f"substanceType={self.substanceType}, " + f"referenceSubstance={self.referenceSubstance}, " + f"study={self.study}, " + f"composition={self.composition}" + ")" ) @@ -1410,7 +1655,7 @@ def transform_array(arr): if any_strings: try: return pd.to_numeric(arr, errors="raise") - except Exception as e: + except Exception: _converted = np.array( [ ( @@ -1487,18 +1732,19 @@ def is_string_only(series): # Check if all values in the series are either strings or NaN return series.apply(lambda x: isinstance(x, str) or pd.isna(x)).all() - # Use list comprehension to check if each column is string only and cannot be converted to numeric + # Use list comprehension to check if each column is string only and cannot be + # converted to numeric. string_only_cols = [ col for col in object_cols if is_string_only(df[col]) and pd.to_numeric(df[col], errors="coerce").isna().all() ] - + # print(string_only_cols) return string_only_cols -def split_df_by_columns(df, columns): +def split_df_by_columns_bad_with_nans(df, columns): # Create a dictionary to hold the split DataFrames split_dfs = {} @@ -1517,3 +1763,29 @@ def split_df_by_columns(df, columns): split_dfs[key] = split_df return split_dfs + + +def split_df_by_columns(df, columns): + # Create a dictionary to hold the split DataFrames + split_dfs = {} + + # Identify unique combinations of values for the specified columns + unique_combinations = df[columns].drop_duplicates() + + for _, row in unique_combinations.iterrows(): + # Create a filter condition that treats NaN as equal + filter_condition = pd.DataFrame( + { + col: (df[col] == row[col]) | (pd.isna(df[col]) & pd.isna(row[col])) + for col in columns + } + ).all(axis=1) + + # Create a new DataFrame for this combination + split_df = df[filter_condition] + + # Use a tuple of the unique values as the key, treating NaN gracefully + key = tuple(row) + split_dfs[key] = split_df + + return split_dfs diff --git a/src/pyambit/nexus_parser.py b/src/pyambit/nexus_parser.py index 1af541e..2a006a0 100644 --- a/src/pyambit/nexus_parser.py +++ b/src/pyambit/nexus_parser.py @@ -1,113 +1,211 @@ -import h5py -import ramanchada2 as rc2 - - -class NexusParser: - def __init__(self): - self.parsed_objects = {} - - def parse_data(self, entry, default=False, nxprocess=False): - for attr in entry.attrs: - print(attr, entry.attrs.get(attr)) - for _, item in entry.items(): - nx_class = item.attrs.get("NX_class", None) - print("PROCESSED " if nxprocess else "", "DATA ", item.name, " ", nx_class) - - def parse_entry(self, entry, nxprocess=False, dataparser=None): - print(dataparser) - nx_class = entry.attrs.get("NX_class", None) - default = entry.attrs.get("default", None) - # print(entry.name, ' ', nx_class, default) - for _, item in entry.items(): - nx_class = item.attrs.get("NX_class", None) - if nx_class == "NXdata": - if dataparser is None: - self.parse_data(item, entry.name == default, nxprocess) - else: - print("dataparsre", dataparser) - dataparser(item, entry.name == default, nxprocess) - - elif nx_class == "NXenvironment": - pass - elif nx_class == "NXinstrument": - pass - elif nx_class == "NXcite": - pass - elif nx_class == "NXcollection": - pass - elif nx_class == "NXnote": - pass - elif nx_class == "NXsample": - self.parse_sample(item) - else: - print("ENTRY ", item.name, " ", nx_class) +import traceback +from typing import Dict - def parse_sample(self, group): - nx_class = group.attrs.get("NX_class", None) - if nx_class == "NXsample_component": - pass - else: - print(group.name, " ", nx_class) +import nexusformat.nexus as nx - def parse(self, file_path: str, dataparser=None): - with h5py.File(file_path, "r") as file: - self.parse_h5(file, dataparser) +from pyambit.datamodel import ( + Citation, + EffectRecord, + EffectResult, + EndpointCategory, + Protocol, + ProtocolApplication, + SampleLink, + SubstanceRecord, + Substances, + Value, +) - def parse_h5(self, h5_file, dataparser=None): - try: - def iterate_groups(group, indent="", nxprocess=False): - nx_class = group.attrs.get("NX_class", None) - if nx_class == "NXentry" or nx_class == "NXsubentry": - self.parse_entry(group, nxprocess, dataparser) - elif nx_class == "NXsample": - self.parse_sample(group) - - else: - for name, item in group.items(): - nx_class = item.attrs.get("NX_class", None) - if isinstance(item, h5py.Group): - # print(indent + 'Group:', name, ' ', nx_class) - # Recursively call the function for nested groups - iterate_groups( - item, - indent + " ", - nxprocess or nx_class == "NX_process", - ) - else: - print(indent + "Dataset:", name, " ", nx_class) - - # Start the iteration from the root of the file - iterate_groups(h5_file) - except Exception as err: - print(err) +class Nexus2Ambit: + + def __init__(self, domain: str, index_only: True): + self.substances: Dict[str, SubstanceRecord] = {} + self.domain = domain + self.index_only = index_only + def __enter__(self): + self.clear() + return self -class SpectrumParser(NexusParser): - def __init__(self): - super().__init__() - # Replace the parent class field with the spectrum-specific field - self.parsed_objects = {} + def __exit__(self, exc_type, exc_value, traceback): + # Any cleanup code, if needed + pass + + def clear(self): + self.substances = {} + + def substance_from_nexus(self, nxentry: nx.NXentry) -> SubstanceRecord: + try: + record = SubstanceRecord( + URI=None, + ownerUUID=nxentry.attrs["owner-uuid"], + ownerName=nxentry.attrs["ownerName"], + i5uuid=nxentry.attrs["uuid"], + name=nxentry["name"].nxdata, + publicname=nxentry.attrs["publicname"], + format="NeXus", + substanceType="CHEBI_59999", + referenceSubstance=None, + study=[], + composition=None, + ) + return record + except Exception as err: + print(traceback.format_exc()) + raise err + + def parse_substances(self, nxentry: nx.NXentry): + for _entry_name, entry in nxentry.items(): + if isinstance(entry, nx.NXsample): + record: SubstanceRecord = self.substance_from_nexus(entry) + if record.i5uuid not in self.substances: + self.substances[record.i5uuid] = record + + def parse_studies(self, nxroot: nx.NXroot, relative_path: str): + for entry_name, entry in nxroot.items(): + if entry_name != "substance": + papp: ProtocolApplication = self.parse_entry(entry, relative_path) + if papp.owner.substance.uuid in self.substances: + self.substances[papp.owner.substance.uuid].study.append(papp) + + def parse(self, nxroot: nx.NXroot, relative_path: str): + for entry_name, entry in nxroot.items(): + if entry_name == "substance": + self.parse_substances(entry) + self.parse_studies(nxroot, relative_path) + + def get_substances(self): + return Substances(substance=self.substances.values()) + + def parse_entry( + self, nxentry: nx.NXentry, relative_path: str + ) -> ProtocolApplication: + dox = nxentry.get("experiment_documentation", None) + protocol = None + parameters = {} + if dox is not None: + _protocol = dox.get("protocol", None) + if _protocol is None: + pass + else: + protocol = Protocol( + topcategory=_protocol.attrs["topcategory"], + category=EndpointCategory(code=_protocol.attrs["code"]), + endpoint=( + _protocol.attrs["endpoint"] + if "endpoint" in _protocol.attrs + else None + ), + guideline=[_protocol.attrs["guideline"]], + ) + if protocol is None: + if nxentry["definition"].nxvalue == "NXraman": + protocol = protocol = Protocol( + "P-CHEM", "ANALYTICAL_METHODS_SECTION", "", ["Raman spectroscopy"] + ) + parameters["E.method"] = nxentry["definition"].nxvalue + else: + protocol = protocol = Protocol("P-CHEM", "UNKNOWN", "", ["UNKNOWN"]) - def parse_data(self, entry, default=False, nxprocess=False): + _reference = nxentry.get("reference") + citation = Citation( + year=_reference["year"].nxdata, + title=_reference["title"].nxdata, + owner=_reference["owner"].nxdata, + ) - signal = entry.attrs.get("signal", None) - # interpretation = entry.attrs.get("interpretation", None) - axes = entry.attrs.get("axes", None) - # print(default,signal,interpretation,axes,isinstance(entry[signal], h5py.Dataset)) - y = entry[signal][:] - for axis in axes: - x = entry[axis][:] - break - spe = rc2.spectrum.Spectrum(x=x, y=y) - self.parsed_objects[str(entry)] = spe + try: + wl = nxentry["instrument/beam_incident/wavelength"].nxdata + wl_unit = nxentry["instrument/beam_incident/wavelength"].attrs["unit"] + parameters["wavelength"] = Value(loValue=wl, unit=wl_unit) + except: # noqa: B001,E722 FIXME + parameters["wavelength"] = None + try: + instrument_model = nxentry["instrument/device_information/model"].nxvalue + instrument_vendor = nxentry["instrument/device_information/vendor"].nxvalue + parameters["instrument"] = "{} {}".format( + instrument_vendor, instrument_model + ) + except: # noqa: B001,E722 FIXME + pass -# spectrum_parser = SpectrumParser() -# spectrum_parser.parse(file_path) + try: + parameters["E.method"] = nxentry[ + "experiment_documentation/E.method" + ].nxvalue + except Exception: + parameters["E.method"] = nxentry["definition"].nxvalue -# Access the spectrum data -# for key in spectrum_parser.parsed_objects: -# spe = spectrum_parser.parsed_objects[key] -# print("Spectrum data", key, spe) -# spe.plot() + # the sample + try: + _owner = SampleLink.create( + sample_uuid=nxentry["sample/substance"].attrs["uuid"], + sample_provider=nxentry["sample/provider"].nxdata, + ) + except Exception as err: + raise ValueError(err) + + papp: ProtocolApplication = ProtocolApplication( + uuid=nxentry.get("entry_identifier_uuid").nxvalue, + interpretationResult=None, + interpretationCriteria=None, + parameters=parameters, + citation=citation, + effects=[], + owner=_owner, + protocol=protocol, + investigation_uuid=nxentry.get("collection_identifier").nxvalue, + assay_uuid=nxentry.get("experiment_identifier").nxvalue, + updated=None, + ) + for endpointtype_name, enddpointtype_group in nxentry.items(): + + if isinstance(enddpointtype_group, nx.NXsample): + continue + elif isinstance(enddpointtype_group, nx.NXcite): + continue + elif isinstance(enddpointtype_group, nx.NXinstrument): + continue + elif isinstance(enddpointtype_group, nx.NXcollection): + continue + elif isinstance(enddpointtype_group, nx.NXenvironment): + continue + elif isinstance(enddpointtype_group, nx.NXnote): + continue + elif isinstance(enddpointtype_group, nx.NXgroup): + pass + elif isinstance(enddpointtype_group, nx.NXprocess): + pass + else: + continue + for _name_data, data in enddpointtype_group.items(): + if isinstance(data, nx.NXdata): + if self.index_only: + papp.effects.append( + self.parse_effect(endpointtype_name, data, relative_path) + ) + else: + raise NotImplementedError("Not implemented") + + return papp + + def parse_effect( + self, endpointtype_name, data: nx.NXentry, relative_path: str + ) -> EffectRecord: + if self.index_only: + return EffectRecord( + endpoint=data.attrs["signal"], + endpointtype=endpointtype_name, + result=EffectResult( + textValue="{}/{}#{}".format(self.domain, relative_path, data.nxpath) + ), + conditions={}, + idresult=None, + endpointGroup=None, + endpointSynonyms=[], + sampleID=None, + ) + else: + raise NotImplementedError("Not implemented") diff --git a/src/pyambit/nexus_spectra.py b/src/pyambit/nexus_spectra.py index 7ec70d7..1315ea2 100644 --- a/src/pyambit/nexus_spectra.py +++ b/src/pyambit/nexus_spectra.py @@ -1,13 +1,14 @@ import uuid from datetime import datetime -from typing import Dict, List, Union +from typing import Dict import nexusformat.nexus.tree as nx import numpy as np import numpy.typing as npt import pyambit.datamodel as mx -from pyambit.nexus_writer import to_nexus + +from pyambit.nexus_writer import to_nexus # noqa: F401 def spe2effect( @@ -17,9 +18,17 @@ def spe2effect( endpointtype="RAW_DATA", meta: Dict = None, ): - data_dict: Dict[str, mx.ValueArray] = {"x": mx.ValueArray(values=x, unit=unit)} + try: + signal = meta["@signal"] + except KeyError: + signal = "y" + try: + axes = meta["@axes"] + except KeyError: + axes = ["y"] + data_dict: Dict[str, mx.ValueArray] = {axes[0]: mx.ValueArray(values=x, unit=unit)} return mx.EffectArray( - endpoint="y", + endpoint=signal, endpointtype=endpointtype, signal=mx.ValueArray(values=y, unit="count"), axes=data_dict, @@ -28,15 +37,15 @@ def spe2effect( def configure_papp( papp: mx.ProtocolApplication = None, - instrument=None, + instrument=("vendor", "model"), wavelength=None, - provider="FNMT", + provider="ABCD", sample="PST", - sample_provider="CHARISMA", - investigation="Round Robin 1", + sample_provider="TEST", + investigation="My investigation", citation: mx.Citation = None, - prefix="CRMA", - meta=None, + prefix="TEST", + meta: Dict = None, ): if papp is None: papp = mx.ProtocolApplication( @@ -57,10 +66,16 @@ def configure_papp( uuid.uuid5(uuid.NAMESPACE_OID, "{} {}".format(investigation, provider)) ) papp.parameters = { - "E.method": "Raman spectrometry", - "wavelength": wavelength, - "T.instrument_model": instrument, + "/experiment_documentation/E.method": "Raman spectroscopy", + "/experiment_type": "Raman spectroscopy", + "instrument/beam_incident/wavelength": mx.Value(loValue=wavelength, unit="nm"), + "instrument/device_information/vendor": instrument[0], + "instrument/device_information/model": instrument[1], + "/definition": "NXraman", } + for key in list(meta.keys()): + if not key.startswith("@"): + papp.parameters["/parameters/{}".format(key)] = meta[key] papp.uuid = "{}-{}".format( prefix, diff --git a/src/pyambit/nexus_writer.py b/src/pyambit/nexus_writer.py index 88f7da3..78428c4 100644 --- a/src/pyambit/nexus_writer.py +++ b/src/pyambit/nexus_writer.py @@ -1,28 +1,78 @@ import math -import numbers import re import traceback from typing import Dict, List import nexusformat.nexus as nx import numpy as np -import pandas as pd -from pyambit.ambit_deco import add_ambitmodel_method +from h5py import string_dtype -# from pydantic import validate_arguments +from pyambit.ambit_deco import add_ambitmodel_method from pyambit.datamodel import ( Composition, EffectArray, - effects2df, + MetaValueArray, ProtocolApplication, Study, SubstanceRecord, Substances, Value, + ValueArray, ) +# tbd parameterize + + +def param_lookup(prm, value): + target = ["environment"] + _prmlo = prm.lower() + if "instrument" in _prmlo: + target = ["instrument"] + elif "technique" in _prmlo: + target = ["instrument"] + elif "wavelength" in _prmlo: + target = ["instrument", "beam_incident"] + elif "sample" in _prmlo: + target = ["sample"] + elif "material" in _prmlo: + target = ["sample"] + elif "dispers" in _prmlo: + target = ["sample"] + elif "vortex" in _prmlo: + target = ["sample"] + elif "stirr" in _prmlo: + target = ["sample"] + elif ("ASSAY" == prm.upper()) or ("E.METHOD" == prm.upper()): + target = ["experiment_documentation"] + elif "E.SOP_REFERENCE" == prm: + target = ["experiment_documentation"] + elif "OPERATOR" == prm: + target = ["experiment_documentation"] + elif prm.startswith("T."): + target = ["instrument"] + elif prm.startswith("E."): + target = ["environment"] + elif "medium" in _prmlo: + target = ["environment"] + elif "cell" in _prmlo: + target = ["environment"] + elif "well" in _prmlo: + target = ["environment"] + elif "animal" in _prmlo: + target = ["environment"] + elif "EXPERIMENT_END_DATE" == prm: + target = ["end_time"] + elif "EXPERIMENT_START_DATE" == prm: + target = ["start_time"] + elif "__input_file" == prm: + target = ["experiment_documentation"] + else: + target = ["parameters"] + target.append(prm) + return target + @add_ambitmodel_method(ProtocolApplication) def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=False): @@ -56,9 +106,9 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal try: _categories_collection = "" if hierarchy: - if not papp.protocol.topcategory in nx_root: + if papp.protocol.topcategory not in nx_root: nx_root[papp.protocol.topcategory] = nx.NXgroup() - if not papp.protocol.category.code in nx_root[papp.protocol.topcategory]: + if papp.protocol.category.code not in nx_root[papp.protocol.topcategory]: nx_root[papp.protocol.topcategory][ papp.protocol.category.code ] = nx.NXgroup() @@ -71,12 +121,21 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal if papp.citation.owner is None else papp.citation.owner.replace("/", "_").upper() ) - except BaseException: + except BaseException: # noqa: B036 FIXME provider = "@" - entry_id = "{}/entry_{}_{}".format(_categories_collection, provider, papp.uuid) - except Exception as err: + if papp.nx_name is None: + entry_id = "{}/{}_{}".format(_categories_collection, provider, papp.uuid) + else: + entry_id = "{}/{}_{}".format( + _categories_collection, + "entry" if papp.nx_name is None else papp.nx_name, + papp.uuid, + ) + except Exception: # print(err) - entry_id = "/entry_{}".format(papp.uuid) + entry_id = "/{}_{}".format( + "entry" if papp.nx_name is None else papp.nx_name, papp.uuid + ) _categories_collection = "{}{}".format(_categories_collection, entry_id) if entry_id not in nx_root: @@ -119,12 +178,25 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal experiment_documentation["date"] = papp.updated # category = nx.NXgroup() # experiment_documentation["category"] = category - experiment_documentation.attrs["topcategory"] = papp.protocol.topcategory - experiment_documentation.attrs["code"] = papp.protocol.category.code - experiment_documentation.attrs["term"] = papp.protocol.category.term - experiment_documentation.attrs["title"] = papp.protocol.category.title - experiment_documentation.attrs["endpoint"] = papp.protocol.endpoint - experiment_documentation.attrs["guideline"] = papp.protocol.guideline + experiment_documentation["protocol"] = nx.NXcollection() + experiment_documentation["protocol"].attrs[ + "topcategory" + ] = papp.protocol.topcategory + experiment_documentation["protocol"].attrs[ + "code" + ] = papp.protocol.category.code + experiment_documentation["protocol"].attrs[ + "term" + ] = papp.protocol.category.term + experiment_documentation["protocol"].attrs[ + "title" + ] = papp.protocol.category.title + experiment_documentation["protocol"].attrs[ + "endpoint" + ] = papp.protocol.endpoint + experiment_documentation["protocol"].attrs[ + "guideline" + ] = papp.protocol.guideline # definition is usually reference to the Nexus XML definition # ambit category codes and method serve similar role nx_root["{}/definition".format(entry_id)] = ( @@ -134,6 +206,7 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal papp.protocol.guideline, ) ) + if papp.parameters is not None: for tag in ["E.method", "ASSAY"]: if tag in papp.parameters: @@ -156,13 +229,15 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal nxmap.attrs["PROTOCOL_APPLICATION_UUID"] = "{}/entry_identifier_uuid".format( entry_id ) - nxmap.attrs["INVESTIGATION_UUID"] = "{}/collection_identifier".format(entry_id) - nxmap.attrs["ASSAY_UUID"] = "{}/experiment_identifier".format(entry_id) - nxmap.attrs["Protocol"] = "{}/experiment_documentation".format(entry_id) - nxmap.attrs["Citation"] = "{}/reference".format(entry_id) - nxmap.attrs["Substance"] = "{}/sample".format(entry_id) - nxmap.attrs["Parameters"] = ["instrument", "environment", "parameters"] - nxmap.attrs["EffectRecords"] = "datasets" + + # no need to repeat these, rather make a xml definition and refer to it + # nxmap.attrs["INVESTIGATION_UUID"] = "{}/collection_identifier".format(entry_id) + # nxmap.attrs["ASSAY_UUID"] = "{}/experiment_identifier".format(entry_id) + # nxmap.attrs["Protocol"] = "{}/experiment_documentation".format(entry_id) + # nxmap.attrs["Citation"] = "{}/reference".format(entry_id) + # nxmap.attrs["Substance"] = "{}/sample".format(entry_id) + # nxmap.attrs["Parameters"] = ["instrument", "environment", "parameters"] + # nxmap.attrs["EffectRecords"] = "datasets" try: citation_id = "{}/reference".format(entry_id) @@ -201,61 +276,42 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal nx_root[substance_id].attrs["uuid"] = papp.owner.substance.uuid nx_root["{}/sample/substance".format(entry_id)] = nx.NXlink(substance_id) - # parameters - if not ("{}/instrument".format(entry_id) in nx_root): - nx_root["{}/instrument".format(entry_id)] = nx.NXinstrument() - instrument = nx_root["{}/instrument".format(entry_id)] - - if not ("{}/parameters".format(entry_id) in nx_root): - nx_root["{}/parameters".format(entry_id)] = nx.NXcollection() - parameters = nx_root["{}/parameters".format(entry_id)] - - if not ("{}/environment".format(entry_id) in nx_root): - nx_root["{}/environment".format(entry_id)] = nx.NXenvironment() - environment = nx_root["{}/environment".format(entry_id)] - - if not (papp.parameters is None): - for prm in papp.parameters: + if papp.parameters is not None: + for prm_path in papp.parameters: try: - value = papp.parameters[prm] - # Invalid path if the key contains / - # prm = prm.replace("/","_") - target = environment - if "instrument" in prm.lower(): - target = instrument - if "technique" in prm.lower(): - target = instrument - if "wavelength" in prm.lower(): - target = instrument - elif "sample" in prm.lower(): - target = sample - elif "material" in prm.lower(): - target = sample - elif ("ASSAY" == prm.upper()) or ("E.METHOD" == prm.upper()): - target = nx_root[entry_id]["experiment_documentation"] - # continue - elif "E.SOP_REFERENCE" == prm: - # target = instrument - target = nx_root[entry_id]["experiment_documentation"] - elif "OPERATOR" == prm: - # target = instrument - target = nx_root[entry_id]["experiment_documentation"] - elif prm.startswith("T."): - target = instrument - - if "EXPERIMENT_END_DATE" == prm: - nx_root[entry_id]["end_time"] = value - elif "EXPERIMENT_START_DATE" == prm: - nx_root[entry_id]["start_time"] = value - elif "__input_file" == prm: - nx_root[entry_id]["experiment_documentation"][prm] = value - elif isinstance(value, str): - target[prm] = nx.NXfield(str(value)) + value = papp.parameters[prm_path] + prms = prm_path.split("/") + if len(prms) == 1: + prms = param_lookup(prm_path, value) + # print(prms,prms[:-1]) + _entry = nx_root[entry_id] + for _group in prms[:-1]: + if _group not in _entry: + if _group == "instrument": + _entry[_group] = nx.NXinstrument() + elif _group == "environment": + _entry[_group] = nx.NXenvironment() + elif _group == "parameters": + _entry[_group] = nx.NXcollection() + elif _group == "experiment_documentation": + _entry[_group] = nx.NXnote() + else: + _entry[_group] = nx.NXgroup() + _entry = _entry[_group] + target = _entry + prm = prms[-1] + + if isinstance(value, str): + target[prm] = nx.NXfield(value) + elif isinstance(value, int): + target[prm] = nx.NXfield(value) + elif isinstance(value, float): + target[prm] = nx.NXfield(value) elif isinstance(value, Value): # tbd ranges? target[prm] = nx.NXfield(value.loValue, unit=value.unit) else: - target = parameters + target[prm] = nx.NXfield(str(value)) except Exception as err: raise Exception( "ProtocolApplication: parameters parsing error {} {}".format( @@ -263,9 +319,9 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal ) ) from err - if not (papp.owner is None): + if papp.owner is not None: try: - sample["uuid"] = papp.owner.substance.uuid + sample.attrs["uuid"] = papp.owner.substance.uuid sample["provider"] = papp.owner.company.name except Exception as err: raise Exception( @@ -284,7 +340,8 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal # nx_root["/group_byexperiment"] = nx.NXgroup() # print(nx_root[entry_id].attrs) - # nx_root["/group_byexperiment{}".format(entry_id)] = nx.NXlink("{}/RAW_DATA".format(entry_id),abspath=True,soft=True) + # nx_root["/group_byexperiment{}".format(entry_id)] = nx.NXlink( + # "{}/RAW_DATA".format(entry_id),abspath=True,soft=True) # nx_root["/group_byexperiment/{}".format("xyz")] = nx.NXlink(substance_id) # nx.NXlink(nx_root[entry_id]) # nx_root[_categories_collection] = nx.NXlink(entry_id) @@ -292,7 +349,7 @@ def to_nexus(papp: ProtocolApplication, nx_root: nx.NXroot = None, hierarchy=Fal @add_ambitmodel_method(Study) -def to_nexus(study: Study, nx_root: nx.NXroot = None, hierarchy=False): +def to_nexus(study: Study, nx_root: nx.NXroot = None, hierarchy=False): # noqa: F811 if nx_root is None: nx_root = nx.NXroot() for papp in study.study: @@ -302,7 +359,9 @@ def to_nexus(study: Study, nx_root: nx.NXroot = None, hierarchy=False): @add_ambitmodel_method(SubstanceRecord) -def to_nexus(substance: SubstanceRecord, nx_root: nx.NXroot = None, hierarchy=False): +def to_nexus( # noqa: F811 + substance: SubstanceRecord, nx_root: nx.NXroot = None, hierarchy=False +): """ SubstanceRecord to nexus entry (NXentry) @@ -342,7 +401,7 @@ def to_nexus(substance: SubstanceRecord, nx_root: nx.NXroot = None, hierarchy=Fa print(substance.URI) print(err) nxroot.save("example.nxs",mode="w") - """ + """ # noqa: B950 if nx_root is None: nx_root = nx.NXroot() @@ -361,7 +420,8 @@ def to_nexus(substance: SubstanceRecord, nx_root: nx.NXroot = None, hierarchy=Fa if substance.composition is not None: for index, ce in enumerate(substance.composition): component = nx.NXsample_component() - # name='' cas='' einecs='' inchikey='YVZATJAPAZIWIL-UHFFFAOYSA-M' inchi='InChI=1S/H2O.Zn/h1H2;/q;+1/p-1' formula='HOZn' + # name='' cas='' einecs='' inchikey='YVZATJAPAZIWIL-UHFFFAOYSA-M' + # inchi='InChI=1S/H2O.Zn/h1H2;/q;+1/p-1' formula='HOZn' component.name = ce.component.compound.name component.einecs = ce.component.compound.einecs component.cas = ce.component.compound.cas @@ -384,7 +444,9 @@ def to_nexus(substance: SubstanceRecord, nx_root: nx.NXroot = None, hierarchy=Fa @add_ambitmodel_method(Substances) -def to_nexus(substances: Substances, nx_root: nx.NXroot = None, hierarchy=False): +def to_nexus( # noqa: F811 + substances: Substances, nx_root: nx.NXroot = None, hierarchy=False +): if nx_root is None: nx_root = nx.NXroot() for substance in substances.substance: @@ -393,7 +455,7 @@ def to_nexus(substances: Substances, nx_root: nx.NXroot = None, hierarchy=False) @add_ambitmodel_method(Composition) -def to_nexus(composition: Composition, nx_root: nx.NXroot = None): +def to_nexus(composition: Composition, nx_root: nx.NXroot = None): # noqa: F811 if nx_root is None: nx_root = nx.NXroot() @@ -413,7 +475,8 @@ def is_alternate_axis(key: str, alt_axes: Dict[str, List[str]]) -> bool: Parameters: - key: The axis name to check. - - alt_axes: Dictionary where keys are primary axis names and values are lists of alternative axis names. + - alt_axes: Dictionary where keys are primary axis names and values are lists of + alternative axis names. Returns: - True if the key is an alternate axis, False otherwise. @@ -442,39 +505,66 @@ def is_alternate_axis(key: str, alt_axes: Dict[str, List[str]]) -> bool: signal = nx.tree.NXfield( effect.signal.values, - name="value", + name=effect.endpoint, units=effect.signal.unit, long_name="{} {}".format( effect.endpoint, "" if effect.signal.unit is None else effect.signal.unit ).strip(), ) + if effect.signal.conditions is not None: + for key in effect.signal.conditions: + signal.attrs[key] = effect.signal.conditions[key] + + nxdata = nx.tree.NXdata( + signal=signal, + axes=None if len(axes) == 0 else axes, + errors=effect.signal.errorValue, + # auxiliary_signals=None if len(aux_signals) < 1 else aux_signals, + ) aux_signals = [] + if effect.signal.auxiliary: for a in effect.signal.auxiliary: - _tmp = effect.signal.auxiliary[a] + item = effect.signal.auxiliary[a] + if isinstance(item, MetaValueArray or isinstance(item, ValueArray)): + _tmp = item.values + _tmp_unit = item.unit + _tmp_meta = item.conditions + + elif isinstance(item, np.ndarray): + _tmp = item + _tmp_unit = effect.signal.unit + _tmp_meta = None + else: + continue + if _tmp.size > 0: - aux_signals.append( - nx.tree.NXfield( + _auxname = a.replace("/", "_") + long_name = "{} ({}) {}".format( + effect.endpoint, + a, + "" if effect.signal.unit is None else effect.signal.unit, + ).strip() + if _auxname == "textValue": + nxdata[_auxname] = nx.tree.NXfield( _tmp, - name=a.replace("/", "_"), - units=effect.signal.unit, - long_name="{} ({}) {}".format( - effect.endpoint, - a, - "" if effect.signal.unit is None else effect.signal.unit, - ).strip(), + name=_auxname, + units=_tmp_unit, + long_name=long_name, + dtype=string_dtype(encoding="utf-8"), ) - ) - # print(a,aux_signal) - # print(effect.endpoint,aux_signals,len(aux_signals)) - # print(">>>",effect.endpoint,effect.signal.values) - # aux_signals = [] - nxdata = nx.tree.NXdata( - signal=signal, - axes=None if len(axes) == 0 else axes, - errors=effect.signal.errorValue, - auxiliary_signals=None if len(aux_signals) < 1 else aux_signals, - ) + else: + nxdata[_auxname] = nx.tree.NXfield( + _tmp, name=_auxname, units=_tmp_unit, long_name=long_name + ) + + if _tmp_meta is not None: + for key in _tmp_meta: + nxdata[_auxname].attrs[key] = _tmp_meta[key] + aux_signals.append(_auxname) + + if len(aux_signals) > 0: + nxdata.attrs["auxiliary_signals"] = aux_signals if effect.conditions: for key in effect.conditions: nxdata.attrs[key] = effect.conditions[key] @@ -498,6 +588,7 @@ def is_alternate_axis(key: str, alt_axes: Dict[str, List[str]]) -> bool: nxdata.attrs["interpretation"] = ( "scalar" if index == 0 else ("spectrum" if index == 1 else "image") ) + nxdata.title = effect.nx_name return nxdata @@ -511,7 +602,7 @@ def process_pa(pa: ProtocolApplication, entry=None, nx_root: nx.NXroot = None): _path = "/substance/{}".format(pa.owner.substance.uuid) # print(_path, nx_root[_path].name) substance_name = nx_root[_path].name - except BaseException: + except BaseException: # noqa: B036 FIXME substance_name = "" effectarrays_only, df = pa.convert_effectrecords2array() @@ -533,7 +624,14 @@ def process_pa(pa: ProtocolApplication, entry=None, nx_root: nx.NXroot = None): entry[_group_key]["description"] = effect.endpointtype # entry[_group_key] = _endpointtype_groups[_group_key] - entryid = "{}_{}".format(effect.endpoint, index) + entryid = "{}_{}".format( + ( + effect.endpoint + if effect.nx_name is None + else effect.nx_name.replace("/", "_") + ), + index, + ) if entryid in entry[_group_key]: del entry[_group_key][entryid] print("replacing {}/{}".format(_group_key, entryid)) @@ -543,9 +641,15 @@ def process_pa(pa: ProtocolApplication, entry=None, nx_root: nx.NXroot = None): entry[_group_key][entryid] = nxdata if _default is None: entry.attrs["default"] = _group_key - nxdata.title = "{} (by {}) {}".format( - effect.endpoint, pa.citation.owner, substance_name - ) + + if nxdata.title is None: + nxdata.title = ( + "{} (by {}) {}".format( + effect.endpoint, pa.citation.owner, substance_name + ) + if pa.nx_name is None + else pa.nx_name + ) return entry diff --git a/src/pyambit/solr_writer.py b/src/pyambit/solr_writer.py new file mode 100644 index 0000000..2bfc993 --- /dev/null +++ b/src/pyambit/solr_writer.py @@ -0,0 +1,176 @@ +import json +from typing import Dict, Union + +from pyambit.datamodel import ( + EffectArray, + EffectRecord, + EffectResult, + ProtocolApplication, + SubstanceRecord, + Substances, + Value, +) + + +class Ambit2Solr: + + def __init__(self, prefix: str): + self.prefix = prefix + + def __enter__(self): + self._solr = [] + return self + + def __exit__(self, exc_type, exc_value, traceback): + # Any cleanup code, if needed + pass + + def prm2solr(self, params: Dict, key: str, value: Union[str, Value, None]): + if isinstance(value, str): + params["{}_s".format(key)] = value + elif isinstance(value, int): + params["{}_d".format(key)] = value + elif isinstance(value, float): + params["{}_d".format(key)] = value + elif isinstance(value, Value): + if value.loValue is not None: + params["{}_d".format(key)] = value.loValue + if value.unit is not None: + params["{}_UNIT_s".format(key)] = value.unit + + def effectresult2solr(self, effect_result: EffectResult, solr_index=None): + if solr_index is None: + solr_index = {} + if effect_result.loValue is not None: + solr_index["loValue_d"] = effect_result.loValue + if effect_result.loQualifier is not None: + solr_index["loQualifier_s"] = effect_result.loQualifier + if effect_result.upQualifier is not None: + solr_index["upQualifier_s"] = effect_result.upQualifier + if effect_result.upValue is not None: + solr_index["upValue_d"] = effect_result.upValue + if effect_result.unit is not None: + solr_index["unit_s"] = effect_result.unit + if effect_result.textValue is not None: + solr_index["textValue_s"] = effect_result.textValue + + def effectrecord2solr(self, effect: EffectRecord, solr_index=None): + if solr_index is None: + solr_index = {} + if isinstance(effect, EffectArray): + # tbd - this is new in pyambit, we did not have array results implementation + if effect.result is not None: # EffectResult + self.effectresult2solr(effect.result, solr_index) + # e.g. vector search + if effect.endpointtype == "embeddings": + solr_index[effect.endpoint] = effect.signal.values.tolist() + elif isinstance(effect, EffectRecord): + # conditions + if effect.result is not None: # EffectResult + self.effectresult2solr(effect.result, solr_index) + + def entry2solr(self, papp: ProtocolApplication): + papp_solr = [] + for _id, effect in enumerate(papp.effects, start=1): + _solr = {} + _solr["id"] = "{}/{}".format(papp.uuid, _id) + _solr["investigation_uuid_s"] = papp.investigation_uuid + _solr["assay_uuid_s"] = papp.assay_uuid + _solr["type_s"] = "study" + _solr["document_uuid_s"] = papp.uuid + + _solr["topcategory_s"] = papp.protocol.topcategory + _solr["endpointcategory_s"] = ( + "UNKNOWN" + if papp.protocol.category is None + else papp.protocol.category.code + ) + _solr["guidance_s"] = papp.protocol.guideline + # _solr["guidance_synonym_ss"] = ["FIX_0000058"] + # _solr["E.method_synonym_ss"] = ["FIX_0000058"] + _solr["endpoint_s"] = papp.protocol.endpoint + _solr["effectendpoint_s"] = effect.endpoint + _solr["effectendpoint_type_s"] = effect.endpointtype + # _solr["effectendpoint_synonym_ss"] = ["CHMO_0000823"] + _solr["reference_owner_s"] = papp.citation.owner + _solr["reference_year_s"] = papp.citation.year + _solr["reference_s"] = papp.citation.title + _solr["updated_s"] = papp.updated + if "E.method_s" in papp.parameters: + _solr["E.method_s"] = papp.parameters["E.method_s"] + self.effectrecord2solr(effect, _solr) + + _conditions = {"type_s": "conditions"} + _conditions["topcategory_s"] = papp.protocol.topcategory + _conditions["endpointcategory_s"] = ( + "UNKNOWN" + if papp.protocol.category is None + else papp.protocol.category.code + ) + _conditions["document_uuid_s"] = papp.uuid + _conditions["id"] = "{}/cn".format(_solr["id"]) + for prm in effect.conditions: + self.prm2solr(_conditions, prm, effect.conditions[prm]) + _solr["_childDocuments_"] = [_conditions] + + _params = {} + for prm in papp.parameters: + self.prm2solr(_params, prm, papp.parameters[prm]) + _params["document_uuid_s"] = papp.uuid + _params["id"] = "{}/prm".format(papp.uuid) + _params["topcategory_s"] = papp.protocol.topcategory + _params["endpointcategory_s"] = ( + "UNKNOWN" + if papp.protocol.category is None + else papp.protocol.category.code + ) + if "E.method_s" in papp.parameters: + _params["E.method_s"] = papp.parameters["E.method_s"] + _params["type_s"] = "params" + _solr["_childDocuments_"] = [_params] + papp_solr.append(_solr) + return papp_solr + + def substancerecord2solr(self, substance: SubstanceRecord): + _solr = {} + _solr["content_hss"] = [] + _solr["dbtag_hss"] = self.prefix + _solr["name_hs"] = substance.name + _solr["publicname_hs"] = substance.publicname + _solr["owner_name_hs"] = substance.ownerName + _solr["substanceType_hs"] = substance.substanceType + _solr["type_s"] = "substance" + _solr["s_uuid_hs"] = substance.i5uuid + _solr["id"] = substance.i5uuid + _studies = [] + _solr["SUMMARY.RESULTS_hss"] = [] + for _papp in substance.study: + _study_solr = self.entry2solr(_papp) + for _study in _study_solr: + _study["s_uuid_s"] = substance.i5uuid + _study["type_s"] = "study" + _study["name_s"] = substance.name + _study["publicname_s"] = substance.publicname + _study["substanceType_s"] = substance.substanceType + _study["owner_name_s"] = substance.ownerName + _studies.extend(_study_solr) + _solr["_childDocuments_"] = _studies + _solr["SUMMARY.REFS_hss"] = [] + _solr["SUMMARY.REFOWNERS_hss"] = [] + + return _solr + + def substances2solr(self, substances: Substances, buffer=None): + if buffer is None: + buffer = [] + for substance in substances.substance: + buffer.append(self.substancerecord2solr(substance)) + return buffer + + def to_json(self, substances: Substances): + return self.substances2solr(substances) + + def write(self, substances, file_path): + _json = self.to_json(substances) + with open(file_path, "w") as file: + json.dump(_json, file) diff --git a/tests/pyambit/datamodel/datamodel_test.py b/tests/pyambit/datamodel/datamodel_test.py index 570bc78..2cf5569 100644 --- a/tests/pyambit/datamodel/datamodel_test.py +++ b/tests/pyambit/datamodel/datamodel_test.py @@ -5,7 +5,7 @@ import numpy as np import numpy.typing as npt import pyambit.datamodel as mb -from pydantic_core import from_json + TEST_DIR = Path(__file__).parent.parent / "resources" @@ -24,14 +24,57 @@ def test_substances_load(): assert substances == new_val +def test_basevaluearray_roundtrip(): + """ + Test the roundtrip serialization and deserialization of the ValueArray model. + """ + a1: npt.NDArray[np.float64] = np.ones(5) + a0: npt.NDArray[np.float64] = np.zeros(5) + val = mb.BaseValueArray(values=a1, unit="unit", errQualifier="SD", errorValue=a0) + + data = json.loads(val.model_dump_json()) + # print(data) + new_val = mb.BaseValueArray.model_construct(**data) + + assert val == new_val + + +def test_metavaluearray_roundtrip(): + """ + Test the roundtrip serialization and deserialization of the MetaValueArray model. + """ + a1: npt.NDArray[np.float64] = np.ones(5) + a0: npt.NDArray[np.float64] = np.zeros(5) + val = mb.MetaValueArray( + values=a1, + unit="unit", + errQualifier="SD", + errorValue=a0, + conditions={"test": "test"}, + ) + + data = json.loads(val.model_dump_json()) + # print(data) + new_val = mb.MetaValueArray.model_construct(**data) + + assert val == new_val + + def test_valuearray_roundtrip(): """ Test the roundtrip serialization and deserialization of the ValueArray model. """ a1: npt.NDArray[np.float64] = np.ones(5) a0: npt.NDArray[np.float64] = np.zeros(5) - val = mb.ValueArray(values=a1, unit="unit", errQualifier="SD", errorValue=a0) + val = mb.ValueArray( + values=a1, + unit="unit", + errQualifier="SD", + errorValue=a0, + conditions={"test": "test"}, + ) + assert val.conditions is not None data = json.loads(val.model_dump_json()) new_val = mb.ValueArray.model_construct(**data) @@ -42,7 +85,7 @@ def test_valuearrayaux_roundtrip(): """ Test the roundtrip serialization and deserialization of the ValueArray model. """ - shape = tuple((10, 2, 1)) + shape = tuple((3, 2, 1)) matrix_vals = np.random.random(shape) * 3 matrix_errs = np.random.random(shape) matrix_upValue = np.random.random(shape) * 5 @@ -58,7 +101,10 @@ def test_valuearrayaux_roundtrip(): data = json.loads(val.model_dump_json()) new_val = mb.ValueArray.model_construct(**data) - + for key in val.auxiliary: + print("old", key, type(val.auxiliary[key])) + for key in new_val.auxiliary: + print("new", key, type(new_val.auxiliary[key])) assert val == new_val @@ -81,6 +127,29 @@ def test_valuearray_roundtrip_withaux(): assert val == new_val +def test_valuearray_roundtrip_with_arrayaux(): + """ + Test the roundtrip serialization and deserialization of the ValueArray model. + """ + + b1: npt.NDArray[np.float64] = np.ones(10) + aux = mb.MetaValueArray(values=b1, unit="bunit") + + a1: npt.NDArray[np.float64] = np.ones(5) + a0: npt.NDArray[np.float64] = np.zeros(5) + val = mb.ValueArray( + values=a1, + unit="unit", + errQualifier="SD", + errorValue=a0, + auxiliary={"upValue": a1, "array": aux}, + ) + + data = json.loads(val.model_dump_json()) + new_val = mb.ValueArray.model_construct(**data) + assert val == new_val + + def test_value_roundtrip(): """ Test the roundtrip serialization and deserialization of the Value model. @@ -216,7 +285,8 @@ def test_effect_array_roundtrip(): def test_protocol_effect_record_roundtrip(): """ - Test the roundtrip serialization and deserialization of the ProtocolEffectRecord model. + Test the roundtrip serialization and deserialization of the ProtocolEffectRecord + model. """ protocol = mb.Protocol( topcategory="TOX", @@ -385,7 +455,8 @@ def create_effectrecord(): def test_protocol_application_roundtrip(): """ - Test the roundtrip serialization and deserialization of the ProtocolApplication model. + Test the roundtrip serialization and deserialization of the ProtocolApplication + model. """ original = create_protocolapp4test() @@ -425,7 +496,8 @@ def test_study_roundtrip(): def test_component_proportion_roundtrip(): """ - Test the roundtrip serialization and deserialization of the ComponentProportion model. + Test the roundtrip serialization and deserialization of the ComponentProportion + model. """ typical = mb.TypicalProportion(precision="<", value=5.0, unit="g") @@ -527,7 +599,7 @@ def test_composition_roundtrip(): see how features are expected https://apps.ideaconsult.net/gracious/compound/3?media=application/json&feature_uris=https://apps.ideaconsult.net/gracious/compound/3/feature - """ + """ # noqa: B950 # Create sample data for Composition original = mb.Composition( composition=[ diff --git a/tests/pyambit/datamodel/nexus_writer_test.py b/tests/pyambit/datamodel/nexus_writer_test.py index 7e67b93..419d866 100644 --- a/tests/pyambit/datamodel/nexus_writer_test.py +++ b/tests/pyambit/datamodel/nexus_writer_test.py @@ -7,7 +7,7 @@ import pytest # to_nexus is not added without this import -from pyambit import nexus_writer +from pyambit import nexus_writer # noqa: F401 from pyambit.datamodel import Study, Substances @@ -31,6 +31,20 @@ def substances(): return substances +def inspect_nexus_tree(node, path="root"): + if isinstance(node, dict): # If the node is a group/dictionary + for key, child in node.items(): + inspect_nexus_tree(child, path + f"/{key}") + elif hasattr(node, "dtype"): + # Check if dtype is Unicode + if node.dtype.char == "U": + print( + f"*****Problematic Unicode data found at {path} with dtype {node.dtype}" + ) + # else: + # print(f"Skipping non-data node at {path}") + + def test_substances(substances): # nxroot = nx.NXroot() @@ -38,20 +52,31 @@ def test_substances(substances): substances.to_nexus(nxroot, hierarchy=True) file = os.path.join(tempfile.gettempdir(), "substances.nxs") print(file) + inspect_nexus_tree(nxroot) nxroot.save(file, mode="w") def test_study(substances): for substance in substances.substance: for study in substance.study: + + study.nx_name = "test" file = os.path.join( tempfile.gettempdir(), "study_{}.nxs".format(study.uuid) ) - print(file) nxroot = nx.NXroot() try: - study.to_nexus(nxroot) + study.to_nexus(nxroot, hierarchy=True) + inspect_nexus_tree(nxroot) nxroot.save(file, mode="w") except Exception as err: - print(study) + # inspect_nexus_tree(nxroot) + # print(study.model_dump_json(exclude_none=True)) + effectarrays_only, df = study.convert_effectrecords2array() + df.dropna(how="all").to_excel("bad.xlsx") + for effect in effectarrays_only: + for key in effect.signal.auxiliary: + for element in effect.signal.auxiliary[key].flat: + print(element, end=".") + # print(nxroot.tree) raise err diff --git a/tests/pyambit/datamodel/solr_writer_test.py b/tests/pyambit/datamodel/solr_writer_test.py new file mode 100644 index 0000000..e5ec276 --- /dev/null +++ b/tests/pyambit/datamodel/solr_writer_test.py @@ -0,0 +1,40 @@ +import json +import os.path +import tempfile +from pathlib import Path + +import pytest +from pyambit.datamodel import Study, Substances + +from pyambit.solr_writer import Ambit2Solr + + +TEST_DIR = Path(__file__).parent.parent / "resources" + + +@pytest.fixture(scope="module") +def substances(): + """ + Fixture to load and return the Substances object. + """ + + with open(os.path.join(TEST_DIR, "substance.json"), "r", encoding="utf-8") as file: + json_substance = json.load(file) + substances = Substances(**json_substance) + + with open(os.path.join(TEST_DIR, "study.json"), "r", encoding="utf-8") as file: + json_study = json.load(file) + study = Study(**json_study) + substances.substance[0].study = study.study + return substances + + +def test_substances(substances): + + _substances = Substances(substance=[substances.substance[0]]) + writer: Ambit2Solr = Ambit2Solr(prefix="TEST") + _json = writer.to_json(_substances) + _file = os.path.join(tempfile.gettempdir(), "substances.json") + print(_file) + with open(_file, "w") as file: + json.dump(_json, file) diff --git a/tests/pyambit/datamodel/spectra_writer_test.py b/tests/pyambit/datamodel/spectra_writer_test.py index 8e5106c..38c7f93 100644 --- a/tests/pyambit/datamodel/spectra_writer_test.py +++ b/tests/pyambit/datamodel/spectra_writer_test.py @@ -4,7 +4,7 @@ import nexusformat.nexus.tree as nx import numpy as np -from pyambit.ambit_deco import add_ambitmodel_method +from pyambit.ambit_deco import add_ambitmodel_method # noqa: F401 from pyambit.datamodel import SubstanceRecord, Substances from pyambit.nexus_spectra import spe2ambit