From 69075736fca3e1d9a4e6cf42af0ce3eb2b66686a Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Thu, 14 Apr 2022 12:51:20 +0100
Subject: [PATCH] add export to hdf5

---
 openff/qcsubmit/results/results.py | 85 ++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/openff/qcsubmit/results/results.py b/openff/qcsubmit/results/results.py
index 0e9435dd..3bfd7abe 100644
--- a/openff/qcsubmit/results/results.py
+++ b/openff/qcsubmit/results/results.py
@@ -407,6 +407,91 @@ def to_records(self) -> List[Tuple[ResultRecord, Molecule]]:
 
         return records_and_molecules
 
+    def to_hdf5(self, filename: str):
+        """
+        Write the dataset to a custom HDF5 file format suitable for ML purposes, the format is compact where
+        possible to avoid repeated redundant information.
+        """
+        import h5py
+        import numpy as np
+        import tqdm
+        from openmm import unit
+
+        all_records_and_molecules = {
+            record.id: [record, molecule] for record, molecule in self.to_records()
+        }
+        entries_by_inchikey = defaultdict(list)
+
+        for entries in self.entries.values():
+            for entry in entries:
+                entries_by_inchikey[entry.inchi_key].append(entry)
+
+        f = h5py.File(filename, "w")
+        for inchikey, entries in tqdm.tqdm(
+            entries_by_inchikey.items(),
+            ncols=80,
+            desc="Creating HDF5 Dataset",
+            total=len(entries_by_inchikey),
+        ):
+            group = f.create_group(name=inchikey)
+            energies = []
+            gradients = []
+            conformations = []
+            for entry in entries:
+                record, molecule = all_records_and_molecules[entry.record_id]
+                energies.append(
+                    record.return_result
+                    if record.driver == "energy"
+                    else record.properties.return_energy
+                )
+                gradients.append(
+                    record.return_result
+                    if record.driver == "gradient"
+                    else record.properties.return_gradient
+                )
+                conformations.extend(
+                    [c.value_in_unit(unit=unit.bohr) for c in molecule.conformers]
+                )
+            ds_energy = group.create_dataset(
+                "energies", data=np.array(energies), dtype=np.float64, chunks=True
+            )
+            ds_energy.attrs["units"] = "hartree"
+            ds_gradient = group.create_dataset(
+                "gradients", data=gradients, dtype=np.float64, chunks=True
+            )
+            ds_gradient.attrs["units"] = "hartree / bohr"
+            ds_conformers = group.create_dataset(
+                "conformations",
+                data=np.array(conformations),
+                dtype=np.float64,
+                chunks=True,
+            )
+            ds_conformers.attrs["units"] = "bohr"
+            # now add the molecule information
+            record, molecule = all_records_and_molecules[entries[0].record_id]
+            # mapped smiles to remake the molecule in the correct order
+            group.create_dataset(
+                "smiles",
+                data=[molecule.to_smiles(mapped=True)],
+                dtype=h5py.string_dtype(encoding="utf-8"),
+            )
+            group.create_dataset(
+                "atomic_numbers",
+                data=[atom.atomic_number for atom in molecule.atoms],
+                dtype=np.int16,
+            )
+            group.create_dataset(
+                "charge",
+                data=[molecule.total_charge.value_in_unit(unit.elementary_charge)],
+                dtype=np.int16,
+            )
+            group.create_dataset(
+                "specification",
+                data=[f"{record.method}:{str(record.basis)}"],
+                dtype=h5py.string_dtype(encoding="utf-8"),
+            )
+        f.close()
+
 
 class OptimizationResult(_BaseResult):
     """A class which stores a reference to, and allows the retrieval of, data from