Merge pull request #79 from SWIFTSIM/log_file_reader

Log file reader
SWIFTSIM · Oct 7, 2020 · 7dd59d5 · 7dd59d5
2 parents 23efffb + b086caa
commit 7dd59d5
Show file tree

Hide file tree

Showing 8 changed files with 267 additions and 3 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,6 +1,14 @@
 SWIFTsimIO Changelog
 ====================
 
+v 4.2.0
+-------
+
+Added a log file reader.
+
++ Added the `swiftsimio.statistics.SWIFTStatisticsFile` functionality
+  to read statistics log files.
+
 v 4.1.0
 -------
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -21,6 +21,7 @@ snapshots to enable partial reading.
    visualisation/index
    velociraptor/index
    creating_initial_conditions/index
+   statistics/index
    command_line/index
 
    modules/index

diff --git a/docs/source/statistics/index.rst b/docs/source/statistics/index.rst
@@ -0,0 +1,32 @@
+Statistics Files
+================
+
+:mod:`swiftsimio` includes routines to load log files, such as the
+``SFR.txt`` and ``energy.txt``. This is available through the
+:obj:`swiftsimio.statistics.SWIFTStatisticsFile` object, or through
+the main ``load_statistics`` function.
+
+Example
+-------
+
+.. code-block::python
+
+   from swiftsimio import load_statistics
+
+   data = load_statistics("energy.txt")
+
+   print(data)
+
+   print(x.total_mass.name)
+
+
+Will output:
+
+.. code-block::bash
+
+   Statistics file: energy.txt, containing fields: #, step, time, a, z, total_mass,
+   gas_mass, dm_mass, sink_mass, star_mass, bh_mass, gas_z_mass, star_z_mass,
+   bh_z_mass, kin_energy, int_energy, pot_energy, rad_energy, gas_entropy, com_x,
+   com_y, com_z, mom_x, mom_y, mom_z, ang_mom_x, ang_mom_y, ang_mom_z
+
+   'Total mass in the simulation'
diff --git a/swiftsimio/__init__.py b/swiftsimio/__init__.py
@@ -1,6 +1,7 @@
 from .reader import *
 from .writer import SWIFTWriterDataset
 from .masks import SWIFTMask
+from .statistics import SWIFTStatisticsFile
 from .__version__ import __version__
 from .__cite__ import __cite__
 
@@ -10,6 +11,7 @@
 import swiftsimio.visualisation as visualisation
 import swiftsimio.units as units
 import swiftsimio.subset_writer as subset_writer
+import swiftsimio.statistics as statistics
 
 name = "swiftsimio"
 
@@ -85,13 +87,26 @@ def load(filename, mask=None) -> SWIFTDataset:
     Parameters
     ----------
     filename : str
-        file to containing SWIFT dataset to read
+        SWIFT snapshot file to read
     mask : SWIFTMask, optional
         mask to apply when reading dataset
     """
 
     return SWIFTDataset(filename, mask=mask)
 
+def load_statistics(filename) -> SWIFTStatisticsFile:
+    """
+    Loads a SWIFT statistics file (``SFR.txt``, ``energy.txt``).
+
+    Parameters
+    ----------
+
+    filename : str
+        SWIFT statistics file path
+    """
+
+    return SWIFTStatisticsFile(filename=filename)
+
 
 # Rename this object to something simpler.
 Writer = SWIFTWriterDataset
diff --git a/swiftsimio/__version__.py b/swiftsimio/__version__.py
@@ -1 +1 @@
-__version__ = "4.1.0"
+__version__ = "4.2.0"
diff --git a/swiftsimio/accelerated.py b/swiftsimio/accelerated.py
@@ -6,7 +6,7 @@
 
 from h5py._hl.dataset import Dataset
 
-from typing import Tuple
+from typing import Tuple, Union, List
 
 try:
     from numba import jit, prange
@@ -495,3 +495,51 @@ def read_ranges_from_file(
     )
 
     return read_ranges(handle, ranges, output_shape, output_type, columns)
+
+
+def list_of_strings_to_arrays(lines: List[str]) -> Union[np.array]:
+    """
+    Converts a list of space-delimited values to arrays.
+
+    Parameters
+    ----------
+
+    lines: List[str]
+        List of strings containing numbers separated by a set of spaces.
+    
+    
+    Returns
+    -------
+
+    arrays: List[np.array]
+        List of numpy arrays, one per column.
+
+
+    Notes
+    -----
+
+    Currently not suitable for ``numba`` acceleration due to mixed datatype usage.
+    """
+
+    # Calculate types and set up arrays.
+
+    arrays = []
+    dtypes = []
+    number_of_lines = len(lines)
+
+    for item in lines[0].split():
+        if "." in item or "e" in item:
+            dtype = np.float64
+        else:
+            dtype = np.int64
+
+        dtypes.append(dtype)
+
+        arrays.append(np.zeros(number_of_lines, dtype=dtype))
+
+    for index, line in enumerate(lines):
+        for dtype, (array, value) in zip(dtypes, enumerate(line.split())):
+            arrays[array][index] = dtype(value)
+
+    return arrays
+
diff --git a/swiftsimio/statistics.py b/swiftsimio/statistics.py
@@ -0,0 +1,133 @@
+"""
+Reader for the statistics file.
+"""
+
+import unyt
+import regex as re
+
+from typing import List, Dict
+
+from swiftsimio.accelerated import list_of_strings_to_arrays
+
+
+class SWIFTStatisticsFile(object):
+    """
+    SWIFT statistics files (e.g. SFR.txt, energy.txt) reader.
+    """
+
+    # Names from the header.
+    header_names: List[str]
+    # Units (unyt-based) from the header
+    header_units: Dict[str, unyt.unyt_quantity]
+    # snake_case names from the header
+    header_snake_case_names: List[str]
+    # Raw lines as strings, read from the file.
+    raw_lines: List[str]
+
+    def __init__(self, filename: str):
+        """
+        Parameters
+        ----------
+
+        filename: str
+            File name for the statistics file.
+        """
+
+        self.filename = filename
+
+        self._read_file()
+        self._process_raw_lines()
+
+        return
+
+    def _read_file(self):
+        """
+        Reads the header of the file, including loading the units.
+        """
+
+        # Read the header and use custom regex parsing.
+
+        with open(self.filename, "r") as handle:
+            lines = handle.readlines()
+
+        current_line = 0
+
+        header_names = []
+        header_units = {}
+        current_name = None
+
+        # Regex for matching
+        regex_name = re.compile(r"# \(([0-9]*)\) +([^\.\n]*)")
+        regex_unit = re.compile(r"# *Unit = ([^\s]+) ?(.*)")
+
+        while lines[current_line].startswith("#"):
+            # Regex match each line to see if it is a unit
+            # or a name
+
+            current_string = lines[current_line]
+            current_line += 1
+
+            name_match = regex_name.match(current_string)
+
+            if name_match:
+                current_name = name_match.group(2)
+                header_units[current_name] = unyt.dimensionless
+                header_names.append(current_name)
+
+                continue
+
+            unit_match = regex_unit.match(current_string)
+
+            if unit_match:
+                if unit_match.group(1) != "dimensionless":
+                    header_units[current_name] = unyt.unyt_quantity(
+                        float(unit_match.group(1)), unit_match.group(2)
+                    )
+                else:
+                    header_units[current_name] = unyt.dimensionless
+
+                continue
+
+        # The last line will be the names, so extract those here.
+        header_snake_case_names = [
+            x.replace(".", "").replace(" ", "_").replace("\n", "").lower()
+            for x in re.split(r"\s{2,}", lines[current_line - 1][1:])
+            if x != ""
+        ]
+
+        self.header_names = header_names
+        self.header_units = header_units
+        self.header_snake_case_names = header_snake_case_names
+
+        self.raw_lines = lines[current_line:]
+
+        return
+
+    def _process_raw_lines(self):
+        """
+        Processes the raw string lines read out of the header.
+        """
+
+        arrays = list_of_strings_to_arrays(lines=self.raw_lines)
+
+        for array, header_name, header_snake_case_name in zip(
+            arrays, self.header_names, self.header_snake_case_names
+        ):
+            setattr(
+                self,
+                header_snake_case_name,
+                unyt.unyt_array(
+                    array, units=self.header_units[header_name], name=header_name
+                ),
+            )
+
+        return
+
+    def __str__(self):
+        return (
+            f"Statistics file: {self.filename}, containing fields: "
+            f"{', '.join(self.header_snake_case_names)}"
+        )
+
+    def __repr__(self):
+        return str(self)
diff --git a/tests/test_accelerated.py b/tests/test_accelerated.py
@@ -6,6 +6,7 @@
     ranges_from_array,
     read_ranges_from_file,
     index_dataset,
+    list_of_strings_to_arrays,
 )
 
 import numpy as np
@@ -93,3 +94,29 @@ def test_index_dataset_h5py():
     dataset = file.create_dataset("Test", data=data)
 
     assert (index_dataset(dataset, mask) == data[mask]).all()
+
+
+def test_list_of_strings_to_arrays():
+    """
+    Tests list_of_strings_to_arrays.
+    """
+
+    lines = [
+        "    0     0.0000    1.0e-3    14.0",
+        "    7     3.0000    1.0e-3    14.0",
+    ]
+
+    expected_output = [
+        np.array([0, 7], dtype=np.int64),
+        np.array([0, 3], dtype=np.float64),
+        np.array([1e-3, 1e-3], dtype=np.float64),
+        np.array([14, 14], dtype=np.float64),
+    ]
+
+    output = list_of_strings_to_arrays(lines)
+
+    for expected, real in zip(expected_output, output):
+        assert expected.dtype == real.dtype
+        assert (expected == real).all()
+
+    return