dev

NCAS-CMS · davidhassell · Jul 15, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 15, 2024
commit f8c757534df19d33b8c72f83331ba0124f787a12
diff --git a/Changelog.rst b/Changelog.rst
@@ -3,6 +3,8 @@ version NEXTVERSION
 
 **2024-??-??**
 
+* New keyword parameter to `cf.histogram`: ``density``
+  (https://github.com/NCAS-CMS/cf-python/issues/794)
 * New method: `cf.Field.is_discrete_axis`
   (https://github.com/NCAS-CMS/cf-python/issues/784)
 * Include the UM version as a field property when reading UM files

diff --git a/cf/maths.py b/cf/maths.py
@@ -92,7 +92,7 @@ def relative_vorticity(
     )  # pragman: no cover
 
 
-def histogram(*digitized):
+def histogram(*digitized, density=False):
     """Return the distribution of a set of variables in the form of an
     N-dimensional histogram.
 
@@ -151,6 +151,17 @@ def histogram(*digitized):
             manipulating the dimensions of the digitized field
             construct's data to ensure that broadcasting can occur.
 
+        density: `bool`, optional
+            If False, the default, the result will contain the number
+            of samples in each bin. If True, the result is the value
+            of the probability density function at the bin, normalized
+            such that the integral over the range is 1. Note that the
+            sum of the histogram values will not be equal to 1 unless
+            bins of unity size are chosen; it is not a probability
+            mass function.
+
+            .. versionadded:: NEXTVERSION
+
     :Returns:
 
         `Field`
@@ -197,7 +208,7 @@ def histogram(*digitized):
      [0.1174 0.1317]
      [0.1317 0.146 ]]
     >>> h = cf.histogram(indices)
-    >>> rint(h)
+    >>> print(h)
     Field: number_of_observations
     -----------------------------
     Data            : number_of_observations(specific_humidity(10)) 1
@@ -216,7 +227,21 @@ def histogram(*digitized):
      [0.1031 0.1174]
      [0.1174 0.1317]
      [0.1317 0.146 ]]
+    >>> p = cf.histogram(indices, density=True)
+    >>> print(p)
+    Field: long_name=probability density function
+    ---------------------------------------------
+    Data            : long_name=probability density function(specific_humidity(10))
+    Cell methods    : latitude: longitude: point
+    Dimension coords: specific_humidity(10) = [0.01015, ..., 0.13885] 1
+    >>> print(p.data.round(2).array))
+    [15.73 12.24 15.73 6.99 8.74 1.75 1.75 1.75 3.5 1.75]
+
+    The sum of the density values multiplied by the bin sizes is
+    unity:
 
+    >>> print((p * p.dimension_coordinate().cellsize).sum().array)
+    1.0
 
     Create a two-dimensional histogram based on specific humidity and
     temperature bins. The temperature bins in this example are derived
@@ -225,8 +250,8 @@ def histogram(*digitized):
 
     >>> g = f.copy()
     >>> g.standard_name = 'air_temperature'
-    >>> import numpy
-    >>> g[...] = numpy.random.normal(loc=290, scale=10, size=40).reshape(5, 8)
+    >>> import numpy as np
+    >>> g[...] = np.arange(40).reshape(5, 8) + 253.15
     >>> g.override_units('K', inplace=True)
     >>> print(g)
     Field: air_temperature (ncvar%q)
@@ -247,13 +272,28 @@ def histogram(*digitized):
     Dimension coords: air_temperature(5) = [281.1054839143287, ..., 313.9741786365939] K
                     : specific_humidity(10) = [0.01015, ..., 0.13885] 1
     >>> print(h.array)
-    [[2  1  5  3  2 -- -- -- -- --]
-     [1  1  2 --  1 --  1  1 -- --]
-     [4  4  2  1  1  1 -- --  1  1]
-     [1  1 -- --  1 -- -- --  1 --]
-     [1 -- -- -- -- -- -- -- -- --]]
+    [[3  3  2 -- -- -- -- -- -- --]
+     [1  1  2  1  3 -- -- -- -- --]
+     [1 -- --  1 --  1  1  1  2  1]
+     [2  1  1  2  2 -- -- -- -- --]
+     [2  2  4 -- -- -- -- -- -- --]]
     >>> h.sum()
     <CF Data(): 40 1>
+    >>> p = cf.histogram(indices, indices_t, density=True)
+    >>> print(p)
+    Field: long_name=probability density function
+    ---------------------------------------------
+    Data            : long_name=probability density function(air_temperature(5), specific_humidity(10))
+    Cell methods    : latitude: longitude: point
+    Dimension coords: air_temperature(5) = [257.05, ..., 288.25] K
+                    : specific_humidity(10) = [0.01015, ..., 0.13885] 1
+    >>> print(p.array)
+    >>> print(p.data.round(2).array))
+    [[0.67 0.67 0.45   --   --   --   --   --   --   --]
+     [0.22 0.22 0.45 0.22 0.67   --  --    --   --   --]
+     [0.22   --   -- 0.22   -- 0.22 0.22 0.22 0.45 0.22]
+     [0.45 0.22 0.22 0.45 0.45   --   --   --   --   --]
+     [0.45 0.45 0.9    --   --   --   --   --   --   --]]
 
     """
     if not digitized:
@@ -264,7 +304,26 @@ def histogram(*digitized):
     f = digitized[0].copy()
     f.clear_properties()
 
-    return f.bin("sample_size", digitized=digitized)
+    out = f.bin("sample_size", digitized=digitized)
+
+    if density:
+        # Get the measure of each bin. This is the outer product of
+        # the cell sizes of each dimension coordinate construct.
+        data_axes = out.get_data_axes()
+        dim = out.dimension_coordinate(filter_by_axis=(data_axes[0],))
+        bin_measures = dim.cellsize
+        for axis in data_axes[1:]:
+            dim = out.dimension_coordinate(filter_by_axis=(axis,))
+            bin_measures.outerproduct(dim.cellsize, inplace=True)
+
+        # Convert counts to densities
+        out /= out.data.sum() * bin_measures
+
+        out.override_units(Units(), inplace=True)
+        out.del_property("standard_name", None)
+        out.set_property("long_name", "probability density function")
+
+    return out
 
 
 def curl_xy(fx, fy, x_wrap=None, one_sided_at_boundary=False, radius=None):

diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py
@@ -4,6 +4,8 @@
 
 faulthandler.enable()  # to debug seg faults and timeouts
 
+import numpy as np
+
 import cf
 
 
@@ -207,6 +209,125 @@ def test_differential_operators(self):
         zeros[...] = 0
         self.assertTrue(cg.data.equals(zeros.data, rtol=0, atol=1e-15))
 
+    def test_histogram(self):
+        f = cf.example_field(0)
+        g = f.copy()
+        g.standard_name = "air_temperature"
+        g[...] = np.arange(40).reshape(5, 8) + 253.15
+        g.override_units("K", inplace=True)
+
+        # 1-d histogram
+        indices = f.digitize(10)
+        h = cf.histogram(indices)
+        self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all)
+        h = cf.histogram(indices, density=True)
+        self.assertTrue(
+            (
+                h.array
+                == [
+                    15.734265734265733,
+                    12.237762237762242,
+                    15.734265734265733,
+                    6.9930069930069925,
+                    8.74125874125874,
+                    1.748251748251749,
+                    1.7482517482517475,
+                    1.748251748251749,
+                    3.496503496503498,
+                    1.748251748251744,
+                ]
+            ).all
+        )
+        integral = (h * h.dimension_coordinate().cellsize).sum()
+        self.assertEqual(integral.array, 1)
+
+        # 2-d histogram
+        indices_t = g.digitize(5)
+        h = cf.histogram(indices, indices_t)
+        self.assertEqual(h.Units, cf.Units())
+        self.assertTrue(
+            (
+                h.array
+                == [
+                    [3, 3, 2, -1, -1, -1, -1, -1, -1, -1],
+                    [1, 1, 2, 1, 3, -1, -1, -1, -1, -1],
+                    [1, -1, -1, 1, -1, 1, 1, 1, 2, 1],
+                    [2, 1, 1, 2, 2, -1, -1, -1, -1, -1],
+                    [2, 2, 4, -1, -1, -1, -1, -1, -1, -1],
+                ]
+            ).all()
+        )
+
+        h = cf.histogram(indices, indices_t, density=True)
+        self.assertEqual(h.Units, cf.Units())
+        self.assertTrue(
+            (
+                h.array
+                == [
+                    [
+                        0.6724045185583661,
+                        0.6724045185583664,
+                        0.44826967903891074,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                    ],
+                    [
+                        0.22413483951945457,
+                        0.2241348395194546,
+                        0.44826967903890913,
+                        0.22413483951945457,
+                        0.6724045185583637,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                    ],
+                    [
+                        0.22413483951945457,
+                        -1,
+                        -1,
+                        0.22413483951945457,
+                        -1,
+                        0.2241348395194547,
+                        0.22413483951945448,
+                        0.2241348395194547,
+                        0.4482696790389094,
+                        0.224134839519454,
+                    ],
+                    [
+                        0.4482696790389124,
+                        0.22413483951945626,
+                        0.2241348395194562,
+                        0.4482696790389124,
+                        0.4482696790389124,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                    ],
+                    [
+                        0.4482696790389059,
+                        0.44826967903890597,
+                        0.8965393580778118,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                        -1,
+                    ],
+                ]
+            ).all()
+        )
+
 
 if __name__ == "__main__":
     print("Run date:", datetime.datetime.now())