Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New keyword parameter to cf.histogram: density #795

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
dev
davidhassell committed Jul 15, 2024
commit f8c757534df19d33b8c72f83331ba0124f787a12
2 changes: 2 additions & 0 deletions Changelog.rst
Original file line number Diff line number Diff line change
@@ -3,6 +3,8 @@ version NEXTVERSION

**2024-??-??**

* New keyword parameter to `cf.histogram`: ``density``
(https://github.com/NCAS-CMS/cf-python/issues/794)
* New method: `cf.Field.is_discrete_axis`
(https://github.com/NCAS-CMS/cf-python/issues/784)
* Include the UM version as a field property when reading UM files
79 changes: 69 additions & 10 deletions cf/maths.py
Original file line number Diff line number Diff line change
@@ -92,7 +92,7 @@ def relative_vorticity(
) # pragman: no cover


def histogram(*digitized):
def histogram(*digitized, density=False):
"""Return the distribution of a set of variables in the form of an
N-dimensional histogram.

@@ -151,6 +151,17 @@ def histogram(*digitized):
manipulating the dimensions of the digitized field
construct's data to ensure that broadcasting can occur.

density: `bool`, optional
If False, the default, the result will contain the number
of samples in each bin. If True, the result is the value
of the probability density function at the bin, normalized
such that the integral over the range is 1. Note that the
sum of the histogram values will not be equal to 1 unless
bins of unity size are chosen; it is not a probability
mass function.

.. versionadded:: NEXTVERSION

:Returns:

`Field`
@@ -197,7 +208,7 @@ def histogram(*digitized):
[0.1174 0.1317]
[0.1317 0.146 ]]
>>> h = cf.histogram(indices)
>>> rint(h)
>>> print(h)
Field: number_of_observations
-----------------------------
Data : number_of_observations(specific_humidity(10)) 1
@@ -216,7 +227,21 @@ def histogram(*digitized):
[0.1031 0.1174]
[0.1174 0.1317]
[0.1317 0.146 ]]
>>> p = cf.histogram(indices, density=True)
>>> print(p)
Field: long_name=probability density function
---------------------------------------------
Data : long_name=probability density function(specific_humidity(10))
Cell methods : latitude: longitude: point
Dimension coords: specific_humidity(10) = [0.01015, ..., 0.13885] 1
>>> print(p.data.round(2).array))
[15.73 12.24 15.73 6.99 8.74 1.75 1.75 1.75 3.5 1.75]

The sum of the density values multiplied by the bin sizes is
unity:

>>> print((p * p.dimension_coordinate().cellsize).sum().array)
1.0

Create a two-dimensional histogram based on specific humidity and
temperature bins. The temperature bins in this example are derived
@@ -225,8 +250,8 @@ def histogram(*digitized):

>>> g = f.copy()
>>> g.standard_name = 'air_temperature'
>>> import numpy
>>> g[...] = numpy.random.normal(loc=290, scale=10, size=40).reshape(5, 8)
>>> import numpy as np
>>> g[...] = np.arange(40).reshape(5, 8) + 253.15
>>> g.override_units('K', inplace=True)
>>> print(g)
Field: air_temperature (ncvar%q)
@@ -247,13 +272,28 @@ def histogram(*digitized):
Dimension coords: air_temperature(5) = [281.1054839143287, ..., 313.9741786365939] K
: specific_humidity(10) = [0.01015, ..., 0.13885] 1
>>> print(h.array)
[[2 1 5 3 2 -- -- -- -- --]
[1 1 2 -- 1 -- 1 1 -- --]
[4 4 2 1 1 1 -- -- 1 1]
[1 1 -- -- 1 -- -- -- 1 --]
[1 -- -- -- -- -- -- -- -- --]]
[[3 3 2 -- -- -- -- -- -- --]
[1 1 2 1 3 -- -- -- -- --]
[1 -- -- 1 -- 1 1 1 2 1]
[2 1 1 2 2 -- -- -- -- --]
[2 2 4 -- -- -- -- -- -- --]]
>>> h.sum()
<CF Data(): 40 1>
>>> p = cf.histogram(indices, indices_t, density=True)
>>> print(p)
Field: long_name=probability density function
---------------------------------------------
Data : long_name=probability density function(air_temperature(5), specific_humidity(10))
Cell methods : latitude: longitude: point
Dimension coords: air_temperature(5) = [257.05, ..., 288.25] K
: specific_humidity(10) = [0.01015, ..., 0.13885] 1
>>> print(p.array)
>>> print(p.data.round(2).array))
[[0.67 0.67 0.45 -- -- -- -- -- -- --]
[0.22 0.22 0.45 0.22 0.67 -- -- -- -- --]
[0.22 -- -- 0.22 -- 0.22 0.22 0.22 0.45 0.22]
[0.45 0.22 0.22 0.45 0.45 -- -- -- -- --]
[0.45 0.45 0.9 -- -- -- -- -- -- --]]

"""
if not digitized:
@@ -264,7 +304,26 @@ def histogram(*digitized):
f = digitized[0].copy()
f.clear_properties()

return f.bin("sample_size", digitized=digitized)
out = f.bin("sample_size", digitized=digitized)

if density:
# Get the measure of each bin. This is the outer product of
# the cell sizes of each dimension coordinate construct.
data_axes = out.get_data_axes()
dim = out.dimension_coordinate(filter_by_axis=(data_axes[0],))
bin_measures = dim.cellsize
for axis in data_axes[1:]:
dim = out.dimension_coordinate(filter_by_axis=(axis,))
bin_measures.outerproduct(dim.cellsize, inplace=True)

# Convert counts to densities
out /= out.data.sum() * bin_measures

out.override_units(Units(), inplace=True)
out.del_property("standard_name", None)
out.set_property("long_name", "probability density function")

return out


def curl_xy(fx, fy, x_wrap=None, one_sided_at_boundary=False, radius=None):
121 changes: 121 additions & 0 deletions cf/test/test_Maths.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,8 @@

faulthandler.enable() # to debug seg faults and timeouts

import numpy as np

import cf


@@ -207,6 +209,125 @@ def test_differential_operators(self):
zeros[...] = 0
self.assertTrue(cg.data.equals(zeros.data, rtol=0, atol=1e-15))

def test_histogram(self):
f = cf.example_field(0)
g = f.copy()
g.standard_name = "air_temperature"
g[...] = np.arange(40).reshape(5, 8) + 253.15
g.override_units("K", inplace=True)

# 1-d histogram
indices = f.digitize(10)
h = cf.histogram(indices)
self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all)
h = cf.histogram(indices, density=True)
self.assertTrue(
(
h.array
== [
15.734265734265733,
12.237762237762242,
15.734265734265733,
6.9930069930069925,
8.74125874125874,
1.748251748251749,
1.7482517482517475,
1.748251748251749,
3.496503496503498,
1.748251748251744,
]
).all
)
integral = (h * h.dimension_coordinate().cellsize).sum()
self.assertEqual(integral.array, 1)

# 2-d histogram
indices_t = g.digitize(5)
h = cf.histogram(indices, indices_t)
self.assertEqual(h.Units, cf.Units())
self.assertTrue(
(
h.array
== [
[3, 3, 2, -1, -1, -1, -1, -1, -1, -1],
[1, 1, 2, 1, 3, -1, -1, -1, -1, -1],
[1, -1, -1, 1, -1, 1, 1, 1, 2, 1],
[2, 1, 1, 2, 2, -1, -1, -1, -1, -1],
[2, 2, 4, -1, -1, -1, -1, -1, -1, -1],
]
).all()
)

h = cf.histogram(indices, indices_t, density=True)
self.assertEqual(h.Units, cf.Units())
self.assertTrue(
(
h.array
== [
[
0.6724045185583661,
0.6724045185583664,
0.44826967903891074,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
],
[
0.22413483951945457,
0.2241348395194546,
0.44826967903890913,
0.22413483951945457,
0.6724045185583637,
-1,
-1,
-1,
-1,
-1,
],
[
0.22413483951945457,
-1,
-1,
0.22413483951945457,
-1,
0.2241348395194547,
0.22413483951945448,
0.2241348395194547,
0.4482696790389094,
0.224134839519454,
],
[
0.4482696790389124,
0.22413483951945626,
0.2241348395194562,
0.4482696790389124,
0.4482696790389124,
-1,
-1,
-1,
-1,
-1,
],
[
0.4482696790389059,
0.44826967903890597,
0.8965393580778118,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
],
]
).all()
)


if __name__ == "__main__":
print("Run date:", datetime.datetime.now())