From 19e33865551a7ad5f08df6362ebeeb34cd2cfe87 Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 2 Aug 2024 08:59:35 -0400 Subject: [PATCH 01/11] allow float in interpolate_by by column --- .../ops/interpolation/interpolate_by.rs | 20 ++++++++++++---- .../unit/operations/test_interpolate_by.py | 23 +++++++++++++++---- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs index 674cbab514e9..c77d2ad6f157 100644 --- a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs +++ b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs @@ -87,7 +87,7 @@ fn interpolate_impl_by_sorted( ) -> PolarsResult> where T: PolarsNumericType, - F: PolarsIntegerType, + F: PolarsNumericType, I: Fn(T::Native, T::Native, &[F::Native], &mut Vec), { // This implementation differs from pandas as that boundary None's are not removed. @@ -169,7 +169,7 @@ fn interpolate_impl_by( ) -> PolarsResult> where T: PolarsNumericType, - F: PolarsIntegerType, + F: PolarsNumericType, I: Fn(T::Native, T::Native, &[F::Native], &mut [T::Native], &[IdxSize]), { // This implementation differs from pandas as that boundary None's are not removed. @@ -273,7 +273,7 @@ pub fn interpolate_by(s: &Series, by: &Series, by_is_sorted: bool) -> PolarsResu ) -> PolarsResult where T: PolarsNumericType, - F: PolarsIntegerType, + F: PolarsNumericType, ChunkedArray: IntoSeries, { if is_sorted { @@ -290,6 +290,18 @@ pub fn interpolate_by(s: &Series, by: &Series, by_is_sorted: bool) -> PolarsResu } match (s.dtype(), by.dtype()) { + (DataType::Float64, DataType::Float64) => { + func(s.f64().unwrap(), by.f64().unwrap(), by_is_sorted) + }, + (DataType::Float64, DataType::Float32) => { + func(s.f64().unwrap(), by.f32().unwrap(), by_is_sorted) + }, + (DataType::Float32, DataType::Float64) => { + func(s.f32().unwrap(), by.f64().unwrap(), by_is_sorted) + }, + (DataType::Float32, DataType::Float32) => { + func(s.f32().unwrap(), by.f32().unwrap(), by_is_sorted) + }, (DataType::Float64, DataType::Int64) => { func(s.f64().unwrap(), by.i64().unwrap(), by_is_sorted) }, @@ -326,7 +338,7 @@ pub fn interpolate_by(s: &Series, by: &Series, by_is_sorted: bool) -> PolarsResu _ => { polars_bail!(InvalidOperation: "expected series to be Float64, Float32, \ Int64, Int32, UInt64, UInt32, and `by` to be Date, Datetime, Int64, Int32, \ - UInt64, or UInt32") + UInt64, UInt32, Float32 or Float64") }, } } diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 423992abeadd..93acff0b6366 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -28,6 +28,8 @@ pl.Int32, pl.UInt64, pl.UInt32, + pl.Float32, + pl.Float64 ], ) @pytest.mark.parametrize( @@ -143,14 +145,16 @@ def test_interpolate_by_trailing_nulls() -> None: @given(data=st.data()) -def test_interpolate_vs_numpy(data: st.DataObject) -> None: +@pytest.mark.parametrize("x_dtype", [pl.Date, pl.Float64]) +def test_interpolate_vs_numpy(data: st.DataObject, x_dtype) -> None: + dataframe = ( data.draw( dataframes( [ column( "ts", - dtype=pl.Date, + dtype=x_dtype, allow_null=False, ), column( @@ -166,13 +170,24 @@ def test_interpolate_vs_numpy(data: st.DataObject) -> None: .fill_nan(None) .unique("ts") ) + + if x_dtype == pl.Float64: + assume(not dataframe['ts'].is_nan().any()) + assume(not dataframe['ts'].is_null().any()) + assume(not dataframe["ts"].is_in([float("-inf"), float("inf")]).any()) + assume(not dataframe["value"].is_null().all()) assume(not dataframe["value"].is_in([float("-inf"), float("inf")]).any()) + + dataframe = dataframe.sort('ts') + result = dataframe.select(pl.col("value").interpolate_by("ts"))["value"] mask = dataframe["value"].is_not_null() - x = dataframe["ts"].to_numpy().astype("int64") - xp = dataframe["ts"].filter(mask).to_numpy().astype("int64") + + np_dtype = "int64" if x_dtype == pl.Date else 'float64' + x = dataframe["ts"].to_numpy().astype(np_dtype) + xp = dataframe["ts"].filter(mask).to_numpy().astype(np_dtype) yp = dataframe["value"].filter(mask).to_numpy().astype("float64") interp = np.interp(x, xp, yp) # Polars preserves nulls on boundaries, but NumPy doesn't. From f13f211c11442fd1f748fa73519db5752f5d57cd Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 2 Aug 2024 15:57:29 -0400 Subject: [PATCH 02/11] Fix formatting problems --- py-polars/tests/unit/operations/test_interpolate_by.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 93acff0b6366..63efa88aeef1 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -147,7 +147,7 @@ def test_interpolate_by_trailing_nulls() -> None: @given(data=st.data()) @pytest.mark.parametrize("x_dtype", [pl.Date, pl.Float64]) def test_interpolate_vs_numpy(data: st.DataObject, x_dtype) -> None: - + dataframe = ( data.draw( dataframes( @@ -170,7 +170,7 @@ def test_interpolate_vs_numpy(data: st.DataObject, x_dtype) -> None: .fill_nan(None) .unique("ts") ) - + if x_dtype == pl.Float64: assume(not dataframe['ts'].is_nan().any()) assume(not dataframe['ts'].is_null().any()) @@ -184,7 +184,7 @@ def test_interpolate_vs_numpy(data: st.DataObject, x_dtype) -> None: result = dataframe.select(pl.col("value").interpolate_by("ts"))["value"] mask = dataframe["value"].is_not_null() - + np_dtype = "int64" if x_dtype == pl.Date else 'float64' x = dataframe["ts"].to_numpy().astype(np_dtype) xp = dataframe["ts"].filter(mask).to_numpy().astype(np_dtype) From f226b98376819c5a962d6516de71b9aa06cf37d3 Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 2 Aug 2024 15:59:43 -0400 Subject: [PATCH 03/11] type hint --- py-polars/tests/unit/operations/test_interpolate_by.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 63efa88aeef1..95f2eae1e81f 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -146,7 +146,7 @@ def test_interpolate_by_trailing_nulls() -> None: @given(data=st.data()) @pytest.mark.parametrize("x_dtype", [pl.Date, pl.Float64]) -def test_interpolate_vs_numpy(data: st.DataObject, x_dtype) -> None: +def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None: dataframe = ( data.draw( From 200d09846da7214e491ae2d9bc523eca3044aa62 Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 2 Aug 2024 23:29:59 -0400 Subject: [PATCH 04/11] more formatting --- .../tests/unit/operations/test_interpolate_by.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 95f2eae1e81f..f073e5366617 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -29,7 +29,7 @@ pl.UInt64, pl.UInt32, pl.Float32, - pl.Float64 + pl.Float64, ], ) @pytest.mark.parametrize( @@ -147,7 +147,6 @@ def test_interpolate_by_trailing_nulls() -> None: @given(data=st.data()) @pytest.mark.parametrize("x_dtype", [pl.Date, pl.Float64]) def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None: - dataframe = ( data.draw( dataframes( @@ -172,20 +171,20 @@ def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None ) if x_dtype == pl.Float64: - assume(not dataframe['ts'].is_nan().any()) - assume(not dataframe['ts'].is_null().any()) + assume(not dataframe["ts"].is_nan().any()) + assume(not dataframe["ts"].is_null().any()) assume(not dataframe["ts"].is_in([float("-inf"), float("inf")]).any()) assume(not dataframe["value"].is_null().all()) assume(not dataframe["value"].is_in([float("-inf"), float("inf")]).any()) - dataframe = dataframe.sort('ts') + dataframe = dataframe.sort("ts") result = dataframe.select(pl.col("value").interpolate_by("ts"))["value"] mask = dataframe["value"].is_not_null() - np_dtype = "int64" if x_dtype == pl.Date else 'float64' + np_dtype = "int64" if x_dtype == pl.Date else "float64" x = dataframe["ts"].to_numpy().astype(np_dtype) xp = dataframe["ts"].filter(mask).to_numpy().astype(np_dtype) yp = dataframe["value"].filter(mask).to_numpy().astype("float64") From 0e3b77d793c0b6178d6f3ad2ef0e33ad7dcc29fb Mon Sep 17 00:00:00 2001 From: andrew Date: Fri, 2 Aug 2024 23:53:57 -0400 Subject: [PATCH 05/11] less wastefull hypothesis testing parameters --- py-polars/tests/unit/operations/test_interpolate_by.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index f073e5366617..095c00343356 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -155,6 +155,7 @@ def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None "ts", dtype=x_dtype, allow_null=False, + strategy=st.floats(allow_nan=False, allow_infinity=False, allow_subnormal=False) if x_dtype == pl.Float64 else None, ), column( "value", From 04c7c2b4579764222acb282d7d62f04216e95466 Mon Sep 17 00:00:00 2001 From: andrew Date: Sat, 3 Aug 2024 00:07:09 -0400 Subject: [PATCH 06/11] formatting --- py-polars/tests/unit/operations/test_interpolate_by.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 095c00343356..5c5f89d331a2 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -147,6 +147,11 @@ def test_interpolate_by_trailing_nulls() -> None: @given(data=st.data()) @pytest.mark.parametrize("x_dtype", [pl.Date, pl.Float64]) def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None: + if x_dtype == pl.Float64: + by_strategy = st.floats(allow_nan=False, allow_infinity=False, allow_subnormal=False) + else: + by_strategy = None + dataframe = ( data.draw( dataframes( @@ -155,7 +160,7 @@ def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None "ts", dtype=x_dtype, allow_null=False, - strategy=st.floats(allow_nan=False, allow_infinity=False, allow_subnormal=False) if x_dtype == pl.Float64 else None, + strategy=by_strategy, ), column( "value", From 5b25b45c42198379e7bc51485327109a5fc22fae Mon Sep 17 00:00:00 2001 From: andrew Date: Sat, 3 Aug 2024 00:08:58 -0400 Subject: [PATCH 07/11] really? --- py-polars/tests/unit/operations/test_interpolate_by.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 5c5f89d331a2..4161a2f1c04c 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -148,7 +148,9 @@ def test_interpolate_by_trailing_nulls() -> None: @pytest.mark.parametrize("x_dtype", [pl.Date, pl.Float64]) def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None: if x_dtype == pl.Float64: - by_strategy = st.floats(allow_nan=False, allow_infinity=False, allow_subnormal=False) + by_strategy = st.floats( + allow_nan=False, allow_infinity=False, allow_subnormal=False + ) else: by_strategy = None From 9210a080633c255a6b24054afe8feb576024b5a7 Mon Sep 17 00:00:00 2001 From: andrew Date: Sat, 3 Aug 2024 08:33:43 -0400 Subject: [PATCH 08/11] At float version of test_interpolate_by_trailing_nulls --- .../unit/operations/test_interpolate_by.py | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 4161a2f1c04c..7e672890cbc4 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -118,8 +118,10 @@ def test_interpolate_by_leading_nulls() -> None: assert_frame_equal(result, expected) -def test_interpolate_by_trailing_nulls() -> None: - df = pl.DataFrame( +@pytest.mark.parametrize("dataset", ["floats", "dates"]) +def test_interpolate_by_trailing_nulls(dataset: str) -> None: + input_data = { + "dates": pl.DataFrame( { "times": [ date(2020, 1, 1), @@ -130,10 +132,25 @@ def test_interpolate_by_trailing_nulls() -> None: date(2020, 1, 13), ], "values": [1, None, None, 5, None, None], + }), + "floats": pl.DataFrame( + { + "times": [0.2, 0.4, 0.5, 0.6, 0.9, 1.1], + "values": [1, None, None, 5, None, None], } - ) + ) + } + + expected_data = { + "dates": pl.DataFrame({"values": [1.0, 1.7999999999999998, 4.6, 5.0, None, None]}), + "floats": pl.DataFrame({"values": [1.0, 3.0, 4.0, 5.0, None, None]}) + } + + df = input_data[dataset] + expected = expected_data[dataset] + result = df.select(pl.col("values").interpolate_by("times")) - expected = pl.DataFrame({"values": [1.0, 1.7999999999999998, 4.6, 5.0, None, None]}) + assert_frame_equal(result, expected) result = ( df.sort("times", descending=True) From bf3af7abaa70cd43e9c7ad731505e25ce79d1e6e Mon Sep 17 00:00:00 2001 From: andrew Date: Sat, 3 Aug 2024 10:02:09 -0400 Subject: [PATCH 09/11] more formatting --- .../unit/operations/test_interpolate_by.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 7e672890cbc4..39c6aa64d741 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -122,28 +122,31 @@ def test_interpolate_by_leading_nulls() -> None: def test_interpolate_by_trailing_nulls(dataset: str) -> None: input_data = { "dates": pl.DataFrame( - { - "times": [ - date(2020, 1, 1), - date(2020, 1, 3), - date(2020, 1, 10), - date(2020, 1, 11), - date(2020, 1, 12), - date(2020, 1, 13), - ], - "values": [1, None, None, 5, None, None], - }), + { + "times": [ + date(2020, 1, 1), + date(2020, 1, 3), + date(2020, 1, 10), + date(2020, 1, 11), + date(2020, 1, 12), + date(2020, 1, 13), + ], + "values": [1, None, None, 5, None, None], + } + ), "floats": pl.DataFrame( - { - "times": [0.2, 0.4, 0.5, 0.6, 0.9, 1.1], - "values": [1, None, None, 5, None, None], - } - ) + { + "times": [0.2, 0.4, 0.5, 0.6, 0.9, 1.1], + "values": [1, None, None, 5, None, None], + } + ), } expected_data = { - "dates": pl.DataFrame({"values": [1.0, 1.7999999999999998, 4.6, 5.0, None, None]}), - "floats": pl.DataFrame({"values": [1.0, 3.0, 4.0, 5.0, None, None]}) + "dates": pl.DataFrame( + {"values": [1.0, 1.7999999999999998, 4.6, 5.0, None, None]} + ), + "floats": pl.DataFrame({"values": [1.0, 3.0, 4.0, 5.0, None, None]}), } df = input_data[dataset] From c7c9ca328626242da8999c8f4c3d76ceed0971cb Mon Sep 17 00:00:00 2001 From: andrew Date: Mon, 5 Aug 2024 18:51:38 -0400 Subject: [PATCH 10/11] use @given instead of parametrize --- py-polars/tests/unit/operations/test_interpolate_by.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 39c6aa64d741..0e981ec09e1f 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -164,8 +164,7 @@ def test_interpolate_by_trailing_nulls(dataset: str) -> None: assert_frame_equal(result, expected) -@given(data=st.data()) -@pytest.mark.parametrize("x_dtype", [pl.Date, pl.Float64]) +@given(data=st.data(), x_dtype=st.sampled_from([pl.Date, pl.Float64])) def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None: if x_dtype == pl.Float64: by_strategy = st.floats( From c9ff1a858565e9b311592f19fbfdf18dcfe3fa1c Mon Sep 17 00:00:00 2001 From: andrew Date: Thu, 8 Aug 2024 21:41:08 -0400 Subject: [PATCH 11/11] try float bounds on hypothesis test --- py-polars/tests/unit/operations/test_interpolate_by.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 0e981ec09e1f..98ee656fdaed 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -168,7 +168,11 @@ def test_interpolate_by_trailing_nulls(dataset: str) -> None: def test_interpolate_vs_numpy(data: st.DataObject, x_dtype: pl.DataType) -> None: if x_dtype == pl.Float64: by_strategy = st.floats( - allow_nan=False, allow_infinity=False, allow_subnormal=False + min_value=-1e150, + max_value=1e150, + allow_nan=False, + allow_infinity=False, + allow_subnormal=False, ) else: by_strategy = None