From e1dd3c9f4f14bce514542f8985eab296c9a4e7fb Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 31 Jul 2024 08:36:37 +1000 Subject: [PATCH] fix: Incorrect filter on categorical columns from parquet files --- crates/polars-io/src/predicates.rs | 3 ++- py-polars/tests/unit/io/test_lazy_parquet.py | 25 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/crates/polars-io/src/predicates.rs b/crates/polars-io/src/predicates.rs index ad20aa102a9f..be3bf0ebff03 100644 --- a/crates/polars-io/src/predicates.rs +++ b/crates/polars-io/src/predicates.rs @@ -190,7 +190,8 @@ impl ColumnStats { /// Returns whether the [`DataType`] supports minimum/maximum operations. fn use_min_max(dtype: &DataType) -> bool { - dtype.to_physical().is_numeric() + dtype.is_numeric() + || dtype.is_temporal() || matches!( dtype, DataType::String | DataType::Binary | DataType::Boolean diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index 6c4b2842ee13..7b7350bdf496 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -450,3 +450,28 @@ def test_parquet_schema_mismatch_panic_17067(tmp_path: Path, streaming: bool) -> with pytest.raises(pl.exceptions.SchemaError): pl.scan_parquet(tmp_path).collect(streaming=streaming) + + +@pytest.mark.write_disk() +def test_predicate_push_down_categorical_17744(tmp_path: Path) -> None: + path = tmp_path / "1" + + df = pl.DataFrame( + data={ + "n": [1, 2, 3], + "ccy": ["USD", "JPY", "EUR"], + }, + schema_overrides={"ccy": pl.Categorical("lexical")}, + ) + df.write_parquet(path) + expect = df.head(1).with_columns(pl.col(pl.Categorical).cast(pl.String)) + + lf = pl.scan_parquet(path) + + for predicate in [pl.col("ccy") == "USD", pl.col("ccy").is_in(["USD"])]: + assert_frame_equal( + lf.filter(predicate) + .with_columns(pl.col(pl.Categorical).cast(pl.String)) + .collect(), + expect, + )