From 7473b0493b1ee09094578e31a95008ff8bc13724 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Fri, 8 Nov 2024 08:24:12 +0000 Subject: [PATCH 01/20] docs: Update Excel page of user guide to refer to fastexcel as the default engine (#19691) Co-authored-by: Liam Brannigan Co-authored-by: Alexander Beedie --- docs/source/user-guide/io/excel.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/source/user-guide/io/excel.md b/docs/source/user-guide/io/excel.md index 48ad3d995789..f5515d789645 100644 --- a/docs/source/user-guide/io/excel.md +++ b/docs/source/user-guide/io/excel.md @@ -5,25 +5,23 @@ From a performance perspective, we recommend using other formats if possible, su ## Read -Polars does not have a native Excel reader. Instead, it uses external libraries to parse Excel files into objects that Polars can parse. The available engines are: +Polars does not have a native Excel reader. Instead, it uses an external library called an "engine" to parse Excel files into a form that Polars can parse. The available engines are: -- xlsx2csv: This is the current default. +- fastexcel: This engine is based on the Rust [calamine](https://github.com/tafia/calamine) crate and is (by far) the fastest reader. +- xlsx2csv: This reader parses the .xlsx file to an in-memory CSV that Polars then reads with its own CSV reader. - openpyxl: Typically slower than xls2csv, but can provide more flexibility for files that are difficult to parse. -- fastexcel: This reader is based on [calamine](https://github.com/tafia/calamine) and is typically the fastest reader but has fewer features than xls2csv. -Although fastexcel is not the default at this point, we recommend trying fastexcel first and using xlsx2csv or openpyxl if you encounter issues. +We recommend working with the default fastexcel engine. The xlsx2csv and openpyxl engines are slower but may have more features for parsing tricky data. These engines may be helpful if the fastexcel reader does not work for a specific Excel file. To use one of these engines, the appropriate Python package must be installed as an additional dependency. === ":fontawesome-brands-python: Python" ```shell - $ pip install xlsx2csv openpyxl fastexcel + $ pip install fastexcel xlsx2csv openpyxl ``` -The default Excel reader is xlsx2csv. -It is a Python library which parses the Excel file into a CSV file which Polars then reads with the native CSV reader. -We read an Excel file with `read_excel`: +The default engine for reading .xslx files is fastexcel. This engine uses the Rust calamine crate to read .xslx files into an Apache Arrow in-memory representation that Polars can read without needing to copy the data. {{code_block('user-guide/io/excel','read',['read_excel'])}} From 772d023b948e519f68c55422ecac63de9d30febb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:15:58 +0000 Subject: [PATCH 02/20] docs: Revise and rework user-guide/expressions (#19360) --- crates/polars-plan/src/dsl/mod.rs | 2 +- docs/source/_build/API_REFERENCE_LINKS.yml | 682 ++++++++++-------- docs/source/_build/css/extra.css | 81 ++- docs/source/_build/js/mathjax.js | 19 + .../python/user-guide/concepts/contexts.py | 55 -- .../concepts/data-types-and-structures.py | 24 + .../concepts/data-types/categoricals.py | 107 --- .../user-guide/expressions/aggregation.py | 55 +- .../python/user-guide/expressions/casting.py | 120 +-- .../user-guide/expressions/categoricals.py | 208 ++++++ .../expressions/column-selections.py | 2 - .../expressions/expression-expansion.py | 198 +++++ .../python/user-guide/expressions/folds.py | 67 +- .../user-guide/expressions/functions.py | 60 -- .../python/user-guide/expressions/lists.py | 186 +++-- .../user-guide/expressions/missing-data.py | 49 +- .../user-guide/expressions/operations.py | 132 ++++ .../user-guide/expressions/operators.py | 44 -- .../python/user-guide/expressions/strings.py | 113 ++- .../python/user-guide/expressions/structs.py | 93 ++- .../python/user-guide/expressions/window.py | 131 +++- docs/source/src/rust/Cargo.toml | 76 +- .../src/rust/user-guide/concepts/contexts.rs | 70 -- .../user-guide/expressions/aggregation.rs | 32 +- .../rust/user-guide/expressions/casting.rs | 199 ++--- .../expressions/column-selections.rs | 2 +- .../expressions/expression-expansion.rs | 215 ++++++ .../src/rust/user-guide/expressions/folds.rs | 49 +- .../rust/user-guide/expressions/functions.rs | 79 -- .../src/rust/user-guide/expressions/lists.rs | 186 +---- .../user-guide/expressions/missing-data.rs | 59 +- .../rust/user-guide/expressions/operations.rs | 138 ++++ .../rust/user-guide/expressions/operators.rs | 54 -- .../rust/user-guide/expressions/strings.rs | 92 +-- .../rust/user-guide/expressions/structs.rs | 80 +- .../src/rust/user-guide/expressions/window.rs | 71 +- .../concepts/data-types-and-structures.md | 31 +- .../concepts/expressions-and-contexts.md | 2 +- .../user-guide/expressions/aggregation.md | 131 ++-- .../expressions/athletes_over_country.svg | 84 +++ .../athletes_over_country_explode.svg | 85 +++ .../expressions/basic-operations.md | 123 ++++ docs/source/user-guide/expressions/casting.md | 81 ++- .../expressions/categorical-data-and-enums.md | 396 +++++++--- .../expressions/column-selections.md | 134 ---- .../expressions/expression-expansion.md | 363 ++++++++++ docs/source/user-guide/expressions/folds.md | 62 +- .../user-guide/expressions/functions.md | 65 -- docs/source/user-guide/expressions/index.md | 34 +- .../expressions/lists-and-arrays.md | 184 +++++ docs/source/user-guide/expressions/lists.md | 119 --- .../user-guide/expressions/missing-data.md | 118 +-- .../user-guide/expressions/numpy-functions.md | 24 + docs/source/user-guide/expressions/numpy.md | 22 - .../user-guide/expressions/operators.md | 30 - .../expressions/speed_rank_by_type.svg | 102 +++ docs/source/user-guide/expressions/strings.md | 117 ++- docs/source/user-guide/expressions/structs.md | 113 ++- ...ns.md => user-defined-python-functions.md} | 12 +- .../expressions/window-functions.md | 147 ++++ docs/source/user-guide/expressions/window.md | 91 --- docs/source/user-guide/getting-started.md | 2 +- docs/source/user-guide/installation.md | 6 +- .../your-first-polars-plugin.md} | 4 +- .../user-guide/transformations/index.md | 2 + mkdocs.yml | 25 +- py-polars/docs/source/_static/css/custom.css | 2 +- 67 files changed, 4083 insertions(+), 2458 deletions(-) create mode 100644 docs/source/_build/js/mathjax.js delete mode 100644 docs/source/src/python/user-guide/concepts/contexts.py delete mode 100644 docs/source/src/python/user-guide/concepts/data-types/categoricals.py create mode 100644 docs/source/src/python/user-guide/expressions/categoricals.py create mode 100644 docs/source/src/python/user-guide/expressions/expression-expansion.py delete mode 100644 docs/source/src/python/user-guide/expressions/functions.py create mode 100644 docs/source/src/python/user-guide/expressions/operations.py delete mode 100644 docs/source/src/python/user-guide/expressions/operators.py delete mode 100644 docs/source/src/rust/user-guide/concepts/contexts.rs create mode 100644 docs/source/src/rust/user-guide/expressions/expression-expansion.rs delete mode 100644 docs/source/src/rust/user-guide/expressions/functions.rs create mode 100644 docs/source/src/rust/user-guide/expressions/operations.rs delete mode 100644 docs/source/src/rust/user-guide/expressions/operators.rs create mode 100644 docs/source/user-guide/expressions/athletes_over_country.svg create mode 100644 docs/source/user-guide/expressions/athletes_over_country_explode.svg create mode 100644 docs/source/user-guide/expressions/basic-operations.md delete mode 100644 docs/source/user-guide/expressions/column-selections.md create mode 100644 docs/source/user-guide/expressions/expression-expansion.md delete mode 100644 docs/source/user-guide/expressions/functions.md create mode 100644 docs/source/user-guide/expressions/lists-and-arrays.md delete mode 100644 docs/source/user-guide/expressions/lists.md create mode 100644 docs/source/user-guide/expressions/numpy-functions.md delete mode 100644 docs/source/user-guide/expressions/numpy.md delete mode 100644 docs/source/user-guide/expressions/operators.md create mode 100644 docs/source/user-guide/expressions/speed_rank_by_type.svg rename docs/source/user-guide/expressions/{user-defined-functions.md => user-defined-python-functions.md} (95%) create mode 100644 docs/source/user-guide/expressions/window-functions.md delete mode 100644 docs/source/user-guide/expressions/window.md rename docs/source/user-guide/{expressions/plugins.md => plugins/your-first-polars-plugin.md} (98%) diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 3d6b92aeba67..e8bc56d9ead9 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1276,7 +1276,7 @@ impl Expr { /// Exclude a column from a wildcard/regex selection. /// - /// You may also use regexes in the exclude as long as they start with `^` and end with `$`/ + /// You may also use regexes in the exclude as long as they start with `^` and end with `$`. pub fn exclude(self, columns: impl IntoVec) -> Expr { let v = columns.into_vec().into_iter().map(Excluded::Name).collect(); Expr::Exclude(Arc::new(self), v) diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index 1e301f592cb1..6a0ce39d1044 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -1,388 +1,401 @@ python: - DataFrame: https://docs.pola.rs/api/python/stable/reference/dataframe/index.html - LazyFrame: https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html - Series: https://docs.pola.rs/api/python/stable/reference/series/index.html - Categorical: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.Categorical.html - Config: https://docs.pola.rs/api/python/stable/reference/config.html - select: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.select.html - filter: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.filter.html - with_columns: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.with_columns.html - group_by: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.group_by.html agg: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.dataframe.group_by.GroupBy.agg.html - join: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html - vstack: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.vstack.html - read_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html - write_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_csv.html - read_excel: https://docs.pola.rs/api/python/stable/reference/api/polars.read_excel.html - write_excel: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_excel.html - read_json: https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html - write_json: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_json.html - read_ipc: https://docs.pola.rs/api/python/stable/reference/api/polars.read_ipc.html - min: https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.min.html - max: https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.max.html - value_counts: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.value_counts.html - unnest: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html - struct: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.struct.html - is_duplicated: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.is_duplicated.html - sample: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.sample.html - head: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.head.html - glimpse: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.glimpse.html - tail: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.tail.html - describe: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.describe.html + alias: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.alias.html + all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.all.html + approx_n_unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.approx_n_unique.html + Array: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.Array.html + cast: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.cast.html + Categorical: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.Categorical.html col: https://docs.pola.rs/api/python/stable/reference/expressions/col.html - sort: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.sort.html - scan_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_csv.html collect: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.collect.html - fold: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.fold.html + concat: https://docs.pola.rs/api/python/stable/reference/api/polars.concat.html + concat_list: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.concat_list.html concat_str: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.concat_str.html - str.split: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.split.html + Config: https://docs.pola.rs/api/python/stable/reference/config.html + cs.by_name: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.by_name + cs.contains: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.contains + cs.first: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.first + cs.matches: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.matches + cs.numeric: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.numeric + cs.temporal: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.temporal + DataFrame: https://docs.pola.rs/api/python/stable/reference/dataframe/index.html + DataFrame.explode: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.explode.html + date_range: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.date_range.html + datetime_range: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.datetime_range.html + describe: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.describe.html + dt.convert_time_zone: + name: dt.convert_time_zone + link: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.convert_time_zone.html + feature_flags: [timezone] + dt.replace_time_zone: + name: dt.replace_time_zone + link: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.replace_time_zone.html + feature_flags: [timezone] + dt.to_string: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.to_string.html + dt.year: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.year.html + element: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.element.html + Enum: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.Enum.html + estimated_size: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.estimated_size.html + exclude: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.exclude.html + expand_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.expand_selector + explain: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.explain.html + explode: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.explode.html + Expr.arr: + name: arr namespace + link: https://docs.pola.rs/api/python/stable/reference/expressions/array.html + Expr.dt: + name: dt namespace + link: https://docs.pola.rs/api/python/stable/reference/expressions/temporal.html Expr.list: - name: "list namespace" + name: list namespace link: https://docs.pola.rs/api/python/stable/reference/expressions/list.html + Expr.name: + name: name namespace + link: https://docs.pola.rs/api/python/stable/reference/expressions/name.html Expr.str: - name: "str namespace" + name: str namespace link: https://docs.pola.rs/api/python/stable/reference/expressions/string.html - element: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.element.html - all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.all.html - exclude: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.exclude.html - alias: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.alias.html - prefix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.prefix.html - suffix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.suffix.html - n_unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.n_unique.html - approx_n_unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.approx_n_unique.html - when: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.when.html - concat_list: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.concat_list.html - list.eval: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.list.eval.html - null_count: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.null_count.html - is_null: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.is_null.html + fetch: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.fetch.html + fill_nan: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.fill_nan.html fill_null: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.fill_null.html + filter: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.filter.html + fold: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.fold.html + from_arrow: + name: from_arrow + link: https://docs.pola.rs/api/python/stable/reference/api/polars.from_arrow.html + feature_flags: [fsspec, pyarrow] + glimpse: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.glimpse.html + group_by: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.group_by.html + group_by_dynamic: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.group_by_dynamic.html + head: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.head.html + implode: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.implode.html interpolate: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.interpolate.html - fill_nan: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.fill_nan.html + is_between: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.is_between.html + is_duplicated: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.is_duplicated.html + is_null: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.is_null.html + is_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.is_selector + join: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html + join_asof: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html + join_where: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html + lazy: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.lazy.html + LazyFrame: https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html + List: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.List.html + list.eval: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.list.eval.html + map: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.map.html + map_elements: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_elements.html + max: https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.max.html + min: https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.min.html + n_unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.n_unique.html + np.log: + name: log + link: https://numpy.org/doc/stable/reference/generated/numpy.log.html + feature_flags: [numpy] + null_count: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.null_count.html operators: https://docs.pola.rs/api/python/stable/reference/expressions/operators.html over: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.over.html - implode: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.implode.html - DataFrame.explode: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.explode.html - read_database_connectorx: + pivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.pivot.html + prefix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.prefix.html + rank: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.rank.html + read_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html + read_database: name: read_database link: https://docs.pola.rs/api/python/stable/reference/api/polars.read_database.html - feature_flags: ['connectorx'] - read_database: + read_database_connectorx: name: read_database link: https://docs.pola.rs/api/python/stable/reference/api/polars.read_database.html - write_database: - name: write_database - link: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_database.html + feature_flags: [connectorx] read_database_uri: https://docs.pola.rs/api/python/stable/reference/api/polars.read_database_uri.html - read_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.read_parquet.html - write_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_parquet.html - scan_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_parquet.html - scan_ipc: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_ipc.html + read_excel: https://docs.pola.rs/api/python/stable/reference/api/polars.read_excel.html + read_ipc: https://docs.pola.rs/api/python/stable/reference/api/polars.read_ipc.html read_json: https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html read_ndjson: https://docs.pola.rs/api/python/stable/reference/api/polars.read_ndjson.html - write_ndjson: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_ndjson.html - write_json: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_json.html + read_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.read_parquet.html + round: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.round.html#polars.Expr.round + sample: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.sample.html + scan_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_csv.html + scan_ipc: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_ipc.html scan_ndjson: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_ndjson.html + scan_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_parquet.html scan_pyarrow_dataset: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_pyarrow_dataset.html - from_arrow: - name: from_arrow - link: https://docs.pola.rs/api/python/stable/reference/api/polars.from_arrow.html - feature_flags: ['fsspec','pyarrow'] + select: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.select.html + selectors: https://docs.pola.rs/api/python/stable/reference/selectors.html + Series: https://docs.pola.rs/api/python/stable/reference/series/index.html + Series.arr: https://docs.pola.rs/api/python/stable/reference/series/array.html + Series.dt.day: https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.dt.day.html show_graph: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.show_graph.html - lazy: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.lazy.html - explain: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.explain.html - fetch: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.fetch.html + sort: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.sort.html SQLContext: https://docs.pola.rs/api/python/stable/reference/sql/python_api.html#polars.SQLContext + SQLexecute: + name: execute + link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.execute.html SQLregister: name: register link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.register.html#polars.SQLContext.register SQLregister_many: name: register_many link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.register_many.html - SQLexecute: - name: execute - link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.execute.html - join_asof: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html - join_where: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html - concat: https://docs.pola.rs/api/python/stable/reference/api/polars.concat.html - pivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.pivot.html - unpivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unpivot.html - is_between: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.is_between.html - - date_range: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.date_range.html - upsample: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.upsample.html - group_by_dynamic: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.group_by_dynamic.html - cast: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.cast.html - np.log: - name: log - link: https://numpy.org/doc/stable/reference/generated/numpy.log.html - feature_flags: ['numpy'] - Array: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.Array.html - Series.arr: https://docs.pola.rs/api/python/stable/reference/series/array.html - Series.dt.day: https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.dt.day.html - - selectors: https://docs.pola.rs/api/python/stable/reference/selectors.html - cs.numeric: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.numeric - cs.by_name: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.by_name - cs.first: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.first - cs.temporal: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.temporal - cs.contains: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.contains - cs.matches: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.matches - is_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.is_selector - expand_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.expand_selector - - Expr.dt: - name: "dt namespace" - link: https://docs.pola.rs/api/python/stable/reference/expressions/temporal.html - dt.convert_time_zone: - name: dt.convert_time_zone - link: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.convert_time_zone.html - feature_flags: ['timezone'] - dt.replace_time_zone: - name: dt.replace_time_zone - link: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.replace_time_zone.html - feature_flags: ['timezone'] - dt.to_string: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.to_string.html - dt.year: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.year.html - - str.starts_with: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.starts_with.html + str.contains: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html str.ends_with: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.ends_with.html str.extract: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.extract.html str.extract_all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.extract_all.html - str.contains: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html + str.head: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.head.html + str.len_bytes: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_bytes.html + str.len_chars: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_chars.html str.replace: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.replace.html str.replace_all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.replace_all.html - str.to_datetime: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_datetime.html + str.slice: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.slice.html + str.split: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.split.html + str.starts_with: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.starts_with.html + str.strip_chars: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_chars.html + str.strip_chars_end: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_chars_end.html + str.strip_chars_start: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_chars_start.html + str.strip_prefix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_prefix.html + str.strip_suffix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_suffix.html + str.tail: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.tail.html str.to_date: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_date.html - str.len_chars: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_chars.html - str.len_bytes: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_bytes.html - + str.to_datetime: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_datetime.html + str.to_lowercase: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_lowercase.html + str.to_titlecase: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_titlecase.html + str.to_uppercase: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_uppercase.html + StringCache: https://docs.pola.rs/api/python/stable/reference/api/polars.StringCache.html + struct: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.struct.html struct.field: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.field.html struct.rename_fields: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.rename_fields.html - Expr.name: - name: "name namespace" - link: https://docs.pola.rs/api/python/stable/reference/expressions/name.html - round: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.round.html#polars.Expr.round + suffix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.suffix.html + tail: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.tail.html + unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.unique.html + unique_counts: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.unique_counts.html + unnest: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html + unpivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unpivot.html + upsample: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.upsample.html + value_counts: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.value_counts.html + vstack: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.vstack.html + when: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.when.html + with_columns: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.with_columns.html + write_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_csv.html + write_database: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_database.html + write_excel: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_excel.html + write_json: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_json.html + write_ndjson: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_ndjson.html + write_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_parquet.html rust: - DataFrame: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html - LazyFrame: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html - Series: https://docs.pola.rs/api/rust/dev/polars/series/struct.Series.html + agg: https://docs.rs/polars/latest/polars/prelude/struct.LazyGroupBy.html#method.agg + alias: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.alias + all: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.all.html + approx_n_unique: + name: approx_n_unique + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.approx_n_unique + feature_flags: [approx_unique] + arr.eval: + name: arr + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.arr + feature_flags: [list_eval, rank] + Array: + name: Array + link: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.Array + feature_flags: [dtype-array] + cast: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.cast Categorical: name: Categorical link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.DataType.html#variant.Categorical - feature_flags: ['dtype-categorical'] - select: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.select + feature_flags: [dtype-categorical] + col: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.col.html + collect: + name: collect + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.collect + feature_flags: [streaming] + concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html + concat_list: + name: concat_lst + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.concat_list.html + concat_str: + name: concat_str + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.concat_str.html + feature_flags: [concat_str] + cross_join: + name: cross_join + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.cross_join + feature_flags: [cross_join] + cs.by_name: https://github.com/pola-rs/polars/issues/10594 + cs.contains: https://github.com/pola-rs/polars/issues/10594 + cs.first: https://github.com/pola-rs/polars/issues/10594 + cs.matches: https://github.com/pola-rs/polars/issues/10594 + cs.numeric: https://github.com/pola-rs/polars/issues/10594 + cs.temporal: https://github.com/pola-rs/polars/issues/10594 + DataFrame: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html + DataFrame.explode: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.explode + date_range: + name: date_range + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.date_range.html + feature_flags: [range, dtype-date] + datetime_range: + name: datetime_range + link: https://docs.rs/polars/latest/polars/prelude/fn.datetime_range.html + feature_flags: [lazy, dtype-datetime] + describe: + name: describe + link: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.describe + feature_flags: [describe] + dt.convert_time_zone: + name: dt.convert_time_zone + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.convert_time_zone + feature_flags: [timezones] + dt.replace_time_zone: + name: dt.replace_time_zone + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.replace_time_zone + feature_flags: [timezones] + dt.to_string: + name: dt.to_string + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.to_string + feature_flags: [temporal] + dt.year: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.year + dtype_col: https://docs.rs/polars/latest/polars/prelude/fn.dtype_col.html + dtype_cols: https://docs.rs/polars/latest/polars/prelude/fn.dtype_cols.html + element: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.col.html + estimated_size: https://docs.rs/polars/latest/polars/frame/struct.DataFrame.html#method.estimated_size + exclude: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.exclude + expand_selector: https://github.com/pola-rs/polars/issues/10594 + explain: https://docs.rs/polars/latest/polars/prelude/struct.LazyFrame.html#method.explain + explode: https://docs.rs/polars/latest/polars/frame/struct.DataFrame.html#method.explode + Expr.arr: + name: '`arr` namespace' + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.arr + feature_flags: [dtype-array] + Expr.dt: + name: dt namespace + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html + feature_flags: [temporal] + Expr.list: + name: list namespace + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ListNameSpace.html + Expr.name: + name: name namespace + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ExprNameNameSpace.html + feature_flags: [lazy] + Expr.str: + name: str namespace + link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.StringNameSpaceImpl.html + feature_flags: [strings] + fill_nan: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.fill_nan + fill_null: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.fill_null filter: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.filter - with_columns: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.with_columns + fold: + name: fold_exprs + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.fold_exprs.html group_by: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by - agg: https://docs.rs/polars/latest/polars/prelude/struct.LazyGroupBy.html#method.agg group_by_dynamic: name: group_by_dynamic link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by_dynamic feature_flags: [dynamic_group_by] + head: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.head + implode: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.implode + interpolate: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.interpolate + is_between: + name: is_between + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_between + feature_flags: [is_between] + is_duplicated: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.is_duplicated + is_null: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_null + is_selector: https://github.com/pola-rs/polars/issues/10594 join: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join join-semi_anti_join_flag: name: join link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join - feature_flags: ["semi_anti_join"] - - vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack - concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html - - explain: https://docs.rs/polars/latest/polars/prelude/struct.LazyFrame.html#method.explain - - operators: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Operator.html - - Array: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.Array - - DataFrame.explode: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.explode - pivot: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/pivot/fn.pivot.html - unpivot: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unpivot - upsample: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.upsample + feature_flags: [semi_anti_join] join_asof_by: name: join_asof_by link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoinBy.html#method.join_asof_by - feature_flags: ['asof_join'] + feature_flags: [asof_join] join_where: name: join_where link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.JoinBuilder.html#method.join_where - feature_flags: ["iejoin"] - cross_join: - name: cross_join - link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.cross_join - feature_flags: [cross_join] - unnest: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unnest - + feature_flags: [iejoin] + LazyFrame: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html + List: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.List + list.eval: + name: list.eval + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/trait.ListNameSpaceExtension.html#method.eval + feature_flags: [list_eval] + map: https://docs.rs/polars/latest/polars/prelude/struct.ExprNameNameSpace.html#method.map + max: https://docs.pola.rs/api/rust/dev/polars/series/struct.Series.html#method.max + min: https://docs.pola.rs/api/rust/dev/polars/series/struct.Series.html#method.min + n_unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.n_unique + null_count: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.null_count + operators: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Operator.html + over: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.over + pivot: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/pivot/fn.pivot.html + prefix: https://docs.rs/polars/latest/polars/prelude/struct.ExprNameNameSpace.html#method.prefix + rank: https://docs.rs/polars/latest/polars/prelude/enum.Expr.html#method.rank read_csv: name: CsvReader link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.CsvReader.html - feature_flags: ['csv'] - scan_csv: - name: LazyCsvReader - link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyCsvReader.html - feature_flags: ['csv'] - write_csv: - name: CsvWriter - link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.CsvWriter.html - feature_flags: ['csv'] + feature_flags: [csv] + read_ipc: + name: IpcReader + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.IpcReader.html + feature_flags: [ipc] read_json: name: JsonReader link: https://docs.pola.rs/api/rust/dev/polars_io/json/struct.JsonReader.html - feature_flags: ['json'] + feature_flags: [json] read_ndjson: name: JsonLineReader link: https://docs.pola.rs/api/rust/dev/polars_io/ndjson/core/struct.JsonLineReader.html - feature_flags: ['json'] - write_json: - name: JsonWriter - link: https://docs.pola.rs/api/rust/dev/polars_io/json/struct.JsonWriter.html - feature_flags: ['json'] - write_ndjson: - name: JsonWriter - link: https://docs.pola.rs/api/rust/dev/polars_io/json/struct.JsonWriter.html - feature_flags: ['json'] - scan_ndjson: - name: LazyJsonLineReader - link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyJsonLineReader.html - feature_flags: ['json'] + feature_flags: [json] read_parquet: name: ParquetReader link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.ParquetReader.html - feature_flags: ['parquet'] - write_parquet: - name: ParquetWriter - link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.ParquetWriter.html - feature_flags: ['parquet'] + feature_flags: [parquet] + round: + name: round + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.round + feature_flags: [round_series] + sample: + name: sample_n + link: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.sample_n + scan_csv: + name: LazyCsvReader + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyCsvReader.html + feature_flags: [csv] + scan_ndjson: + name: LazyJsonLineReader + link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyJsonLineReader.html + feature_flags: [json] scan_parquet: name: scan_parquet link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.scan_parquet - feature_flags: ['parquet'] - read_ipc: - name: IpcReader - link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.IpcReader.html - feature_flags: ['ipc'] + feature_flags: [parquet] scan_pyarrow_dataset: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_pyarrow_dataset.html - - min: https://docs.pola.rs/api/rust/dev/polars/series/struct.Series.html#method.min - max: https://docs.pola.rs/api/rust/dev/polars/series/struct.Series.html#method.max - struct: - name: Struct - link: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.Struct - feature_flags: ['dtype-struct'] - implode: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.implode - sample: - name: sample_n - link: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.sample_n - head: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.head - tail: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.tail - describe: - name: describe - link: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.describe - feature_flags: ['describe'] - collect: - name: collect - link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.collect - feature_flags: ['streaming'] - - col: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.col.html - element: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.col.html - all: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.all.html - when: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.when.html - - sort: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.sort - arr.eval: - name: arr - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.arr - feature_flags: ['list_eval','rank'] - fold: - name: fold_exprs - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.fold_exprs.html - concat_str: - name: concat_str - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.concat_str.html - feature_flags: ['concat_str'] - concat_list: - name: concat_lst - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.concat_list.html - over: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.over - - alias: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.alias - approx_n_unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.approx_n_unique - cast: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.cast - exclude: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.exclude - fill_nan: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.fill_nan - fill_null: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.fill_null - n_unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.n_unique - null_count: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.null_count - interpolate: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.interpolate - is_between: - name: is_between - link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_between - feature_flags: [is_between] - is_duplicated: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.is_duplicated - is_null: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_null - value_counts: - name: value_counts - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.value_counts - feature_flags: [dtype-struct] - - Expr.list: - name: "list namespace" - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ListNameSpace.html - Expr.str: - name: "str namespace" - link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.StringNameSpaceImpl.html - feature_flags: [strings] - Series.arr: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ArrayNameSpace.html - - date_range: - name: date_range - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.date_range.html - feature_flags: [range, dtype-date] - + select: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.select selectors: https://github.com/pola-rs/polars/issues/10594 - cs.numeric: https://github.com/pola-rs/polars/issues/10594 - cs.by_name: https://github.com/pola-rs/polars/issues/10594 - cs.first: https://github.com/pola-rs/polars/issues/10594 - cs.temporal: https://github.com/pola-rs/polars/issues/10594 - cs.contains: https://github.com/pola-rs/polars/issues/10594 - cs.matches: https://github.com/pola-rs/polars/issues/10594 - is_selector: https://github.com/pola-rs/polars/issues/10594 - expand_selector: https://github.com/pola-rs/polars/issues/10594 - - dt.convert_time_zone: - name: dt.convert_time_zone - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.convert_time_zone - feature_flags: [timezones] - dt.replace_time_zone: - name: dt.replace_time_zone - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.replace_time_zone - feature_flags: [timezones] - dt.to_string: - name: dt.to_string - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.to_string - feature_flags: [temporal] - dt.year: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.year + Series: https://docs.pola.rs/api/rust/dev/polars/series/struct.Series.html + Series.arr: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ArrayNameSpace.html Series.dt.day: name: dt.day link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html#method.day feature_flags: [temporal] - - list.eval: - name: list.eval - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/trait.ListNameSpaceExtension.html#method.eval - feature_flags: [list_eval] - + sort: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.sort str.contains: name: str.contains link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.contains feature_flags: [regex] + str.ends_with: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.ends_with str.extract: name: str.extract link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.extract str.extract_all: name: str.extract_all link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.extract_all + str.head: + name: str.head + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.head + str.len_bytes: + name: str.len_bytes + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_bytes + str.len_chars: + name: str.len_chars + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_chars str.replace: name: str.replace link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.replace @@ -391,11 +404,21 @@ rust: name: str.replace_all link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.replace_all feature_flags: [regex] - str.starts_with: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.starts_with - str.ends_with: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.ends_with str.split: name: str.split link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.split + str.starts_with: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.starts_with + str.str_head: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.str_head + str.str_slice: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.str_slice + str.str_tail: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.str_tail + str.strip_chars: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.strip_chars + str.strip_chars_end: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.strip_chars_end + str.strip_chars_start: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.strip_chars_start + str.strip_prefix: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.strip_prefix + str.strip_suffix: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.strip_suffix + str.tail: + name: str.tail + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.tail str.to_date: name: str.replace_all link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.to_date @@ -404,27 +427,50 @@ rust: name: str.replace_all link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.to_datetime feature_flags: [dtype-datetime] - str.len_chars: - name: str.len_chars - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_chars - str.len_bytes: - name: str.len_bytes - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_bytes - - struct.rename_fields: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.rename_fields + str.to_lowercase: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.to_lowercase + str.to_titlecase: + name: str.to_titlecase + link: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.to_titlecase + feature_flags: [nightly] + str.to_uppercase: https://docs.rs/polars/latest/polars/prelude/trait.StringNameSpaceImpl.html#method.to_uppercase + struct: + name: Struct + link: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.Struct + feature_flags: [dtype-struct] struct.field: name: struct.field_by_name link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.field_by_name - - Expr.name: - name: "name namespace" - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ExprNameNameSpace.html - feature_flags: [lazy] - Expr.dt: - name: "dt namespace" - link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html - feature_flags: [temporal] - round: - name: "round" - link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.round - feature_flags: [round_series] + struct.rename_fields: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.rename_fields + suffix: https://docs.rs/polars/latest/polars/prelude/struct.ExprNameNameSpace.html#method.suffix + tail: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.tail + unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.unique + unique_counts: + name: unique_counts + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.unique_counts + feature_flags: [unique_counts] + unnest: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unnest + unpivot: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unpivot + upsample: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.upsample + value_counts: + name: value_counts + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.value_counts + feature_flags: [dtype-struct] + vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack + when: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/fn.when.html + with_columns: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.with_columns + write_csv: + name: CsvWriter + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.CsvWriter.html + feature_flags: [csv] + write_json: + name: JsonWriter + link: https://docs.pola.rs/api/rust/dev/polars_io/json/struct.JsonWriter.html + feature_flags: [json] + write_ndjson: + name: JsonWriter + link: https://docs.pola.rs/api/rust/dev/polars_io/json/struct.JsonWriter.html + feature_flags: [json] + write_parquet: + name: ParquetWriter + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.ParquetWriter.html + feature_flags: [parquet] diff --git a/docs/source/_build/css/extra.css b/docs/source/_build/css/extra.css index 420db3966780..4f9cd5638a55 100644 --- a/docs/source/_build/css/extra.css +++ b/docs/source/_build/css/extra.css @@ -1,18 +1,20 @@ :root { - --md-primary-fg-color: #0B7189 ; - --md-primary-fg-color--light: #C2CCD6; - --md-primary-fg-color--dark: #103547; - --md-text-font: 'Proxima Nova', sans-serif; + --md-primary-fg-color: #0B7189; + --md-primary-fg-color--light: #C2CCD6; + --md-primary-fg-color--dark: #103547; + --md-text-font: 'Proxima Nova', sans-serif; } -span .md-typeset .emojione, .md-typeset .gemoji, .md-typeset .twemoji { - vertical-align: text-bottom; +span .md-typeset .emojione, +.md-typeset .gemoji, +.md-typeset .twemoji { + vertical-align: text-bottom; } @font-face { - font-family: 'Proxima Nova', sans-serif; - src: 'https://fonts.cdnfonts.com/css/proxima-nova-2' + font-family: 'Proxima Nova', sans-serif; + src: 'https://fonts.cdnfonts.com/css/proxima-nova-2' } :root { @@ -20,14 +22,14 @@ span .md-typeset .emojione, .md-typeset .gemoji, .md-typeset .twemoji { } .contributor_icon { - height:40px; - width:40px; - border-radius: 20px; - margin: 0 5px; + height: 40px; + width: 40px; + border-radius: 20px; + margin: 0 5px; } -.feature-flag{ - background-color: rgba(255, 245, 214,.5); +.feature-flag { + background-color: rgba(255, 245, 214, .5); border: none; padding: 0px 5px; text-align: center; @@ -38,27 +40,38 @@ span .md-typeset .emojione, .md-typeset .gemoji, .md-typeset .twemoji { font-size: .85em; } -[data-md-color-scheme=slate] .feature-flag{ - background-color:var(--md-code-bg-color); +[data-md-color-scheme=slate] .feature-flag { + background-color: var(--md-code-bg-color); } -.md-typeset ol li, .md-typeset ul li{ - margin-bottom: 0em !important; + +.md-typeset ol li, +.md-typeset ul li { + margin-bottom: 0em !important; } :root { - --md-admonition-icon--rust: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'%3E%3C!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--%3E%3Cpath d='m508.52 249.75-21.82-13.51c-.17-2-.34-3.93-.55-5.88l18.72-17.5a7.35 7.35 0 0 0-2.44-12.25l-24-9c-.54-1.88-1.08-3.78-1.67-5.64l15-20.83a7.35 7.35 0 0 0-4.79-11.54l-25.42-4.15c-.9-1.73-1.79-3.45-2.73-5.15l10.68-23.42a7.35 7.35 0 0 0-6.95-10.39l-25.82.91q-1.79-2.22-3.61-4.4L439 81.84a7.36 7.36 0 0 0-8.84-8.84L405 78.93q-2.17-1.83-4.4-3.61l.91-25.82a7.35 7.35 0 0 0-10.39-7L367.7 53.23c-1.7-.94-3.43-1.84-5.15-2.73l-4.15-25.42a7.35 7.35 0 0 0-11.54-4.79L326 35.26c-1.86-.59-3.75-1.13-5.64-1.67l-9-24a7.35 7.35 0 0 0-12.25-2.44l-17.5 18.72c-1.95-.21-3.91-.38-5.88-.55L262.25 3.48a7.35 7.35 0 0 0-12.5 0L236.24 25.3c-2 .17-3.93.34-5.88.55l-17.5-18.72a7.35 7.35 0 0 0-12.25 2.44l-9 24c-1.89.55-3.79 1.08-5.66 1.68l-20.82-15a7.35 7.35 0 0 0-11.54 4.79l-4.15 25.41c-1.73.9-3.45 1.79-5.16 2.73l-23.4-10.63a7.35 7.35 0 0 0-10.39 7l.92 25.81c-1.49 1.19-3 2.39-4.42 3.61L81.84 73A7.36 7.36 0 0 0 73 81.84L78.93 107c-1.23 1.45-2.43 2.93-3.62 4.41l-25.81-.91a7.42 7.42 0 0 0-6.37 3.26 7.35 7.35 0 0 0-.57 7.13l10.66 23.41c-.94 1.7-1.83 3.43-2.73 5.16l-25.41 4.14a7.35 7.35 0 0 0-4.79 11.54l15 20.82c-.59 1.87-1.13 3.77-1.68 5.66l-24 9a7.35 7.35 0 0 0-2.44 12.25l18.72 17.5c-.21 1.95-.38 3.91-.55 5.88l-21.86 13.5a7.35 7.35 0 0 0 0 12.5l21.82 13.51c.17 2 .34 3.92.55 5.87l-18.72 17.5a7.35 7.35 0 0 0 2.44 12.25l24 9c.55 1.89 1.08 3.78 1.68 5.65l-15 20.83a7.35 7.35 0 0 0 4.79 11.54l25.42 4.15c.9 1.72 1.79 3.45 2.73 5.14l-10.63 23.43a7.35 7.35 0 0 0 .57 7.13 7.13 7.13 0 0 0 6.37 3.26l25.83-.91q1.77 2.22 3.6 4.4L73 430.16a7.36 7.36 0 0 0 8.84 8.84l25.16-5.93q2.18 1.83 4.41 3.61l-.92 25.82a7.35 7.35 0 0 0 10.39 6.95l23.43-10.68c1.69.94 3.42 1.83 5.14 2.73l4.15 25.42a7.34 7.34 0 0 0 11.54 4.78l20.83-15c1.86.6 3.76 1.13 5.65 1.68l9 24a7.36 7.36 0 0 0 12.25 2.44l17.5-18.72c1.95.21 3.92.38 5.88.55l13.51 21.82a7.35 7.35 0 0 0 12.5 0l13.51-21.82c2-.17 3.93-.34 5.88-.56l17.5 18.73a7.36 7.36 0 0 0 12.25-2.44l9-24c1.89-.55 3.78-1.08 5.65-1.68l20.82 15a7.34 7.34 0 0 0 11.54-4.78l4.15-25.42c1.72-.9 3.45-1.79 5.15-2.73l23.42 10.68a7.35 7.35 0 0 0 10.39-6.95l-.91-25.82q2.22-1.79 4.4-3.61l25.15 5.93a7.36 7.36 0 0 0 8.84-8.84L433.07 405q1.83-2.17 3.61-4.4l25.82.91a7.23 7.23 0 0 0 6.37-3.26 7.35 7.35 0 0 0 .58-7.13l-10.68-23.42c.94-1.7 1.83-3.43 2.73-5.15l25.42-4.15a7.35 7.35 0 0 0 4.79-11.54l-15-20.83c.59-1.87 1.13-3.76 1.67-5.65l24-9a7.35 7.35 0 0 0 2.44-12.25l-18.72-17.5c.21-1.95.38-3.91.55-5.87l21.82-13.51a7.35 7.35 0 0 0 0-12.5Zm-151 129.08A13.91 13.91 0 0 0 341 389.51l-7.64 35.67a187.51 187.51 0 0 1-156.36-.74l-7.64-35.66a13.87 13.87 0 0 0-16.46-10.68l-31.51 6.76a187.38 187.38 0 0 1-16.26-19.21H258.3c1.72 0 2.89-.29 2.89-1.91v-54.19c0-1.57-1.17-1.91-2.89-1.91h-44.83l.05-34.35H262c4.41 0 23.66 1.28 29.79 25.87 1.91 7.55 6.17 32.14 9.06 40 2.89 8.82 14.6 26.46 27.1 26.46H407a187.3 187.3 0 0 1-17.34 20.09Zm25.77 34.49A15.24 15.24 0 1 1 368 398.08h.44a15.23 15.23 0 0 1 14.8 15.24Zm-225.62-.68a15.24 15.24 0 1 1-15.25-15.25h.45a15.25 15.25 0 0 1 14.75 15.25Zm-88.1-178.49 32.83-14.6a13.88 13.88 0 0 0 7.06-18.33L102.69 186h26.56v119.73h-53.6a187.65 187.65 0 0 1-6.08-71.58Zm-11.26-36.06a15.24 15.24 0 0 1 15.23-15.25H74a15.24 15.24 0 1 1-15.67 15.24Zm155.16 24.49.05-35.32h63.26c3.28 0 23.07 3.77 23.07 18.62 0 12.29-15.19 16.7-27.68 16.7ZM399 306.71c-9.8 1.13-20.63-4.12-22-10.09-5.78-32.49-15.39-39.4-30.57-51.4 18.86-11.95 38.46-29.64 38.46-53.26 0-25.52-17.49-41.59-29.4-49.48-16.76-11-35.28-13.23-40.27-13.23h-198.9a187.49 187.49 0 0 1 104.89-59.19l23.47 24.6a13.82 13.82 0 0 0 19.6.44l26.26-25a187.51 187.51 0 0 1 128.37 91.43l-18 40.57a14 14 0 0 0 7.09 18.33l34.59 15.33a187.12 187.12 0 0 1 .4 32.54h-19.28c-1.91 0-2.69 1.27-2.69 3.13v8.82C421 301 409.31 305.58 399 306.71ZM240 60.21A15.24 15.24 0 0 1 255.21 45h.45A15.24 15.24 0 1 1 240 60.21ZM436.84 214a15.24 15.24 0 1 1 0-30.48h.44a15.24 15.24 0 0 1-.44 30.48Z'/%3E%3C/svg%3E"); - } - .md-typeset .admonition.rust, - .md-typeset details.rust { - border-color: rgb(205, 121, 44); - } - .md-typeset .rust > .admonition-title, - .md-typeset .rust > summary { - background-color: rgb(205, 121, 44,.1); - } - .md-typeset .rust > .admonition-title::before, - .md-typeset .rust > summary::before { - background-color:rgb(205, 121, 44); - -webkit-mask-image: var(--md-admonition-icon--rust); - mask-image: var(--md-admonition-icon--rust); - } \ No newline at end of file + --md-admonition-icon--rust: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'%3E%3C!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--%3E%3Cpath d='m508.52 249.75-21.82-13.51c-.17-2-.34-3.93-.55-5.88l18.72-17.5a7.35 7.35 0 0 0-2.44-12.25l-24-9c-.54-1.88-1.08-3.78-1.67-5.64l15-20.83a7.35 7.35 0 0 0-4.79-11.54l-25.42-4.15c-.9-1.73-1.79-3.45-2.73-5.15l10.68-23.42a7.35 7.35 0 0 0-6.95-10.39l-25.82.91q-1.79-2.22-3.61-4.4L439 81.84a7.36 7.36 0 0 0-8.84-8.84L405 78.93q-2.17-1.83-4.4-3.61l.91-25.82a7.35 7.35 0 0 0-10.39-7L367.7 53.23c-1.7-.94-3.43-1.84-5.15-2.73l-4.15-25.42a7.35 7.35 0 0 0-11.54-4.79L326 35.26c-1.86-.59-3.75-1.13-5.64-1.67l-9-24a7.35 7.35 0 0 0-12.25-2.44l-17.5 18.72c-1.95-.21-3.91-.38-5.88-.55L262.25 3.48a7.35 7.35 0 0 0-12.5 0L236.24 25.3c-2 .17-3.93.34-5.88.55l-17.5-18.72a7.35 7.35 0 0 0-12.25 2.44l-9 24c-1.89.55-3.79 1.08-5.66 1.68l-20.82-15a7.35 7.35 0 0 0-11.54 4.79l-4.15 25.41c-1.73.9-3.45 1.79-5.16 2.73l-23.4-10.63a7.35 7.35 0 0 0-10.39 7l.92 25.81c-1.49 1.19-3 2.39-4.42 3.61L81.84 73A7.36 7.36 0 0 0 73 81.84L78.93 107c-1.23 1.45-2.43 2.93-3.62 4.41l-25.81-.91a7.42 7.42 0 0 0-6.37 3.26 7.35 7.35 0 0 0-.57 7.13l10.66 23.41c-.94 1.7-1.83 3.43-2.73 5.16l-25.41 4.14a7.35 7.35 0 0 0-4.79 11.54l15 20.82c-.59 1.87-1.13 3.77-1.68 5.66l-24 9a7.35 7.35 0 0 0-2.44 12.25l18.72 17.5c-.21 1.95-.38 3.91-.55 5.88l-21.86 13.5a7.35 7.35 0 0 0 0 12.5l21.82 13.51c.17 2 .34 3.92.55 5.87l-18.72 17.5a7.35 7.35 0 0 0 2.44 12.25l24 9c.55 1.89 1.08 3.78 1.68 5.65l-15 20.83a7.35 7.35 0 0 0 4.79 11.54l25.42 4.15c.9 1.72 1.79 3.45 2.73 5.14l-10.63 23.43a7.35 7.35 0 0 0 .57 7.13 7.13 7.13 0 0 0 6.37 3.26l25.83-.91q1.77 2.22 3.6 4.4L73 430.16a7.36 7.36 0 0 0 8.84 8.84l25.16-5.93q2.18 1.83 4.41 3.61l-.92 25.82a7.35 7.35 0 0 0 10.39 6.95l23.43-10.68c1.69.94 3.42 1.83 5.14 2.73l4.15 25.42a7.34 7.34 0 0 0 11.54 4.78l20.83-15c1.86.6 3.76 1.13 5.65 1.68l9 24a7.36 7.36 0 0 0 12.25 2.44l17.5-18.72c1.95.21 3.92.38 5.88.55l13.51 21.82a7.35 7.35 0 0 0 12.5 0l13.51-21.82c2-.17 3.93-.34 5.88-.56l17.5 18.73a7.36 7.36 0 0 0 12.25-2.44l9-24c1.89-.55 3.78-1.08 5.65-1.68l20.82 15a7.34 7.34 0 0 0 11.54-4.78l4.15-25.42c1.72-.9 3.45-1.79 5.15-2.73l23.42 10.68a7.35 7.35 0 0 0 10.39-6.95l-.91-25.82q2.22-1.79 4.4-3.61l25.15 5.93a7.36 7.36 0 0 0 8.84-8.84L433.07 405q1.83-2.17 3.61-4.4l25.82.91a7.23 7.23 0 0 0 6.37-3.26 7.35 7.35 0 0 0 .58-7.13l-10.68-23.42c.94-1.7 1.83-3.43 2.73-5.15l25.42-4.15a7.35 7.35 0 0 0 4.79-11.54l-15-20.83c.59-1.87 1.13-3.76 1.67-5.65l24-9a7.35 7.35 0 0 0 2.44-12.25l-18.72-17.5c.21-1.95.38-3.91.55-5.87l21.82-13.51a7.35 7.35 0 0 0 0-12.5Zm-151 129.08A13.91 13.91 0 0 0 341 389.51l-7.64 35.67a187.51 187.51 0 0 1-156.36-.74l-7.64-35.66a13.87 13.87 0 0 0-16.46-10.68l-31.51 6.76a187.38 187.38 0 0 1-16.26-19.21H258.3c1.72 0 2.89-.29 2.89-1.91v-54.19c0-1.57-1.17-1.91-2.89-1.91h-44.83l.05-34.35H262c4.41 0 23.66 1.28 29.79 25.87 1.91 7.55 6.17 32.14 9.06 40 2.89 8.82 14.6 26.46 27.1 26.46H407a187.3 187.3 0 0 1-17.34 20.09Zm25.77 34.49A15.24 15.24 0 1 1 368 398.08h.44a15.23 15.23 0 0 1 14.8 15.24Zm-225.62-.68a15.24 15.24 0 1 1-15.25-15.25h.45a15.25 15.25 0 0 1 14.75 15.25Zm-88.1-178.49 32.83-14.6a13.88 13.88 0 0 0 7.06-18.33L102.69 186h26.56v119.73h-53.6a187.65 187.65 0 0 1-6.08-71.58Zm-11.26-36.06a15.24 15.24 0 0 1 15.23-15.25H74a15.24 15.24 0 1 1-15.67 15.24Zm155.16 24.49.05-35.32h63.26c3.28 0 23.07 3.77 23.07 18.62 0 12.29-15.19 16.7-27.68 16.7ZM399 306.71c-9.8 1.13-20.63-4.12-22-10.09-5.78-32.49-15.39-39.4-30.57-51.4 18.86-11.95 38.46-29.64 38.46-53.26 0-25.52-17.49-41.59-29.4-49.48-16.76-11-35.28-13.23-40.27-13.23h-198.9a187.49 187.49 0 0 1 104.89-59.19l23.47 24.6a13.82 13.82 0 0 0 19.6.44l26.26-25a187.51 187.51 0 0 1 128.37 91.43l-18 40.57a14 14 0 0 0 7.09 18.33l34.59 15.33a187.12 187.12 0 0 1 .4 32.54h-19.28c-1.91 0-2.69 1.27-2.69 3.13v8.82C421 301 409.31 305.58 399 306.71ZM240 60.21A15.24 15.24 0 0 1 255.21 45h.45A15.24 15.24 0 1 1 240 60.21ZM436.84 214a15.24 15.24 0 1 1 0-30.48h.44a15.24 15.24 0 0 1-.44 30.48Z'/%3E%3C/svg%3E"); +} + +.md-typeset .admonition.rust, +.md-typeset details.rust { + border-color: rgb(205, 121, 44); +} + +.md-typeset .rust>.admonition-title, +.md-typeset .rust>summary { + background-color: rgb(205, 121, 44, .1); +} + +.md-typeset .rust>.admonition-title::before, +.md-typeset .rust>summary::before { + background-color: rgb(205, 121, 44); + -webkit-mask-image: var(--md-admonition-icon--rust); + mask-image: var(--md-admonition-icon--rust); +} + +/* Adapt Excalidraw diagrams to dark mode. */ +body[data-md-color-scheme="slate"] .excalidraw svg { + will-change: filter; + filter: invert(100%) hue-rotate(180deg); +} diff --git a/docs/source/_build/js/mathjax.js b/docs/source/_build/js/mathjax.js new file mode 100644 index 000000000000..5b34852b5eee --- /dev/null +++ b/docs/source/_build/js/mathjax.js @@ -0,0 +1,19 @@ +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; + +document$.subscribe(() => { + MathJax.startup.output.clearCache() + MathJax.typesetClear() + MathJax.texReset() + MathJax.typesetPromise() +}) diff --git a/docs/source/src/python/user-guide/concepts/contexts.py b/docs/source/src/python/user-guide/concepts/contexts.py deleted file mode 100644 index 7c6c2b4999fb..000000000000 --- a/docs/source/src/python/user-guide/concepts/contexts.py +++ /dev/null @@ -1,55 +0,0 @@ -# --8<-- [start:setup] -import polars as pl -import numpy as np - -np.random.seed(12) -# --8<-- [end:setup] - -# --8<-- [start:dataframe] -df = pl.DataFrame( - { - "nrs": [1, 2, 3, None, 5], - "names": ["foo", "ham", "spam", "egg", None], - "random": np.random.rand(5), - "groups": ["A", "A", "B", "C", "B"], - } -) -print(df) -# --8<-- [end:dataframe] - -# --8<-- [start:select] - -out = df.select( - pl.sum("nrs"), - pl.col("names").sort(), - pl.col("names").first().alias("first name"), - (pl.mean("nrs") * 10).alias("10xnrs"), -) -print(out) -# --8<-- [end:select] - -# --8<-- [start:filter] -out = df.filter(pl.col("nrs") > 2) -print(out) -# --8<-- [end:filter] - -# --8<-- [start:with_columns] - -df = df.with_columns( - pl.sum("nrs").alias("nrs_sum"), - pl.col("random").count().alias("count"), -) -print(df) -# --8<-- [end:with_columns] - - -# --8<-- [start:group_by] -out = df.group_by("groups").agg( - pl.sum("nrs"), # sum nrs by groups - pl.col("random").count().alias("count"), # count group members - # sum random where name != null - pl.col("random").filter(pl.col("names").is_not_null()).sum().name.suffix("_sum"), - pl.col("names").reverse().alias("reversed names"), -) -print(out) -# --8<-- [end:group_by] diff --git a/docs/source/src/python/user-guide/concepts/data-types-and-structures.py b/docs/source/src/python/user-guide/concepts/data-types-and-structures.py index 3d08edcbcec9..20067c0a4d19 100644 --- a/docs/source/src/python/user-guide/concepts/data-types-and-structures.py +++ b/docs/source/src/python/user-guide/concepts/data-types-and-structures.py @@ -58,3 +58,27 @@ # --8<-- [start:describe] print(df.describe()) # --8<-- [end:describe] + +# --8<-- [start:schema-def] +df = pl.DataFrame( + { + "name": ["Alice", "Ben", "Chloe", "Daniel"], + "age": [27, 39, 41, 43], + }, + schema={"name": None, "age": pl.UInt8}, +) + +print(df) +# --8<-- [end:schema-def] + +# --8<-- [start:schema_overrides] +df = pl.DataFrame( + { + "name": ["Alice", "Ben", "Chloe", "Daniel"], + "age": [27, 39, 41, 43], + }, + schema_overrides={"age": pl.UInt8}, +) + +print(df) +# --8<-- [end:schema_overrides] diff --git a/docs/source/src/python/user-guide/concepts/data-types/categoricals.py b/docs/source/src/python/user-guide/concepts/data-types/categoricals.py deleted file mode 100644 index c37a70be3e9d..000000000000 --- a/docs/source/src/python/user-guide/concepts/data-types/categoricals.py +++ /dev/null @@ -1,107 +0,0 @@ -# --8<-- [start:setup] -import polars as pl - -# --8<-- [end:setup] - -# --8<-- [start:example] -enum_dtype = pl.Enum(["Polar", "Panda", "Brown"]) -enum_series = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=enum_dtype) -cat_series = pl.Series( - ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical -) -# --8<-- [end:example] - - -# --8<-- [start:append] -cat_series = pl.Series( - ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical -) -cat2_series = pl.Series( - ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical -) -# Triggers a CategoricalRemappingWarning: Local categoricals have different encodings, expensive re-encoding is done -print(cat_series.append(cat2_series)) -# --8<-- [end:append] - - -# --8<-- [start:global_append] -with pl.StringCache(): - cat_series = pl.Series( - ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical - ) - cat2_series = pl.Series( - ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical - ) - print(cat_series.append(cat2_series)) -# --8<-- [end:global_append] - - -# --8<-- [start:enum_append] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -cat_series = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=dtype) -cat2_series = pl.Series(["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=dtype) -print(cat_series.append(cat2_series)) -# --8<-- [end:enum_append] - -# --8<-- [start:enum_error] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -try: - cat_series = pl.Series(["Polar", "Panda", "Brown", "Black"], dtype=dtype) -except Exception as e: - print(e) -# --8<-- [end:enum_error] - -# --8<-- [start:equality] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) -cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) -print(cat_series == cat_series2) -# --8<-- [end:equality] - -# --8<-- [start:global_equality] -with pl.StringCache(): - cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) - cat_series2 = pl.Series(["Polar", "Panda", "Black"], dtype=pl.Categorical) - print(cat_series == cat_series2) -# --8<-- [end:global_equality] - -# --8<-- [start:equality] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) -cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) -print(cat_series == cat_series2) -# --8<-- [end:equality] - -# --8<-- [start:str_compare_single] -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) -print(cat_series <= "Cat") -# --8<-- [end:str_compare_single] - -# --8<-- [start:str_compare] -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) -cat_series_utf = pl.Series(["Panda", "Panda", "Polar"]) -print(cat_series <= cat_series_utf) -# --8<-- [end:str_compare] - -# --8<-- [start:str_enum_compare_error] -try: - cat_series = pl.Series( - ["Low", "Medium", "High"], dtype=pl.Enum(["Low", "Medium", "High"]) - ) - cat_series <= "Excellent" -except Exception as e: - print(e) -# --8<-- [end:str_enum_compare_error] - -# --8<-- [start:str_enum_compare_single] -dtype = pl.Enum(["Low", "Medium", "High"]) -cat_series = pl.Series(["Low", "Medium", "High"], dtype=dtype) -print(cat_series <= "Medium") -# --8<-- [end:str_enum_compare_single] - -# --8<-- [start:str_enum_compare] -dtype = pl.Enum(["Low", "Medium", "High"]) -cat_series = pl.Series(["Low", "Medium", "High"], dtype=dtype) -cat_series2 = pl.Series(["High", "High", "Low"], dtype=dtype) -print(cat_series <= cat_series2) -# --8<-- [end:str_enum_compare] diff --git a/docs/source/src/python/user-guide/expressions/aggregation.py b/docs/source/src/python/user-guide/expressions/aggregation.py index f67226fdc3d7..e38466fd4ab7 100644 --- a/docs/source/src/python/user-guide/expressions/aggregation.py +++ b/docs/source/src/python/user-guide/expressions/aggregation.py @@ -1,9 +1,6 @@ -# --8<-- [start:setup] +# --8<-- [start:dataframe] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:dataframe] url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv" schema_overrides = { @@ -26,7 +23,7 @@ .agg( pl.len(), pl.col("gender"), - pl.first("last_name"), + pl.first("last_name"), # Short for `pl.col("last_name").first()` ) .sort("len", descending=True) .limit(5) @@ -56,7 +53,7 @@ q = ( dataset.lazy() .group_by("state", "party") - .agg(pl.count("party").alias("count")) + .agg(pl.len().alias("count")) .filter( (pl.col("party") == "Anti-Administration") | (pl.col("party") == "Pro-Administration") @@ -104,8 +101,26 @@ def avg_birthday(gender: str) -> pl.Expr: # --8<-- [end:filter] +# --8<-- [start:filter-nested] +q = ( + dataset.lazy() + .group_by("state", "gender") + .agg( + # The function `avg_birthday` is not needed: + compute_age().mean().alias("avg birthday"), + pl.len().alias("#"), + ) + .sort("#", descending=True) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:filter-nested] + + # --8<-- [start:sort] -def get_person() -> pl.Expr: +def get_name() -> pl.Expr: return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") @@ -114,8 +129,8 @@ def get_person() -> pl.Expr: .sort("birthday", descending=True) .group_by("state") .agg( - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), ) .limit(5) ) @@ -126,18 +141,14 @@ def get_person() -> pl.Expr: # --8<-- [start:sort2] -def get_person() -> pl.Expr: - return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") - - q = ( dataset.lazy() .sort("birthday", descending=True) .group_by("state") .agg( - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person().sort().first().alias("alphabetical_first"), + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), + get_name().sort().first().alias("alphabetical_first"), ) .limit(5) ) @@ -148,19 +159,15 @@ def get_person() -> pl.Expr: # --8<-- [start:sort3] -def get_person() -> pl.Expr: - return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") - - q = ( dataset.lazy() .sort("birthday", descending=True) .group_by("state") .agg( - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person().sort().first().alias("alphabetical_first"), - pl.col("gender").sort_by(get_person()).first(), + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), + get_name().sort().first().alias("alphabetical_first"), + pl.col("gender").sort_by(get_name()).first(), ) .sort("state") .limit(5) diff --git a/docs/source/src/python/user-guide/expressions/casting.py b/docs/source/src/python/user-guide/expressions/casting.py index bd06f4038843..e320ffa4a0c6 100644 --- a/docs/source/src/python/user-guide/expressions/casting.py +++ b/docs/source/src/python/user-guide/expressions/casting.py @@ -1,16 +1,11 @@ -# --8<-- [start:setup] - +# --8<-- [start:dfnum] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:dfnum] df = pl.DataFrame( { - "integers": [1, 2, 3, 4, 5], - "big_integers": [1, 10000002, 3, 10000004, 10000005], - "floats": [4.0, 5.0, 6.0, 7.0, 8.0], - "floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5], + "integers": [1, 2, 3], + "big_integers": [10000002, 2, 30000003], + "floats": [4.0, 5.8, -6.3], } ) @@ -18,64 +13,67 @@ # --8<-- [end:dfnum] # --8<-- [start:castnum] -out = df.select( +result = df.select( pl.col("integers").cast(pl.Float32).alias("integers_as_floats"), pl.col("floats").cast(pl.Int32).alias("floats_as_integers"), - pl.col("floats_with_decimal") - .cast(pl.Int32) - .alias("floats_with_decimal_as_integers"), ) -print(out) +print(result) # --8<-- [end:castnum] # --8<-- [start:downcast] -out = df.select( - pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"), - pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"), +print(f"Before downcasting: {df.estimated_size()} bytes") +result = df.with_columns( + pl.col("integers").cast(pl.Int16), + pl.col("floats").cast(pl.Float32), ) -print(out) +print(f"After downcasting: {result.estimated_size()} bytes") # --8<-- [end:downcast] # --8<-- [start:overflow] +from polars.exceptions import InvalidOperationError + try: - out = df.select(pl.col("big_integers").cast(pl.Int8)) - print(out) -except Exception as e: - print(e) + result = df.select(pl.col("big_integers").cast(pl.Int8)) + print(result) +except InvalidOperationError as err: + print(err) # --8<-- [end:overflow] # --8<-- [start:overflow2] -out = df.select(pl.col("big_integers").cast(pl.Int8, strict=False)) -print(out) +result = df.select(pl.col("big_integers").cast(pl.Int8, strict=False)) +print(result) # --8<-- [end:overflow2] # --8<-- [start:strings] df = pl.DataFrame( { - "integers": [1, 2, 3, 4, 5], - "float": [4.0, 5.03, 6.0, 7.0, 8.0], - "floats_as_string": ["4.0", "5.0", "6.0", "7.0", "8.0"], + "integers_as_strings": ["1", "2", "3"], + "floats_as_strings": ["4.0", "5.8", "-6.3"], + "floats": [4.0, 5.8, -6.3], } ) -out = df.select( - pl.col("integers").cast(pl.String), - pl.col("float").cast(pl.String), - pl.col("floats_as_string").cast(pl.Float64), +result = df.select( + pl.col("integers_as_strings").cast(pl.Int32), + pl.col("floats_as_strings").cast(pl.Float64), + pl.col("floats").cast(pl.String), ) -print(out) +print(result) # --8<-- [end:strings] # --8<-- [start:strings2] -df = pl.DataFrame({"strings_not_float": ["4.0", "not_a_number", "6.0", "7.0", "8.0"]}) +df = pl.DataFrame( + { + "floats": ["4.0", "5.8", "- 6 . 3"], + } +) try: - out = df.select(pl.col("strings_not_float").cast(pl.Float64)) - print(out) -except Exception as e: - print(e) + result = df.select(pl.col("floats").cast(pl.Float64)) +except InvalidOperationError as err: + print(err) # --8<-- [end:strings2] # --8<-- [start:bool] @@ -87,43 +85,53 @@ } ) -out = df.select(pl.col("integers").cast(pl.Boolean), pl.col("floats").cast(pl.Boolean)) -print(out) +result = df.select( + pl.col("integers").cast(pl.Boolean), + pl.col("floats").cast(pl.Boolean), + pl.col("bools").cast(pl.Int8), +) +print(result) # --8<-- [end:bool] # --8<-- [start:dates] -from datetime import date, datetime +from datetime import date, datetime, time df = pl.DataFrame( { - "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True), - "datetime": pl.datetime_range( - datetime(2022, 1, 1), datetime(2022, 1, 5), eager=True - ), + "date": [ + date(1970, 1, 1), # epoch + date(1970, 1, 10), # 9 days later + ], + "datetime": [ + datetime(1970, 1, 1, 0, 0, 0), # epoch + datetime(1970, 1, 1, 0, 1, 0), # 1 minute later + ], + "time": [ + time(0, 0, 0), # reference time + time(0, 0, 1), # 1 second later + ], } ) -out = df.select(pl.col("date").cast(pl.Int64), pl.col("datetime").cast(pl.Int64)) -print(out) +result = df.select( + pl.col("date").cast(pl.Int64).alias("days_since_epoch"), + pl.col("datetime").cast(pl.Int64).alias("us_since_epoch"), + pl.col("time").cast(pl.Int64).alias("ns_since_midnight"), +) +print(result) # --8<-- [end:dates] # --8<-- [start:dates2] df = pl.DataFrame( { - "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True), - "string": [ - "2022-01-01", - "2022-01-02", - "2022-01-03", - "2022-01-04", - "2022-01-05", - ], + "date": [date(2022, 1, 1), date(2022, 1, 2)], + "string": ["2022-01-01", "2022-01-02"], } ) -out = df.select( +result = df.select( pl.col("date").dt.to_string("%Y-%m-%d"), pl.col("string").str.to_datetime("%Y-%m-%d"), ) -print(out) +print(result) # --8<-- [end:dates2] diff --git a/docs/source/src/python/user-guide/expressions/categoricals.py b/docs/source/src/python/user-guide/expressions/categoricals.py new file mode 100644 index 000000000000..5e3efefab8f1 --- /dev/null +++ b/docs/source/src/python/user-guide/expressions/categoricals.py @@ -0,0 +1,208 @@ +# --8<-- [start:enum-example] +import polars as pl + +bears_enum = pl.Enum(["Polar", "Panda", "Brown"]) +bears = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=bears_enum) +print(bears) +# --8<-- [end:enum-example] + +# --8<-- [start:enum-wrong-value] +from polars.exceptions import InvalidOperationError + +try: + bears_kind_of = pl.Series( + ["Polar", "Panda", "Brown", "Polar", "Shark"], + dtype=bears_enum, + ) +except InvalidOperationError as exc: + print("InvalidOperationError:", exc) +# --8<-- [end:enum-wrong-value] + +# --8<-- [start:log-levels] +log_levels = pl.Enum(["debug", "info", "warning", "error"]) + +logs = pl.DataFrame( + { + "level": ["debug", "info", "debug", "error"], + "message": [ + "process id: 525", + "Service started correctly", + "startup time: 67ms", + "Cannot connect to DB!", + ], + }, + schema_overrides={ + "level": log_levels, + }, +) + +non_debug_logs = logs.filter( + pl.col("level") > "debug", +) +print(non_debug_logs) +# --8<-- [end:log-levels] + +# --8<-- [start:categorical-example] +bears_cat = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical +) +print(bears_cat) +# --8<-- [end:categorical-example] + +# --8<-- [start:categorical-comparison-string] +print(bears_cat < "Cat") +# --8<-- [end:categorical-comparison-string] + +# --8<-- [start:categorical-comparison-string-column] +bears_str = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], +) +print(bears_cat == bears_str) +# --8<-- [end:categorical-comparison-string-column] + +# --8<-- [start:categorical-comparison-categorical-column] +from polars.exceptions import StringCacheMismatchError + +bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], + dtype=pl.Categorical, +) + +try: + print(bears_cat == bears_cat2) +except StringCacheMismatchError as exc: + exc_str = str(exc).splitlines()[0] + print("StringCacheMismatchError:", exc_str) +# --8<-- [end:categorical-comparison-categorical-column] + +# --8<-- [start:stringcache-categorical-equality] +with pl.StringCache(): + bears_cat = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical + ) + bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical + ) + +print(bears_cat == bears_cat2) +# --8<-- [end:stringcache-categorical-equality] + +# --8<-- [start:stringcache-categorical-comparison-lexical] +with pl.StringCache(): + bears_cat = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], + dtype=pl.Categorical(ordering="lexical"), + ) + bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical + ) + +print(bears_cat > bears_cat2) +# --8<-- [end:stringcache-categorical-comparison-lexical] + +# --8<-- [start:stringcache-categorical-comparison-physical] +with pl.StringCache(): + bears_cat = pl.Series( + # Polar < Panda < Brown + ["Polar", "Panda", "Brown", "Brown", "Polar"], + dtype=pl.Categorical, + ) + bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical + ) + +print(bears_cat > bears_cat2) +# --8<-- [end:stringcache-categorical-comparison-physical] + +# --8<-- [start:concatenating-categoricals] +male_bears = pl.DataFrame( + { + "species": ["Polar", "Brown", "Panda"], + "weight": [450, 500, 110], # kg + }, + schema_overrides={"species": pl.Categorical}, +) +female_bears = pl.DataFrame( + { + "species": ["Brown", "Polar", "Panda"], + "weight": [340, 200, 90], # kg + }, + schema_overrides={"species": pl.Categorical}, +) + +bears = pl.concat([male_bears, female_bears], how="vertical") +print(bears) +# --8<-- [end:concatenating-categoricals] + + +# --8<-- [start:example] +import polars as pl + +bears_enum = pl.Enum(["Polar", "Panda", "Brown"]) +bears = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=bears_enum) +print(bears) + +cat_bears = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical +) +# --8<-- [end:example] + + +# --8<-- [start:append] +cat_bears = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical +) +cat2_series = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical +) + +# Triggers a CategoricalRemappingWarning. +print(cat_bears.append(cat2_series)) +# --8<-- [end:append] + +# --8<-- [start:enum_append] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +cat_bears = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=dtype) +cat2_series = pl.Series(["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=dtype) +print(cat_bears.append(cat2_series)) +# --8<-- [end:enum_append] + +# --8<-- [start:enum_error] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +try: + cat_bears = pl.Series(["Polar", "Panda", "Brown", "Black"], dtype=dtype) +except Exception as e: + print(e) +# --8<-- [end:enum_error] + +# --8<-- [start:equality] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) +cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) +print(cat_bears == cat_series2) +# --8<-- [end:equality] + +# --8<-- [start:global_equality] +with pl.StringCache(): + cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) + cat_series2 = pl.Series(["Polar", "Panda", "Black"], dtype=pl.Categorical) + print(cat_bears == cat_series2) +# --8<-- [end:global_equality] + +# --8<-- [start:equality] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) +cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) +print(cat_bears == cat_series2) +# --8<-- [end:equality] + +# --8<-- [start:str_compare_single] +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) +print(cat_bears <= "Cat") +# --8<-- [end:str_compare_single] + +# --8<-- [start:str_compare] +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) +cat_series_utf = pl.Series(["Panda", "Panda", "Polar"]) +print(cat_bears <= cat_series_utf) +# --8<-- [end:str_compare] diff --git a/docs/source/src/python/user-guide/expressions/column-selections.py b/docs/source/src/python/user-guide/expressions/column-selections.py index 4454a1a3d970..61f3fdb44a09 100644 --- a/docs/source/src/python/user-guide/expressions/column-selections.py +++ b/docs/source/src/python/user-guide/expressions/column-selections.py @@ -1,5 +1,3 @@ -# --8<-- [start:setup] -# --8<-- [end:setup] # --8<-- [start:selectors_df] from datetime import date, datetime diff --git a/docs/source/src/python/user-guide/expressions/expression-expansion.py b/docs/source/src/python/user-guide/expressions/expression-expansion.py new file mode 100644 index 000000000000..b34169e49736 --- /dev/null +++ b/docs/source/src/python/user-guide/expressions/expression-expansion.py @@ -0,0 +1,198 @@ +# --8<-- [start:df] +import polars as pl + +df = pl.DataFrame( + { # As of 14th October 2024, ~3pm UTC + "ticker": ["AAPL", "NVDA", "MSFT", "GOOG", "AMZN"], + "company_name": ["Apple", "NVIDIA", "Microsoft", "Alphabet (Google)", "Amazon"], + "price": [229.9, 138.93, 420.56, 166.41, 188.4], + "day_high": [231.31, 139.6, 424.04, 167.62, 189.83], + "day_low": [228.6, 136.3, 417.52, 164.78, 188.44], + "year_high": [237.23, 140.76, 468.35, 193.31, 201.2], + "year_low": [164.08, 39.23, 324.39, 121.46, 118.35], + } +) + +print(df) +# --8<-- [end:df] + +# --8<-- [start:col-with-names] +eur_usd_rate = 1.09 # As of 14th October 2024 + +result = df.with_columns( + ( + pl.col( + "price", + "day_high", + "day_low", + "year_high", + "year_low", + ) + / eur_usd_rate + ).round(2) +) +print(result) +# --8<-- [end:col-with-names] + +# --8<-- [start:expression-list] +exprs = [ + (pl.col("price") / eur_usd_rate).round(2), + (pl.col("day_high") / eur_usd_rate).round(2), + (pl.col("day_low") / eur_usd_rate).round(2), + (pl.col("year_high") / eur_usd_rate).round(2), + (pl.col("year_low") / eur_usd_rate).round(2), +] + +result2 = df.with_columns(exprs) +print(result.equals(result2)) +# --8<-- [end:expression-list] + +# --8<-- [start:col-with-dtype] +result = df.with_columns((pl.col(pl.Float64) / eur_usd_rate).round(2)) +print(result) +# --8<-- [end:col-with-dtype] + +# --8<-- [start:col-with-dtypes] +result2 = df.with_columns( + ( + pl.col( + pl.Float32, + pl.Float64, + ) + / eur_usd_rate + ).round(2) +) +print(result.equals(result2)) +# --8<-- [end:col-with-dtypes] + +# --8<-- [start:col-with-regex] +result = df.select(pl.col("ticker", "^.*_high$", "^.*_low$")) +print(result) +# --8<-- [end:col-with-regex] + +# --8<-- [start:col-error] +try: + df.select(pl.col("ticker", pl.Float64)) +except TypeError as err: + print("TypeError:", err) +# --8<-- [end:col-error] + +# --8<-- [start:all] +result = df.select(pl.all()) +print(result.equals(df)) +# --8<-- [end:all] + +# --8<-- [start:all-exclude] +result = df.select(pl.all().exclude("^day_.*$")) +print(result) +# --8<-- [end:all-exclude] + +# --8<-- [start:col-exclude] +result = df.select(pl.col(pl.Float64).exclude("^day_.*$")) +print(result) +# --8<-- [end:col-exclude] + +# --8<-- [start:duplicate-error] +from polars.exceptions import DuplicateError + +gbp_usd_rate = 1.31 # As of 14th October 2024 + +try: + df.select( + pl.col("price") / gbp_usd_rate, # This would be named "price"... + pl.col("price") / eur_usd_rate, # And so would this. + ) +except DuplicateError as err: + print("DuplicateError:", err) +# --8<-- [end:duplicate-error] + +# --8<-- [start:alias] +result = df.select( + (pl.col("price") / gbp_usd_rate).alias("price (GBP)"), + (pl.col("price") / eur_usd_rate).alias("price (EUR)"), +) +# --8<-- [end:alias] + +# --8<-- [start:prefix-suffix] +result = df.select( + (pl.col("^year_.*$") / eur_usd_rate).name.prefix("in_eur_"), + (pl.col("day_high", "day_low") / gbp_usd_rate).name.suffix("_gbp"), +) +print(result) +# --8<-- [end:prefix-suffix] + +# --8<-- [start:name-map] +# There is also `.name.to_uppercase`, so this usage of `.map` is moot. +result = df.select(pl.all().name.map(str.upper)) +print(result) +# --8<-- [end:name-map] + +# --8<-- [start:for-with_columns] +result = df +for tp in ["day", "year"]: + result = result.with_columns( + (pl.col(f"{tp}_high") - pl.col(f"{tp}_low")).alias(f"{tp}_amplitude") + ) +print(result) +# --8<-- [end:for-with_columns] + + +# --8<-- [start:yield-expressions] +def amplitude_expressions(time_periods): + for tp in time_periods: + yield (pl.col(f"{tp}_high") - pl.col(f"{tp}_low")).alias(f"{tp}_amplitude") + + +result = df.with_columns(amplitude_expressions(["day", "year"])) +print(result) +# --8<-- [end:yield-expressions] + +# --8<-- [start:selectors] +import polars.selectors as cs + +result = df.select(cs.string() | cs.ends_with("_high")) +print(result) +# --8<-- [end:selectors] + +# --8<-- [start:selectors-set-operations] +result = df.select(cs.contains("_") - cs.string()) +print(result) +# --8<-- [end:selectors-set-operations] + +# --8<-- [start:selectors-expressions] +result = df.select((cs.contains("_") - cs.string()) / eur_usd_rate) +print(result) +# --8<-- [end:selectors-expressions] + +# --8<-- [start:selector-ambiguity] +people = pl.DataFrame( + { + "name": ["Anna", "Bob"], + "has_partner": [True, False], + "has_kids": [False, False], + "has_tattoos": [True, False], + "is_alive": [True, True], + } +) + +wrong_result = people.select((~cs.starts_with("has_")).name.prefix("not_")) +print(wrong_result) +# --8<-- [end:selector-ambiguity] + +# --8<-- [start:as_expr] +result = people.select((~cs.starts_with("has_").as_expr()).name.prefix("not_")) +print(result) +# --8<-- [end:as_expr] + +# --8<-- [start:is_selector] +print(cs.is_selector(~cs.starts_with("has_").as_expr())) +# --8<-- [end:is_selector] + +# --8<-- [start:expand_selector] +print( + cs.expand_selector( + people, + cs.starts_with("has_"), + ) +) +# --8<-- [end:expand_selector] diff --git a/docs/source/src/python/user-guide/expressions/folds.py b/docs/source/src/python/user-guide/expressions/folds.py index 803591b5b581..f0be44b29cb5 100644 --- a/docs/source/src/python/user-guide/expressions/folds.py +++ b/docs/source/src/python/user-guide/expressions/folds.py @@ -1,24 +1,63 @@ -# --8<-- [start:setup] +# --8<-- [start:mansum] +import operator import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:mansum] df = pl.DataFrame( { + "label": ["foo", "bar", "spam"], "a": [1, 2, 3], "b": [10, 20, 30], } ) -out = df.select( - pl.fold(acc=pl.lit(0), function=lambda acc, x: acc + x, exprs=pl.all()).alias( - "sum" - ), +result = df.select( + pl.fold( + acc=pl.lit(0), + function=operator.add, + exprs=pl.col("a", "b"), + ).alias("sum_fold"), + pl.sum_horizontal(pl.col("a", "b")).alias("sum_horz"), ) -print(out) + +print(result) # --8<-- [end:mansum] +# --8<-- [start:mansum-explicit] +acc = pl.lit(0) +f = operator.add + +result = df.select( + f(f(acc, pl.col("a")), pl.col("b")), + pl.fold(acc=acc, function=f, exprs=pl.col("a", "b")).alias("sum_fold"), +) + +print(result) +# --8<-- [end:mansum-explicit] + +# --8<-- [start:manprod] +result = df.select( + pl.fold( + acc=pl.lit(0), + function=operator.mul, + exprs=pl.col("a", "b"), + ).alias("prod"), +) + +print(result) +# --8<-- [end:manprod] + +# --8<-- [start:manprod-fixed] +result = df.select( + pl.fold( + acc=pl.lit(1), + function=operator.mul, + exprs=pl.col("a", "b"), + ).alias("prod"), +) + +print(result) +# --8<-- [end:manprod-fixed] + # --8<-- [start:conditional] df = pl.DataFrame( { @@ -27,14 +66,14 @@ } ) -out = df.filter( +result = df.filter( pl.fold( acc=pl.lit(True), function=lambda acc, x: acc & x, - exprs=pl.col("*") > 1, + exprs=pl.all() > 1, ) ) -print(out) +print(result) # --8<-- [end:conditional] # --8<-- [start:string] @@ -45,6 +84,6 @@ } ) -out = df.select(pl.concat_str(["a", "b"])) -print(out) +result = df.select(pl.concat_str(["a", "b"])) +print(result) # --8<-- [end:string] diff --git a/docs/source/src/python/user-guide/expressions/functions.py b/docs/source/src/python/user-guide/expressions/functions.py deleted file mode 100644 index 5f9bbd5bb1da..000000000000 --- a/docs/source/src/python/user-guide/expressions/functions.py +++ /dev/null @@ -1,60 +0,0 @@ -# --8<-- [start:setup] - -import polars as pl -import numpy as np - -np.random.seed(12) -# --8<-- [end:setup] - -# --8<-- [start:dataframe] -df = pl.DataFrame( - { - "nrs": [1, 2, 3, None, 5], - "names": ["foo", "ham", "spam", "egg", "spam"], - "random": np.random.rand(5), - "groups": ["A", "A", "B", "C", "B"], - } -) -print(df) -# --8<-- [end:dataframe] - -# --8<-- [start:samename] -df_samename = df.select(pl.col("nrs") + 5) -print(df_samename) -# --8<-- [end:samename] - - -# --8<-- [start:samenametwice] -try: - df_samename2 = df.select(pl.col("nrs") + 5, pl.col("nrs") - 5) - print(df_samename2) -except Exception as e: - print(e) -# --8<-- [end:samenametwice] - -# --8<-- [start:samenamealias] -df_alias = df.select( - (pl.col("nrs") + 5).alias("nrs + 5"), - (pl.col("nrs") - 5).alias("nrs - 5"), -) -print(df_alias) -# --8<-- [end:samenamealias] - -# --8<-- [start:countunique] -df_alias = df.select( - pl.col("names").n_unique().alias("unique"), - pl.approx_n_unique("names").alias("unique_approx"), -) -print(df_alias) -# --8<-- [end:countunique] - -# --8<-- [start:conditional] -df_conditional = df.select( - pl.col("nrs"), - pl.when(pl.col("nrs") > 2) - .then(pl.lit(True)) - .otherwise(pl.lit(False)) - .alias("conditional"), -) -print(df_conditional) -# --8<-- [end:conditional] diff --git a/docs/source/src/python/user-guide/expressions/lists.py b/docs/source/src/python/user-guide/expressions/lists.py index edd0092330d5..6cffd6520317 100644 --- a/docs/source/src/python/user-guide/expressions/lists.py +++ b/docs/source/src/python/user-guide/expressions/lists.py @@ -1,12 +1,74 @@ -# --8<-- [start:setup] +# --8<-- [start:list-example] +from datetime import datetime import polars as pl -# --8<-- [end:setup] +df = pl.DataFrame( + { + "names": [ + ["Anne", "Averill", "Adams"], + ["Brandon", "Brooke", "Borden", "Branson"], + ["Camila", "Campbell"], + ["Dennis", "Doyle"], + ], + "children_ages": [ + [5, 7], + [], + [], + [8, 11, 18], + ], + "medical_appointments": [ + [], + [], + [], + [datetime(2022, 5, 22, 16, 30)], + ], + } +) + +print(df) +# --8<-- [end:list-example] + +# --8<-- [start:array-example] +df = pl.DataFrame( + { + "bit_flags": [ + [True, True, True, True, False], + [False, True, True, True, True], + ], + "tic_tac_toe": [ + [ + [" ", "x", "o"], + [" ", "x", " "], + ["o", "x", " "], + ], + [ + ["o", "x", "x"], + [" ", "o", "x"], + [" ", " ", "o"], + ], + ], + }, + schema={ + "bit_flags": pl.Array(pl.Boolean, 5), + "tic_tac_toe": pl.Array(pl.String, (3, 3)), + }, +) -# --8<-- [start:weather_df] +print(df) +# --8<-- [end:array-example] + +# --8<-- [start:numpy-array-inference] +import numpy as np + +array = np.arange(0, 120).reshape((5, 2, 3, 4)) # 4D array + +print(pl.Series(array).dtype) # Column with the 3D subarrays +# --8<-- [end:numpy-array-inference] + +# --8<-- [start:weather] weather = pl.DataFrame( { - "station": ["Station " + str(x) for x in range(1, 6)], + "station": [f"Station {idx}" for idx in range(1, 6)], "temperatures": [ "20 5 5 E1 7 13 19 9 6 20", "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40", @@ -16,57 +78,55 @@ ], } ) -print(weather) -# --8<-- [end:weather_df] -# --8<-- [start:string_to_list] -out = weather.with_columns(pl.col("temperatures").str.split(" ")) -print(out) -# --8<-- [end:string_to_list] +print(weather) +# --8<-- [end:weather] -# --8<-- [start:explode_to_atomic] -out = weather.with_columns(pl.col("temperatures").str.split(" ")).explode( - "temperatures" -) -print(out) -# --8<-- [end:explode_to_atomic] - -# --8<-- [start:list_ops] -out = weather.with_columns(pl.col("temperatures").str.split(" ")).with_columns( - pl.col("temperatures").list.head(3).alias("top3"), - pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"), - pl.col("temperatures").list.len().alias("obs"), +# --8<-- [start:split] +weather = weather.with_columns( + pl.col("temperatures").str.split(" "), ) -print(out) -# --8<-- [end:list_ops] +print(weather) +# --8<-- [end:split] + +# --8<-- [start:explode] +result = weather.explode("temperatures") +print(result) +# --8<-- [end:explode] +# --8<-- [start:list-slicing] +result = weather.with_columns( + pl.col("temperatures").list.head(3).alias("head"), + pl.col("temperatures").list.tail(3).alias("tail"), + pl.col("temperatures").list.slice(-3, 2).alias("two_next_to_last"), +) +print(result) +# --8<-- [end:list-slicing] -# --8<-- [start:count_errors] -out = weather.with_columns( +# --8<-- [start:element-wise-casting] +result = weather.with_columns( pl.col("temperatures") - .str.split(" ") .list.eval(pl.element().cast(pl.Int64, strict=False).is_null()) .list.sum() - .alias("errors") + .alias("errors"), ) -print(out) -# --8<-- [end:count_errors] +print(result) +# --8<-- [end:element-wise-casting] -# --8<-- [start:count_errors_regex] -out = weather.with_columns( +# --8<-- [start:element-wise-regex] +result2 = weather.with_columns( pl.col("temperatures") - .str.split(" ") .list.eval(pl.element().str.contains("(?i)[a-z]")) .list.sum() - .alias("errors") + .alias("errors"), ) -print(out) -# --8<-- [end:count_errors_regex] +print(result.equals(result2)) +# --8<-- [end:element-wise-regex] # --8<-- [start:weather_by_day] weather_by_day = pl.DataFrame( { - "station": ["Station " + str(x) for x in range(1, 11)], + "station": [f"Station {idx}" for idx in range(1, 11)], "day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17], "day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13], "day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10], @@ -75,10 +135,10 @@ print(weather_by_day) # --8<-- [end:weather_by_day] -# --8<-- [start:weather_by_day_rank] -rank_pct = (pl.element().rank(descending=True) / pl.col("*").count()).round(2) +# --8<-- [start:rank_pct] +rank_pct = (pl.element().rank(descending=True) / pl.all().count()).round(2) -out = weather_by_day.with_columns( +result = weather_by_day.with_columns( # create the list of homogeneous data pl.concat_list(pl.all().exclude("station")).alias("all_temps") ).select( @@ -88,27 +148,37 @@ pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"), ) -print(out) -# --8<-- [end:weather_by_day_rank] +print(result) +# --8<-- [end:rank_pct] -# --8<-- [start:array_df] -array_df = pl.DataFrame( - [ - pl.Series("Array_1", [[1, 3], [2, 5]]), - pl.Series("Array_2", [[1, 7, 3], [8, 1, 0]]), - ], +# --8<-- [start:array-overview] +df = pl.DataFrame( + { + "first_last": [ + ["Anne", "Adams"], + ["Brandon", "Branson"], + ["Camila", "Campbell"], + ["Dennis", "Doyle"], + ], + "fav_numbers": [ + [42, 0, 1], + [2, 3, 5], + [13, 21, 34], + [73, 3, 7], + ], + }, schema={ - "Array_1": pl.Array(pl.Int64, 2), - "Array_2": pl.Array(pl.Int64, 3), + "first_last": pl.Array(pl.String, 2), + "fav_numbers": pl.Array(pl.Int32, 3), }, ) -print(array_df) -# --8<-- [end:array_df] -# --8<-- [start:array_ops] -out = array_df.select( - pl.col("Array_1").arr.min().name.suffix("_min"), - pl.col("Array_2").arr.sum().name.suffix("_sum"), +result = df.select( + pl.col("first_last").arr.join(" ").alias("name"), + pl.col("fav_numbers").arr.sort(), + pl.col("fav_numbers").arr.max().alias("largest_fav"), + pl.col("fav_numbers").arr.sum().alias("summed"), + pl.col("fav_numbers").arr.contains(3).alias("likes_3"), ) -print(out) -# --8<-- [end:array_ops] +print(result) +# --8<-- [end:array-overview] diff --git a/docs/source/src/python/user-guide/expressions/missing-data.py b/docs/source/src/python/user-guide/expressions/missing-data.py index f8944de3c9ee..f078f5a34aa7 100644 --- a/docs/source/src/python/user-guide/expressions/missing-data.py +++ b/docs/source/src/python/user-guide/expressions/missing-data.py @@ -1,10 +1,6 @@ -# --8<-- [start:setup] -import numpy as np +# --8<-- [start:dataframe] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:dataframe] df = pl.DataFrame( { "value": [1, None], @@ -31,8 +27,8 @@ # --8<-- [start:dataframe2] df = pl.DataFrame( { - "col1": [1, 2, 3], - "col2": [1, None, 3], + "col1": [0.5, 1, 1.5, 2, 2.5], + "col2": [1, None, 3, None, 5], }, ) print(df) @@ -41,25 +37,26 @@ # --8<-- [start:fill] fill_literal_df = df.with_columns( - pl.col("col2").fill_null(pl.lit(2)), + pl.col("col2").fill_null(3), ) print(fill_literal_df) # --8<-- [end:fill] -# --8<-- [start:fillstrategy] -fill_forward_df = df.with_columns( - pl.col("col2").fill_null(strategy="forward"), -) -print(fill_forward_df) -# --8<-- [end:fillstrategy] - # --8<-- [start:fillexpr] fill_median_df = df.with_columns( - pl.col("col2").fill_null(pl.median("col2")), + pl.col("col2").fill_null((2 * pl.col("col1")).cast(pl.Int64)), ) print(fill_median_df) # --8<-- [end:fillexpr] +# --8<-- [start:fillstrategy] +fill_forward_df = df.with_columns( + pl.col("col2").fill_null(strategy="forward").alias("forward"), + pl.col("col2").fill_null(strategy="backward").alias("backward"), +) +print(fill_forward_df) +# --8<-- [end:fillstrategy] + # --8<-- [start:fillinterpolate] fill_interpolation_df = df.with_columns( pl.col("col2").interpolate(), @@ -68,6 +65,8 @@ # --8<-- [end:fillinterpolate] # --8<-- [start:nan] +import numpy as np + nan_df = pl.DataFrame( { "value": [1.0, np.nan, float("nan"), 3.0], @@ -76,9 +75,23 @@ print(nan_df) # --8<-- [end:nan] +# --8<-- [start:nan-computed] +df = pl.DataFrame( + { + "dividend": [1, 0, -1], + "divisor": [1, 0, -1], + } +) +result = df.select(pl.col("dividend") / pl.col("divisor")) +print(result) +# --8<-- [end:nan-computed] + # --8<-- [start:nanfill] mean_nan_df = nan_df.with_columns( - pl.col("value").fill_nan(None).alias("value"), -).mean() + pl.col("value").fill_nan(None).alias("replaced"), +).select( + pl.all().mean().name.suffix("_mean"), + pl.all().sum().name.suffix("_sum"), +) print(mean_nan_df) # --8<-- [end:nanfill] diff --git a/docs/source/src/python/user-guide/expressions/operations.py b/docs/source/src/python/user-guide/expressions/operations.py new file mode 100644 index 000000000000..556ed512c757 --- /dev/null +++ b/docs/source/src/python/user-guide/expressions/operations.py @@ -0,0 +1,132 @@ +# --8<-- [start:dataframe] +import polars as pl +import numpy as np + +np.random.seed(42) # For reproducibility. + +df = pl.DataFrame( + { + "nrs": [1, 2, 3, None, 5], + "names": ["foo", "ham", "spam", "egg", "spam"], + "random": np.random.rand(5), + "groups": ["A", "A", "B", "A", "B"], + } +) +print(df) +# --8<-- [end:dataframe] + +# --8<-- [start:arithmetic] +result = df.select( + (pl.col("nrs") + 5).alias("nrs + 5"), + (pl.col("nrs") - 5).alias("nrs - 5"), + (pl.col("nrs") * pl.col("random")).alias("nrs * random"), + (pl.col("nrs") / pl.col("random")).alias("nrs / random"), + (pl.col("nrs") ** 2).alias("nrs ** 2"), + (pl.col("nrs") % 3).alias("nrs % 3"), +) + +print(result) +# --8<-- [end:arithmetic] + +# --8<-- [start:operator-overloading] +# Python only: +result_named_operators = df.select( + (pl.col("nrs").add(5)).alias("nrs + 5"), + (pl.col("nrs").sub(5)).alias("nrs - 5"), + (pl.col("nrs").mul(pl.col("random"))).alias("nrs * random"), + (pl.col("nrs").truediv(pl.col("random"))).alias("nrs / random"), + (pl.col("nrs").pow(2)).alias("nrs ** 2"), + (pl.col("nrs").mod(3)).alias("nrs % 3"), +) + +print(result.equals(result_named_operators)) +# --8<-- [end:operator-overloading] + +# --8<-- [start:comparison] +result = df.select( + (pl.col("nrs") > 1).alias("nrs > 1"), # .gt + (pl.col("nrs") >= 3).alias("nrs >= 3"), # ge + (pl.col("random") < 0.2).alias("random < .2"), # .lt + (pl.col("random") <= 0.5).alias("random <= .5"), # .le + (pl.col("nrs") != 1).alias("nrs != 1"), # .ne + (pl.col("nrs") == 1).alias("nrs == 1"), # .eq +) +print(result) +# --8<-- [end:comparison] + +# --8<-- [start:boolean] +# Boolean operators & | ~ +result = df.select( + ((~pl.col("nrs").is_null()) & (pl.col("groups") == "A")).alias( + "number not null and group A" + ), + ((pl.col("random") < 0.5) | (pl.col("groups") == "B")).alias( + "random < 0.5 or group B" + ), +) + +print(result) + +# Corresponding named functions `and_`, `or_`, and `not_`. +result2 = df.select( + (pl.col("nrs").is_null().not_().and_(pl.col("groups") == "A")).alias( + "number not null and group A" + ), + ((pl.col("random") < 0.5).or_(pl.col("groups") == "B")).alias( + "random < 0.5 or group B" + ), +) +print(result.equals(result2)) +# --8<-- [end:boolean] + +# --8<-- [start:bitwise] +result = df.select( + pl.col("nrs"), + (pl.col("nrs") & 6).alias("nrs & 6"), + (pl.col("nrs") | 6).alias("nrs | 6"), + (~pl.col("nrs")).alias("not nrs"), + (pl.col("nrs") ^ 6).alias("nrs ^ 6"), +) + +print(result) +# --8<-- [end:bitwise] + +# --8<-- [start:count] +long_df = pl.DataFrame({"numbers": np.random.randint(0, 100_000, 100_000)}) + +result = long_df.select( + pl.col("numbers").n_unique().alias("n_unique"), + pl.col("numbers").approx_n_unique().alias("approx_n_unique"), +) + +print(result) +# --8<-- [end:count] + +# --8<-- [start:value_counts] +result = df.select( + pl.col("names").value_counts().alias("value_counts"), +) + +print(result) +# --8<-- [end:value_counts] + +# --8<-- [start:unique_counts] +result = df.select( + pl.col("names").unique(maintain_order=True).alias("unique"), + pl.col("names").unique_counts().alias("unique_counts"), +) + +print(result) +# --8<-- [end:unique_counts] + +# --8<-- [start:collatz] +result = df.select( + pl.col("nrs"), + pl.when(pl.col("nrs") % 2 == 1) # Is the number odd? + .then(3 * pl.col("nrs") + 1) # If so, multiply by 3 and add 1. + .otherwise(pl.col("nrs") // 2) # If not, divide by 2. + .alias("Collatz"), +) + +print(result) +# --8<-- [end:collatz] diff --git a/docs/source/src/python/user-guide/expressions/operators.py b/docs/source/src/python/user-guide/expressions/operators.py deleted file mode 100644 index 92bf57952332..000000000000 --- a/docs/source/src/python/user-guide/expressions/operators.py +++ /dev/null @@ -1,44 +0,0 @@ -# --8<-- [start:setup] - -import polars as pl -import numpy as np - -np.random.seed(12) -# --8<-- [end:setup] - - -# --8<-- [start:dataframe] -df = pl.DataFrame( - { - "nrs": [1, 2, 3, None, 5], - "names": ["foo", "ham", "spam", "egg", None], - "random": np.random.rand(5), - "groups": ["A", "A", "B", "C", "B"], - } -) -print(df) -# --8<-- [end:dataframe] - -# --8<-- [start:numerical] - -df_numerical = df.select( - (pl.col("nrs") + 5).alias("nrs + 5"), - (pl.col("nrs") - 5).alias("nrs - 5"), - (pl.col("nrs") * pl.col("random")).alias("nrs * random"), - (pl.col("nrs") / pl.col("random")).alias("nrs / random"), -) -print(df_numerical) - -# --8<-- [end:numerical] - -# --8<-- [start:logical] -df_logical = df.select( - (pl.col("nrs") > 1).alias("nrs > 1"), - (pl.col("random") <= 0.5).alias("random <= .5"), - (pl.col("nrs") != 1).alias("nrs != 1"), - (pl.col("nrs") == 1).alias("nrs == 1"), - ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"), # and - ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"), # or -) -print(df_logical) -# --8<-- [end:logical] diff --git a/docs/source/src/python/user-guide/expressions/strings.py b/docs/source/src/python/user-guide/expressions/strings.py index 379c20358feb..a68d032d702f 100644 --- a/docs/source/src/python/user-guide/expressions/strings.py +++ b/docs/source/src/python/user-guide/expressions/strings.py @@ -1,61 +1,112 @@ -# --8<-- [start:setup] +# --8<-- [start:df] import polars as pl -# --8<-- [end:setup] - - -# --8<-- [start:df] -df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]}) +df = pl.DataFrame( + { + "language": ["English", "Dutch", "Portuguese", "Finish"], + "fruit": ["pear", "peer", "pêra", "päärynä"], + } +) -out = df.select( - pl.col("animal").str.len_bytes().alias("byte_count"), - pl.col("animal").str.len_chars().alias("letter_count"), +result = df.with_columns( + pl.col("fruit").str.len_bytes().alias("byte_count"), + pl.col("fruit").str.len_chars().alias("letter_count"), ) -print(out) +print(result) # --8<-- [end:df] # --8<-- [start:existence] -out = df.select( - pl.col("animal"), - pl.col("animal").str.contains("cat|bit").alias("regex"), - pl.col("animal").str.contains("rab$", literal=True).alias("literal"), - pl.col("animal").str.starts_with("rab").alias("starts_with"), - pl.col("animal").str.ends_with("dog").alias("ends_with"), -) -print(out) +result = df.select( + pl.col("fruit"), + pl.col("fruit").str.starts_with("p").alias("starts_with_p"), + pl.col("fruit").str.contains("p..r").alias("p..r"), + pl.col("fruit").str.contains("e+").alias("e+"), + pl.col("fruit").str.ends_with("r").alias("ends_with_r"), +) +print(result) # --8<-- [end:existence] # --8<-- [start:extract] df = pl.DataFrame( { - "a": [ + "urls": [ "http://vote.com/ballon_dor?candidate=messi&ref=polars", "http://vote.com/ballon_dor?candidat=jorginho&ref=polars", "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars", ] } ) -out = df.select( - pl.col("a").str.extract(r"candidate=(\w+)", group_index=1), +result = df.select( + pl.col("urls").str.extract(r"candidate=(\w+)", group_index=1), ) -print(out) +print(result) # --8<-- [end:extract] # --8<-- [start:extract_all] -df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]}) -out = df.select( - pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"), +df = pl.DataFrame({"text": ["123 bla 45 asd", "xyz 678 910t"]}) +result = df.select( + pl.col("text").str.extract_all(r"(\d+)").alias("extracted_nrs"), ) -print(out) +print(result) # --8<-- [end:extract_all] # --8<-- [start:replace] -df = pl.DataFrame({"id": [1, 2], "text": ["123abc", "abc456"]}) -out = df.with_columns( - pl.col("text").str.replace(r"abc\b", "ABC"), - pl.col("text").str.replace_all("a", "-", literal=True).alias("text_replace_all"), +df = pl.DataFrame({"text": ["123abc", "abc456"]}) +result = df.with_columns( + pl.col("text").str.replace(r"\d", "-"), + pl.col("text").str.replace_all(r"\d", "-").alias("text_replace_all"), ) -print(out) +print(result) # --8<-- [end:replace] + +# --8<-- [start:casing] +addresses = pl.DataFrame( + { + "addresses": [ + "128 PERF st", + "Rust blVD, 158", + "PoLaRs Av, 12", + "1042 Query sq", + ] + } +) + +addresses = addresses.select( + pl.col("addresses").alias("originals"), + pl.col("addresses").str.to_titlecase(), + pl.col("addresses").str.to_lowercase().alias("lower"), + pl.col("addresses").str.to_uppercase().alias("upper"), +) +print(addresses) +# --8<-- [end:casing] + +# --8<-- [start:strip] +addr = pl.col("addresses") +chars = ", 0123456789" +result = addresses.select( + addr.str.strip_chars(chars).alias("strip"), + addr.str.strip_chars_end(chars).alias("end"), + addr.str.strip_chars_start(chars).alias("start"), + addr.str.strip_prefix("128 ").alias("prefix"), + addr.str.strip_suffix(", 158").alias("suffix"), +) +print(result) +# --8<-- [end:strip] + +# --8<-- [start:slice] +df = pl.DataFrame( + { + "fruits": ["pear", "mango", "dragonfruit", "passionfruit"], + "n": [1, -1, 4, -4], + } +) + +result = df.with_columns( + pl.col("fruits").str.slice(pl.col("n")).alias("slice"), + pl.col("fruits").str.head(pl.col("n")).alias("head"), + pl.col("fruits").str.tail(pl.col("n")).alias("tail"), +) +print(result) +# --8<-- [end:slice] diff --git a/docs/source/src/python/user-guide/expressions/structs.py b/docs/source/src/python/user-guide/expressions/structs.py index 232ccea9b8c4..f500343b428d 100644 --- a/docs/source/src/python/user-guide/expressions/structs.py +++ b/docs/source/src/python/user-guide/expressions/structs.py @@ -1,28 +1,25 @@ -# --8<-- [start:setup] +# --8<-- [start:ratings_df] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:ratings_df] ratings = pl.DataFrame( { - "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], - "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], - "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6], - "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 26], + "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "Cars"], + "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "NE"], + "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.5, 4.9, 4.7, 4.6], + "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 28], } ) print(ratings) # --8<-- [end:ratings_df] # --8<-- [start:state_value_counts] -out = ratings.select(pl.col("Theatre").value_counts(sort=True)) -print(out) +result = ratings.select(pl.col("Theatre").value_counts(sort=True)) +print(result) # --8<-- [end:state_value_counts] # --8<-- [start:struct_unnest] -out = ratings.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre") -print(out) +result = ratings.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre") +print(result) # --8<-- [end:struct_unnest] # --8<-- [start:series_struct] @@ -36,43 +33,87 @@ print(rating_series) # --8<-- [end:series_struct] +# --8<-- [start:series_struct_error] +null_rating_series = pl.Series( + "ratings", + [ + {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5}, + {"Mov": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9}, + {"Movie": "Snow White", "Theatre": "IL", "Avg_Rating": "4.7"}, + ], + strict=False, # To show the final structs with `null` values. +) +print(null_rating_series) +# --8<-- [end:series_struct_error] + # --8<-- [start:series_struct_extract] -out = rating_series.struct.field("Movie") -print(out) +result = rating_series.struct.field("Movie") +print(result) # --8<-- [end:series_struct_extract] # --8<-- [start:series_struct_rename] -out = ( - rating_series.to_frame() - .select(pl.col("ratings").struct.rename_fields(["Film", "State", "Value"])) - .unnest("ratings") -) -print(out) +result = rating_series.struct.rename_fields(["Film", "State", "Value"]) +print(result) # --8<-- [end:series_struct_rename] +# --8<-- [start:struct-rename-check] +print( + result.to_frame().unnest("ratings"), +) +# --8<-- [end:struct-rename-check] + # --8<-- [start:struct_duplicates] -out = ratings.filter(pl.struct("Movie", "Theatre").is_duplicated()) -print(out) +result = ratings.filter(pl.struct("Movie", "Theatre").is_duplicated()) +print(result) # --8<-- [end:struct_duplicates] # --8<-- [start:struct_ranking] -out = ratings.with_columns( +result = ratings.with_columns( pl.struct("Count", "Avg_Rating") .rank("dense", descending=True) .over("Movie", "Theatre") .alias("Rank") ).filter(pl.struct("Movie", "Theatre").is_duplicated()) -print(out) + +print(result) # --8<-- [end:struct_ranking] # --8<-- [start:multi_column_apply] df = pl.DataFrame({"keys": ["a", "a", "b"], "values": [10, 7, 1]}) -out = df.select( +result = df.select( pl.struct(["keys", "values"]) .map_elements(lambda x: len(x["keys"]) + x["values"], return_dtype=pl.Int64) .alias("solution_map_elements"), (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), ) -print(out) +print(result) # --8<-- [end:multi_column_apply] + + +# --8<-- [start:ack] +def ack(m, n): + if not m: + return n + 1 + if not n: + return ack(m - 1, 1) + return ack(m - 1, ack(m, n - 1)) + + +# --8<-- [end:ack] + +# --8<-- [start:struct-ack] +values = pl.DataFrame( + { + "m": [0, 0, 0, 1, 1, 1, 2], + "n": [2, 3, 4, 1, 2, 3, 1], + } +) +result = values.with_columns( + pl.struct(["m", "n"]) + .map_elements(lambda s: ack(s["m"], s["n"]), return_dtype=pl.Int64) + .alias("ack") +) + +print(result) +# --8<-- [end:struct-ack] diff --git a/docs/source/src/python/user-guide/expressions/window.py b/docs/source/src/python/user-guide/expressions/window.py index 9ed9ce5d4f88..f82da48d75f1 100644 --- a/docs/source/src/python/user-guide/expressions/window.py +++ b/docs/source/src/python/user-guide/expressions/window.py @@ -1,16 +1,103 @@ # --8<-- [start:pokemon] import polars as pl -# then let's load some csv data with information about pokemon -df = pl.read_csv( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv" +types = ( + "Grass Water Fire Normal Ground Electric Psychic Fighting Bug Steel " + "Flying Dragon Dark Ghost Poison Rock Ice Fairy".split() ) -print(df.head()) +type_enum = pl.Enum(types) +# then let's load some csv data with information about pokemon +pokemon = pl.read_csv( + "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", +).cast({"Type 1": type_enum, "Type 2": type_enum}) +print(pokemon.head()) # --8<-- [end:pokemon] +# --8<-- [start:rank] +result = pokemon.select( + pl.col("Name", "Type 1"), + pl.col("Speed").rank("dense", descending=True).over("Type 1").alias("Speed rank"), +) + +print(result) +# --8<-- [end:rank] + +# --8<-- [start:rank-multiple] +result = pokemon.select( + pl.col("Name", "Type 1", "Type 2"), + pl.col("Speed") + .rank("dense", descending=True) + .over("Type 1", "Type 2") + .alias("Speed rank"), +) + +print(result) +# --8<-- [end:rank-multiple] + +# --8<-- [start:rank-explode] +result = ( + pokemon.group_by("Type 1") + .agg( + pl.col("Name"), + pl.col("Speed").rank("dense", descending=True).alias("Speed rank"), + ) + .select(pl.col("Name"), pl.col("Type 1"), pl.col("Speed rank")) + .explode("Name", "Speed rank") +) + +print(result) +# --8<-- [end:rank-explode] + +# --8<-- [start:athletes] +athletes = pl.DataFrame( + { + "athlete": list("ABCDEF"), + "country": ["PT", "NL", "NL", "PT", "PT", "NL"], + "rank": [6, 1, 5, 4, 2, 3], + } +) +print(athletes) +# --8<-- [end:athletes] + +# --8<-- [start:athletes-sort-over-country] +result = athletes.select( + pl.col("athlete", "rank").sort_by(pl.col("rank")).over(pl.col("country")), + pl.col("country"), +) + +print(result) +# --8<-- [end:athletes-sort-over-country] + +# --8<-- [start:athletes-explode] +result = athletes.select( + pl.all() + .sort_by(pl.col("rank")) + .over(pl.col("country"), mapping_strategy="explode"), +) + +print(result) +# --8<-- [end:athletes-explode] + +# --8<-- [start:athletes-join] +result = athletes.with_columns( + pl.col("rank").sort().over(pl.col("country"), mapping_strategy="join"), +) + +print(result) +# --8<-- [end:athletes-join] + +# --8<-- [start:pokemon-mean] +result = pokemon.select( + pl.col("Name", "Type 1", "Speed"), + pl.col("Speed").mean().over(pl.col("Type 1")).alias("Mean speed in group"), +) + +print(result) +# --8<-- [end:pokemon-mean] + # --8<-- [start:group_by] -out = df.select( +result = pokemon.select( "Type 1", "Type 2", pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"), @@ -20,11 +107,11 @@ .alias("avg_defense_by_type_combination"), pl.col("Attack").mean().alias("avg_attack"), ) -print(out) +print(result) # --8<-- [end:group_by] # --8<-- [start:operations] -filtered = df.filter(pl.col("Type 2") == "Psychic").select( +filtered = pokemon.filter(pl.col("Type 2") == "Psychic").select( "Name", "Type 1", "Speed", @@ -33,36 +120,14 @@ # --8<-- [end:operations] # --8<-- [start:sort] -out = filtered.with_columns( +result = filtered.with_columns( pl.col("Name", "Speed").sort_by("Speed", descending=True).over("Type 1"), ) -print(out) +print(result) # --8<-- [end:sort] -# --8<-- [start:rules] -# aggregate and broadcast within a group -# output type: -> Int32 -pl.sum("foo").over("groups") - -# sum within a group and multiply with group elements -# output type: -> Int32 -(pl.col("x").sum() * pl.col("y")).over("groups") - -# sum within a group and multiply with group elements -# and aggregate the group to a list -# output type: -> List(Int32) -(pl.col("x").sum() * pl.col("y")).over("groups", mapping_strategy="join") - -# sum within a group and multiply with group elements -# and aggregate the group to a list -# then explode the list to multiple rows - -# This is the fastest method to do things over groups when the groups are sorted -(pl.col("x").sum() * pl.col("y")).over("groups", mapping_strategy="explode") -# --8<-- [end:rules] - # --8<-- [start:examples] -out = df.sort("Type 1").select( +result = pokemon.sort("Type 1").select( pl.col("Type 1").head(3).over("Type 1", mapping_strategy="explode"), pl.col("Name") .sort_by(pl.col("Speed"), descending=True) @@ -80,5 +145,5 @@ .over("Type 1", mapping_strategy="explode") .alias("sorted_by_alphabet"), ) -print(out) +print(result) # --8<-- [end:examples] diff --git a/docs/source/src/rust/Cargo.toml b/docs/source/src/rust/Cargo.toml index 8a6607d4aa84..7f1620d2696c 100644 --- a/docs/source/src/rust/Cargo.toml +++ b/docs/source/src/rust/Cargo.toml @@ -26,131 +26,123 @@ path = "home/example.rs" required-features = ["polars/lazy", "polars/csv"] [[bin]] -name = "user-guide-getting-started" +name = "getting-started" path = "user-guide/getting-started.rs" required-features = ["polars/lazy", "polars/temporal", "polars/round_series", "polars/strings"] [[bin]] -name = "user-guide-concepts-data-types-and-structures" +name = "concepts-data-types-and-structures" path = "user-guide/concepts/data-types-and-structures.rs" [[bin]] -name = "user-guide-concepts-contexts" -path = "user-guide/concepts/contexts.rs" -required-features = ["polars/lazy"] -[[bin]] -name = "user-guide-concepts-expressions" +name = "concepts-expressions" path = "user-guide/concepts/expressions.rs" required-features = ["polars/lazy", "polars/temporal", "polars/is_between"] [[bin]] -name = "user-guide-concepts-lazy-vs-eager" +name = "concepts-lazy-vs-eager" path = "user-guide/concepts/lazy-vs-eager.rs" required-features = ["polars/lazy", "polars/csv"] [[bin]] -name = "user-guide-concepts-streaming" +name = "concepts-streaming" path = "user-guide/concepts/streaming.rs" required-features = ["polars/lazy", "polars/csv"] [[bin]] -name = "user-guide-expressions-aggregation" +name = "expressions-aggregation" path = "user-guide/expressions/aggregation.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-casting" +name = "expressions-casting" path = "user-guide/expressions/casting.rs" -required-features = ["polars/lazy"] +required-features = ["polars/lazy", "polars/temporal", "polars/strings", "polars/dtype-u8"] [[bin]] -name = "user-guide-expressions-column-selections" +name = "expressions-column-selections" path = "user-guide/expressions/column-selections.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-folds" +name = "expressions-folds" path = "user-guide/expressions/folds.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-functions" -path = "user-guide/expressions/functions.rs" -required-features = ["polars/lazy"] +name = "expressions-expression-expansion" +path = "user-guide/expressions/expression-expansion.rs" +required-features = ["polars/lazy", "polars/round_series", "polars/regex"] [[bin]] -name = "user-guide-expressions-lists" +name = "expressions-lists" path = "user-guide/expressions/lists.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-missing-data" +name = "expressions-missing-data" path = "user-guide/expressions/missing-data.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-operators" -path = "user-guide/expressions/operators.rs" -required-features = ["polars/lazy"] +name = "expressions-operations" +path = "user-guide/expressions/operations.rs" +required-features = ["polars/lazy", "polars/approx_unique", "polars/dtype-struct", "polars/unique_counts"] [[bin]] -name = "user-guide-expressions-strings" +name = "expressions-strings" path = "user-guide/expressions/strings.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-structs" +name = "expressions-structs" path = "user-guide/expressions/structs.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-user-defined-functions" -path = "user-guide/expressions/user-defined-functions.rs" -required-features = ["polars/lazy"] -[[bin]] -name = "user-guide-expressions-window" +name = "expressions-window" path = "user-guide/expressions/window.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-io-cloud-storage" +name = "io-cloud-storage" path = "user-guide/io/cloud-storage.rs" required-features = ["polars/csv"] [[bin]] -name = "user-guide-io-csv" +name = "io-csv" path = "user-guide/io/csv.rs" required-features = ["polars/csv"] [[bin]] -name = "user-guide-io-json" +name = "io-json" path = "user-guide/io/json.rs" required-features = ["polars/json"] [[bin]] -name = "user-guide-io-parquet" +name = "io-parquet" path = "user-guide/io/parquet.rs" required-features = ["polars/parquet"] [[bin]] -name = "user-guide-transformations-concatenation" +name = "transformations-concatenation" path = "user-guide/transformations/concatenation.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-transformations-joins" +name = "transformations-joins" path = "user-guide/transformations/joins.rs" required-features = ["polars/lazy", "polars/strings", "polars/semi_anti_join", "polars/iejoin", "polars/cross_join"] [[bin]] -name = "user-guide-transformations-unpivot" +name = "transformations-unpivot" path = "user-guide/transformations/unpivot.rs" required-features = ["polars/pivot"] [[bin]] -name = "user-guide-transformations-pivot" +name = "transformations-pivot" path = "user-guide/transformations/pivot.rs" required-features = ["polars/lazy", "polars/pivot"] [[bin]] -name = "user-guide-transformations-time-series-filter" +name = "transformations-time-series-filter" path = "user-guide/transformations/time-series/filter.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-transformations-time-series-parsing" +name = "transformations-time-series-parsing" path = "user-guide/transformations/time-series/parsing.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-transformations-time-series-resampling" +name = "transformations-time-series-resampling" path = "user-guide/transformations/time-series/resampling.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-transformations-time-series-rolling" +name = "transformations-time-series-rolling" path = "user-guide/transformations/time-series/rolling.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-transformations-time-series-timezones" +name = "transformations-time-series-timezones" path = "user-guide/transformations/time-series/timezones.rs" required-features = ["polars/lazy"] diff --git a/docs/source/src/rust/user-guide/concepts/contexts.rs b/docs/source/src/rust/user-guide/concepts/contexts.rs deleted file mode 100644 index 1ff1114d4bf0..000000000000 --- a/docs/source/src/rust/user-guide/concepts/contexts.rs +++ /dev/null @@ -1,70 +0,0 @@ -use polars::prelude::*; - -fn main() -> Result<(), Box> { - // --8<-- [start:dataframe] - use rand::{thread_rng, Rng}; - - let mut arr = [0f64; 5]; - thread_rng().fill(&mut arr); - - let df = df! ( - "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], - "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None], - "random" => &arr, - "groups" => &["A", "A", "B", "C", "B"], - )?; - - println!("{}", &df); - // --8<-- [end:dataframe] - - // --8<-- [start:select] - let out = df - .clone() - .lazy() - .select([ - sum("nrs"), - col("names").sort(Default::default()), - col("names").first().alias("first name"), - (mean("nrs") * lit(10)).alias("10xnrs"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:select] - - // --8<-- [start:filter] - let out = df.clone().lazy().filter(col("nrs").gt(lit(2))).collect()?; - println!("{}", out); - // --8<-- [end:filter] - - // --8<-- [start:with_columns] - let out = df - .clone() - .lazy() - .with_columns([ - sum("nrs").alias("nrs_sum"), - col("random").count().alias("count"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:with_columns] - - // --8<-- [start:group_by] - let out = df - .lazy() - .group_by([col("groups")]) - .agg([ - sum("nrs"), // sum nrs by groups - col("random").count().alias("count"), // count group members - // sum random where name != null - col("random") - .filter(col("names").is_not_null()) - .sum() - .name() - .suffix("_sum"), - col("names").reverse().alias("reversed names"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:group_by] - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/expressions/aggregation.rs b/docs/source/src/rust/user-guide/expressions/aggregation.rs index 9436565330bf..76b6ce22272d 100644 --- a/docs/source/src/rust/user-guide/expressions/aggregation.rs +++ b/docs/source/src/rust/user-guide/expressions/aggregation.rs @@ -1,9 +1,8 @@ -use polars::prelude::*; - fn main() -> Result<(), Box> { // --8<-- [start:dataframe] use std::io::Cursor; + use polars::prelude::*; use reqwest::blocking::Client; let url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"; @@ -89,7 +88,7 @@ fn main() -> Result<(), Box> { .clone() .lazy() .group_by(["state", "party"]) - .agg([col("party").count().alias("count")]) + .agg([len().count().alias("count")]) .filter( col("party") .eq(lit("Anti-Administration")) @@ -109,7 +108,7 @@ fn main() -> Result<(), Box> { // --8<-- [start:filter] fn compute_age() -> Expr { - lit(2022) - col("birthday").dt().year() + lit(2024) - col("birthday").dt().year() } fn avg_birthday(gender: &str) -> Expr { @@ -135,8 +134,12 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:filter] + // --8<-- [start:filter-nested] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:filter-nested] + // --8<-- [start:sort] - fn get_person() -> Expr { + fn get_name() -> Expr { col("first_name") + lit(" ") + col("last_name") } @@ -151,8 +154,8 @@ fn main() -> Result<(), Box> { ) .group_by(["state"]) .agg([ - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), ]) .limit(5) .collect()?; @@ -172,9 +175,9 @@ fn main() -> Result<(), Box> { ) .group_by(["state"]) .agg([ - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person() + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), + get_name() .sort(Default::default()) .first() .alias("alphabetical_first"), @@ -197,16 +200,15 @@ fn main() -> Result<(), Box> { ) .group_by(["state"]) .agg([ - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person() + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), + get_name() .sort(Default::default()) .first() .alias("alphabetical_first"), col("gender") .sort_by(["first_name"], SortMultipleOptions::default()) - .first() - .alias("gender"), + .first(), ]) .sort(["state"], SortMultipleOptions::default()) .limit(5) diff --git a/docs/source/src/rust/user-guide/expressions/casting.rs b/docs/source/src/rust/user-guide/expressions/casting.rs index 85824afc3198..6e0cb2b9576a 100644 --- a/docs/source/src/rust/user-guide/expressions/casting.rs +++ b/docs/source/src/rust/user-guide/expressions/casting.rs @@ -1,21 +1,18 @@ -// --8<-- [start:setup] -use polars::prelude::*; -// --8<-- [end:setup] - fn main() -> Result<(), Box> { // --8<-- [start:dfnum] + use polars::prelude::*; + let df = df! ( - "integers"=> &[1, 2, 3, 4, 5], - "big_integers"=> &[1, 10000002, 3, 10000004, 10000005], - "floats"=> &[4.0, 5.0, 6.0, 7.0, 8.0], - "floats_with_decimal"=> &[4.532, 5.5, 6.5, 7.5, 8.5], + "integers"=> [1, 2, 3], + "big_integers"=> [10000002, 2, 30000003], + "floats"=> [4.0, 5.8, -6.3], )?; - println!("{}", &df); + println!("{}", df); // --8<-- [end:dfnum] // --8<-- [start:castnum] - let out = df + let result = df .clone() .lazy() .select([ @@ -25,193 +22,145 @@ fn main() -> Result<(), Box> { col("floats") .cast(DataType::Int32) .alias("floats_as_integers"), - col("floats_with_decimal") - .cast(DataType::Int32) - .alias("floats_with_decimal_as_integers"), ]) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:castnum] // --8<-- [start:downcast] - let out = df + println!("Before downcasting: {} bytes", df.estimated_size()); + let result = df .clone() .lazy() - .select([ - col("integers") - .cast(DataType::Int16) - .alias("integers_smallfootprint"), - col("floats") - .cast(DataType::Float32) - .alias("floats_smallfootprint"), + .with_columns([ + col("integers").cast(DataType::Int16), + col("floats").cast(DataType::Float32), ]) - .collect(); - match out { - Ok(out) => println!("{}", &out), - Err(e) => println!("{:?}", e), - }; + .collect()?; + println!("After downcasting: {} bytes", result.estimated_size()); // --8<-- [end:downcast] // --8<-- [start:overflow] - - let out = df + let result = df .clone() .lazy() .select([col("big_integers").strict_cast(DataType::Int8)]) .collect(); - match out { - Ok(out) => println!("{}", &out), - Err(e) => println!("{:?}", e), + if let Err(e) = result { + println!("{}", e) }; // --8<-- [end:overflow] // --8<-- [start:overflow2] - let out = df + let result = df .clone() .lazy() .select([col("big_integers").cast(DataType::Int8)]) - .collect(); - match out { - Ok(out) => println!("{}", &out), - Err(e) => println!("{:?}", e), - }; + .collect()?; + println!("{}", result); // --8<-- [end:overflow2] // --8<-- [start:strings] - let df = df! ( - "integers" => &[1, 2, 3, 4, 5], - "float" => &[4.0, 5.03, 6.0, 7.0, 8.0], - "floats_as_string" => &["4.0", "5.0", "6.0", "7.0", "8.0"], + "integers_as_strings" => ["1", "2", "3"], + "floats_as_strings" => ["4.0", "5.8", "-6.3"], + "floats" => [4.0, 5.8, -6.3], )?; - let out = df + let result = df .clone() .lazy() .select([ - col("integers").cast(DataType::String), - col("float").cast(DataType::String), - col("floats_as_string").cast(DataType::Float64), + col("integers_as_strings").cast(DataType::Int32), + col("floats_as_strings").cast(DataType::Float64), + col("floats").cast(DataType::String), ]) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:strings] // --8<-- [start:strings2] + let df = df! ("floats" => ["4.0", "5.8", "- 6 . 3"])?; - let df = df! ("strings_not_float"=> ["4.0", "not_a_number", "6.0", "7.0", "8.0"])?; - - let out = df + let result = df .clone() .lazy() - .select([col("strings_not_float").cast(DataType::Float64)]) + .select([col("floats").strict_cast(DataType::Float64)]) .collect(); - match out { - Ok(out) => println!("{}", &out), - Err(e) => println!("{:?}", e), + if let Err(e) = result { + println!("{}", e) }; // --8<-- [end:strings2] // --8<-- [start:bool] - let df = df! ( - "integers"=> &[-1, 0, 2, 3, 4], - "floats"=> &[0.0, 1.0, 2.0, 3.0, 4.0], - "bools"=> &[true, false, true, false, true], + "integers"=> [-1, 0, 2, 3, 4], + "floats"=> [0.0, 1.0, 2.0, 3.0, 4.0], + "bools"=> [true, false, true, false, true], )?; - let out = df + let result = df .clone() .lazy() .select([ col("integers").cast(DataType::Boolean), col("floats").cast(DataType::Boolean), + col("bools").cast(DataType::UInt8), ]) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:bool] // --8<-- [start:dates] use chrono::prelude::*; - let date = polars::time::date_range( - "date".into(), - NaiveDate::from_ymd_opt(2022, 1, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(), - NaiveDate::from_ymd_opt(2022, 1, 5) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(), - Duration::parse("1d"), - ClosedWindow::Both, - TimeUnit::Milliseconds, - None, - )? - .cast(&DataType::Date)?; - - let datetime = polars::time::date_range( - "datetime".into(), - NaiveDate::from_ymd_opt(2022, 1, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(), - NaiveDate::from_ymd_opt(2022, 1, 5) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(), - Duration::parse("1d"), - ClosedWindow::Both, - TimeUnit::Milliseconds, - None, - )?; - - let df = df! ( - "date" => date, - "datetime" => datetime, - )?; - - let out = df + let df = df!( + "date" => [ + NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(), // epoch + NaiveDate::from_ymd_opt(1970, 1, 10).unwrap(), // 9 days later + ], + "datetime" => [ + NaiveDate::from_ymd_opt(1970, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), // epoch + NaiveDate::from_ymd_opt(1970, 1, 1).unwrap().and_hms_opt(0, 1, 0).unwrap(), // 1 minute later + ], + "time" => [ + NaiveTime::from_hms_opt(0, 0, 0).unwrap(), // reference time + NaiveTime::from_hms_opt(0, 0, 1).unwrap(), // 1 second later + ] + ) + .unwrap() + .lazy() + // Make the time unit match that of Python's for the same results. + .with_column(col("datetime").cast(DataType::Datetime(TimeUnit::Microseconds, None))) + .collect()?; + + let result = df .clone() .lazy() .select([ - col("date").cast(DataType::Int64), - col("datetime").cast(DataType::Int64), + col("date").cast(DataType::Int64).alias("days_since_epoch"), + col("datetime") + .cast(DataType::Int64) + .alias("us_since_epoch"), + col("time").cast(DataType::Int64).alias("ns_since_midnight"), ]) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:dates] // --8<-- [start:dates2] - let date = polars::time::date_range( - "date".into(), - NaiveDate::from_ymd_opt(2022, 1, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(), - NaiveDate::from_ymd_opt(2022, 1, 5) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(), - Duration::parse("1d"), - ClosedWindow::Both, - TimeUnit::Milliseconds, - None, - )?; - let df = df! ( - "date" => date, - "string" => &[ + "date" => [ + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 2).unwrap(), + ], + "string" => [ "2022-01-01", "2022-01-02", - "2022-01-03", - "2022-01-04", - "2022-01-05", ], )?; - let out = df + let result = df .clone() .lazy() .select([ @@ -224,7 +173,7 @@ fn main() -> Result<(), Box> { ), ]) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:dates2] Ok(()) diff --git a/docs/source/src/rust/user-guide/expressions/column-selections.rs b/docs/source/src/rust/user-guide/expressions/column-selections.rs index c0f3f35ac3b0..0dff5ab38c62 100644 --- a/docs/source/src/rust/user-guide/expressions/column-selections.rs +++ b/docs/source/src/rust/user-guide/expressions/column-selections.rs @@ -79,7 +79,7 @@ fn main() -> Result<(), Box> { // --8<-- [start:selectors_by_name] // Not available in Rust, refer the following link - // https://github.com/pola-rs/polars/issues/1059 + // https://github.com/pola-rs/polars/issues/10594 // --8<-- [end:selectors_by_name] // --8<-- [start:selectors_to_expr] diff --git a/docs/source/src/rust/user-guide/expressions/expression-expansion.rs b/docs/source/src/rust/user-guide/expressions/expression-expansion.rs new file mode 100644 index 000000000000..a1752a7d0334 --- /dev/null +++ b/docs/source/src/rust/user-guide/expressions/expression-expansion.rs @@ -0,0 +1,215 @@ +fn main() -> Result<(), Box> { + // --8<-- [start:df] + use polars::prelude::*; + + // Data as of 14th October 2024, ~3pm UTC + let df = df!( + "ticker" => ["AAPL", "NVDA", "MSFT", "GOOG", "AMZN"], + "company_name" => ["Apple", "NVIDIA", "Microsoft", "Alphabet (Google)", "Amazon"], + "price" => [229.9, 138.93, 420.56, 166.41, 188.4], + "day_high" => [231.31, 139.6, 424.04, 167.62, 189.83], + "day_low" => [228.6, 136.3, 417.52, 164.78, 188.44], + "year_high" => [237.23, 140.76, 468.35, 193.31, 201.2], + "year_low" => [164.08, 39.23, 324.39, 121.46, 118.35], + )?; + + println!("{}", df); + // --8<-- [end:df] + + // --8<-- [start:col-with-names] + let eur_usd_rate = 1.09; // As of 14th October 2024 + + let result = df + .clone() + .lazy() + .with_column( + (cols(["price", "day_high", "day_low", "year_high", "year_low"]) / lit(eur_usd_rate)) + .round(2), + ) + .collect()?; + println!("{}", result); + // --8<-- [end:col-with-names] + + // --8<-- [start:expression-list] + let exprs = [ + (col("price") / lit(eur_usd_rate)).round(2), + (col("day_high") / lit(eur_usd_rate)).round(2), + (col("day_low") / lit(eur_usd_rate)).round(2), + (col("year_high") / lit(eur_usd_rate)).round(2), + (col("year_low") / lit(eur_usd_rate)).round(2), + ]; + + let result2 = df.clone().lazy().with_columns(exprs).collect()?; + println!("{}", result.equals(&result2)); + // --8<-- [end:expression-list] + + // --8<-- [start:col-with-dtype] + let result = df + .clone() + .lazy() + .with_column((dtype_col(&DataType::Float64) / lit(eur_usd_rate)).round(2)) + .collect()?; + println!("{}", result); + // --8<-- [end:col-with-dtype] + + // --8<-- [start:col-with-dtypes] + let result2 = df + .clone() + .lazy() + .with_column( + (dtype_cols([DataType::Float32, DataType::Float64]) / lit(eur_usd_rate)).round(2), + ) + .collect()?; + println!("{}", result.equals(&result2)); + // --8<-- [end:col-with-dtypes] + + // --8<-- [start:col-with-regex] + // NOTE: Using regex inside `col`/`cols` requires the feature flag `regex`. + let result = df + .clone() + .lazy() + .select([cols(["ticker", "^.*_high$", "^.*_low$"])]) + .collect()?; + println!("{}", result); + // --8<-- [end:col-with-regex] + + // --8<-- [start:all] + let result = df.clone().lazy().select([all()]).collect()?; + println!("{}", result.equals(&df)); + // --8<-- [end:all] + + // --8<-- [start:all-exclude] + let result = df + .clone() + .lazy() + .select([all().exclude(["^day_.*$"])]) + .collect()?; + println!("{}", result); + // --8<-- [end:all-exclude] + + // --8<-- [start:col-exclude] + let result = df + .clone() + .lazy() + .select([dtype_col(&DataType::Float64).exclude(["^day_.*$"])]) + .collect()?; + println!("{}", result); + // --8<-- [end:col-exclude] + + // --8<-- [start:duplicate-error] + let gbp_usd_rate = 1.31; // As of 14th October 2024 + + let result = df + .clone() + .lazy() + .select([ + col("price") / lit(gbp_usd_rate), + col("price") / lit(eur_usd_rate), + ]) + .collect(); + match result { + Ok(df) => println!("{}", df), + Err(e) => println!("{}", e), + }; + // --8<-- [end:duplicate-error] + + // --8<-- [start:alias] + let _result = df + .clone() + .lazy() + .select([ + (col("price") / lit(gbp_usd_rate)).alias("price (GBP)"), + (col("price") / lit(eur_usd_rate)).alias("price (EUR)"), + ]) + .collect()?; + // --8<-- [end:alias] + + // --8<-- [start:prefix-suffix] + let result = df + .clone() + .lazy() + .select([ + (col("^year_.*$") / lit(eur_usd_rate)) + .name() + .prefix("in_eur_"), + (cols(["day_high", "day_low"]) / lit(gbp_usd_rate)) + .name() + .suffix("_gbp"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:prefix-suffix] + + // --8<-- [start:name-map] + // There is also `name().to_uppercase()`, so this usage of `map` is moot. + let result = df + .clone() + .lazy() + .select([all() + .name() + .map(|name| Ok(PlSmallStr::from_string(name.to_ascii_uppercase())))]) + .collect()?; + println!("{}", result); + // --8<-- [end:name-map] + + // --8<-- [start:for-with_columns] + let mut result = df.clone().lazy(); + for tp in ["day", "year"] { + let high = format!("{}_high", tp); + let low = format!("{}_low", tp); + let aliased = format!("{}_amplitude", tp); + result = result.with_column((col(high) - col(low)).alias(aliased)) + } + let result = result.collect()?; + println!("{}", result); + // --8<-- [end:for-with_columns] + + // --8<-- [start:yield-expressions] + let mut exprs: Vec = vec![]; + for tp in ["day", "year"] { + let high = format!("{}_high", tp); + let low = format!("{}_low", tp); + let aliased = format!("{}_amplitude", tp); + exprs.push((col(high) - col(low)).alias(aliased)) + } + let result = df.clone().lazy().with_columns(exprs).collect()?; + println!("{}", result); + // --8<-- [end:yield-expressions] + + // --8<-- [start:selectors] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors] + + // --8<-- [start:selectors-set-operations] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors-set-operations] + + // --8<-- [start:selectors-expressions] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors-expressions] + + // --8<-- [start:selector-ambiguity] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selector-ambiguity] + + // --8<-- [start:as_expr] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:as_expr] + + // --8<-- [start:is_selector] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:is_selector] + + // --8<-- [start:expand_selector] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:expand_selector] + + Ok(()) +} diff --git a/docs/source/src/rust/user-guide/expressions/folds.rs b/docs/source/src/rust/user-guide/expressions/folds.rs index e7df220e0644..5f986233ecda 100644 --- a/docs/source/src/rust/user-guide/expressions/folds.rs +++ b/docs/source/src/rust/user-guide/expressions/folds.rs @@ -1,47 +1,36 @@ -use polars::prelude::*; - fn main() -> Result<(), Box> { // --8<-- [start:mansum] - let df = df!( - "a" => &[1, 2, 3], - "b" => &[10, 20, 30], - )?; - - let out = df - .lazy() - .select([fold_exprs(lit(0), |acc, x| (acc + x).map(Some), [col("*")]).alias("sum")]) - .collect()?; - println!("{}", out); + use polars::prelude::*; + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:mansum] - // --8<-- [start:conditional] - let df = df!( - "a" => &[1, 2, 3], - "b" => &[0, 1, 2], - )?; + // --8<-- [start:mansum-explicit] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:mansum-explicit] - let out = df - .lazy() - .filter(fold_exprs( - lit(true), - |acc, x| acc.bitand(&x).map(Some), - [col("*").gt(1)], - )) - .collect()?; - println!("{}", out); + // --8<-- [start:manprod] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:manprod] + + // --8<-- [start:manprod-fixed] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:manprod-fixed] + + // --8<-- [start:conditional] + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:conditional] // --8<-- [start:string] let df = df!( - "a" => &["a", "b", "c"], - "b" => &[1, 2, 3], + "a" => ["a", "b", "c"], + "b" => [1, 2, 3], )?; - let out = df + let result = df .lazy() .select([concat_str([col("a"), col("b")], "", false)]) .collect()?; - println!("{:?}", out); + println!("{:?}", result); // --8<-- [end:string] Ok(()) diff --git a/docs/source/src/rust/user-guide/expressions/functions.rs b/docs/source/src/rust/user-guide/expressions/functions.rs deleted file mode 100644 index 490809b75557..000000000000 --- a/docs/source/src/rust/user-guide/expressions/functions.rs +++ /dev/null @@ -1,79 +0,0 @@ -use polars::prelude::*; - -fn main() -> Result<(), Box> { - // --8<-- [start:dataframe] - use rand::{thread_rng, Rng}; - - let mut arr = [0f64; 5]; - thread_rng().fill(&mut arr); - - let df = df! ( - "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], - "names" => &["foo", "ham", "spam", "egg", "spam"], - "random" => &arr, - "groups" => &["A", "A", "B", "C", "B"], - )?; - - println!("{}", &df); - // --8<-- [end:dataframe] - - // --8<-- [start:samename] - let df_samename = df.clone().lazy().select([col("nrs") + lit(5)]).collect()?; - println!("{}", &df_samename); - // --8<-- [end:samename] - - // --8<-- [start:samenametwice] - let df_samename2 = df - .clone() - .lazy() - .select([col("nrs") + lit(5), col("nrs") - lit(5)]) - .collect(); - match df_samename2 { - Ok(df) => println!("{}", &df), - Err(e) => println!("{:?}", &e), - }; - // --8<-- [end:samenametwice] - - // --8<-- [start:samenamealias] - let df_alias = df - .clone() - .lazy() - .select([ - (col("nrs") + lit(5)).alias("nrs + 5"), - (col("nrs") - lit(5)).alias("nrs - 5"), - ]) - .collect()?; - println!("{}", &df_alias); - // --8<-- [end:samenamealias] - - // --8<-- [start:countunique] - let df_alias = df - .clone() - .lazy() - .select([ - col("names").n_unique().alias("unique"), - // Following query shows there isn't anything in Rust API - // https://docs.rs/polars/latest/polars/?search=approx_n_unique - // col("names").approx_n_unique().alias("unique_approx"), - ]) - .collect()?; - println!("{}", &df_alias); - // --8<-- [end:countunique] - - // --8<-- [start:conditional] - let df_conditional = df - .clone() - .lazy() - .select([ - col("nrs"), - when(col("nrs").gt(2)) - .then(lit(true)) - .otherwise(lit(false)) - .alias("conditional"), - ]) - .collect()?; - println!("{}", &df_conditional); - // --8<-- [end:conditional] - - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/expressions/lists.rs b/docs/source/src/rust/user-guide/expressions/lists.rs index fd097d98df7e..ee8bf9597ce7 100644 --- a/docs/source/src/rust/user-guide/expressions/lists.rs +++ b/docs/source/src/rust/user-guide/expressions/lists.rs @@ -1,165 +1,51 @@ -// --8<-- [start:setup] -use polars::prelude::*; -// --8<-- [end:setup] fn main() -> Result<(), Box> { - // --8<-- [start:weather_df] - let stns: Vec = (1..6).map(|i| format!("Station {i}")).collect(); - let weather = df!( - "station"=> &stns, - "temperatures"=> &[ - "20 5 5 E1 7 13 19 9 6 20", - "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40", - "19 24 E9 16 6 12 10 22", - "E2 E0 15 7 8 10 E1 24 17 13 6", - "14 8 E0 16 22 24 E1", - ], - )?; - println!("{}", &weather); - // --8<-- [end:weather_df] + // --8<-- [start:list-example] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:list-example] - // --8<-- [start:string_to_list] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures").str().split(lit(" "))]) - .collect()?; - println!("{}", &out); - // --8<-- [end:string_to_list] + // --8<-- [start:array-example] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:array-example] - // --8<-- [start:explode_to_atomic] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures").str().split(lit(" "))]) - .explode(["temperatures"]) - .collect()?; - println!("{}", &out); - // --8<-- [end:explode_to_atomic] + // --8<-- [start:numpy-array-inference] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:numpy-array-inference] - // --8<-- [start:list_ops] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures").str().split(lit(" "))]) - .with_columns([ - col("temperatures").list().head(lit(3)).alias("top3"), - col("temperatures") - .list() - .slice(lit(-3), lit(3)) - .alias("bottom_3"), - col("temperatures").list().len().alias("obs"), - ]) - .collect()?; - println!("{}", &out); - // --8<-- [end:list_ops] + // --8<-- [start:weather] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:weather] - // --8<-- [start:count_errors] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures") - .str() - .split(lit(" ")) - .list() - .eval(col("").cast(DataType::Int64).is_null(), false) - .list() - .sum() - .alias("errors")]) - .collect()?; - println!("{}", &out); - // --8<-- [end:count_errors] + // --8<-- [start:split] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:split] - // --8<-- [start:count_errors_regex] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures") - .str() - .split(lit(" ")) - .list() - .eval(col("").str().contains(lit("(?i)[a-z]"), false), false) - .list() - .sum() - .alias("errors")]) - .collect()?; - println!("{}", &out); - // --8<-- [end:count_errors_regex] + // --8<-- [start:explode] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:explode] - // --8<-- [start:weather_by_day] - let stns: Vec = (1..11).map(|i| format!("Station {i}")).collect(); - let weather_by_day = df!( - "station" => &stns, - "day_1" => &[17, 11, 8, 22, 9, 21, 20, 8, 8, 17], - "day_2" => &[15, 11, 10, 8, 7, 14, 18, 21, 15, 13], - "day_3" => &[16, 15, 24, 24, 8, 23, 19, 23, 16, 10], - )?; - println!("{}", &weather_by_day); - // --8<-- [end:weather_by_day] - - // --8<-- [start:weather_by_day_rank] - let rank_pct = (col("") - .rank( - RankOptions { - method: RankMethod::Average, - descending: true, - }, - None, - ) - .cast(DataType::Float32) - / col("*").count().cast(DataType::Float32)) - .round(2); + // --8<-- [start:list-slicing] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:list-slicing] - let out = weather_by_day - .clone() - .lazy() - .with_columns( - // create the list of homogeneous data - [concat_list([all().exclude(["station"])])?.alias("all_temps")], - ) - .select( - // select all columns except the intermediate list - [ - all().exclude(["all_temps"]), - // compute the rank by calling `list.eval` - col("all_temps") - .list() - .eval(rank_pct, true) - .alias("temps_rank"), - ], - ) - .collect()?; + // --8<-- [start:element-wise-casting] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:element-wise-casting] - println!("{}", &out); - // --8<-- [end:weather_by_day_rank] + // --8<-- [start:element-wise-regex] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:element-wise-regex] - // --8<-- [start:array_df] - let mut col1: ListPrimitiveChunkedBuilder = - ListPrimitiveChunkedBuilder::new("Array_1".into(), 8, 8, DataType::Int32); - col1.append_slice(&[1, 3]); - col1.append_slice(&[2, 5]); - let mut col2: ListPrimitiveChunkedBuilder = - ListPrimitiveChunkedBuilder::new("Array_2".into(), 8, 8, DataType::Int32); - col2.append_slice(&[1, 7, 3]); - col2.append_slice(&[8, 1, 0]); - let array_df = DataFrame::new(vec![ - col1.finish().into_column(), - col2.finish().into_column(), - ])?; + // --8<-- [start:weather_by_day] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:weather_by_day] - println!("{}", &array_df); - // --8<-- [end:array_df] + // --8<-- [start:rank_pct] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:rank_pct] - // --8<-- [start:array_ops] - let out = array_df - .clone() - .lazy() - .select([ - col("Array_1").list().min().name().suffix("_min"), - col("Array_2").list().sum().name().suffix("_sum"), - ]) - .collect()?; - println!("{}", &out); - // --8<-- [end:array_ops] + // --8<-- [start:array-overview] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:array-overview] Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/missing-data.rs b/docs/source/src/rust/user-guide/expressions/missing-data.rs index 8d78310cb0a9..437379dda37e 100644 --- a/docs/source/src/rust/user-guide/expressions/missing-data.rs +++ b/docs/source/src/rust/user-guide/expressions/missing-data.rs @@ -1,18 +1,16 @@ -use polars::prelude::*; - fn main() -> Result<(), Box> { // --8<-- [start:dataframe] - + use polars::prelude::*; let df = df! ( "value" => &[Some(1), None], )?; - println!("{}", &df); + println!("{}", df); // --8<-- [end:dataframe] // --8<-- [start:count] let null_count_df = df.null_count(); - println!("{}", &null_count_df); + println!("{}", null_count_df); // --8<-- [end:count] // --8<-- [start:isnull] @@ -21,69 +19,42 @@ fn main() -> Result<(), Box> { .lazy() .select([col("value").is_null()]) .collect()?; - println!("{}", &is_null_series); + println!("{}", is_null_series); // --8<-- [end:isnull] // --8<-- [start:dataframe2] - let df = df!( - "col1" => &[Some(1), Some(2), Some(3)], - "col2" => &[Some(1), None, Some(3)], - - )?; - println!("{}", &df); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:dataframe2] // --8<-- [start:fill] - let fill_literal_df = df - .clone() - .lazy() - .with_columns([col("col2").fill_null(lit(2))]) - .collect()?; - println!("{}", &fill_literal_df); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:fill] // --8<-- [start:fillstrategy] - let fill_forward_df = df - .clone() - .lazy() - .with_columns([col("col2").forward_fill(None)]) - .collect()?; - println!("{}", &fill_forward_df); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:fillstrategy] // --8<-- [start:fillexpr] - let fill_median_df = df - .clone() - .lazy() - .with_columns([col("col2").fill_null(median("col2"))]) - .collect()?; - println!("{}", &fill_median_df); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:fillexpr] // --8<-- [start:fillinterpolate] - let fill_interpolation_df = df - .clone() - .lazy() - .with_columns([col("col2").interpolate(InterpolationMethod::Linear)]) - .collect()?; - println!("{}", &fill_interpolation_df); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:fillinterpolate] // --8<-- [start:nan] let nan_df = df!( "value" => [1.0, f64::NAN, f64::NAN, 3.0], )?; - println!("{}", &nan_df); + println!("{}", nan_df); // --8<-- [end:nan] + // --8<-- [start:nan-computed] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:nan-computed] + // --8<-- [start:nanfill] - let mean_nan_df = nan_df - .clone() - .lazy() - .with_columns([col("value").fill_nan(lit(NULL)).alias("value")]) - .mean() - .collect()?; - println!("{}", &mean_nan_df); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:nanfill] Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/operations.rs b/docs/source/src/rust/user-guide/expressions/operations.rs new file mode 100644 index 000000000000..55fbf9412f0e --- /dev/null +++ b/docs/source/src/rust/user-guide/expressions/operations.rs @@ -0,0 +1,138 @@ +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + use polars::prelude::*; + + let df = df! ( + "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], + "names" => &["foo", "ham", "spam", "egg", "spam"], + "random" => &[0.37454, 0.950714, 0.731994, 0.598658, 0.156019], + "groups" => &["A", "A", "B", "A", "B"], + )?; + + println!("{}", &df); + // --8<-- [end:dataframe] + + // --8<-- [start:arithmetic] + let result = df + .clone() + .lazy() + .select([ + (col("nrs") + lit(5)).alias("nrs + 5"), + (col("nrs") - lit(5)).alias("nrs - 5"), + (col("nrs") * col("random")).alias("nrs * random"), + (col("nrs") / col("random")).alias("nrs / random"), + (col("nrs").pow(lit(2))).alias("nrs ** 2"), + (col("nrs") % lit(3)).alias("nrs % 3"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:arithmetic] + + // --8<-- [start:comparison] + let result = df + .clone() + .lazy() + .select([ + col("nrs").gt(1).alias("nrs > 1"), + col("nrs").gt_eq(3).alias("nrs >= 3"), + col("random").lt_eq(0.2).alias("random < .2"), + col("random").lt_eq(0.5).alias("random <= .5"), + col("nrs").neq(1).alias("nrs != 1"), + col("nrs").eq(1).alias("nrs == 1"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:comparison] + + // --8<-- [start:boolean] + let result = df + .clone() + .lazy() + .select([ + ((col("nrs").is_null()).not().and(col("groups").eq(lit("A")))) + .alias("number not null and group A"), + (col("random").lt(lit(0.5)).or(col("groups").eq(lit("B")))) + .alias("random < 0.5 or group B"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:boolean] + + // --8<-- [start:bitwise] + let result = df + .clone() + .lazy() + .select([ + col("nrs"), + col("nrs").and(lit(6)).alias("nrs & 6"), + col("nrs").or(lit(6)).alias("nrs | 6"), + col("nrs").not().alias("not nrs"), + col("nrs").xor(lit(6)).alias("nrs ^ 6"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:bitwise] + + // --8<-- [start:count] + use rand::distributions::{Distribution, Uniform}; + use rand::thread_rng; + + let mut rng = thread_rng(); + let between = Uniform::new_inclusive(0, 100_000); + let arr: Vec = between.sample_iter(&mut rng).take(100_100).collect(); + + let long_df = df!( + "numbers" => &arr + )?; + + let result = long_df + .clone() + .lazy() + .select([ + col("numbers").n_unique().alias("n_unique"), + col("numbers").approx_n_unique().alias("approx_n_unique"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:count] + + // --8<-- [start:value_counts] + let result = df + .clone() + .lazy() + .select([col("names") + .value_counts(false, false, "count", false) + .alias("value_counts")]) + .collect()?; + println!("{}", result); + // --8<-- [end:value_counts] + + // --8<-- [start:unique_counts] + let result = df + .clone() + .lazy() + .select([ + col("names").unique_stable().alias("unique"), + col("names").unique_counts().alias("unique_counts"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:unique_counts] + + // --8<-- [start:collatz] + let result = df + .clone() + .lazy() + .select([ + col("nrs"), + when((col("nrs") % lit(2)).eq(lit(1))) + .then(lit(3) * col("nrs") + lit(1)) + .otherwise(col("nrs") / lit(2)) + .alias("Collatz"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:collatz] + + Ok(()) +} diff --git a/docs/source/src/rust/user-guide/expressions/operators.rs b/docs/source/src/rust/user-guide/expressions/operators.rs deleted file mode 100644 index 868d301c2182..000000000000 --- a/docs/source/src/rust/user-guide/expressions/operators.rs +++ /dev/null @@ -1,54 +0,0 @@ -use polars::prelude::*; - -fn main() -> Result<(), Box> { - // --8<-- [start:dataframe] - use rand::{thread_rng, Rng}; - - let mut arr = [0f64; 5]; - thread_rng().fill(&mut arr); - - let df = df! ( - "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], - "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None], - "random" => &arr, - "groups" => &["A", "A", "B", "C", "B"], - )?; - - println!("{}", &df); - // --8<-- [end:dataframe] - - // --8<-- [start:numerical] - let df_numerical = df - .clone() - .lazy() - .select([ - (col("nrs") + lit(5)).alias("nrs + 5"), - (col("nrs") - lit(5)).alias("nrs - 5"), - (col("nrs") * col("random")).alias("nrs * random"), - (col("nrs") / col("random")).alias("nrs / random"), - ]) - .collect()?; - println!("{}", &df_numerical); - // --8<-- [end:numerical] - - // --8<-- [start:logical] - let df_logical = df - .clone() - .lazy() - .select([ - col("nrs").gt(1).alias("nrs > 1"), - col("random").lt_eq(0.5).alias("random < .5"), - col("nrs").neq(1).alias("nrs != 1"), - col("nrs").eq(1).alias("nrs == 1"), - (col("random").lt_eq(0.5)) - .and(col("nrs").gt(1)) - .alias("and_expr"), // and - (col("random").lt_eq(0.5)) - .or(col("nrs").gt(1)) - .alias("or_expr"), // or - ]) - .collect()?; - println!("{}", &df_logical); - // --8<-- [end:logical] - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/expressions/strings.rs b/docs/source/src/rust/user-guide/expressions/strings.rs index 8ebcfa5d6f22..60903fa827f5 100644 --- a/docs/source/src/rust/user-guide/expressions/strings.rs +++ b/docs/source/src/rust/user-guide/expressions/strings.rs @@ -1,93 +1,35 @@ -// --8<-- [start:setup] -use polars::prelude::*; -// --8<-- [end:setup] - fn main() -> Result<(), Box> { // --8<-- [start:df] - let df = df! ( - "animal" => &[Some("Crab"), Some("cat and dog"), Some("rab$bit"), None], - )?; - - let out = df - .clone() - .lazy() - .select([ - col("animal").str().len_bytes().alias("byte_count"), - col("animal").str().len_chars().alias("letter_count"), - ]) - .collect()?; - - println!("{}", &out); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:df] // --8<-- [start:existence] - let out = df - .clone() - .lazy() - .select([ - col("animal"), - col("animal") - .str() - .contains(lit("cat|bit"), false) - .alias("regex"), - col("animal") - .str() - .contains_literal(lit("rab$")) - .alias("literal"), - col("animal") - .str() - .starts_with(lit("rab")) - .alias("starts_with"), - col("animal").str().ends_with(lit("dog")).alias("ends_with"), - ]) - .collect()?; - println!("{}", &out); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:existence] // --8<-- [start:extract] - let df = df!( - "a" => &[ - "http://vote.com/ballon_dor?candidate=messi&ref=polars", - "http://vote.com/ballon_dor?candidat=jorginho&ref=polars", - "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars", - ] - )?; - let out = df - .clone() - .lazy() - .select([col("a").str().extract(lit(r"candidate=(\w+)"), 1)]) - .collect()?; - println!("{}", &out); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:extract] // --8<-- [start:extract_all] - let df = df!("foo"=> &["123 bla 45 asd", "xyz 678 910t"])?; - let out = df - .clone() - .lazy() - .select([col("foo") - .str() - .extract_all(lit(r"(\d+)")) - .alias("extracted_nrs")]) - .collect()?; - println!("{}", &out); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:extract_all] // --8<-- [start:replace] - let df = df!("id"=> &[1, 2], "text"=> &["123abc", "abc456"])?; - let out = df - .clone() - .lazy() - .with_columns([ - col("text").str().replace(lit(r"abc\b"), lit("ABC"), false), - col("text") - .str() - .replace_all(lit("a"), lit("-"), false) - .alias("text_replace_all"), - ]) - .collect()?; - println!("{}", &out); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:replace] + // --8<-- [start:casing] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:casing] + + // --8<-- [start:strip] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:strip] + + // --8<-- [start:slice] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:slice] + Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/structs.rs b/docs/source/src/rust/user-guide/expressions/structs.rs index cc6fff831d06..d62717f30628 100644 --- a/docs/source/src/rust/user-guide/expressions/structs.rs +++ b/docs/source/src/rust/user-guide/expressions/structs.rs @@ -1,35 +1,33 @@ -// --8<-- [start:setup] -use polars::prelude::*; -// --8<-- [end:setup] fn main() -> Result<(), Box> { // --8<-- [start:ratings_df] + use polars::prelude::*; let ratings = df!( - "Movie"=> &["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], - "Theatre"=> &["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], - "Avg_Rating"=> &[4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6], - "Count"=> &[30, 27, 26, 29, 31, 28, 28, 26, 33, 26], + "Movie"=> ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "Cars"], + "Theatre"=> ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "NE"], + "Avg_Rating"=> [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.5, 4.9, 4.7, 4.6], + "Count"=> [30, 27, 26, 29, 31, 28, 28, 26, 33, 28], )?; println!("{}", &ratings); // --8<-- [end:ratings_df] // --8<-- [start:state_value_counts] - let out = ratings + let result = ratings .clone() .lazy() .select([col("Theatre").value_counts(true, true, "count", false)]) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:state_value_counts] // --8<-- [start:struct_unnest] - let out = ratings + let result = ratings .clone() .lazy() .select([col("Theatre").value_counts(true, true, "count", false)]) .unnest(["Theatre"]) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:struct_unnest] // --8<-- [start:series_struct] @@ -44,44 +42,36 @@ fn main() -> Result<(), Box> { println!("{}", &rating_series); // // --8<-- [end:series_struct] + // --8<-- [start:series_struct_error] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:series_struct_error] + // --8<-- [start:series_struct_extract] - let out = rating_series.struct_()?.field_by_name("Movie")?; - println!("{}", &out); + let result = rating_series.struct_()?.field_by_name("Movie")?; + println!("{}", result); // --8<-- [end:series_struct_extract] // --8<-- [start:series_struct_rename] - let out = DataFrame::new([rating_series.into_column()].into())? - .lazy() - .select([col("ratings") - .struct_() - .rename_fields(["Film", "State", "Value"].to_vec())]) - .unnest(["ratings"]) - .collect()?; - - println!("{}", &out); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:series_struct_rename] + // --8<-- [start:struct-rename-check] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:struct-rename-check] + // --8<-- [start:struct_duplicates] - let out = ratings - .clone() - .lazy() - // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated()) - // Error: .is_duplicated() not available if you try that - // https://github.com/pola-rs/polars/issues/3803 - .filter(len().over([col("Movie"), col("Theatre")]).gt(lit(1))) - .collect()?; - println!("{}", &out); + // Contribute the Rust translation of the Python example by opening a PR. // --8<-- [end:struct_duplicates] // --8<-- [start:struct_ranking] - let out = ratings + let result = ratings .clone() .lazy() .with_columns([as_struct(vec![col("Count"), col("Avg_Rating")]) .rank( RankOptions { method: RankMethod::Dense, - descending: false, + descending: true, }, None, ) @@ -92,16 +82,16 @@ fn main() -> Result<(), Box> { // https://github.com/pola-rs/polars/issues/3803 .filter(len().over([col("Movie"), col("Theatre")]).gt(lit(1))) .collect()?; - println!("{}", &out); + println!("{}", result); // --8<-- [end:struct_ranking] // --8<-- [start:multi_column_apply] let df = df!( - "keys" => &["a", "a", "b"], - "values" => &[10, 7, 1], + "keys" => ["a", "a", "b"], + "values" => [10, 7, 1], )?; - let out = df + let result = df .lazy() .select([ // pack to struct to get access to multiple fields in a custom `apply/map` @@ -121,7 +111,7 @@ fn main() -> Result<(), Box> { let ca_b = s_b.i32()?; // iterate both `ChunkedArrays` - let out: Int32Chunked = ca_a + let result: Int32Chunked = ca_a .into_iter() .zip(ca_b) .map(|(opt_a, opt_b)| match (opt_a, opt_b) { @@ -130,7 +120,7 @@ fn main() -> Result<(), Box> { }) .collect(); - Ok(Some(out.into_column())) + Ok(Some(result.into_column())) }, GetOutput::from_type(DataType::Int32), ) @@ -141,8 +131,16 @@ fn main() -> Result<(), Box> { .alias("solution_expr"), ]) .collect()?; - println!("{}", out); - + println!("{}", result); // --8<-- [end:multi_column_apply] + + // --8<-- [start:ack] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:ack] + + // --8<-- [start:struct-ack] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:struct-ack] + Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/window.rs b/docs/source/src/rust/user-guide/expressions/window.rs index 6414bc984c09..891f99793d29 100644 --- a/docs/source/src/rust/user-guide/expressions/window.rs +++ b/docs/source/src/rust/user-guide/expressions/window.rs @@ -19,8 +19,40 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:pokemon] + // --8<-- [start:rank] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:rank] + + // --8<-- [start:rank-multiple] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:rank-multiple] + + // --8<-- [start:rank-explode] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:rank-explode] + + // --8<-- [start:athletes] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:athletes] + + // --8<-- [start:athletes-sort-over-country] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:athletes-sort-over-country] + + // --8<-- [start:athletes-explode] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:athletes-explode] + + // --8<-- [start:athletes-join] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:athletes-join] + + // --8<-- [start:pokemon-mean] + // Contribute the Rust translation of the Python example by opening a PR. + // --8<-- [end:pokemon-mean] + // --8<-- [start:group_by] - let out = df + let result = df .clone() .lazy() .select([ @@ -38,7 +70,7 @@ fn main() -> Result<(), Box> { ]) .collect()?; - println!("{}", out); + println!("{}", result); // --8<-- [end:group_by] // --8<-- [start:operations] @@ -53,7 +85,7 @@ fn main() -> Result<(), Box> { // --8<-- [end:operations] // --8<-- [start:sort] - let out = filtered + let result = filtered .lazy() .with_columns([cols(["Name", "Speed"]) .sort_by( @@ -62,38 +94,11 @@ fn main() -> Result<(), Box> { ) .over(["Type 1"])]) .collect()?; - println!("{}", out); + println!("{}", result); // --8<-- [end:sort] - // --8<-- [start:rules] - // aggregate and broadcast within a group - // output type: -> i32 - let _ = sum("foo").over([col("groups")]); - // sum within a group and multiply with group elements - // output type: -> i32 - let _ = (col("x").sum() * col("y")) - .over([col("groups")]) - .alias("x1"); - // sum within a group and multiply with group elements - // and aggregate the group to a list - // output type: -> ChunkedArray - let _ = (col("x").sum() * col("y")) - .over([col("groups")]) - .alias("x2"); - // note that it will require an explicit `list()` call - // sum within a group and multiply with group elements - // and aggregate the group to a list - // the flatten call explodes that list - - // This is the fastest method to do things over groups when the groups are sorted - let _ = (col("x").sum() * col("y")) - .over([col("groups")]) - .flatten() - .alias("x3"); - // --8<-- [end:rules] - // --8<-- [start:examples] - let out = df + let result = df .clone() .lazy() .select([ @@ -124,7 +129,7 @@ fn main() -> Result<(), Box> { .alias("sorted_by_alphabet"), ]) .collect()?; - println!("{:?}", out); + println!("{:?}", result); // --8<-- [end:examples] Ok(()) diff --git a/docs/source/user-guide/concepts/data-types-and-structures.md b/docs/source/user-guide/concepts/data-types-and-structures.md index 2de8120f05a3..896fc84a0ec9 100644 --- a/docs/source/user-guide/concepts/data-types-and-structures.md +++ b/docs/source/user-guide/concepts/data-types-and-structures.md @@ -83,7 +83,8 @@ Here, each line of the output corresponds to a single column, making it easier t ``` !!! info -`glimpse` is only available for Python users. + + `glimpse` is only available for Python users. #### Tail @@ -122,7 +123,6 @@ You can also use `describe` to compute summary statistics for all columns of you When talking about data (in a dataframe or otherwise) we can refer to its schema. The schema is a mapping of column or series names to the data types of those same columns or series. -Much like with series, Polars will infer the schema of a dataframe when you create it but you can override the inference system if needed. You can check the schema of a dataframe with `schema`: {{code_block('user-guide/concepts/data-types-and-structures','schema',[])}} @@ -131,6 +131,29 @@ You can check the schema of a dataframe with `schema`: --8<-- "python/user-guide/concepts/data-types-and-structures.py:schema" ``` +Much like with series, Polars will infer the schema of a dataframe when you create it but you can override the inference system if needed. + +In Python, you can specify an explicit schema by using a dictionary to map column names to data types. +You can use the value `None` if you do not wish to override inference for a given column: + +```python +--8<-- "python/user-guide/concepts/data-types-and-structures.py:schema-def" +``` + +```python exec="on" result="text" session="user-guide/data-types-and-structures" +--8<-- "python/user-guide/concepts/data-types-and-structures.py:schema-def" +``` + +If you only need to override the inference of some columns, the parameter `schema_overrides` tends to be more convenient because it lets you omit columns for which you do not want to override the inference: + +```python +--8<-- "python/user-guide/concepts/data-types-and-structures.py:schema_overrides" +``` + +```python exec="on" result="text" session="user-guide/data-types-and-structures" +--8<-- "python/user-guide/concepts/data-types-and-structures.py:schema_overrides" +``` + ## Data types internals Polars utilizes the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) for its data orientation. @@ -167,8 +190,8 @@ much larger internal representations than 64-bit floats), and thus some error is | `Time` | Represents a time of day. | | `Datetime` | Represents a calendar date and time of day. | | `Duration` | Represents a time duration. | -| `Array` | Arrays with a known, fixed shape per series; akin to numpy arrays. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | -| `List` | Homogeneous 1D container with variable length. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | +| `Array` | Arrays with a known, fixed shape per series; akin to numpy arrays. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists-and-arrays.md). | +| `List` | Homogeneous 1D container with variable length. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists-and-arrays.md). | | `Object` | Wraps arbitrary Python objects. | | `Categorical` | Efficient encoding of string data where the categories are inferred at runtime. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | | `Enum` | Efficient ordered encoding of a set of predetermined string categories. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | diff --git a/docs/source/user-guide/concepts/expressions-and-contexts.md b/docs/source/user-guide/concepts/expressions-and-contexts.md index 4ec537b71fb9..bee5cd130b45 100644 --- a/docs/source/user-guide/concepts/expressions-and-contexts.md +++ b/docs/source/user-guide/concepts/expressions-and-contexts.md @@ -143,7 +143,7 @@ The last example contained two grouping expressions and three aggregating expres If we look closely, the last aggregating expression mentioned two different columns: “weight” and “height”. Polars expressions support a feature called _expression expansion_. -Expression expansion is like a shorthand notation for when you want to apply the same transform to multiple columns. +Expression expansion is like a shorthand notation for when you want to apply the same transformation to multiple columns. As we have seen, the expression ```python diff --git a/docs/source/user-guide/expressions/aggregation.md b/docs/source/user-guide/expressions/aggregation.md index f4d963606ffb..65ebfc776c00 100644 --- a/docs/source/user-guide/expressions/aggregation.md +++ b/docs/source/user-guide/expressions/aggregation.md @@ -1,47 +1,42 @@ # Aggregation -Polars implements a powerful syntax defined not only in its lazy API, but also in its eager API. Let's take a look at what that means. +The Polars [context](../concepts/expressions-and-contexts.md#contexts) `group_by` lets you apply expressions on subsets of columns, as defined by the unique values of the column over which the data is grouped. +This is a very powerful capability that we explore in this section of the user guide. -We can start with the simple [US congress `dataset`](https://github.com/unitedstates/congress-legislators). +We start by reading in a [US congress `dataset`](https://github.com/unitedstates/congress-legislators): {{code_block('user-guide/expressions/aggregation','dataframe',['DataFrame','Categorical'])}} -#### Basic aggregations - -You can easily combine different aggregations by adding multiple expressions in a -`list`. There is no upper bound on the number of aggregations you can do, and you can -make any combination you want. In the snippet below we do the following aggregations: - -Per GROUP `"first_name"` we +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:dataframe" +``` - +## Basic aggregations -- count the number of rows in the group: - - full form: `pl.len()` -- combine the values of gender into a list by omitting an aggregate function: - - full form: `pl.col("gender")` -- get the first value of column `"last_name"` in the group: - - short form: `pl.first("last_name")` (not available in Rust) - - full form: `pl.col("last_name").first()` +You can easily apply multiple expressions to your aggregated values. +Simply list all of the expressions you want inside the function `agg`. +There is no upper bound on the number of aggregations you can do and you can make any combination you want. +In the snippet below we will group the data based on the column “first_name” and then we will apply the following aggregations: - +- count the number of rows in the group (which means we count how many people in the data set have each unique first name); +- combine the values of the column “gender” into a list by referring the column but omitting an aggregate function; and +- get the first value of the column “last_name” within the group. -Besides the aggregation, we immediately sort the result and limit to the top `5` so that -we have a nice summary overview. +After computing the aggregations, we immediately sort the result and limit it to the top five rows so that we have a nice summary overview: {{code_block('user-guide/expressions/aggregation','basic',['group_by'])}} ```python exec="on" result="text" session="user-guide/expressions" ---8<-- "python/user-guide/expressions/aggregation.py:setup" ---8<-- "python/user-guide/expressions/aggregation.py:dataframe" --8<-- "python/user-guide/expressions/aggregation.py:basic" ``` -#### Conditionals +It's that easy! +Let's turn it up a notch. -It's that easy! Let's turn it up a notch. Let's say we want to know how -many delegates of a "state" are "Pro" or "Anti" administration. We could directly query -that in the aggregation without the need of a `lambda` or grooming the `DataFrame`. +## Conditionals + +Let's say we want to know how many delegates of a state are “Pro” or “Anti” administration. +We can query that directly in the aggregation without the need for a `lambda` or grooming the dataframe: {{code_block('user-guide/expressions/aggregation','conditional',['group_by'])}} @@ -49,35 +44,57 @@ that in the aggregation without the need of a `lambda` or grooming the `DataFram --8<-- "python/user-guide/expressions/aggregation.py:conditional" ``` -Similarly, this could also be done with a nested GROUP BY, but that doesn't help show off some of these nice features. 😉 +## Filtering -{{code_block('user-guide/expressions/aggregation','nested',['group_by'])}} +We can also filter the groups. +Let's say we want to compute a mean per group, but we don't want to include all values from that group, and we also don't want to actually filter the rows from the dataframe because we need those rows for another aggregation. + +In the example below we show how this can be done. + +!!! note + + Note that we can define Python functions for clarity. + These functions don't cost us anything because they return Polars expressions, we don't apply a custom function over a series during runtime of the query. + Of course, you can write functions that return expressions in Rust, too. + +{{code_block('user-guide/expressions/aggregation','filter',['group_by'])}} ```python exec="on" result="text" session="user-guide/expressions" ---8<-- "python/user-guide/expressions/aggregation.py:nested" +--8<-- "python/user-guide/expressions/aggregation.py:filter" ``` -#### Filtering +Do the average age values look nonsensical? +That's because we are working with historical data that dates back to the 1800s and we are doing our computations assuming everyone represented in the dataset is still alive and kicking. -We can also filter the groups. Let's say we want to compute a mean per group, but we -don't want to include all values from that group, and we also don't want to filter the -rows from the `DataFrame` (because we need those rows for another aggregation). +## Nested grouping -In the example below we show how this can be done. +The two previous queries could have been done with a nested `group_by`, but that wouldn't have let us show off some of these features. 😉 +To do a nested `group_by`, simply list the columns that will be used for grouping. -!!! note +First, we use a nested `group_by` to figure out how many delegates of a state are “Pro” or “Anti” administration: - Note that we can make Python functions for clarity. These functions don't cost us anything. That is because we only create Polars expressions, we don't apply a custom function over a `Series` during runtime of the query. Of course, you can make functions that return expressions in Rust, too. +{{code_block('user-guide/expressions/aggregation','nested',['group_by'])}} -{{code_block('user-guide/expressions/aggregation','filter',['group_by'])}} +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:nested" +``` + +Next, we use a nested `group_by` to compute the average age of delegates per state and per gender: + +{{code_block('user-guide/expressions/aggregation','filter-nested',['group_by'])}} ```python exec="on" result="text" session="user-guide/expressions" ---8<-- "python/user-guide/expressions/aggregation.py:filter" +--8<-- "python/user-guide/expressions/aggregation.py:filter-nested" ``` -#### Sorting +Note that we get the same results but the format of the data is different. +Depending on the situation, one format may be more suitable than the other. + +## Sorting -It's common to see a `DataFrame` being sorted for the sole purpose of managing the ordering during a GROUP BY operation. Let's say that we want to get the names of the oldest and youngest politicians per state. We could SORT and GROUP BY. +It is common to see a dataframe being sorted for the sole purpose of managing the ordering during a grouping operation. +Let's say that we want to get the names of the oldest and youngest politicians per state. +We could start by sorting and then grouping: {{code_block('user-guide/expressions/aggregation','sort',['group_by'])}} @@ -85,7 +102,8 @@ It's common to see a `DataFrame` being sorted for the sole purpose of managing t --8<-- "python/user-guide/expressions/aggregation.py:sort" ``` -However, **if** we also want to sort the names alphabetically, this breaks. Luckily we can sort in a `group_by` context separate from the `DataFrame`. +However, if we also want to sort the names alphabetically, we need to perform an extra sort operation. +Luckily, we can sort in a `group_by` context without changing the sorting of the underlying dataframe: {{code_block('user-guide/expressions/aggregation','sort2',['group_by'])}} @@ -93,7 +111,8 @@ However, **if** we also want to sort the names alphabetically, this breaks. Luck --8<-- "python/user-guide/expressions/aggregation.py:sort2" ``` -We can even sort by another column in the `group_by` context. If we want to know if the alphabetically sorted name is male or female we could add: `pl.col("gender").sort_by(get_person()).first()` +We can even sort a column with the order induced by another column, and this also works inside the context `group_by`. +This modification to the previous query lets us check if the delegate with the first name is male or female: {{code_block('user-guide/expressions/aggregation','sort3',['group_by'])}} @@ -101,25 +120,17 @@ We can even sort by another column in the `group_by` context. If we want to know --8<-- "python/user-guide/expressions/aggregation.py:sort3" ``` -### Do not kill parallelization - -!!! warning "Python Users Only" - - The following section is specific to Python, and doesn't apply to Rust. Within Rust, blocks and closures (lambdas) can, and will, be executed concurrently. - -We have all heard that Python is slow, and does "not scale." Besides the overhead of -running "slow" bytecode, Python has to remain within the constraints of the Global -Interpreter Lock (GIL). This means that if you were to use a `lambda` or a custom Python -function to apply during a parallelized phase, Polars speed is capped running Python -code preventing any multiple threads from executing the function. +## Do not kill parallelization -This all feels terribly limiting, especially because we often need those `lambda` functions in a -`.group_by()` step, for example. This approach is still supported by Polars, but -keeping in mind bytecode **and** the GIL costs have to be paid. It is recommended to try to solve your queries using the expression syntax before moving to `lambdas`. If you want to learn more about using `lambdas`, go to the [user defined functions section](./user-defined-functions.md). +!!! warning "Python users only" -### Conclusion + The following section is specific to Python, and doesn't apply to Rust. + Within Rust, blocks and closures (lambdas) can, and will, be executed concurrently. -In the examples above we've seen that we can do a lot by combining expressions. By doing so we delay the use of custom Python functions that slow down the queries (by the slow nature of Python AND the GIL). +Python is generally slower than Rust. +Besides the overhead of running “slow” bytecode, Python has to remain within the constraints of the Global Interpreter Lock (GIL). +This means that if you were to use a `lambda` or a custom Python function to apply during a parallelized phase, Polars' speed is capped running Python code, preventing any multiple threads from executing the function. -If we are missing a type expression let us know by opening a -[feature request](https://github.com/pola-rs/polars/issues/new/choose)! +Polars will try to parallelize the computation of the aggregating functions over the groups, so it is recommended that you avoid using `lambda`s and custom Python functions as much as possible. +Instead, try to stay within the realm of the Polars expression API. +This is not always possible, though, so if you want to learn more about using `lambda`s you can go [the user guide section on using user-defined functions](user-defined-python-functions.md). diff --git a/docs/source/user-guide/expressions/athletes_over_country.svg b/docs/source/user-guide/expressions/athletes_over_country.svg new file mode 100644 index 000000000000..3ab07cc73ef7 --- /dev/null +++ b/docs/source/user-guide/expressions/athletes_over_country.svg @@ -0,0 +1,84 @@ + + + + + + A + + B + + C + + D + + E + + F + + PT + + NL + + NL + + PT + + PT + + NL + + 6 + + 1 + + 5 + + 4 + + 2 + + 3 + + E + + B + + F + + D + + A + + C + + PT + + NL + + NL + + PT + + PT + + NL + + 2 + + 1 + + 3 + + 4 + + 6 + + 5 + + + + NL + NL + diff --git a/docs/source/user-guide/expressions/athletes_over_country_explode.svg b/docs/source/user-guide/expressions/athletes_over_country_explode.svg new file mode 100644 index 000000000000..d49db911465c --- /dev/null +++ b/docs/source/user-guide/expressions/athletes_over_country_explode.svg @@ -0,0 +1,85 @@ + + + + + + A + + B + + C + + D + + E + + F + + PT + + NL + + NL + + PT + + PT + + NL + + 6 + + 1 + + 5 + + 4 + + 2 + + 3 + + E + + B + + F + + D + + A + + C + + PT + + NL + + NL + + PT + + PT + + NL + + 2 + + 1 + + 3 + + 4 + + 6 + + 5 + + NL + NL + + + NL + diff --git a/docs/source/user-guide/expressions/basic-operations.md b/docs/source/user-guide/expressions/basic-operations.md new file mode 100644 index 000000000000..8cfce9e5392e --- /dev/null +++ b/docs/source/user-guide/expressions/basic-operations.md @@ -0,0 +1,123 @@ +# Basic operations + +This section shows how to do basic operations on dataframe columns, like do basic arithmetic calculations, perform comparisons, and other general-purpose operations. +We will use the following dataframe for the examples that follow: + +{{code_block('user-guide/expressions/operations', 'dataframe', ['DataFrame'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:dataframe" +``` + +## Basic arithmetic + +Polars supports basic arithmetic between series of the same length, or between series and literals. +When literals are mixed with series, the literals are broadcast to match the length of the series they are being used with. + +{{code_block('user-guide/expressions/operations', 'arithmetic', ['operators'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:arithmetic" +``` + +The example above shows that when an arithmetic operation takes `null` as one of its operands, the result is `null`. + +Polars uses operator overloading to allow you to use your language's native arithmetic operators within your expressions. +If you prefer, in Python you can use the corresponding named functions, as the snippet below demonstrates: + +```python +--8<-- "python/user-guide/expressions/operations.py:operator-overloading" +``` + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:operator-overloading" +``` + +## Comparisons + +Like with arithmetic operations, Polars supports comparisons via the overloaded operators or named functions: + +{{code_block('user-guide/expressions/operations','comparison',['operators'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:comparison" +``` + +## Boolean and bitwise operations + +Depending on the language, you may use the operators `&`, `|`, and `~`, for the Boolean operations “and”, “or”, and “not”, respectively, or the functions of the same name: + +{{code_block('user-guide/expressions/operations', 'boolean', ['operators'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:boolean" +``` + +??? info "Python trivia" + + The Python functions are called `and_`, `or_`, and `not_`, because the words `and`, `or`, and `not` are reserved keywords in Python. + Similarly, we cannot use the keywords `and`, `or`, and `not`, as the Boolean operators because these Python keywords will interpret their operands in the context of Truthy and Falsy through the dunder method `__bool__`. + Thus, we overload the bitwise operators `&`, `|`, and `~`, as the Boolean operators because they are the second best choice. + +These operators/functions can also be used for the respective bitwise operations, alongside the bitwise operator `^` / function `xor`: + +{{code_block('user-guide/expressions/operations', 'bitwise', [])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:bitwise" +``` + +## Counting (unique) values + +Polars has two functions to count the number of unique values in a series. +The function `n_unique` can be used to count the exact number of unique values in a series. +However, for very large data sets, this operation can be quite slow. +In those cases, if an approximation is good enough, you can use the function `approx_n_unique` that uses the algorithm [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) to estimate the result. + +The example below shows an example series where the `approx_n_unique` estimation is wrong by 0.9%: + +{{code_block('user-guide/expressions/operations', 'count', ['n_unique', 'approx_n_unique'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:count" +``` + +You can get more information about the unique values and their counts with the function `value_counts`, that Polars also provides: + +{{code_block('user-guide/expressions/operations', 'value_counts', ['value_counts'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:value_counts" +``` + +The function `value_counts` returns the results in [structs, a data type that we will explore in a later section](structs.md). + +Alternatively, if you only need a series with the unique values or a series with the unique counts, they are one function away: + +{{code_block('user-guide/expressions/operations', 'unique_counts', ['unique', 'unique_counts'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:unique_counts" +``` + +Note that we need to specify `maintain_order=True` in the function `unique` so that the order of the results is consistent with the order of the results in `unique_counts`. +See the API reference for more information. + +## Conditionals + +Polars supports something akin to a ternary operator through the function `when`, which is followed by one function `then` and an optional function `otherwise`. + +The function `when` accepts a predicate expression. +The values that evaluate to `True` are replaced by the corresponding values of the expression inside the function `then`. +The values that evaluate to `False` are replaced by the corresponding values of the expression inside the function `otherwise` or `null`, if `otherwise` is not provided. + +The example below applies one step of the [Collatz conjecture](https://en.wikipedia.org/wiki/Collatz_conjecture) to the numbers in the column “nrs”: + +{{code_block('user-guide/expressions/operations', 'collatz', ['when'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:collatz" +``` + +You can also emulate a chain of an arbitrary number of conditionals, akin to Python's `elif` statement, by chaining an arbitrary number of consecutive blocks of `.when(...).then(...)`. +In those cases, and for each given value, Polars will only consider a replacement expression that is deeper within the chain if the previous predicates all failed for that value. diff --git a/docs/source/user-guide/expressions/casting.md b/docs/source/user-guide/expressions/casting.md index f0c625d19f28..daa11a94a31e 100644 --- a/docs/source/user-guide/expressions/casting.md +++ b/docs/source/user-guide/expressions/casting.md @@ -1,101 +1,116 @@ # Casting -Casting converts the [underlying `DataType` of a column](../concepts/data-types-and-structures.md) to a new one. -Casting is available with the `cast()` method. +Casting converts the [underlying data type of a column](../concepts/data-types-and-structures.md) to a new one. +Casting is available through the function `cast`. -The `cast` method includes a `strict` parameter that determines how Polars behaves when it encounters a value that can't be converted from the source `DataType` to the target `DataType`. By default, `strict=True`, which means that Polars will throw an error to notify the user of the failed conversion and provide details on the values that couldn't be cast. On the other hand, if `strict=False`, any values that can't be converted to the target `DataType` will be quietly converted to `null`. +The function `cast` includes a parameter `strict` that determines how Polars behaves when it encounters a value that cannot be converted from the source data type to the target data type. +The default behaviour is `strict=True`, which means that Polars will thrown an error to notify the user of the failed conversion while also providing details on the values that couldn't be cast. +On the other hand, if `strict=False`, any values that cannot be converted to the target data type will be quietly converted to `null`. -## Numerics +## Basic example -Let's take a look at the following `DataFrame` which contains both integers and floating point numbers. +Let's take a look at the following dataframe which contains both integers and floating point numbers: -{{code_block('user-guide/expressions/casting','dfnum',['DataFrame'])}} +{{code_block('user-guide/expressions/casting', 'dfnum', [])}} -```python exec="on" result="text" session="user-guide/cast" ---8<-- "python/user-guide/expressions/casting.py:setup" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:dfnum" ``` -To perform casting operations between floats and integers, or vice versa, we can invoke the `cast()` function. +To perform casting operations between floats and integers, or vice versa, we use the function `cast`: {{code_block('user-guide/expressions/casting','castnum',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:castnum" ``` -Note that in the case of decimal values these are rounded downwards when casting to an integer. +Note that floating point numbers are truncated when casting to an integer data type. -##### Downcast +## Downcasting numerical data types -Reducing the memory footprint is also achievable by modifying the number of bits allocated to an element. As an illustration, the code below demonstrates how casting from `Int64` to `Int16` and from `Float64` to `Float32` can be used to lower memory usage. +You can reduce the memory footprint of a column by changing the precision associated with its numeric data type. +As an illustration, the code below demonstrates how casting from `Int64` to `Int16` and from `Float64` to `Float32` can be used to lower memory usage: -{{code_block('user-guide/expressions/casting','downcast',['cast'])}} +{{code_block('user-guide/expressions/casting','downcast',['cast', 'estimated_size'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:downcast" ``` -#### Overflow - -When performing downcasting, it is crucial to ensure that the chosen number of bits (such as 64, 32, or 16) is sufficient to accommodate the largest and smallest numbers in the column. For example, using a 32-bit signed integer (`Int32`) allows handling integers within the range of -2147483648 to +2147483647, while using `Int8` covers integers between -128 to 127. Attempting to cast to a `DataType` that is too small will result in a `ComputeError` thrown by Polars, as the operation is not supported. +When performing downcasting it is crucial to ensure that the chosen number of bits (such as 64, 32, or 16) is sufficient to accommodate the largest and smallest numbers in the column. +For example, a 32-bit signed integer (`Int32`) represents integers between -2147483648 and 2147483647, inclusive, while an 8-bit signed integer only represents integers between -128 and 127, inclusive. +Attempting to downcast to a data type with insufficient precision results in an error thrown by Polars: {{code_block('user-guide/expressions/casting','overflow',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:overflow" ``` -You can set the `strict` parameter to `False`, this converts values that are overflowing to null values. +If you set the parameter `strict` to `False` the overflowing/underflowing values are converted to `null`: {{code_block('user-guide/expressions/casting','overflow2',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:overflow2" ``` -## Strings +## Converting strings to numeric data types -Strings can be casted to numerical data types and vice versa: +Strings that represent numbers can be converted to the appropriate data types via casting. +The opposite conversion is also possible: {{code_block('user-guide/expressions/casting','strings',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:strings" ``` -In case the column contains a non-numerical value, Polars will throw a `ComputeError` detailing the conversion error. Setting `strict=False` will convert the non float value to `null`. +In case the column contains a non-numerical value, or a poorly formatted one, Polars will throw an error with details on the conversion error. +You can set `strict=False` to circumvent the error and get a `null` value instead. {{code_block('user-guide/expressions/casting','strings2',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:strings2" ``` ## Booleans -Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting operations between a numerical `DataType` and a boolean, and vice versa. However, keep in mind that casting from a string (`String`) to a boolean is not permitted. +Booleans can be expressed as either 1 (`True`) or 0 (`False`). +It's possible to perform casting operations between a numerical data type and a Boolean, and vice versa. + +When converting numbers to Booleans, the number 0 is converted to `False` and all other numbers are converted to `True`, in alignment with Python's Truthy and Falsy values for numbers: {{code_block('user-guide/expressions/casting','bool',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:bool" ``` -## Dates +## Parsing / formatting temporal data types -Temporal data types such as `Date` or `Datetime` are represented as the number of days (`Date`) and microseconds (`Datetime`) since epoch. Therefore, casting between the numerical types and the temporal data types is allowed. +All temporal data types are represented internally as the number of time units elapsed since a reference moment, usually referred to as the epoch. +For example, values of the data type `Date` are stored as the number of days since the epoch. +For the data type `Datetime` the time unit is the microsecond (us) and for `Time` the time unit is the nanosecond (ns). + +Casting between numerical types and temporal data types is allowed and exposes this relationship: {{code_block('user-guide/expressions/casting','dates',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:dates" ``` -To convert between strings and `Dates`/`Datetimes`, `dt.to_string` and `str.to_datetime` are utilized. Polars adopts the [chrono format syntax](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) for formatting. It's worth noting that `str.to_datetime` features additional options that support timezone functionality. Refer to the API documentation for further information. +To format temporal data types as strings we can use the function `dt.to_string` and to parse temporal data types from strings we can use the function `str.to_datetime`. +Both functions adopt the [chrono format syntax](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) for formatting. {{code_block('user-guide/expressions/casting','dates2',['dt.to_string','str.to_date'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:dates2" ``` + +It's worth noting that `str.to_datetime` features additional options that support timezone functionality. +Refer to the API documentation for further information. diff --git a/docs/source/user-guide/expressions/categorical-data-and-enums.md b/docs/source/user-guide/expressions/categorical-data-and-enums.md index 9d1e5eee8905..4c9ce9734d3a 100644 --- a/docs/source/user-guide/expressions/categorical-data-and-enums.md +++ b/docs/source/user-guide/expressions/categorical-data-and-enums.md @@ -1,11 +1,176 @@ # Categorical data and enums -Categorical data represents string data where the values in the column have a finite set of values (usually way smaller than the length of the column). You can think about columns on gender, countries, currency pairings, etc. Storing these values as plain strings is a waste of memory and performance as we will be repeating the same string over and over again. Additionally, in the case of joins we are stuck with expensive string comparisons. +A column that holds string values that can only take on one of a limited number of possible values is a column that holds [categorical data](https://en.wikipedia.org/wiki/Categorical_variable). +Usually, the number of possible values is much smaller than the length of the column. +Some typical examples include your nationality, the operating system of your computer, or the license that your favorite open source project uses. -That is why Polars supports encoding string values in dictionary format. Working with categorical data in Polars can be done with two different DataTypes: `Enum`,`Categorical`. Both have their own use cases which we will explain further on this page. -First we will look at what a categorical is in Polars. +When working with categorical data you can use Polars' dedicated types, `Categorical` and `Enum`, to make your queries more performant. +Now, we will see what are the differences between the two data types `Categorical` and `Enum` and when you should use one data type or the other. +We also include some notes on [why the data types `Categorical` and `Enum` are more efficient than using the plain string values](#performance-considerations-on-categorical-data-types) in the end of this user guide section. -In Polars a categorical is defined as a string column which is encoded by a dictionary. A string column would be split into two elements: encodings and the actual string values. +## `Enum` vs `Categorical` + +In short, you should prefer `Enum` over `Categorical` whenever possible. +When the categories are fixed and known up front, use `Enum`. +When you don't know the categories or they are not fixed then you must use `Categorical`. +In case your requirements change along the way you can always cast from one to the other. + +## Data type `Enum` + +### Creating an `Enum` + +The data type `Enum` is an ordered categorical data type. +To use the data type `Enum` you have to specify the categories in advance to create a new data type that is a variant of an `Enum`. +Then, when creating a new series, a new dataframe, or when casting a string column, you can use that `Enum` variant. + +{{code_block('user-guide/expressions/categoricals', 'enum-example', ['Enum'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:enum-example" +``` + +### Invalid values + +Polars will raise an error if you try to specify a data type `Enum` whose categories do not include all the values present: + +{{code_block('user-guide/expressions/categoricals', 'enum-wrong-value', ['Enum'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:enum-wrong-value" +``` + +If you are in a position where you cannot know all of the possible values in advance and erroring on unknown values is semantically wrong, you may need to [use the data type `Categorical`](#data-type-categorical). + +### Category ordering and comparison + +The data type `Enum` is ordered and the order is induced by the order in which you specify the categories. +The example below uses log levels as an example of where an ordered `Enum` is useful: + +{{code_block('user-guide/expressions/categoricals', 'log-levels', ['Enum'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:log-levels" +``` + +This example shows that we can compare `Enum` values with a string, but this only works if the string matches one of the `Enum` values. +If we compared the column “level” with any string other than `"debug"`, `"info"`, `"warning"`, or `"error"`, Polars would raise an exception. + +Columns with the data type `Enum` can also be compared with other columns that have the same data type `Enum` or columns that hold strings, but only if all the strings are valid `Enum` values. + +## Data type `Categorical` + +The data type `Categorical` can be seen as a more flexible version of `Enum`. + +### Creating a `Categorical` series + +To use the data type `Categorical`, you can cast a column of strings or specify `Categorical` as the data type of a series or dataframe column: + +{{code_block('user-guide/expressions/categoricals', 'categorical-example', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-example" +``` + +Having Polars infer the categories for you may sound strictly better than listing the categories beforehand, but this inference comes with a performance cost. +That is why, whenever possible, you should use `Enum`. +You can learn more by [reading the subsection about the data type `Categorical` and its encodings](#data-type-categorical-and-encodings). + +### Lexical comparison with strings + +When comparing a `Categorical` column with a string, Polars will perform a lexical comparison: + +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-string" +``` + +You can also compare a column of strings with your `Categorical` column, and the comparison will also be lexical: + +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string-column', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-string-column" +``` + +Although it is possible to compare a string column with a categorical column, it is typically more efficient to compare two categorical columns. +We will see how to do that next. + +### Comparing `Categorical` columns and the string cache + +You are told that comparing columns with the data type `Categorical` is more efficient than if one of them is a string column. +So, you change your code so that the second column is also a categorical column and then you perform your comparison... +But Polars raises an exception: + +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-categorical-column', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-categorical-column" +``` + +By default, the values in columns with the data type `Categorical` are [encoded in the order they are seen in the column](#encodings), and independently from other columns, which means that Polars cannot compare efficiently two categorical columns that were created independently. + +Enabling the Polars string cache and creating the columns with the cache enabled fixes this issue: + +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-equality', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-equality" +``` + +Note that using [the string cache comes at a performance cost](#using-the-global-string-cache). + +### Combining `Categorical` columns + +The string cache is also useful in any operation that combines or mixes two columns with the data type `Categorical` in any way. +An example of this is when [concatenating two dataframes vertically](../getting-started.md#concatenating-dataframes): + +{{code_block('user-guide/expressions/categoricals', 'concatenating-categoricals', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:concatenating-categoricals" +``` + +In this case, Polars issues a warning complaining about an expensive reenconding that implies taking a performance hit. +Polars then suggests using the data type `Enum` if possible, or using the string cache. +To understand the issue with this operation and why Polars raises an error, please read the final section about [the performance considerations of using categorical data types](#performance-considerations-on-categorical-data-types). + +### Comparison between `Categorical` columns is not lexical + +When comparing two columns with data type `Categorical`, Polars does not perform lexical comparison between the values by default. +If you want lexical ordering, you need to specify so when creating the column: + +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-lexical', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-comparison-lexical" +``` + +Otherwise, the order is inferred together with the values: + +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-physical', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-comparison-physical" +``` + +## Performance considerations on categorical data types + +This part of the user guide explains + +- why categorical data types are more performant than the string literals; and +- why Polars needs a string cache when doing some operations with the data type `Categorical`. + +### Encodings + +Categorical data represents string data where the values in the column have a finite set of values (usually way smaller than the length of the column). +Storing these values as plain strings is a waste of memory and performance as we will be repeating the same string over and over again. +Additionally, in operations like joins we have to perform expensive string comparisons. + +Categorical data types like `Enum` and `Categorical` let you encode the string values in a cheaper way, establishing a relationship between a cheap encoding value and the original string literal. + +As an example of a sensible encoding, Polars could choose to represent the finite set of categories as positive integers. +With that in mind, the diagram below shows a regular string column and a possible representation of a Polars column with the categorical data type: @@ -18,25 +183,25 @@ In Polars a categorical is defined as a string column which is encoded by a dict - + - + - + - + - + - + - +
String Column Categorical Column
Polar BearPolar
Panda BearPanda
Brown BearBrown
Panda BearPanda
Brown BearBrown
Brown BearBrown
Polar BearPolar
@@ -87,13 +252,13 @@ In Polars a categorical is defined as a string column which is encoded by a dict - Polar Bear + Polar - Panda Bear + Panda - Brown Bear + Brown @@ -104,25 +269,26 @@ In Polars a categorical is defined as a string column which is encoded by a dict -The physical `0` in this case encodes (or maps) to the value 'Polar Bear', the value `1` encodes to 'Panda Bear' and the value `2` to 'Brown Bear'. This encoding has the benefit of only storing the string values once. Additionally, when we perform operations (e.g. sorting, counting) we can work directly on the physical representation which is much faster than the working with string data. - -## `Enum` vs `Categorical` +The physical `0` in this case encodes (or maps) to the value 'Polar', the value `1` encodes to 'Panda', and the value `2` to 'Brown'. +This encoding has the benefit of only storing the string values once. +Additionally, when we perform operations (e.g. sorting, counting) we can work directly on the physical representation which is much faster than the working with string data. -Polars supports two different DataTypes for working with categorical data: `Enum` and `Categorical`. When the categories are known up front use `Enum`. When you don't know the categories or they are not fixed then you use `Categorical`. In case your requirements change along the way you can always cast from one to the other. +### Encodings for the data type `Enum` are global -{{code_block('user-guide/concepts/data-types/categoricals','example',[])}} +When working with the data type `Enum` we specify the categories in advance. +This way, Polars can ensure different columns and even different datasets have the same encoding and there is no need for expensive re-encoding or cache lookups. -From the code block above you can see that the `Enum` data type requires the upfront while the categorical data type infers the categories. +### Data type `Categorical` and encodings -### `Categorical` data type +The fact that the categories for the data type `Categorical` are inferred come at a cost. +The main cost here is that we have no control over our encodings. -The `Categorical` data type is a flexible one. Polars will add categories on the fly if it sees them. This sounds like a strictly better version compared to the `Enum` data type as we can simply infer the categories, however inferring comes at a cost. The main cost here is we have no control over our encodings. - -Consider the following scenario where we append the following two categorical `Series` +Consider the following scenario where we append the following two categorical series: {{code_block('user-guide/concepts/data-types/categoricals','append',[])}} -Polars encodes the string values in order as they appear. So the series would look like this: +Polars encodes the string values in the order they appear. +So, the series would look like this: @@ -238,94 +404,102 @@ Polars encodes the string values in order as they appear. So the series would lo
cat_series cat2_series
-Combining the `Series` becomes a non-trivial task which is expensive as the physical value of `0` represents something different in both `Series`. Polars does support these types of operations for convenience, however in general these should be avoided due to its slower performance as it requires making both encodings compatible first before doing any merge operations. - -#### Using the global string cache - -One way to handle this problem is to enable a `StringCache`. When you enable the `StringCache` strings are no longer encoded in the order they appear on a per-column basis. Instead, the string cache ensures a single encoding for each string. The string `Polar` will always map the same physical for all categorical columns made under the string cache. -Merge operations (e.g. appends, joins) are cheap as there is no need to make the encodings compatible first, solving the problem we had above. - -{{code_block('user-guide/concepts/data-types/categoricals','global_append',[])}} - -However, the string cache does come at a small performance hit during construction of the `Series` as we need to look up / insert the string value in the cache. Therefore, it is preferred to use the `Enum` Data Type if you know your categories in advance. - -### `Enum data type` - -In the `Enum` data type we specify the categories in advance. This way we ensure categoricals from different columns or different datasets have the same encoding and there is no need for expensive re-encoding or cache lookups. - -{{code_block('user-guide/concepts/data-types/categoricals','enum_append',[])}} - - - -Polars will raise an `OutOfBounds` error when a value is encountered which is not specified in the `Enum`. - -{{code_block('user-guide/concepts/data-types/categoricals','enum_error',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:setup" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:enum_error" -``` - -## Comparisons - - - -The following types of comparisons operators are allowed for categorical data: - -- Categorical vs Categorical -- Categorical vs String - -### `Categorical` Type +Combining the series becomes a non-trivial task which is expensive as the physical value of `0` represents something different in both series. +Polars does support these types of operations for convenience, however these should be avoided due to its slower performance as it requires making both encodings compatible first before doing any merge operations. -For the `Categorical` type comparisons are valid if they have the same global cache set or if they have the same underlying categories in the same order. +### Using the global string cache -{{code_block('user-guide/concepts/data-types/categoricals','global_equality',[])}} +One way to handle this reencoding problem is to enable the string cache. +Under the string cache, the diagram would instead look like this: -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:setup" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:global_equality" -``` - -For `Categorical` vs `String` comparisons Polars uses lexical ordering to determine the result: - -{{code_block('user-guide/concepts/data-types/categoricals','str_compare_single',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_compare_single" -``` - -{{code_block('user-guide/concepts/data-types/categoricals','str_compare',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_compare" -``` - -### `Enum` Type - -For `Enum` type comparisons are valid if they have the same categories. - -{{code_block('user-guide/concepts/data-types/categoricals','equality',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:equality" -``` - -For `Enum` vs `String` comparisons the order within the categories is used instead of lexical ordering. In order for a comparison to be valid all values in the `String` column should be present in the `Enum` categories list. - -{{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare_error',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_enum_compare_error" -``` - -{{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare_single',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_enum_compare_single" -``` + + + + + +
SeriesString cache
+ + + + + + +
cat_seriescat2_series
+ + + + + + + + + + + + + + + + + + + + + + + +
Physical
0
1
2
2
0
+
+ + + + + + + + + + + + + + + + + + + + + + + +
Physical
1
2
2
0
0
+
+
+ + + + + + + + + + + + + + + + + +
Categories
Polar
Panda
Brown
+
-{{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare',[])}} +When you enable the string cache, strings are no longer encoded in the order they appear on a per-column basis. +Instead, the encoding is shared across columns. +The value 'Polar' will always be encoded by the same value for all categorical columns created under the string cache. +Merge operations (e.g. appends, joins) become cheap again as there is no need to make the encodings compatible first, solving the problem we had above. -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_enum_compare" -``` +However, the string cache does come at a small performance hit during construction of the series as we need to look up or insert the string values in the cache. +Therefore, it is preferred to use the data type `Enum` if you know your categories in advance. diff --git a/docs/source/user-guide/expressions/column-selections.md b/docs/source/user-guide/expressions/column-selections.md deleted file mode 100644 index 92a87dc2b760..000000000000 --- a/docs/source/user-guide/expressions/column-selections.md +++ /dev/null @@ -1,134 +0,0 @@ -# Column selections - -Let's create a dataset to use in this section: - -{{code_block('user-guide/expressions/column-selections','selectors_df',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:setup" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_df" -``` - -## Expression expansion - -As we've seen in the previous section, we can select specific columns using the `pl.col` method. It can also select multiple columns - both as a means of convenience, and to _expand_ the expression. - -This kind of convenience feature isn't just decorative or syntactic sugar. It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: a single expression that specifies multiple columns expands into a list of expressions (depending on the DataFrame schema), resulting in being able to select multiple columns + run computation on them! - -### Select all, or all but some - -We can select all columns in the `DataFrame` object by providing the argument `*`: - -{{code_block('user-guide/expressions/column-selections', 'all',['all'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:all" -``` - -Often, we don't just want to include all columns, but include all _while_ excluding a few. This can be done easily as well: - -{{code_block('user-guide/expressions/column-selections','exclude',['exclude'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:exclude" -``` - -### By multiple strings - -Specifying multiple strings allows expressions to _expand_ to all matching columns: - -{{code_block('user-guide/expressions/column-selections','expansion_by_names',['dt.to_string'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:expansion_by_names" -``` - -### By regular expressions - -Multiple column selection is possible by regular expressions also, by making sure to wrap the regex by `^` and `$` to let `pl.col` know that a regex selection is expected: - -{{code_block('user-guide/expressions/column-selections','expansion_by_regex',[])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:expansion_by_regex" -``` - -### By data type - -`pl.col` can select multiple columns using Polars data types: - -{{code_block('user-guide/expressions/column-selections','expansion_by_dtype',['n_unique'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:expansion_by_dtype" -``` - -## Using `selectors` - -Polars also allows for the use of intuitive selections for columns based on their name, `dtype` or other properties; and this is built on top of existing functionality outlined in `col` used above. It is recommended to use them by importing and aliasing `polars.selectors` as `cs`. - -### By `dtype` - -To select just the integer and string columns, we can do: - -{{code_block('user-guide/expressions/column-selections','selectors_intro',['selectors'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_intro" -``` - -### Applying set operations - -These _selectors_ also allow for set based selection operations. For instance, to select the **numeric** columns **except** the **first** column that indicates row numbers: - -{{code_block('user-guide/expressions/column-selections','selectors_diff',['cs.first', 'cs.numeric'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_diff" -``` - -We can also select the row number by name **and** any **non**-numeric columns: - -{{code_block('user-guide/expressions/column-selections','selectors_union',['cs.by_name', 'cs.numeric'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_union" -``` - -### By patterns and substrings - -_Selectors_ can also be matched by substring and regex patterns: - -{{code_block('user-guide/expressions/column-selections','selectors_by_name',['cs.contains', 'cs.matches'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_by_name" -``` - -### Converting to expressions - -What if we want to apply a specific operation on the selected columns (i.e. get back to representing them as **expressions** to operate upon)? We can simply convert them using `as_expr` and then proceed as normal: - -{{code_block('user-guide/expressions/column-selections','selectors_to_expr',['cs.temporal'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_to_expr" -``` - -### Debugging `selectors` - -Polars also provides two helpful utility functions to aid with using selectors: `is_selector` and `expand_selector`: - -{{code_block('user-guide/expressions/column-selections','selectors_is_selector_utility',['is_selector'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_is_selector_utility" -``` - -To predetermine the column names that are selected, which is especially useful for a LazyFrame object: - -{{code_block('user-guide/expressions/column-selections','selectors_colnames_utility',['expand_selector'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_colnames_utility" -``` diff --git a/docs/source/user-guide/expressions/expression-expansion.md b/docs/source/user-guide/expressions/expression-expansion.md new file mode 100644 index 000000000000..c40c1ddeeafd --- /dev/null +++ b/docs/source/user-guide/expressions/expression-expansion.md @@ -0,0 +1,363 @@ +# Expression expansion + +As you've seen in [the section about expressions and contexts](../concepts/expressions-and-contexts.md), expression expansion is a feature that enables you to write a single expression that can expand to multiple different expressions, possibly depending on the schema of the context in which the expression is used. + +This feature isn't just decorative or syntactic sugar. +It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: +a single expression that specifies multiple columns expands into a list of expressions, which means you can write one single expression and reuse the computation that it represents. + +In this section we will show several forms of expression expansion and we will be using the dataframe that you can see below for that effect: + +{{code_block('user-guide/expressions/expression-expansion', 'df', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:df" +``` + +## Function `col` + +The function `col` is the most common way of making use of expression expansion features in Polars. +Typically used to refer to one column of a dataframe, in this section we explore other ways in which you can use `col` (or its variants, when in Rust). + +### Explicit expansion by column name + +The simplest form of expression expansion happens when you provide multiple column names to the function `col`. + +The example below uses a single function `col` with multiple column names to convert the values in USD to EUR: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-names', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-names" +``` + +When you list the column names you want the expression to expand to, you can predict what the expression will expand to. +In this case, the expression that does the currency conversion is expanded to a list of five expressions: + +{{code_block('user-guide/expressions/expression-expansion', 'expression-list', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:expression-list" +``` + +### Expansion by data type + +We had to type five column names in the previous example but the function `col` can also conveniently accept one or more data types. +If you provide data types instead of column names, the expression is expanded to all columns that match one of the data types provided. + +The example below performs the exact same computation as before: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtype', [], ['col'], ['dtype_col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-dtype" +``` + +When we use a data type with expression expansion we cannot know, beforehand, how many columns a single expression will expand to. +We need the schema of the input dataframe if we want to determine what is the final list of expressions that is to be applied. + +If we weren't sure about whether the price columns where of the type `Float64` or `Float32`, we could specify both data types: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtypes', [], ['col'], ['dtype_cols'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-dtypes" +``` + +### Expansion by pattern matching + +You can also use regular expressions to specify patterns that are used to match the column names. +To distinguish between a regular column name and expansion by pattern matching, regular expressions start and end with `^` and `$`, respectively. +This also means that the pattern must match against the whole column name string. + +Regular expressions can be mixed with regular column names: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-regex', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-regex" +``` + +### Arguments cannot be of mixed types + +In Python, the function `col` accepts an arbitrary number of strings (as [column names](#explicit-expansion-by-column-name) or as [regular expressions](#expansion-by-pattern-matching)) or an arbitrary number of data types, but you cannot mix both in the same function call: + +```python +--8<-- "python/user-guide/expressions/expression-expansion.py:col-error" +``` + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-error" +``` + +## Selecting all columns + +Polars provides the function `all` as shorthand notation to refer to all columns of a dataframe: + +{{code_block('user-guide/expressions/expression-expansion', 'all', ['all'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:all" +``` + +!!! note + + The function `all` is syntactic sugar for `col("*")`, but since the argument `"*"` is a special case and `all` reads more like English, the usage of `all` is preferred. + +## Excluding columns + +Polars also provides a mechanism to exclude certain columns from expression expansion. +For that, you use the function `exclude`, which accepts exactly the same types of arguments as `col`: + +{{code_block('user-guide/expressions/expression-expansion', 'all-exclude', ['exclude'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:all-exclude" +``` + +Naturally, the function `exclude` can also be used after the function `col`: + +{{code_block('user-guide/expressions/expression-expansion', 'col-exclude', ['exclude'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-exclude" +``` + +## Column renaming + +By default, when you apply an expression to a column, the result keeps the same name as the original column. + +Preserving the column name can be semantically wrong and in certain cases Polars may even raise an error if duplicate names occur: + +{{code_block('user-guide/expressions/expression-expansion', 'duplicate-error', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:duplicate-error" +``` + +To prevent errors like this, and to allow users to rename their columns when appropriate, Polars provides a series of functions that let you change the name of a column or a group of columns. + +### Renaming a single column with `alias` + +The function `alias` has been used thoroughly in the documentation already and it lets you rename a single column: + +{{code_block('user-guide/expressions/expression-expansion', 'alias', ['alias'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:alias" +``` + +### Prefixing and suffixing column names + +When using expression expansion you cannot use the function `alias` because the function `alias` is designed specifically to rename a single column. + +When it suffices to add a static prefix or a static suffix to the existing names, we can use the functions `prefix` and `suffix` from the namespace `name`: + +{{code_block('user-guide/expressions/expression-expansion', 'prefix-suffix', ['Expr.name', 'prefix', 'suffix'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:prefix-suffix" +``` + +### Dynamic name replacement + +If a static prefix/suffix is not enough, the namespace `name` also provides the function `map` that accepts a callable that accepts the old column names and produces the new ones: + +{{code_block('user-guide/expressions/expression-expansion', 'name-map', ['Expr.name', 'map'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:name-map" +``` + +See the API reference for the full contents of the namespace `name`. + +## Programmatically generating expressions + +Expression expansion is a very useful feature but it does not solve all of your problems. +For example, if we want to compute the day and year amplitude of the prices of the stocks in our dataframe, expression expansion won't help us. + +At first, you may think about using a `for` loop: + +{{code_block('user-guide/expressions/expression-expansion', 'for-with_columns', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:for-with_columns" +``` + +Do not do this. +Instead, generate all of the expressions you want to compute programmatically and use them only once in a context. +Loosely speaking, you want to swap the `for` loop with the context `with_columns`. +In practice, you could do something like the following: + +{{code_block('user-guide/expressions/expression-expansion', 'yield-expressions', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:yield-expressions" +``` + +This produces the same final result and by specifying all of the expressions in one go we give Polars the opportunity to: + +1. do a better job at optimising the query; and +2. parallelise the execution of the actual computations. + +## More flexible column selections + +Polars comes with the submodule `selectors` that provides a number of functions that allow you to write more flexible column selections for expression expansion. + +!!! warning + + This functionality is not available in Rust yet. Refer to [Polars issue #10594](https://github.com/pola-rs/polars/issues/10594). + +As a first example, here is how we can use the functions `string` and `ends_with`, and the set operations that the functions from `selectors` support, to select all string columns and the columns whose names end with `"_high"`: + +{{code_block('user-guide/expressions/expression-expansion', 'selectors', [], ['selectors'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selectors" +``` + +The submodule `selectors` provides [a number of selectors that match based on the data type of the columns](#selectors-for-data-types), of which the most useful are the functions that match a whole category of types, like `cs.numeric` for all numeric data types or `cs.temporal` for all temporal data types. + +The submodule `selectors` also provides [a number of selectors that match based on patterns in the column names](#selectors-for-column-name-patterns) which make it more convenient to specify common patterns you may want to check for, like the function `cs.ends_with` that was shown above. + +### Combining selectors with set operations + +We can combine multiple selectors using set operations and the usual Python operators: + + + +| Operator | Operation | +| ----------------------- | -------------------- | +| `A | B` | Union | +| `A & B` | Intersection | +| `A - B` | Difference | +| `A ^ B` | Symmetric difference | +| `~A` | Complement | + + +The next example matches all non-string columns that contain an underscore in the name: + +{{code_block('user-guide/expressions/expression-expansion', 'selectors-set-operations', [], ['selectors'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selectors-set-operations" +``` + +### Resolving operator ambiguity + +Expression functions can be chained on top of selectors: + +{{code_block('user-guide/expressions/expression-expansion', 'selectors-expressions', [], ['selectors'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selectors-expressions" +``` + +However, some operators have been overloaded to operate both on Polars selectors and on expressions. +For example, the operator `~` on a selector represents [the set operation “complement”](#combining-selectors-with-set-operations) and on an expression represents the Boolean operation of negation. + +When you use a selector and then want to use, in the context of an expression, one of the [operators that act as set operators for selectors](#combining-selectors-with-set-operations), you can use the function `as_expr`. + +Below, we want to negate the Boolean values in the columns “has_partner”, “has_kids”, and “has_tattoos”. +If we are not careful, the combination of the operator `~` and the selector `cs.starts_with("has_")` will actually select the columns that we do not care about: + +{{code_block('user-guide/expressions/expression-expansion', 'selector-ambiguity', [], [], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selector-ambiguity" +``` + +The correct solution uses `as_expr`: + +{{code_block('user-guide/expressions/expression-expansion', 'as_expr', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:as_expr" +``` + +### Debugging selectors + +When you are not sure whether you have a Polars selector at hand or not, you can use the function `cs.is_selector` to check: + +{{code_block('user-guide/expressions/expression-expansion', 'is_selector', [], ['is_selector'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:is_selector" +``` + +This should help you avoid any ambiguous situations where you think you are operating with expressions but are in fact operating with selectors. + +Another helpful debugging utility is the function `expand_selector`. +Given a target frame or schema, you can check what columns a given selector will expand to: + +{{code_block('user-guide/expressions/expression-expansion', 'expand_selector', [], ['expand_selector'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:expand_selector" +``` + +### Complete reference + +The tables below group the functions available in the submodule `selectors` by their type of behaviour. + +#### Selectors for data types + +Selectors that match based on the data type of the column: + +| Selector function | Data type(s) matched | +| ------------------ | ------------------------------------------------------------------ | +| `binary` | `Binary` | +| `boolean` | `Boolean` | +| `by_dtype` | Data types specified as arguments | +| `categorical` | `Categorical` | +| `date` | `Date` | +| `datetime` | `Datetime`, optionally filtering by time unit/zone | +| `decimal` | `Decimal` | +| `duration` | `Duration`, optionally filtering by time unit | +| `float` | All float types, regardless of precision | +| `integer` | All integer types, signed and unsigned, regardless of precision | +| `numeric` | All numeric types, namely integers, floats, and `Decimal` | +| `signed_integer` | All signed integer types, regardless of precision | +| `string` | `String` | +| `temporal` | All temporal data types, namely `Date`, `Datetime`, and `Duration` | +| `time` | `Time` | +| `unsigned_integer` | All unsigned integer types, regardless of precision | + +#### Selectors for column name patterns + +Selectors that match based on column name patterns: + +| Selector function | Columns selected | +| ----------------- | ------------------------------------------------------------ | +| `alpha` | Columns with alphabetical names | +| `alphanumeric` | Columns with alphanumeric names (letters and the digits 0-9) | +| `by_name` | Columns with the names specified as arguments | +| `contains` | Columns whose names contain the substring specified | +| `digit` | Columns with numeric names (only the digits 0-9) | +| `ends_with` | Columns whose names end with the given substring | +| `matches` | Columns whose names match the given regex pattern | +| `starts_with` | Columns whose names start with the given substring | + +#### Positional selectors + +Selectors that match based on the position of the columns: + +| Selector function | Columns selected | +| ----------------- | ------------------------------------ | +| `all` | All columns | +| `by_index` | The columns at the specified indices | +| `first` | The first column in the context | +| `last` | The last column in the context | + +#### Miscellaneous functions + +The submodule `selectors` also provides the following functions: + +| Function | Behaviour | +| ----------------- | ------------------------------------------------------------------------------------- | +| `as_expr`* | Convert a selector to an expression | +| `exclude` | Selects all columns except those matching the given names, data types, or selectors | +| `expand_selector` | Expand selector to matching columns with respect to a specific frame or target schema | +| `is_selector` | Check whether the given object/expression is a selector | + +*`as_expr` isn't a function defined on the submodule `selectors`, but rather a method defined on selectors. diff --git a/docs/source/user-guide/expressions/folds.md b/docs/source/user-guide/expressions/folds.md index 7990aae7eca8..6fb8d56072c4 100644 --- a/docs/source/user-guide/expressions/folds.md +++ b/docs/source/user-guide/expressions/folds.md @@ -1,26 +1,61 @@ # Folds -Polars provides expressions/methods for horizontal aggregations like `sum`,`min`, `mean`, -etc. However, when you need a more complex aggregation the default methods Polars supplies may not be sufficient. That's when `folds` come in handy. +Polars provides many expressions to perform computations across columns, like `sum_horizontal`, `mean_horizontal`, and `min_horizontal`. +However, these are just special cases of a general algorithm called a fold, and Polars provides a general mechanism for you to compute custom folds for when the specialised versions of Polars are not enough. -The `fold` expression operates on columns for maximum speed. It utilizes the data layout very efficiently and often has vectorized execution. +Folds computed with the function `fold` operate on the full columns for maximum speed. +They utilize the data layout very efficiently and often have vectorized execution. -### Manual sum +## Basic example -Let's start with an example by implementing the `sum` operation ourselves, with a `fold`. +As a first example, we will reimplement `sum_horizontal` with the function `fold`: {{code_block('user-guide/expressions/folds','mansum',['fold'])}} ```python exec="on" result="text" session="user-guide/folds" ---8<-- "python/user-guide/expressions/folds.py:setup" --8<-- "python/user-guide/expressions/folds.py:mansum" ``` -The snippet above recursively applies the function `f(acc, x) -> acc` to an accumulator `acc` and a new column `x`. The function operates on columns individually and can take advantage of cache efficiency and vectorization. +The function `fold` expects a function `f` as the parameter `function` and `f` should accept two arguments. +The first argument is the accumulated result, which we initialise as zero, and the second argument takes the successive values of the expressions listed in the parameter `exprs`. +In our case, they're the two columns “a” and “b”. -### Conditional +The snippet below includes a third explicit expression that represents what the function `fold` is doing above: -In the case where you'd want to apply a condition/predicate on all columns in a `DataFrame` a `fold` operation can be a very concise way to express this. +{{code_block('user-guide/expressions/folds','mansum-explicit',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:mansum-explicit" +``` + +??? tip "`fold` in Python" + + Most programming languages include a higher-order function that implements the algorithm that the function `fold` in Polars implements. + The Polars `fold` is very similar to Python's `functools.reduce`. + You can [learn more about the power of `functools.reduce` in this article](http://mathspp.com/blog/pydonts/the-power-of-reduce). + +## The initial value `acc` + +The initial value chosen for the accumulator `acc` is typically, but not always, the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation you want to apply. +For example, if we wanted to multiply across the columns, we would not get the correct result if our accumulator was set to zero: + +{{code_block('user-guide/expressions/folds','manprod',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:manprod" +``` + +To fix this, the accumulator `acc` should be set to `1`: + +{{code_block('user-guide/expressions/folds','manprod-fixed',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:manprod-fixed" +``` + +## Conditional + +In the case where you'd want to apply a condition/predicate across all columns in a dataframe, a fold can be a very concise way to express this. {{code_block('user-guide/expressions/folds','conditional',['fold'])}} @@ -28,13 +63,14 @@ In the case where you'd want to apply a condition/predicate on all columns in a --8<-- "python/user-guide/expressions/folds.py:conditional" ``` -In the snippet we filter all rows where **each** column value is `> 1`. +The snippet above filters all rows where all columns are greater than 1. -### Folds and string data +## Folds and string data -Folds could be used to concatenate string data. However, due to the materialization of intermediate columns, this operation will have squared complexity. +Folds could be used to concatenate string data. +However, due to the materialization of intermediate columns, this operation will have squared complexity. -Therefore, we recommend using the `concat_str` expression for this. +Therefore, we recommend using the function `concat_str` for this: {{code_block('user-guide/expressions/folds','string',['concat_str'])}} diff --git a/docs/source/user-guide/expressions/functions.md b/docs/source/user-guide/expressions/functions.md deleted file mode 100644 index 21c17ea4758b..000000000000 --- a/docs/source/user-guide/expressions/functions.md +++ /dev/null @@ -1,65 +0,0 @@ -# Functions - -Polars expressions have a large number of built in functions. These allow you to create complex queries without the need for [user defined functions](user-defined-functions.md). There are too many to go through here, but we will cover some of the more popular use cases. If you want to view all the functions go to the API Reference for your programming language. - -In the examples below we will use the following `DataFrame`: - -{{code_block('user-guide/expressions/functions','dataframe',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:setup" ---8<-- "python/user-guide/expressions/functions.py:dataframe" -``` - -## Column naming - -By default if you perform an expression it will keep the same name as the original column. In the example below we perform an expression on the `nrs` column. Note that the output `DataFrame` still has the same name. - -{{code_block('user-guide/expressions/functions','samename',[])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:samename" -``` - -This might get problematic in the case you use the same column multiple times in your expression as the output columns will get duplicated. For example, the following query will fail. - -{{code_block('user-guide/expressions/functions','samenametwice',[])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:samenametwice" -``` - -You can change the output name of an expression by using the `alias` function - -{{code_block('user-guide/expressions/functions','samenamealias',['alias'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:samenamealias" -``` - -In case of multiple columns for example when using `all()` or `col(*)` you can apply a mapping function `name.map` to change the original column name into something else. In case you want to add a suffix (`name.suffix()`) or prefix (`name.prefix()`) these are also built in. - -=== ":fontawesome-brands-python: Python" -[:material-api: `name.prefix`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.prefix.html) -[:material-api: `name.suffix`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.suffix.html) -[:material-api: `name.map`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.map.html) - -## Count unique values - -There are two ways to count unique values in Polars: an exact methodology and an approximation. The approximation uses the [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) algorithm to approximate the cardinality and is especially useful for very large datasets where an approximation is good enough. - -{{code_block('user-guide/expressions/functions','countunique',['n_unique','approx_n_unique'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:countunique" -``` - -## Conditionals - -Polars supports if-else like conditions in expressions with the `when`, `then`, `otherwise` syntax. The predicate is placed in the `when` clause and when this evaluates to `true` the `then` expression is applied otherwise the `otherwise` expression is applied (row-wise). - -{{code_block('user-guide/expressions/functions','conditional',['when'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:conditional" -``` diff --git a/docs/source/user-guide/expressions/index.md b/docs/source/user-guide/expressions/index.md index 32550974782e..7e4b6f0a8b1a 100644 --- a/docs/source/user-guide/expressions/index.md +++ b/docs/source/user-guide/expressions/index.md @@ -1,18 +1,22 @@ # Expressions -In the `Contexts` sections we outlined what `Expressions` are and how they are invaluable. In this section we will focus on the `Expressions` themselves. Each section gives an overview of what they do and provide additional examples. +We [introduced the concept of “expressions” in a previous section](../concepts/expressions-and-contexts.md#expressions). +In this section we will focus on exploring the types of expressions that Polars offers. +Each section gives an overview of what they do and provides additional examples. -- [Operators](operators.md) -- [Column selections](column-selections.md) -- [Functions](functions.md) -- [Casting](casting.md) -- [Strings](strings.md) -- [Aggregation](aggregation.md) -- [Missing data](missing-data.md) -- [Window](window.md) -- [Folds](folds.md) -- [Lists](lists.md) -- [Plugins](plugins.md) -- [User-defined functions](user-defined-functions.md) -- [Structs](structs.md) -- [Numpy](numpy.md) +- Essentials: + - [Basic operations](basic-operations.md) – how to do basic operations on dataframe columns, like arithmetic calculations, comparisons, and other common, general-purpose operations + - [Expression expansion](expression-expansion.md) – what is expression expansion and how to use it + - [Casting](casting.md) – how to convert / cast values to different data types +- How to work with specific types of data or data type namespaces: + - [Strings](strings.md) – how to work with strings and the namespace `str` + - [Lists and arrays](lists-and-arrays.md) – the differences between the data types `List` and `Array`, when to use them, and how to use them + - [Categorical data and enums](categorical-data-and-enums.md) – the differences between the data types `Categorical` and `Enum`, when to use them, and how to use them + - [Structs](structs.md) – when to use the data type `Struct` and how to use it + - [Missing data](missing-data.md) – how to work with missing data and how to fill missing data +- Types of operations: + - [Aggregation](aggregation.md) – how to work with aggregating contexts like `group_by` + - [Window functions](window-functions.md) – how to apply window functions over columns in a dataframe + - [Folds](folds.md) – how to perform arbitrary computations horizontally across columns +- [User-defined Python functions](user-defined-python-functions.md) – how to apply user-defined Python functions to dataframe columns or to column values +- [Numpy functions](numpy-functions.md) – how to use NumPy native functions on Polars dataframes and series diff --git a/docs/source/user-guide/expressions/lists-and-arrays.md b/docs/source/user-guide/expressions/lists-and-arrays.md new file mode 100644 index 000000000000..6ba5fd319421 --- /dev/null +++ b/docs/source/user-guide/expressions/lists-and-arrays.md @@ -0,0 +1,184 @@ +# Lists and arrays + +Polars has first-class support for two homogeneous container data types: `List` and `Array`. +Polars supports many operations with the two data types and their APIs overlap, so this section of the user guide has the objective of clarifying when one data type should be chosen in favour of the other. + +## Lists vs arrays + +### The data type `List` + +The data type list is suitable for columns whose values are homogeneous 1D containers of varying lengths. + +The dataframe below contains three examples of columns with the data type `List`: + +{{code_block('user-guide/expressions/lists', 'list-example', ['List'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:list-example" +``` + +Note that the data type `List` is different from Python's type `list`, where elements can be of any type. +If you want to store true Python lists in a column, you can do so with the data type `Object` and your column will not have the list manipulation features that we're about to discuss. + +### The data type `Array` + +The data type `Array` is suitable for columns whose values are homogeneous containers of an arbitrary dimension with a known and fixed shape. + +The dataframe below contains two examples of columns with the data type `Array`. + +{{code_block('user-guide/expressions/lists', 'array-example', ['Array'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:array-example" +``` + +The example above shows how to specify that the columns “bit_flags” and “tic_tac_toe” have the data type `Array`, parametrised by the data type of the elements contained within and by the shape of each array. + +In general, Polars does not infer that a column has the data type `Array` for performance reasons, and defaults to the appropriate variant of the data type `List`. +In Python, an exception to this rule is when you provide a NumPy array to build a column. +In that case, Polars has the guarantee from NumPy that all subarrays have the same shape, so an array of $n + 1$ dimensions will generate a column of $n$ dimensional arrays: + +{{code_block('user-guide/expressions/lists', 'numpy-array-inference', ['Array'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:numpy-array-inference" +``` + +### When to use each + +In short, prefer the data type `Array` over `List` because it is more memory efficient and more performant. +If you cannot use `Array`, then use `List`: + +- when the values within a column do not have a fixed shape; or +- when you need functions that are only available in the list API. + +## Working with lists + +### The namespace `list` + +Polars provides many functions to work with values of the data type `List` and these are grouped inside the namespace `list`. +We will explore this namespace a bit now. + +!!! warning "`arr` then, `list` now" + + In previous versions of Polars, the namespace for list operations used to be `arr`. + `arr` is now the namespace for the data type `Array`. + If you find references to the namespace `arr` on StackOverflow or other sources, note that those sources _may_ be outdated. + +The dataframe `weather` defined below contains data from different weather stations across a region. +When the weather station is unable to get a result, an error code is recorded instead of the actual temperature at that time. + +{{code_block('user-guide/expressions/lists', 'weather', [])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:weather" +``` + +### Programmatically creating lists + +Given the dataframe `weather` defined previously, it is very likely we need to run some analysis on the temperatures that are captured by each station. +To make this happen, we need to first be able to get individual temperature measurements. +We [can use the namespace `str`](strings.md#the-string-namespace) for this: + +{{code_block('user-guide/expressions/lists', 'split', ['str.split'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:split" +``` + +A natural follow-up would be to explode the list of temperatures so that each measurement is in its own row: + +{{code_block('user-guide/expressions/lists', 'explode', ['explode'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:explode" +``` + +However, in Polars we often do not need to do this to operate on the list elements. + +### Operating on lists + +Polars provides several standard operations on columns with the `List` data type. +[Similar to what you can do with strings](strings.md#slicing), lists can be sliced with the functions `head`, `tail`, and `slice`: + +{{code_block('user-guide/expressions/lists', 'list-slicing', ['Expr.list'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:list-slicing" +``` + +### Element-wise computation within lists + +If we need to identify the stations that are giving the most number of errors we need to + +1. try to convert the measurements into numbers; +2. count the number of non-numeric values (i.e., `null` values) in the list, by row; and +3. rename this output column as “errors” so that we can easily identify the stations. + +To perform these steps, we need to perform a casting operation on each measurement within the list values. +The function `eval` is used as the entry point to perform operations on the elements of the list. +Within it, you can use the context `element` to refer to each single element of the list individually, and then you can use any Polars expression on the element: + +{{code_block('user-guide/expressions/lists', 'element-wise-casting', ['element'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:element-wise-casting" +``` + +Another alternative would be to use a regular expression to check if a measurement starts with a letter: + +{{code_block('user-guide/expressions/lists', 'element-wise-regex', ['element'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:element-wise-regex" +``` + +If you are unfamiliar with the namespace `str` or the notation `(?i)` in the regex, now is a good time to [look at how to work with strings and regular expressions in Polars](strings.md#check-for-the-existence-of-a-pattern). + +### Row-wise computations + +The function `eval` gives us access to the list elements and `pl.element` refers to each individual element, but we can also use `pl.all()` to refer to all of the elements of the list. + +To show this in action, we will start by creating another dataframe with some more weather data: + +{{code_block('user-guide/expressions/lists', 'weather_by_day', [])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:weather_by_day" +``` + +Now, we will calculate the percentage rank of the temperatures by day, measured across stations. +Polars does not provide a function to do this directly, but because expressions are so versatile we can create our own percentage rank expression for highest temperature. +Let's try that: + +{{code_block('user-guide/expressions/lists', 'rank_pct', ['element', 'rank'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:rank_pct" +``` + +## Working with arrays + +### Creating an array column + +As [we have seen above](#the-data-type-array), Polars usually does not infer the data type `Array` automatically. +You have to specify the data type `Array` when creating a series/dataframe or [cast a column](casting.md) explicitly unless you create the column out of a NumPy array. + +### The namespace `arr` + +The data type `Array` was recently introduced and is still pretty nascent in features that it offers. +Even so, the namespace `arr` aggregates several functions that you can use to work with arrays. + +!!! warning "`arr` then, `list` now" + + In previous versions of Polars, the namespace for list operations used to be `arr`. + `arr` is now the namespace for the data type `Array`. + If you find references to the namespace `arr` on StackOverflow or other sources, note that those sources _may_ be outdated. + +The API documentation should give you a good overview of the functions in the namespace `arr`, of which we present a couple: + +{{code_block('user-guide/expressions/lists', 'array-overview', ['Expr.arr'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:array-overview" +``` diff --git a/docs/source/user-guide/expressions/lists.md b/docs/source/user-guide/expressions/lists.md deleted file mode 100644 index dea95ffc2c1c..000000000000 --- a/docs/source/user-guide/expressions/lists.md +++ /dev/null @@ -1,119 +0,0 @@ -# Lists and Arrays - -Polars has first-class support for `List` columns: that is, columns where each row is a list of homogeneous elements, of varying lengths. Polars also has an `Array` datatype, which is analogous to NumPy's `ndarray` objects, where the length is identical across rows. - -Note: this is different from Python's `list` object, where the elements can be of any type. Polars can store these within columns, but as a generic `Object` datatype that doesn't have the special list manipulation features that we're about to discuss. - -## Powerful `List` manipulation - -Let's say we had the following data from different weather stations across a state. When the weather station is unable to get a result, an error code is recorded instead of the actual temperature at that time. - -{{code_block('user-guide/expressions/lists','weather_df',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:setup" ---8<-- "python/user-guide/expressions/lists.py:weather_df" -``` - -### Creating a `List` column - -For the `weather` `DataFrame` created above, it's very likely we need to run some analysis on the temperatures that are captured by each station. To make this happen, we need to first be able to get individual temperature measurements. This is done by: - -{{code_block('user-guide/expressions/lists','string_to_list',['str.split'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:string_to_list" -``` - -One way we could go post this would be to convert each temperature measurement into its own row: - -{{code_block('user-guide/expressions/lists','explode_to_atomic',['DataFrame.explode'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:explode_to_atomic" -``` - -However, in Polars, we often do not need to do this to operate on the `List` elements. - -### Operating on `List` columns - -Polars provides several standard operations on `List` columns. If we want the first three measurements, we can do a `head(3)`. The last three can be obtained via a `tail(3)`, or alternately, via `slice` (negative indexing is supported). We can also identify the number of observations via `lengths`. Let's see them in action: - -{{code_block('user-guide/expressions/lists','list_ops',['Expr.list'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:list_ops" -``` - -!!! warning "`arr` then, `list` now" - - If you find references to the `arr` API on Stackoverflow or other sources, just replace `arr` with `list`, this was the old accessor for the `List` datatype. `arr` now refers to the newly introduced `Array` datatype (see below). - -### Element-wise computation within `List`s - -If we need to identify the stations that are giving the most number of errors from the starting `DataFrame`, we need to: - -1. Parse the string input as a `List` of string values (already done). -2. Identify those strings that can be converted to numbers. -3. Identify the number of non-numeric values (i.e. `null` values) in the list, by row. -4. Rename this output as `errors` so that we can easily identify the stations. - -The third step requires a casting (or alternately, a regex pattern search) operation to be perform on each element of the list. We can do this using by applying the operation on each element by first referencing them in the `pl.element()` context, and then calling a suitable Polars expression on them. Let's see how: - -{{code_block('user-guide/expressions/lists','count_errors',['Expr.list', 'element'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:count_errors" -``` - -What if we chose the regex route (i.e. recognizing the presence of _any_ alphabetical character?) - -{{code_block('user-guide/expressions/lists','count_errors_regex',['str.contains'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:count_errors_regex" -``` - -If you're unfamiliar with the `(?i)`, it's a good time to look at the documentation for the `str.contains` function in Polars! The Rust regex crate provides a lot of additional regex flags that might come in handy. - -## Row-wise computations - -This context is ideal for computing in row orientation. - -We can apply **any** Polars operations on the elements of the list with the `list.eval` (`list().eval` in Rust) expression! These expressions run entirely on Polars' query engine and can run in parallel, so will be well optimized. Let's say we have another set of weather data across three days, for different stations: - -{{code_block('user-guide/expressions/lists','weather_by_day',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:weather_by_day" -``` - -Let's do something interesting, where we calculate the percentage rank of the temperatures by day, measured across stations. Pandas allows you to compute the percentages of the `rank` values. Polars doesn't provide a special function to do this directly, but because expressions are so versatile we can create our own percentage rank expression for highest temperature. Let's try that! - -{{code_block('user-guide/expressions/lists','weather_by_day_rank',['list.eval'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:weather_by_day_rank" -``` - -## Polars `Array`s - -`Array`s are a new data type that was recently introduced, and are still pretty nascent in features that it offers. The major difference between a `List` and an `Array` is that the latter is limited to having the same number of elements per row, while a `List` can have a variable number of elements. Both still require that each element's data type is the same. - -We can define `Array` columns in this manner: - -{{code_block('user-guide/expressions/lists','array_df',['Array'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:array_df" -``` - -Basic operations are available on it: - -{{code_block('user-guide/expressions/lists','array_ops',['Series.arr'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:array_ops" -``` - -Polars `Array`s are still being actively developed, so this section will likely change in the future. diff --git a/docs/source/user-guide/expressions/missing-data.md b/docs/source/user-guide/expressions/missing-data.md index ce2fd0216c5f..f1697cced489 100644 --- a/docs/source/user-guide/expressions/missing-data.md +++ b/docs/source/user-guide/expressions/missing-data.md @@ -1,31 +1,35 @@ # Missing data -This page sets out how missing data is represented in Polars and how missing data can be filled. +This section of the user guide teaches how to work with missing data in Polars. ## `null` and `NaN` values -Each column in a `DataFrame` (or equivalently a `Series`) is an Arrow array or a collection of Arrow arrays [based on the Apache Arrow spec](https://arrow.apache.org/docs/format/Columnar.html#null-count). Missing data is represented in Arrow and Polars with a `null` value. This `null` missing value applies for all data types including numerical values. +In Polars, missing data is represented by the value `null`. +This missing value `null` is used for all data types, including numerical types. -Polars also allows `NotaNumber` or `NaN` values for float columns. These `NaN` values are considered to be a type of floating point data rather than missing data. We discuss `NaN` values separately below. +Polars also supports the value `NaN` (“Not a Number”) for columns with floating point numbers. +The value `NaN` is considered to be a valid floating point value, which is different from missing data. +[We discuss the value `NaN` separately below](#not-a-number-or-nan-values). -You can manually define a missing value with the python `None` value: +When creating a series or a dataframe, you can set a value to `null` by using the appropriate construct for your language: {{code_block('user-guide/expressions/missing-data','dataframe',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/missing-data" ---8<-- "python/user-guide/expressions/missing-data.py:setup" --8<-- "python/user-guide/expressions/missing-data.py:dataframe" ``` -!!! info +!!! info "Difference from pandas" - In pandas the value for missing data depends on the dtype of the column. In Polars missing data is always represented as a `null` value. + In pandas, the value used to represent missing data depends on the data type of the column. + In Polars, missing data is always represented by the value `null`. ## Missing data metadata -Each Arrow array used by Polars stores two kinds of metadata related to missing data. This metadata allows Polars to quickly show how many missing values there are and which values are missing. +Polars keeps track of some metadata regarding the missing data of each series. +This metadata allows Polars to answer some basic queries about missing values in a very efficient way, namely how many values are missing and which ones are missing. -The first piece of metadata is the `null_count` - this is the number of rows with `null` values in the column: +To determine how many values are missing from a column you can use the function `null_count`: {{code_block('user-guide/expressions/missing-data','count',['null_count'])}} @@ -33,12 +37,13 @@ The first piece of metadata is the `null_count` - this is the number of rows wit --8<-- "python/user-guide/expressions/missing-data.py:count" ``` -The `null_count` method can be called on a `DataFrame`, a column from a `DataFrame` or a `Series`. The `null_count` method is a cheap operation as `null_count` is already calculated for the underlying Arrow array. - -The second piece of metadata is an array called a _validity bitmap_ that indicates whether each data value is valid or missing. -The validity bitmap is memory efficient as it is bit encoded - each value is either a 0 or a 1. This bit encoding means the memory overhead per array is only (array length / 8) bytes. The validity bitmap is used by the `is_null` method in Polars. +The function `null_count` can be called on a dataframe, a column from a dataframe, or on a series directly. +The function `null_count` is a cheap operation because the result is already known. -You can return a `Series` based on the validity bitmap for a column in a `DataFrame` or a `Series` with the `is_null` method: +Polars uses something called a “validity bitmap” to know which values are missing in a series. +The validity bitmap is memory efficient as it is bit encoded. +If a series has length $n$, then its validity bitmap will cost $n / 8$ bytes. +The function `is_null` uses the validity bitmap to efficiently report which values are `null` and which are not: {{code_block('user-guide/expressions/missing-data','isnull',['is_null'])}} @@ -46,18 +51,27 @@ You can return a `Series` based on the validity bitmap for a column in a `DataFr --8<-- "python/user-guide/expressions/missing-data.py:isnull" ``` -The `is_null` method is a cheap operation that does not require scanning the full column for `null` values. This is because the validity bitmap already exists and can be returned as a Boolean array. +The function `is_null` can be used on a column of a dataframe or on a series directly. +Again, this is a cheap operation because the result is already known by Polars. + +??? info "Why does Polars waste memory on a validity bitmap?" + + It all comes down to a tradeoff. + By using a bit more memory per column, Polars can be much more efficient when performing most operations on your columns. + If the validity bitmap wasn't known, every time you wanted to compute something you would have to check each position of the series to see if a legal value was present or not. + With the validity bitmap, Polars knows automatically the positions where your operations can be applied. ## Filling missing data -Missing data in a `Series` can be filled with the `fill_null` method. You have to specify how you want the `fill_null` method to fill the missing data. The main ways to do this are filling with: +Missing data in a series can be filled with the function `fill_null`. +You can specify how missing data is effectively filled in a couple of different ways: -- a literal such as 0 or "0" -- a strategy such as filling forwards -- an expression such as replacing with values from another column -- interpolation +- a literal of the correct data type; +- a Polars expression, such as replacing with values computed from another column; +- a strategy based on neighbouring values, such as filling forwards or backwards; and +- interpolation. -We illustrate each way to fill nulls by defining a simple `DataFrame` with a missing value in `col2`: +To illustrate how each of these methods work we start by defining a simple dataframe with two missing values in the second column: {{code_block('user-guide/expressions/missing-data','dataframe2',['DataFrame'])}} @@ -65,9 +79,10 @@ We illustrate each way to fill nulls by defining a simple `DataFrame` with a mis --8<-- "python/user-guide/expressions/missing-data.py:dataframe2" ``` -### Fill with specified literal value +### Fill with a specified literal value -We can fill the missing data with a specified literal value with `pl.lit`: +You can fill the missing data with a specified literal value. +This literal value will replace all of the occurrences of the value `null`: {{code_block('user-guide/expressions/missing-data','fill',['fill_null'])}} @@ -75,34 +90,35 @@ We can fill the missing data with a specified literal value with `pl.lit`: --8<-- "python/user-guide/expressions/missing-data.py:fill" ``` -### Fill with a strategy +However, this is actually just a special case of the general case where [the function `fill_null` replaces missing values with the corresponding values from the result of a Polars expression](#fill-with-a-strategy-based-on-neighbouring-values), as seen next. -We can fill the missing data with a strategy such as filling forward: +### Fill with an expression -{{code_block('user-guide/expressions/missing-data','fillstrategy',['fill_null'])}} +In the general case, the missing data can be filled by extracting the corresponding values from the result of a general Polars expression. +For example, we can fill the second column with values taken from the double of the first column: + +{{code_block('user-guide/expressions/missing-data','fillexpr',['fill_null'])}} ```python exec="on" result="text" session="user-guide/missing-data" ---8<-- "python/user-guide/expressions/missing-data.py:fillstrategy" +--8<-- "python/user-guide/expressions/missing-data.py:fillexpr" ``` -You can find other fill strategies in the API docs. +### Fill with a strategy based on neighbouring values -### Fill with an expression +You can also fill the missing data by following a fill strategy based on the neighbouring values. +The two simpler strategies look for the first non-`null` value that comes immediately before or immediately after the value `null` that is being filled: -For more flexibility we can fill the missing data with an expression. For example, -to fill nulls with the median value from that column: - -{{code_block('user-guide/expressions/missing-data','fillexpr',['fill_null'])}} +{{code_block('user-guide/expressions/missing-data','fillstrategy',['fill_null'])}} ```python exec="on" result="text" session="user-guide/missing-data" ---8<-- "python/user-guide/expressions/missing-data.py:fillexpr" +--8<-- "python/user-guide/expressions/missing-data.py:fillstrategy" ``` -In this case the column is cast from integer to float because the median is a float statistic. +You can find other fill strategies in the API docs. ### Fill with interpolation -In addition, we can fill nulls with interpolation (without using the `fill_null` function): +Additionally, you can fill missing data with interpolation by using the function `interpolate` instead of the function `fill_null`: {{code_block('user-guide/expressions/missing-data','fillinterpolate',['interpolate'])}} @@ -110,9 +126,11 @@ In addition, we can fill nulls with interpolation (without using the `fill_null` --8<-- "python/user-guide/expressions/missing-data.py:fillinterpolate" ``` -## `NotaNumber` or `NaN` values +## Not a Number, or `NaN` values -Missing data in a `Series` has a `null` value. However, you can use `NotaNumber` or `NaN` values in columns with float datatypes. These `NaN` values can be created from Numpy's `np.nan` or the native python `float('nan')`: +Missing data in a series is represented by the value `null`, regardless of the data type of the series. +However, in columns that have a floating point data type, the value `NaN` can be used. +These values can be created directly: {{code_block('user-guide/expressions/missing-data','nan',['DataFrame'])}} @@ -120,18 +138,30 @@ Missing data in a `Series` has a `null` value. However, you can use `NotaNumber` --8<-- "python/user-guide/expressions/missing-data.py:nan" ``` +The special value `NaN` might also arise as the result of a computation: + +{{code_block('user-guide/expressions/missing-data','nan-computed',[])}} + +```python exec="on" result="text" session="user-guide/missing-data" +--8<-- "python/user-guide/expressions/missing-data.py:nan-computed" +``` + !!! info - In pandas by default a `NaN` value in an integer column causes the column to be cast to float. This does not happen in Polars - instead an exception is raised. + By default, a `NaN` value in an integer column causes the column to be cast to a float data type in pandas. + This does not happen in Polars; instead, an exception is raised. -`NaN` values are considered to be a type of floating point data and are **not considered to be missing data** in Polars. This means: +`NaN` values are considered to be a type of floating point data and are **not considered to be missing data** in Polars. +This means: -- `NaN` values are **not** counted with the `null_count` method -- `NaN` values are filled when you use `fill_nan` method but are **not** filled with the `fill_null` method +- `NaN` values are **not** counted with the function `null_count`; and +- `NaN` values are filled when you use the specialised function `fill_nan` method but are **not** filled with the function `fill_null`. -Polars has `is_nan` and `fill_nan` methods which work in a similar way to the `is_null` and `fill_null` methods. The underlying Arrow arrays do not have a pre-computed validity bitmask for `NaN` values so this has to be computed for the `is_nan` method. +Polars has the functions `is_nan` and `fill_nan`, which work in a similar way to the functions `is_null` and `fill_null`. +Unlike with missing data, Polars does not hold any metadata regarding the `NaN` values, so the function `is_nan` entails actual computation. -One further difference between `null` and `NaN` values is that taking the `mean` of a column with `null` values excludes the `null` values from the calculation but with `NaN` values taking the mean results in a `NaN`. This behaviour can be avoided by replacing the `NaN` values with `null` values; +One further difference between the values `null` and `NaN` is that numerical aggregating functions, like `mean` and `sum`, skip the missing values when computing the result, whereas the value `NaN` is considered for the computation and typically propagates into the result. +If desirable, this behavior can be avoided by replacing the occurrences of the value `NaN` with the value `null`: {{code_block('user-guide/expressions/missing-data','nanfill',['fill_nan'])}} diff --git a/docs/source/user-guide/expressions/numpy-functions.md b/docs/source/user-guide/expressions/numpy-functions.md new file mode 100644 index 000000000000..b140d5ff458e --- /dev/null +++ b/docs/source/user-guide/expressions/numpy-functions.md @@ -0,0 +1,24 @@ +# Numpy functions + +Polars expressions support NumPy [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). +See [the NumPy documentation for a list of all supported NumPy functions](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs). + +This means that if a function is not provided by Polars, we can use NumPy and we still have fast columnar operations through the NumPy API. + +## Example + +{{code_block('user-guide/expressions/numpy-example',api_functions=['DataFrame','np.log'])}} + +```python exec="on" result="text" session="user-guide/numpy" +--8<-- "python/user-guide/expressions/numpy-example.py" +``` + +## Interoperability + +Polars' series have support for NumPy universal functions (ufuncs) and generalized ufuncs. +Element-wise functions such as `np.exp`, `np.cos`, `np.div`, etc, all work with almost zero overhead. + +However, bear in mind that [Polars keeps track of missing values with a separate bitmask](missing-data.md) and NumPy does not receive this information. +This can lead to a window function or a `np.convolve` giving flawed or incomplete results, so an error will be raised if you pass a series with missing data to a generalized ufunc. +Convert a Polars series to a NumPy array with the function `to_numpy`. +Missing values will be replaced by `np.nan` during the conversion. diff --git a/docs/source/user-guide/expressions/numpy.md b/docs/source/user-guide/expressions/numpy.md deleted file mode 100644 index 4a5a46978b57..000000000000 --- a/docs/source/user-guide/expressions/numpy.md +++ /dev/null @@ -1,22 +0,0 @@ -# Numpy - -Polars expressions support NumPy [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). See [here](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs) -for a list on all supported numpy functions. - -This means that if a function is not provided by Polars, we can use NumPy and we still have fast columnar operation through the NumPy API. - -### Example - -{{code_block('user-guide/expressions/numpy-example',api_functions=['DataFrame','np.log'])}} - -```python exec="on" result="text" session="user-guide/numpy" ---8<-- "python/user-guide/expressions/numpy-example.py" -``` - -### Interoperability - -Polars `Series` have support for NumPy universal functions (ufuncs) and generalized ufuncs. Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead. - -However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results, so an error will be raised if you pass a `Series` with missing data to a generalized ufunc. - -Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion. diff --git a/docs/source/user-guide/expressions/operators.md b/docs/source/user-guide/expressions/operators.md deleted file mode 100644 index 24cb4e6834b8..000000000000 --- a/docs/source/user-guide/expressions/operators.md +++ /dev/null @@ -1,30 +0,0 @@ -# Basic operators - -This section describes how to use basic operators (e.g. addition, subtraction) in conjunction with Expressions. We will provide various examples using different themes in the context of the following dataframe. - -!!! note Operator Overloading - - In Rust and Python it is possible to use the operators directly (as in `+ - * / < > `) as the language allows operator overloading. For instance, the operator `+` translates to the `.add()` method. You can choose the one you prefer. - -{{code_block('user-guide/expressions/operators','dataframe',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/operators" ---8<-- "python/user-guide/expressions/operators.py:setup" ---8<-- "python/user-guide/expressions/operators.py:dataframe" -``` - -### Numerical - -{{code_block('user-guide/expressions/operators','numerical',['operators'])}} - -```python exec="on" result="text" session="user-guide/operators" ---8<-- "python/user-guide/expressions/operators.py:numerical" -``` - -### Logical - -{{code_block('user-guide/expressions/operators','logical',['operators'])}} - -```python exec="on" result="text" session="user-guide/operators" ---8<-- "python/user-guide/expressions/operators.py:logical" -``` diff --git a/docs/source/user-guide/expressions/speed_rank_by_type.svg b/docs/source/user-guide/expressions/speed_rank_by_type.svg new file mode 100644 index 000000000000..4324508c228b --- /dev/null +++ b/docs/source/user-guide/expressions/speed_rank_by_type.svg @@ -0,0 +1,102 @@ + + + + + + Bulbasaur + + Ivysaur + + Venusaur + + + VenusaurMega + Venusaur + + + Charmander + + ... + + Oddish + + Gloom + + ... + + Grass + + Grass + + Grass + + Grass + + Fire + + ... + + Grass + + Grass + + ... + + 45 + + 60 + + 80 + + 80 + + 65 + + ... + + 30 + + 40 + + ... + + + + + + 6 + + 3 + + 1 + + 1 + + 7 + + ... + + 8 + + 7 + + ... + + Name + + Type 1 + + Speed + + Speed rank + + Golbat + + Poison + + 90 + + 1 + diff --git a/docs/source/user-guide/expressions/strings.md b/docs/source/user-guide/expressions/strings.md index b7aefcf0ba75..facdb5b80c54 100644 --- a/docs/source/user-guide/expressions/strings.md +++ b/docs/source/user-guide/expressions/strings.md @@ -1,62 +1,135 @@ # Strings -The following section discusses operations performed on `String` data, which is a frequently used `DataType` when working with `DataFrames`. However, processing strings can often be inefficient due to their unpredictable memory size, causing the CPU to access many random memory locations. To address this issue, Polars utilizes Arrow as its backend, which stores all strings in a contiguous block of memory. As a result, string traversal is cache-optimal and predictable for the CPU. +The following section discusses operations performed on string data, which is a frequently used data type when working with dataframes. +String processing functions are available in the namespace `str`. -String processing functions are available in the `str` namespace. +Working with strings in other dataframe libraries can be highly inefficient due to the fact that strings have unpredictable lengths. +Polars mitigates these inefficiencies by [following the Arrow Columnar Format specification](../concepts/data-types-and-structures.md#data-types-internals), so you can write performant data queries on string data too. -##### Accessing the string namespace +## The string namespace -The `str` namespace can be accessed through the `.str` attribute of a column with `String` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `len_bytes` is recommended since it is faster. +When working with string data you will likely need to access the namespace `str`, which aggregates 40+ functions that let you work with strings. +As an example of how to access functions from within that namespace, the snippet below shows how to compute the length of the strings in a column in terms of the number of bytes and the number of characters: {{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}} -```python exec="on" result="text" session="user-guide/strings" ---8<-- "python/user-guide/expressions/strings.py:setup" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:df" ``` -#### String parsing +!!! note -Polars offers multiple methods for checking and parsing elements of a string. Firstly, we can use the `contains` method to check whether a given pattern exists within a substring. Subsequently, we can extract these patterns and replace them using other methods, which will be demonstrated in upcoming examples. + If you are working exclusively with ASCII text, then the results of the two computations will be the same and using `len_bytes` is recommended since it is faster. -##### Check for existence of a pattern +## Parsing strings -To check for the presence of a pattern within a string, we can use the contains method. The `contains` method accepts either a regular substring or a regex pattern, depending on the value of the `literal` parameter. If the pattern we're searching for is a simple substring located either at the beginning or end of the string, we can alternatively use the `starts_with` and `ends_with` functions. +Polars offers multiple methods for checking and parsing elements of a string column, namely checking for the existence of given substrings or patterns, and counting, extracting, or replacing, them. +We will demonstrate some of these operations in the upcoming examples. + +### Check for the existence of a pattern + +We can use the function `contains` to check for the presence of a pattern within a string. +By default, the argument to the function `contains` is interpreted as a regular expression. +If you want to specify a literal substring, set the parameter `literal` to `True`. + +For the special cases where you want to check if the strings start or end with a fixed substring, you can use the functions `starts_with` or `ends_with`, respectively. {{code_block('user-guide/expressions/strings','existence',['str.contains', 'str.starts_with','str.ends_with'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:existence" ``` -##### Extract a pattern +### Regex specification + +Polars relies on the Rust crate `regex` to work with regular expressions, so you may need to [refer to the syntax documentation](https://docs.rs/regex/latest/regex/#syntax) to see what features and flags are supported. +In particular, note that the flavor of regex supported by Polars is different from Python's module `re`. -The `extract` method allows us to extract a pattern from a specified string. This method takes a regex pattern containing one or more capture groups, which are defined by parentheses `()` in the pattern. The group index indicates which capture group to output. +### Extract a pattern + +The function `extract` allows us to extract patterns from the string values in a column. +The function `extract` accepts a regex pattern with one or more capture groups and extracts the capture group specified as the second argument. {{code_block('user-guide/expressions/strings','extract',['str.extract'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:extract" ``` -To extract all occurrences of a pattern within a string, we can use the `extract_all` method. In the example below, we extract all numbers from a string using the regex pattern `(\d+)`, which matches one or more digits. The resulting output of the `extract_all` method is a list containing all instances of the matched pattern within the string. +To extract all occurrences of a pattern within a string, we can use the function `extract_all`. +In the example below, we extract all numbers from a string using the regex pattern `(\d+)`, which matches one or more digits. +The resulting output of the function `extract_all` is a list containing all instances of the matched pattern within the string. {{code_block('user-guide/expressions/strings','extract_all',['str.extract_all'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:extract_all" ``` -##### Replace a pattern +### Replace a pattern -We have discussed two methods for pattern matching and extraction thus far, and now we will explore how to replace a pattern within a string. Similar to `extract` and `extract_all`, Polars provides the `replace` and `replace_all` methods for this purpose. In the example below we replace one match of `abc` at the end of a word (`\b`) by `ABC` and we replace all occurrence of `a` with `-`. +Akin to the functions `extract` and `extract_all`, Polars provides the functions `replace` and `replace_all`. +These accept a regex pattern or a literal substring (if the parameter `literal` is set to `True`) and perform the replacements specified. +The function `replace` will make at most one replacement whereas the function `replace_all` will make all the non-overlapping replacements it finds. -{{code_block('user-guide/expressions/strings','replace',['str.replace','str.replace_all'])}} +{{code_block('user-guide/expressions/strings','replace',['str.replace', 'str.replace_all'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:replace" ``` -#### API documentation +## Modifying strings + +### Case conversion + +Converting the casing of a string is a common operation and Polars supports it out of the box with the functions `to_lowercase`, `to_titlecase`, and `to_uppercase`: + +{{code_block('user-guide/expressions/strings','casing', ['str.to_lowercase', 'str.to_titlecase', 'str.to_uppercase'])}} + +```python exec="on" result="text" session="expressions/strings" +--8<-- "python/user-guide/expressions/strings.py:casing" +``` + +### Stripping characters from the ends + +Polars provides five functions in the namespace `str` that let you strip characters from the ends of the string: + +| Function | Behaviour | +| ------------------- | --------------------------------------------------------------------- | +| `strip_chars` | Removes leading and trailing occurrences of the characters specified. | +| `strip_chars_end` | Removes trailing occurrences of the characters specified. | +| `strip_chars_start` | Removes leading occurrences of the characters specified. | +| `strip_prefix` | Removes an exact substring prefix if present. | +| `strip_suffix` | Removes an exact substring suffix if present. | + +??? info "Similarity to Python string methods" +`strip_chars` is similar to Python's string method `strip` and `strip_prefix`/`strip_suffix` are similar to Python's string methods `removeprefix` and `strip_suffix`, respectively. + +It is important to understand that the first three functions interpret their string argument as a set of characters whereas the functions `strip_prefix` and `strip_suffix` do interpret their string argument as a literal string. + +{{code_block('user-guide/expressions/strings', 'strip', ['str.strip_chars', 'str.strip_chars_end', 'str.strip_chars_start', 'str.strip_prefix', 'str.strip_suffix'])}} + +```python exec="on" result="text" session="expressions/strings" +--8<-- "python/user-guide/expressions/strings.py:strip" +``` + +If no argument is provided, the three functions `strip_chars`, `strip_chars_end`, and `strip_chars_start`, remove whitespace by default. + +### Slicing + +Besides [extracting substrings as specified by patterns](#extract-a-pattern), you can also slice strings at specified offsets to produce substrings. +The general-purpose function for slicing is `slice` and it takes the starting offset and the optional _length_ of the slice. +If the length of the slice is not specified or if it's past the end of the string, Polars slices the string all the way to the end. + +The functions `head` and `tail` are specialised versions used for slicing the beginning and end of a string, respectively. + +{{code_block('user-guide/expressions/strings', 'slice', [], ['str.slice', 'str.head', 'str.tail'], ['str.str_slice', 'str.str_head', 'str.str_tail'])}} + +```python exec="on" result="text" session="expressions/strings" +--8<-- "python/user-guide/expressions/strings.py:slice" +``` + +## API documentation -In addition to the examples covered above, Polars offers various other string manipulation methods for tasks such as formatting, stripping, splitting, and more. To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. +In addition to the examples covered above, Polars offers various other string manipulation functions. +To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. diff --git a/docs/source/user-guide/expressions/structs.md b/docs/source/user-guide/expressions/structs.md index d692c05ad0a1..7643c2a70c01 100644 --- a/docs/source/user-guide/expressions/structs.md +++ b/docs/source/user-guide/expressions/structs.md @@ -1,82 +1,112 @@ -# The Struct datatype +# Structs -Polars `Struct`s are the idiomatic way of working with multiple columns. It is also a free operation i.e. moving columns into `Struct`s does not copy any data! +The data type `Struct` is a composite data type that can store multiple fields in a single column. -For this section, let's start with a `DataFrame` that captures the average rating of a few movies across some states in the U.S.: +??? tip "Python analogy" +For Python users, the data type `Struct` is kind of like a Python dictionary. +Even better, if you are familiar with Python typing, you can think of the data type `Struct` as `typing.TypedDict`. + +In this page of the user guide we will see situations in which the data type `Struct` arises, we will understand why it does arise, and we will see how to work with `Struct` values. + +Let's start with a dataframe that captures the average rating of a few movies across some states in the US: {{code_block('user-guide/expressions/structs','ratings_df',['DataFrame'])}} -```python exec="on" result="text" session="user-guide/structs" ---8<-- "python/user-guide/expressions/structs.py:setup" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:ratings_df" ``` -## Encountering the `Struct` type +## Encountering the data type `Struct` -A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. Checking the number of times a state appears the data will be done as so: +A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. +Checking the number of times a state appears in the data is done as so: {{code_block('user-guide/expressions/structs','state_value_counts',['value_counts'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:state_value_counts" ``` -Quite unexpected an output, especially if coming from tools that do not have such a data type. We're not in peril though, to get back to a more familiar output, all we need to do is `unnest` the `Struct` column into its constituent columns: +Quite unexpected an output, especially if coming from tools that do not have such a data type. +We're not in peril, though. +To get back to a more familiar output, all we need to do is use the function `unnest` on the `Struct` column: {{code_block('user-guide/expressions/structs','struct_unnest',['unnest'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:struct_unnest" ``` +The function `unnest` will turn each field of the `Struct` into its own column. + !!! note "Why `value_counts` returns a `Struct`" - Polars expressions always have a `Fn(Series) -> Series` signature and `Struct` is thus the data type that allows us to provide multiple columns as input/output of an expression. In other words, all expressions have to return a `Series` object, and `Struct` allows us to stay consistent with that requirement. + Polars expressions always operate on a single series and return another series. + `Struct` is the data type that allows us to provide multiple columns as input to an expression, or to output multiple columns from an expression. + Thus, we can use the data type `Struct` to specify each value and its count when we use `value_counts`. -## Structs as `dict`s +## Inferring the data type `Struct` from dictionaries -Polars will interpret a `dict` sent to the `Series` constructor as a `Struct`: +When building series or dataframes, Polars will convert dictionaries to the data type `Struct`: {{code_block('user-guide/expressions/structs','series_struct',['Series'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct" ``` -!!! note "Constructing `Series` objects" +The number of fields, their names, and their types, are inferred from the first dictionary seen. +Subsequent incongruences can result in `null` values or in errors: - Note that `Series` here was constructed with the `name` of the series in the beginning, followed by the `values`. Providing the latter first - is considered an anti-pattern in Polars, and must be avoided. +{{code_block('user-guide/expressions/structs','series_struct_error',['Series'])}} + +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct_error" +``` -### Extracting individual values of a `Struct` +## Extracting individual values of a `Struct` -Let's say that we needed to obtain just the `movie` value in the `Series` that we created above. We can use the `field` method to do so: +Let's say that we needed to obtain just the field `"Movie"` from the `Struct` in the series that we created above. +We can use the function `field` to do so: {{code_block('user-guide/expressions/structs','series_struct_extract',['struct.field'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct_extract" ``` -### Renaming individual keys of a `Struct` +## Renaming individual fields of a `Struct` -What if we need to rename individual `field`s of a `Struct` column? We first convert the `rating_series` object to a `DataFrame` so that we can view the changes easily, and then use the `rename_fields` method: +What if we need to rename individual fields of a `Struct` column? +We use the function `rename_fields`: {{code_block('user-guide/expressions/structs','series_struct_rename',['struct.rename_fields'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct_rename" ``` +To be able to actually see that the field names were change, we will create a dataframe where the only column is the result and then we use the function `unnest` so that each field becomes its own column. +The column names will reflect the renaming operation we just did: + +{{code_block('user-guide/expressions/structs','struct-rename-check',['struct.rename_fields'])}} + +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:struct-rename-check" +``` + ## Practical use-cases of `Struct` columns ### Identifying duplicate rows -Let's get back to the `ratings` data. We want to identify cases where there are duplicates at a `Movie` and `Theatre` level. This is where the `Struct` datatype shines: +Let's get back to the `ratings` data. +We want to identify cases where there are duplicates at a “Movie” and “Theatre” level. + +This is where the data type `Struct` shines: {{code_block('user-guide/expressions/structs','struct_duplicates',['is_duplicated', 'struct'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:struct_duplicates" ``` @@ -84,23 +114,40 @@ We can identify the unique cases at this level also with `is_unique`! ### Multi-column ranking -Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define `Count` of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: +Suppose, given that we know there are duplicates, we want to choose which rating gets a higher priority. +We can say that the column “Count” is the most important, and if there is a tie in the column “Count” then we consider the column “Avg_Rating”. + +We can then do: {{code_block('user-guide/expressions/structs','struct_ranking',['is_duplicated', 'struct'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:struct_ranking" ``` That's a pretty complex set of requirements done very elegantly in Polars! +To learn more about the function `over`, used above, [see the user guide section on window functions](window-functions.md). + +### Using multiple columns in a single expression + +As mentioned earlier, the data type `Struct` is also useful if you need to pass multiple columns as input to an expression. +As an example, suppose we want to compute [the Ackermann function](https://en.wikipedia.org/wiki/Ackermann_function) on two columns of a dataframe. +There is no way of composing Polars expressions to compute the Ackermann function[^1], so we define a custom function: -### Using multi-column apply +{{code_block('user-guide/expressions/structs', 'ack', [])}} -This was discussed in the previous section on _User Defined Functions_ for the Python case. -Here's an example of doing so with both Python and Rust: +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:ack" +``` + +Now, to compute the values of the Ackermann function on those arguments, we start by creating a `Struct` with fields `m` and `n` and then use the function `map_elements` to apply the function `ack` to each value: -{{code_block('user-guide/expressions/structs','multi_column_apply',[])}} +{{code_block('user-guide/expressions/structs','struct-ack',[], ['map_elements'], ['apply'])}} -```python exec="on" result="text" session="user-guide/structs" ---8<-- "python/user-guide/expressions/structs.py:multi_column_apply" +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:struct-ack" ``` + +Refer to [this section of the user guide to learn more about applying user-defined Python functions to your data](user-defined-python-functions.md). + +[^1]: To say that something cannot be done is quite a bold claim. If you prove us wrong, please let us know! diff --git a/docs/source/user-guide/expressions/user-defined-functions.md b/docs/source/user-guide/expressions/user-defined-python-functions.md similarity index 95% rename from docs/source/user-guide/expressions/user-defined-functions.md rename to docs/source/user-guide/expressions/user-defined-python-functions.md index dc994148c63b..b99a413e8bdd 100644 --- a/docs/source/user-guide/expressions/user-defined-functions.md +++ b/docs/source/user-guide/expressions/user-defined-python-functions.md @@ -1,4 +1,6 @@ -# User-defined functions (Python) +# User-defined Python functions + + Polars expressions are quite powerful and flexible, so there is much less need for custom Python functions compared to other libraries. Still, you may need to pass an expression's state to a third party library or apply your black box function to data in Polars. @@ -119,10 +121,10 @@ Passing the full `Series` to the user-defined function has a cost: it may use a You can use the `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. !!! note -The `is_elementwise` argument can lead to incorrect results if set incorrectly. -If you set `is_elementwise=True`, make sure that your function actually operates -element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, -for instance, does not. + + The `is_elementwise` argument can lead to incorrect results if set incorrectly. + If you set `is_elementwise=True`, make sure that your function actually operates + element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, for instance, does not. ## Return types diff --git a/docs/source/user-guide/expressions/window-functions.md b/docs/source/user-guide/expressions/window-functions.md new file mode 100644 index 000000000000..5cdcb13b60c6 --- /dev/null +++ b/docs/source/user-guide/expressions/window-functions.md @@ -0,0 +1,147 @@ +# Window functions + +Window functions are expressions with superpowers. +They allow you to perform aggregations on groups within the context `select`. +Let's get a feel for what that means. + +First, we load a Pokémon dataset: + +{{code_block('user-guide/expressions/window','pokemon',['read_csv'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:pokemon" +``` + +## Operations per group + +Window functions are ideal when we want to perform an operation within a group. +For instance, suppose we want to rank our Pokémon by the column “Speed”. +However, instead of a global ranking, we want to rank the speed within each group defined by the column “Type 1”. +We write the expression to rank the data by the column “Speed” and then we add the function `over` to specify that this should happen over the unique values of the column “Type 1”: + +{{code_block('user-guide/expressions/window','rank',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:rank" +``` + +To help visualise this operation, you may imagine that Polars selects the subsets of the data that share the same value for the column “Type 1” and then computes the ranking expression only for those values. +Then, the results for that specific group are projected back to the original rows and Polars does this for all of the existing groups. +The diagram below highlights the ranking computation for the Pokémon with “Type 1” equal to “Grass”. + +
+--8<-- "docs/source/user-guide/expressions/speed_rank_by_type.svg" +
+ +Note how the row for the Pokémon “Golbat” has a “Speed” value of `90`, which is greater than the value `80` of the Pokémon “Venusaur”, and yet the latter was ranked 1 because “Golbat” and “Venusar” do not share the same value for the column “Type 1”. + +The function `over` accepts an arbitrary number of expressions to specify the groups over which to perform the computations. +We can repeat the ranking above, but over the combination of the columns “Type 1” and “Type 2” for a more fine-grained ranking: + +{{code_block('user-guide/expressions/window','rank-multiple',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:rank-multiple" +``` + +In general, the results you get with the function `over` can also be achieved with [an aggregation](aggregation.md) followed by a call to the function `explode`, although the rows would be in a different order: + +{{code_block('user-guide/expressions/window','rank-explode',['explode'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:rank-explode" +``` + +This shows that, usually, `group_by` and `over` produce results of different shapes: + +- `group_by` usually produces a resulting dataframe with as many rows as groups used for aggregating; and +- `over` usually produces a dataframe with the same number of rows as the original. + +The function `over` does not always produce results with the same number of rows as the original dataframe, and that is what we explore next. + +## Mapping results to dataframe rows + +The function `over` accepts a parameter `mapping_strategy` that determines how the results of the expression over the group are mapped back to the rows of the dataframe. + +### `group_to_rows` + +The default behaviour is `"group_to_rows"`: +the result of the expression over the group should be the same length as the group and the results are mapped back to the rows of that group. + +If the order of the rows is not relevant, the option `"explode"` is more performant. +Instead of mapping the resulting values to the original rows, Polars creates a new dataframe where values from the same group are next to each other. +To help understand the distinction, consider the following dataframe: + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes" +``` + +We can sort the athletes by rank within their own countries. +If we do so, the Dutch athletes were in the second, third, and sixth, rows, and they will remain there. +What will change is the order of the names of the athletes, which goes from “B”, “C”, and “F”, to “B”, “F”, and “C”: + +{{code_block('user-guide/expressions/window','athletes-sort-over-country',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes-sort-over-country" +``` + +The diagram below represents this transformation: + +
+--8<-- "docs/source/user-guide/expressions/athletes_over_country.svg" +
+ +### `explode` + +If we set the parameter `mapping_strategy` to `"explode"`, then athletes of the same country are grouped together, but the final order of the rows – with respect to the countries – will not be the same, as the diagram shows: + +
+--8<-- "docs/source/user-guide/expressions/athletes_over_country_explode.svg" +
+ +Because Polars does not need to keep track of the positions of the rows of each group, using `"explode"` is typically faster than `"group_to_rows"`. +However, using `"explode"` also requires more care because it implies reordering the other columns that we wish to keep. +The code that produces this result follows + +{{code_block('user-guide/expressions/window','athletes-explode',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes-explode" +``` + +### `join` + +Another possible value for the parameter `mapping_strategy` is `"join"`, which aggregates the resulting values in a list and repeats the list over all rows of the same group: + +{{code_block('user-guide/expressions/window','athletes-join',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes-join" +``` + +## Windowed aggregation expressions + +In case the expression applied to the values of a group produces a scalar value, the scalar is broadcast across the rows of the group: + +{{code_block('user-guide/expressions/window','pokemon-mean',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:pokemon-mean" +``` + +## More examples + +For more exercises, below are some window functions for us to compute: + +- sort all Pokémon by type; +- select the first `3` Pokémon per type as `"Type 1"`; +- sort the Pokémon within a type by speed in descending order and select the first `3` as `"fastest/group"`; +- sort the Pokémon within a type by attack in descending order and select the first `3` as `"strongest/group"`; and +- sort the Pokémon within a type by name and select the first `3` as `"sorted_by_alphabet"`. + +{{code_block('user-guide/expressions/window','examples',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:examples" +``` diff --git a/docs/source/user-guide/expressions/window.md b/docs/source/user-guide/expressions/window.md deleted file mode 100644 index 261dac180c4d..000000000000 --- a/docs/source/user-guide/expressions/window.md +++ /dev/null @@ -1,91 +0,0 @@ -# Window functions - -Window functions are expressions with superpowers. They allow you to perform aggregations on groups in the -`select` context. Let's get a feel for what that means. First we create a dataset. The dataset loaded in the -snippet below contains information about pokemon: - -{{code_block('user-guide/expressions/window','pokemon',['read_csv'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:pokemon" -``` - -## Group by aggregations in selection - -Below we show how to use window functions to group over different columns and perform an aggregation on them. -Doing so allows us to use multiple group by operations in parallel, using a single query. The results of the aggregation -are projected back to the original rows. Therefore, a window function will almost always lead to a `DataFrame` with the same size as the original. - -We will discuss later the cases where a window function can change the numbers of rows in a `DataFrame`. - -Note how we call `.over("Type 1")` and `.over(["Type 1", "Type 2"])`. Using window functions we can aggregate over different groups in a single `select` call! Note that, in Rust, the type of the argument to `over()` must be a collection, so even when you're only using one column, you must provide it in an array. - -The best part is, this won't cost you anything. The computed groups are cached and shared between different `window` expressions. - -{{code_block('user-guide/expressions/window','group_by',['over'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:group_by" -``` - -## Operations per group - -Window functions can do more than aggregation. They can also be viewed as an operation within a group. If, for instance, you -want to `sort` the values within a `group`, you can write `col("value").sort().over("group")` and voilà! We sorted by group! - -Let's filter out some rows to make this more clear. - -{{code_block('user-guide/expressions/window','operations',['filter'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:operations" -``` - -Observe that the group `Water` of column `Type 1` is not contiguous. There are two rows of `Grass` in between. Also note -that each pokemon within a group are sorted by `Speed` in `ascending` order. Unfortunately, for this example we want them sorted in -`descending` speed order. Luckily with window functions this is easy to accomplish. - -{{code_block('user-guide/expressions/window','sort',['over'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:sort" -``` - -Polars keeps track of each group's location and maps the expressions to the proper row locations. This will also work over different groups in a single `select`. - -The power of window expressions is that you often don't need a `group_by -> explode` combination, but you can put the logic in a single expression. It also makes the API cleaner. If properly used a: - -- `group_by` -> marks that groups are aggregated and we expect a `DataFrame` of size `n_groups` -- `over` -> marks that we want to compute something within a group, and doesn't modify the original size of the `DataFrame` except in specific cases - -## Map the expression result to the DataFrame rows - -In cases where the expression results in multiple values per group, the Window function has 3 strategies for linking the values back to the `DataFrame` rows: - -- `mapping_strategy = 'group_to_rows'` -> each value is assigned back to one row. The number of values returned should match the number of rows. - -- `mapping_strategy = 'join'` -> the values are imploded in a list, and the list is repeated on all rows. This can be memory intensive. - -- `mapping_strategy = 'explode'` -> the values are exploded to new rows. This operation changes the number of rows. - -## Window expression rules - -The evaluations of window expressions are as follows (assuming we apply it to a `pl.Int32` column): - -{{code_block('user-guide/expressions/window','rules',['over'])}} - -## More examples - -For more exercise, below are some window functions for us to compute: - -- sort all pokemon by type -- select the first `3` pokemon per type as `"Type 1"` -- sort the pokemon within a type by speed in descending order and select the first `3` as `"fastest/group"` -- sort the pokemon within a type by attack in descending order and select the first `3` as `"strongest/group"` -- sort the pokemon within a type by name and select the first `3` as `"sorted_by_alphabet"` - -{{code_block('user-guide/expressions/window','examples',['over'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:examples" -``` diff --git a/docs/source/user-guide/getting-started.md b/docs/source/user-guide/getting-started.md index e571ea71cca1..b0c18b2562b1 100644 --- a/docs/source/user-guide/getting-started.md +++ b/docs/source/user-guide/getting-started.md @@ -83,7 +83,7 @@ When using expression expansion you can use `.name.suffix` to add a suffix to th --8<-- "python/user-guide/getting-started.py:expression-expansion" ``` -You can check other sections of the user guide to learn more about [basic operations](expressions/operators.md) or [column selections](expressions/column-selections.md). +You can check other sections of the user guide to learn more about [basic operations](expressions/basic-operations.md) or [column selections in expression expansion](expressions/expression-expansion.md). ### `with_columns` diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md index fdfe83d49dee..0cecd7cd5f4b 100644 --- a/docs/source/user-guide/installation.md +++ b/docs/source/user-guide/installation.md @@ -23,8 +23,8 @@ Polars is a library and installation is as simple as invoking the package manage ## Big Index -By default, Polars dataframes are limited to 232 rows (~4.3 billion). -Increase this limit to 264 (~18 quintillion) by enabling the big index extension: +By default, Polars dataframes are limited to $2^{32}$ rows (~4.3 billion). +Increase this limit to $2^{64}$ (~18 quintillion) by enabling the big index extension: === ":fontawesome-brands-python: Python" @@ -196,7 +196,7 @@ The opt-in features are: - Performance related: - `nightly` - Several nightly only features such as SIMD and specialization. - `performant` - more fast paths, slower compile times. - - `bigidx` - Activate this feature if you expect >> 232 rows. + - `bigidx` - Activate this feature if you expect >> $2^{32}$ rows. This allows polars to scale up way beyond that by using `u64` as an index. Polars will be a bit slower with this feature activated as many data structures are less cache efficient. diff --git a/docs/source/user-guide/expressions/plugins.md b/docs/source/user-guide/plugins/your-first-polars-plugin.md similarity index 98% rename from docs/source/user-guide/expressions/plugins.md rename to docs/source/user-guide/plugins/your-first-polars-plugin.md index 9ef5633cfcd0..eb95ed7115f7 100644 --- a/docs/source/user-guide/expressions/plugins.md +++ b/docs/source/user-guide/plugins/your-first-polars-plugin.md @@ -1,4 +1,6 @@ -# Expression plugins +# Your first Polars plugin + + Expression plugins are the preferred way to create user defined functions. They allow you to compile a Rust function and register that as an expression into the Polars library. The Polars engine will dynamically link your function at runtime diff --git a/docs/source/user-guide/transformations/index.md b/docs/source/user-guide/transformations/index.md index 3092c5be3c37..fa86181eb58d 100644 --- a/docs/source/user-guide/transformations/index.md +++ b/docs/source/user-guide/transformations/index.md @@ -2,6 +2,8 @@ The focus of this section is to describe different types of data transformations and provide some examples on how to use them. + + - [Joins](joins.md) - [Concatenation](concatenation.md) - [Pivot](pivot.md) diff --git a/mkdocs.yml b/mkdocs.yml index c180bbfc6b8e..5d4ca83b9191 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -21,21 +21,19 @@ nav: - user-guide/concepts/lazy-api.md - Expressions: - user-guide/expressions/index.md - - user-guide/expressions/operators.md - - user-guide/expressions/column-selections.md - - user-guide/expressions/functions.md + - user-guide/expressions/basic-operations.md + - user-guide/expressions/expression-expansion.md - user-guide/expressions/casting.md - user-guide/expressions/strings.md + - user-guide/expressions/lists-and-arrays.md - user-guide/expressions/categorical-data-and-enums.md - - user-guide/expressions/aggregation.md + - user-guide/expressions/structs.md - user-guide/expressions/missing-data.md - - user-guide/expressions/window.md + - user-guide/expressions/aggregation.md + - user-guide/expressions/window-functions.md - user-guide/expressions/folds.md - - user-guide/expressions/lists.md - - user-guide/expressions/plugins.md - - user-guide/expressions/user-defined-functions.md - - user-guide/expressions/structs.md - - user-guide/expressions/numpy.md + - user-guide/expressions/user-defined-python-functions.md + - user-guide/expressions/numpy-functions.md - Transformations: - user-guide/transformations/index.md - user-guide/transformations/joins.md @@ -69,6 +67,8 @@ nav: - user-guide/io/cloud-storage.md - user-guide/io/bigquery.md - user-guide/io/hugging-face.md + - Plugins: + - user-guide/plugins/your-first-polars-plugin.md - SQL: - user-guide/sql/intro.md - user-guide/sql/show.md @@ -144,6 +144,9 @@ theme: icon: repo: fontawesome/brands/github +extra_javascript: + - _build/js/mathjax.js + - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js extra_css: - _build/css/extra.css extra: @@ -170,6 +173,8 @@ markdown_extensions: check_paths: true dedent_subsections: true - footnotes + - pymdownx.arithmatex: + generic: true hooks: - docs/source/_build/scripts/people.py diff --git a/py-polars/docs/source/_static/css/custom.css b/py-polars/docs/source/_static/css/custom.css index 7797c1fa0e15..966f1a86d21e 100644 --- a/py-polars/docs/source/_static/css/custom.css +++ b/py-polars/docs/source/_static/css/custom.css @@ -1,5 +1,5 @@ /* To have blue background of width of the block (instead of width of content) */ -dl.class > dt:first-of-type { +dl.class>dt:first-of-type { display: block !important; } From 235f240a6b22d0d622abe2cc8566f47f95dc6b04 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Fri, 8 Nov 2024 22:34:14 +1100 Subject: [PATCH 03/20] perf: Improve hive partition pruning with datetime predicates from SQL (#19680) --- crates/polars-expr/src/expressions/alias.rs | 7 ++ crates/polars-expr/src/expressions/apply.rs | 63 ++++++++-------- crates/polars-expr/src/expressions/binary.rs | 38 +++++----- crates/polars-expr/src/expressions/cast.rs | 15 ++++ crates/polars-expr/src/expressions/literal.rs | 24 +++++-- crates/polars-expr/src/expressions/mod.rs | 14 ++++ crates/polars-expr/src/planner.rs | 1 + crates/polars-io/src/predicates.rs | 2 + py-polars/tests/unit/io/test_scan.py | 72 +++++++++++++++++++ 9 files changed, 185 insertions(+), 51 deletions(-) diff --git a/crates/polars-expr/src/expressions/alias.rs b/crates/polars-expr/src/expressions/alias.rs index 6144a1418de2..f2065289e1ae 100644 --- a/crates/polars-expr/src/expressions/alias.rs +++ b/crates/polars-expr/src/expressions/alias.rs @@ -33,6 +33,13 @@ impl PhysicalExpr for AliasExpr { Ok(self.finish(series)) } + fn evaluate_inline_impl(&self, depth_limit: u8) -> Option { + let depth_limit = depth_limit.checked_sub(1)?; + self.physical_expr + .evaluate_inline_impl(depth_limit) + .map(|s| self.finish(s)) + } + #[allow(clippy::ptr_arg)] fn evaluate_on_groups<'a>( &self, diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index ddb4c37fac5d..f8e2619c4153 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::sync::OnceLock; use polars_core::chunked_array::builder::get_list_builder; use polars_core::prelude::*; @@ -28,6 +29,7 @@ pub struct ApplyExpr { check_lengths: bool, allow_group_aware: bool, output_field: Field, + inlined_eval: OnceLock>, } impl ApplyExpr { @@ -63,6 +65,7 @@ impl ApplyExpr { check_lengths: options.check_lengths(), allow_group_aware: options.flags.contains(FunctionFlags::ALLOW_GROUP_AWARE), output_field, + inlined_eval: Default::default(), } } @@ -347,6 +350,24 @@ impl PhysicalExpr for ApplyExpr { } } + fn evaluate_inline_impl(&self, depth_limit: u8) -> Option { + // For predicate evaluation at I/O of: + // `lit("2024-01-01").str.strptime()` + + self.inlined_eval + .get_or_init(|| { + let depth_limit = depth_limit.checked_sub(1)?; + let mut inputs = self + .inputs + .iter() + .map(|x| x.evaluate_inline_impl(depth_limit).filter(|s| s.len() == 1)) + .collect::>>()?; + + self.eval_and_flatten(&mut inputs).ok() + }) + .clone() + } + #[allow(clippy::ptr_arg)] fn evaluate_on_groups<'a>( &self, @@ -576,11 +597,10 @@ impl ApplyExpr { FunctionExpr::Boolean(BooleanFunction::IsIn) => { let should_read = || -> Option { let root = expr_to_leaf_column_name(&input[0]).ok()?; - let Expr::Literal(LiteralValue::Series(input)) = &input[1] else { - return None; - }; - #[allow(clippy::explicit_auto_deref)] - let input: &Series = &**input; + + let input = self.inputs[1].evaluate_inline()?; + let input = input.as_materialized_series(); + let st = stats.get_stats(&root).ok()?; let min = st.to_min()?; let max = st.to_max()?; @@ -603,35 +623,20 @@ impl ApplyExpr { FunctionExpr::Boolean(BooleanFunction::IsBetween { closed }) => { let should_read = || -> Option { let root: PlSmallStr = expr_to_leaf_column_name(&input[0]).ok()?; - let Expr::Literal(left) = &input[1] else { - return None; - }; - let Expr::Literal(right) = &input[2] else { - return None; - }; + + let left = self.inputs[1] + .evaluate_inline()? + .as_materialized_series() + .clone(); + let right = self.inputs[2] + .evaluate_inline()? + .as_materialized_series() + .clone(); let st = stats.get_stats(&root).ok()?; let min = st.to_min()?; let max = st.to_max()?; - let (left, left_dtype) = (left.to_any_value()?, left.get_datatype()); - let (right, right_dtype) = (right.to_any_value()?, right.get_datatype()); - - let left = Series::from_any_values_and_dtype( - PlSmallStr::EMPTY, - &[left], - &left_dtype, - false, - ) - .ok()?; - let right = Series::from_any_values_and_dtype( - PlSmallStr::EMPTY, - &[right], - &right_dtype, - false, - ) - .ok()?; - // don't read the row_group anyways as // the condition will evaluate to false. // e.g. in_between(10, 5) diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index 7754c2b6633e..10f217844ab1 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -392,17 +392,26 @@ mod stats { impl BinaryExpr { fn impl_should_read(&self, stats: &BatchStats) -> PolarsResult { // See: #5864 for the rationale behind this. - use Expr::*; - use Operator::*; - if !self.expr.into_iter().all(|e| match e { - BinaryExpr { op, .. } => { - !matches!(op, Multiply | Divide | TrueDivide | FloorDivide | Modulus) - }, - Column(_) | Literal(_) | Alias(_, _) => true, - _ => false, - }) { - return Ok(true); + { + use Operator::*; + + match self.op { + // These don't result in a boolean output + Multiply | Divide | TrueDivide | FloorDivide | Modulus => return Ok(true), + _ => {}, + } + + let Expr::BinaryExpr { left, right, .. } = &self.expr else { + unreachable!() + }; + + match (left.as_ref(), right.as_ref()) { + // The logic below assumes one side is a column + (Expr::Column(_), _) | (_, Expr::Column(_)) => {}, + _ => return Ok(true), + } } + let schema = stats.schema(); let Some(fld_l) = self.left.to_field(schema).ok() else { return Ok(true); @@ -423,18 +432,16 @@ mod stats { } } - let dummy = DataFrame::empty(); let state = ExecutionState::new(); - let out = match (self.left.is_literal(), self.right.is_literal()) { - (false, true) => { + let out = match (self.left.evaluate_inline(), self.right.evaluate_inline()) { + (None, Some(lit_s)) => { let l = stats.get_stats(fld_l.name())?; match l.to_min_max() { None => Ok(true), Some(min_max_s) => { // will be incorrect if not debug_assert_eq!(min_max_s.null_count(), 0); - let lit_s = self.right.evaluate(&dummy, &state).unwrap(); Ok(apply_operator_stats_rhs_lit( &min_max_s.into_column(), &lit_s, @@ -443,14 +450,13 @@ mod stats { }, } }, - (true, false) => { + (Some(lit_s), None) => { let r = stats.get_stats(fld_r.name())?; match r.to_min_max() { None => Ok(true), Some(min_max_s) => { // will be incorrect if not debug_assert_eq!(min_max_s.null_count(), 0); - let lit_s = self.left.evaluate(&dummy, &state).unwrap(); Ok(apply_operator_stats_lhs_lit( &lit_s, &min_max_s.into_column(), diff --git a/crates/polars-expr/src/expressions/cast.rs b/crates/polars-expr/src/expressions/cast.rs index dcbd67d36a7e..95f0c9eebee5 100644 --- a/crates/polars-expr/src/expressions/cast.rs +++ b/crates/polars-expr/src/expressions/cast.rs @@ -1,3 +1,5 @@ +use std::sync::OnceLock; + use polars_core::chunked_array::cast::CastOptions; use polars_core::prelude::*; @@ -9,6 +11,7 @@ pub struct CastExpr { pub(crate) dtype: DataType, pub(crate) expr: Expr, pub(crate) options: CastOptions, + pub(crate) inlined_eval: OnceLock>, } impl CastExpr { @@ -27,6 +30,18 @@ impl PhysicalExpr for CastExpr { self.finish(&column) } + fn evaluate_inline_impl(&self, depth_limit: u8) -> Option { + self.inlined_eval + .get_or_init(|| { + let depth_limit = depth_limit.checked_sub(1)?; + self.input + .evaluate_inline_impl(depth_limit) + .filter(|x| x.len() == 1) + .and_then(|x| self.finish(&x).ok()) + }) + .clone() + } + #[allow(clippy::ptr_arg)] fn evaluate_on_groups<'a>( &self, diff --git a/crates/polars-expr/src/expressions/literal.rs b/crates/polars-expr/src/expressions/literal.rs index 0c6900d4356b..0ab9ad9872b3 100644 --- a/crates/polars-expr/src/expressions/literal.rs +++ b/crates/polars-expr/src/expressions/literal.rs @@ -15,13 +15,8 @@ impl LiteralExpr { pub fn new(value: LiteralValue, expr: Expr) -> Self { Self(value, expr) } -} -impl PhysicalExpr for LiteralExpr { - fn as_expression(&self) -> Option<&Expr> { - Some(&self.1) - } - fn evaluate(&self, _df: &DataFrame, _state: &ExecutionState) -> PolarsResult { + fn as_column(&self) -> PolarsResult { use LiteralValue::*; let s = match &self.0 { #[cfg(feature = "dtype-i8")] @@ -118,6 +113,23 @@ impl PhysicalExpr for LiteralExpr { }; Ok(s) } +} + +impl PhysicalExpr for LiteralExpr { + fn as_expression(&self) -> Option<&Expr> { + Some(&self.1) + } + fn evaluate(&self, _df: &DataFrame, _state: &ExecutionState) -> PolarsResult { + self.as_column() + } + + fn evaluate_inline_impl(&self, _depth_limit: u8) -> Option { + use LiteralValue::*; + match &self.0 { + Range { .. } => None, + _ => self.as_column().ok(), + } + } #[allow(clippy::ptr_arg)] fn evaluate_on_groups<'a>( diff --git a/crates/polars-expr/src/expressions/mod.rs b/crates/polars-expr/src/expressions/mod.rs index 8ccc5349b733..277afddb41f2 100644 --- a/crates/polars-expr/src/expressions/mod.rs +++ b/crates/polars-expr/src/expressions/mod.rs @@ -538,6 +538,20 @@ pub trait PhysicalExpr: Send + Sync { /// Take a DataFrame and evaluate the expression. fn evaluate(&self, df: &DataFrame, _state: &ExecutionState) -> PolarsResult; + /// Attempt to cheaply evaluate this expression in-line without a DataFrame context. + /// This is used by StatsEvaluator when skipping files / row groups using a predicate. + /// TODO: Maybe in the future we can do this evaluation in-line at the optimizer stage? + /// + /// Do not implement this directly - instead implement `evaluate_inline_impl` + fn evaluate_inline(&self) -> Option { + self.evaluate_inline_impl(4) + } + + /// Implementation of `evaluate_inline` + fn evaluate_inline_impl(&self, _depth_limit: u8) -> Option { + None + } + /// Some expression that are not aggregations can be done per group /// Think of sort, slice, filter, shift, etc. /// defaults to ignoring the group diff --git a/crates/polars-expr/src/planner.rs b/crates/polars-expr/src/planner.rs index beaa6b6cdae0..9b5bc0a6a5b3 100644 --- a/crates/polars-expr/src/planner.rs +++ b/crates/polars-expr/src/planner.rs @@ -438,6 +438,7 @@ fn create_physical_expr_inner( dtype: dtype.clone(), expr: node_to_expr(expression, expr_arena), options: *options, + inlined_eval: Default::default(), })) }, Ternary { diff --git a/crates/polars-io/src/predicates.rs b/crates/polars-io/src/predicates.rs index 317bd420d99a..77872e708e40 100644 --- a/crates/polars-io/src/predicates.rs +++ b/crates/polars-io/src/predicates.rs @@ -159,6 +159,7 @@ impl ColumnStats { /// /// Returns `None` if no maximum value is available. pub fn to_min(&self) -> Option<&Series> { + // @scalar-opt let min_val = self.min_value.as_ref()?; let dtype = min_val.dtype(); @@ -177,6 +178,7 @@ impl ColumnStats { /// /// Returns `None` if no maximum value is available. pub fn to_max(&self) -> Option<&Series> { + // @scalar-opt let max_val = self.max_value.as_ref()?; let dtype = max_val.dtype(); diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index 30af7b830ff8..cf78190f2380 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -2,6 +2,7 @@ import io from dataclasses import dataclass +from datetime import datetime from functools import partial from math import ceil from pathlib import Path @@ -835,3 +836,74 @@ def test_streaming_scan_csv_with_row_index_19172(io_files_path: Path) -> None: schema={"calories": pl.String, "index": pl.UInt32}, ), ) + + +@pytest.mark.write_disk +def test_predicate_hive_pruning_with_cast(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + df = pl.DataFrame({"x": 1}) + + (p := (tmp_path / "date=2024-01-01")).mkdir() + + df.write_parquet(p / "1") + + (p := (tmp_path / "date=2024-01-02")).mkdir() + + # Write an invalid parquet file that will cause errors if polars attempts to + # read it. + # This works because `scan_parquet()` only looks at the first file during + # schema inference. + (p / "1").write_text("not a parquet file") + + expect = pl.DataFrame({"x": 1, "date": datetime(2024, 1, 1).date()}) + + lf = pl.scan_parquet(tmp_path) + + q = lf.filter(pl.col("date") < datetime(2024, 1, 2).date()) + + assert_frame_equal(q.collect(), expect) + + # This filter expr with stprtime is effectively what LazyFrame.sql() + # generates + q = lf.filter( + pl.col("date") + < pl.lit("2024-01-02").str.strptime( + dtype=pl.Date, format="%Y-%m-%d", ambiguous="latest" + ) + ) + + assert_frame_equal(q.collect(), expect) + + q = lf.sql("select * from self where date < '2024-01-02'") + assert_frame_equal(q.collect(), expect) + + +def test_predicate_stats_eval_nested_binary() -> None: + bufs: list[bytes] = [] + + for i in range(10): + b = io.BytesIO() + pl.DataFrame({"x": i}).write_parquet(b) + b.seek(0) + bufs.append(b.read()) + + assert_frame_equal( + ( + pl.scan_parquet(bufs) + .filter(pl.col("x") % 2 == 0) + .collect(no_optimization=True) + ), + pl.DataFrame({"x": [0, 2, 4, 6, 8]}), + ) + + assert_frame_equal( + ( + pl.scan_parquet(bufs) + # The literal eval depth limit is 4 - + # * crates/polars-expr/src/expressions/mod.rs::PhysicalExpr::evaluate_inline + .filter(pl.col("x") == pl.lit("222").str.slice(0, 1).cast(pl.Int64)) + .collect() + ), + pl.DataFrame({"x": [2]}), + ) From 8c41ae4f6bcded591abb0e278503b0264787fb33 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Fri, 8 Nov 2024 14:59:17 +0100 Subject: [PATCH 04/20] fix: Only allow `list.to_struct` to be elementwise when width is fixed (#19688) --- crates/polars-plan/src/dsl/list.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index 83b30b021f25..e3e3a40806e2 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -279,8 +279,22 @@ impl ListNameSpace { /// If this is incorrectly downstream operation may fail. For instance an `all().sum()` expression /// will look in the current schema to determine which columns to select. pub fn to_struct(self, args: ListToStructArgs) -> Expr { - self.0 - .map_private(FunctionExpr::ListExpr(ListFunction::ToStruct(args))) + let collect_groups = match &args { + ListToStructArgs::FixedWidth(_) => ApplyOptions::ElementWise, + + // If we have to infer the dtype it is not elementwise anymore, since different parts + // could infer to different widths. + ListToStructArgs::InferWidth { .. } => ApplyOptions::GroupWise, + }; + + Expr::Function { + input: vec![self.0], + function: FunctionExpr::ListExpr(ListFunction::ToStruct(args)), + options: FunctionOptions { + collect_groups, + ..Default::default() + }, + } } #[cfg(feature = "is_in")] From 601fcb7c2f2e902058d71c8968f969918f23e212 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 8 Nov 2024 16:44:41 -0500 Subject: [PATCH 05/20] perf: Fix quadratic 'with_columns' behavior (#19701) --- crates/polars-core/src/frame/mod.rs | 19 +++++++++++++++-- .../polars-mem-engine/src/executors/stack.rs | 3 ++- crates/polars-mem-engine/src/planner/lp.rs | 3 ++- .../tests/benchmark/test_with_columns.py | 21 +++++++++++++++++++ 4 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 py-polars/tests/benchmark/test_with_columns.py diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index aa434fb07df7..e3b969a81756 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1382,12 +1382,24 @@ impl DataFrame { self } + // Note: Schema can be both input or output_schema fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> { let name = c.name(); if let Some((idx, _, _)) = schema.get_full(name.as_str()) { - // schema is incorrect fallback to search if self.columns.get(idx).map(|s| s.name()) != Some(name) { - self.add_column_by_search(c)?; + // Given schema is output_schema and we can push. + if idx == self.columns.len() { + if self.width() == 0 { + self.height = c.len(); + } + + self.columns.push(c); + } + // Schema is incorrect fallback to search + else { + debug_assert!(false); + self.add_column_by_search(c)?; + } } else { self.replace_column(idx, c)?; } @@ -1401,6 +1413,7 @@ impl DataFrame { Ok(()) } + // Note: Schema can be both input or output_schema pub fn _add_series(&mut self, series: Vec, schema: &Schema) -> PolarsResult<()> { for (i, s) in series.into_iter().enumerate() { // we need to branch here @@ -1430,6 +1443,8 @@ impl DataFrame { /// Add a new column to this [`DataFrame`] or replace an existing one. /// Uses an existing schema to amortize lookups. /// If the schema is incorrect, we will fallback to linear search. + /// + /// Note: Schema can be both input or output_schema pub fn with_column_and_schema( &mut self, column: C, diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index ba6fa8111402..a93d4fc72d89 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -8,6 +8,7 @@ pub struct StackExec { pub(crate) has_windows: bool, pub(crate) exprs: Vec>, pub(crate) input_schema: SchemaRef, + pub(crate) output_schema: SchemaRef, pub(crate) options: ProjectionOptions, // Can run all operations elementwise pub(crate) streamable: bool, @@ -19,7 +20,7 @@ impl StackExec { state: &ExecutionState, mut df: DataFrame, ) -> PolarsResult { - let schema = &*self.input_schema; + let schema = &*self.output_schema; // Vertical and horizontal parallelism. let df = if self.streamable diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs index 3a5e525867fb..0d438b5f5bd1 100644 --- a/crates/polars-mem-engine/src/planner/lp.rs +++ b/crates/polars-mem-engine/src/planner/lp.rs @@ -629,7 +629,7 @@ fn create_physical_plan_impl( HStack { input, exprs, - schema: _schema, + schema: output_schema, options, } => { let input_schema = lp_arena.get(input).schema(lp_arena).into_owned(); @@ -659,6 +659,7 @@ fn create_physical_plan_impl( has_windows: state.has_windows, exprs: phys_exprs, input_schema, + output_schema, options, streamable, })) diff --git a/py-polars/tests/benchmark/test_with_columns.py b/py-polars/tests/benchmark/test_with_columns.py new file mode 100644 index 000000000000..8ea3402ac696 --- /dev/null +++ b/py-polars/tests/benchmark/test_with_columns.py @@ -0,0 +1,21 @@ +import time + +import pytest + +import polars as pl + + +# TODO: this is slow in streaming +@pytest.mark.may_fail_auto_streaming +def test_with_columns_quadratic_19503() -> None: + num_columns = 2000 + data1 = {f"col_{i}": [0] for i in range(num_columns)} + df1 = pl.DataFrame(data1) + + data2 = {f"feature_{i}": [0] for i in range(num_columns)} + df2 = pl.DataFrame(data2) + + t0 = time.time() + df1.with_columns(df2) + t1 = time.time() + assert t1 - t0 < 0.2 From e276eb80ee53f6f4a67e584f59c9aab9499ca5fd Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 9 Nov 2024 00:10:57 +0000 Subject: [PATCH 06/20] fix: In group_by_dynamic, period and every were getting applied in reverse order for the window upper boundary (#19706) --- crates/polars-time/src/windows/window.rs | 6 ++-- .../unit/operations/test_group_by_dynamic.py | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/crates/polars-time/src/windows/window.rs b/crates/polars-time/src/windows/window.rs index c7a29b846c58..9609f2abc514 100644 --- a/crates/polars-time/src/windows/window.rs +++ b/crates/polars-time/src/windows/window.rs @@ -327,15 +327,15 @@ impl Iterator for BoundsIter<'_> { // Issue is that `next` needs to return `Option`. TimeUnit::Nanoseconds => { self.bi.start = self.window.every.add_ns(self.bi.start, self.tz).unwrap(); - self.bi.stop = self.window.every.add_ns(self.bi.stop, self.tz).unwrap(); + self.bi.stop = self.window.period.add_ns(self.bi.start, self.tz).unwrap(); }, TimeUnit::Microseconds => { self.bi.start = self.window.every.add_us(self.bi.start, self.tz).unwrap(); - self.bi.stop = self.window.every.add_us(self.bi.stop, self.tz).unwrap(); + self.bi.stop = self.window.period.add_us(self.bi.start, self.tz).unwrap(); }, TimeUnit::Milliseconds => { self.bi.start = self.window.every.add_ms(self.bi.start, self.tz).unwrap(); - self.bi.stop = self.window.every.add_ms(self.bi.stop, self.tz).unwrap(); + self.bi.stop = self.window.period.add_ms(self.bi.start, self.tz).unwrap(); }, } Some(out) diff --git a/py-polars/tests/unit/operations/test_group_by_dynamic.py b/py-polars/tests/unit/operations/test_group_by_dynamic.py index 0c083e204725..0b4cc1ec2531 100644 --- a/py-polars/tests/unit/operations/test_group_by_dynamic.py +++ b/py-polars/tests/unit/operations/test_group_by_dynamic.py @@ -1043,3 +1043,36 @@ def test_group_by_dynamic_exclude_index_from_expansion_17075() -> None: "n": [0, 2, 4, 6], "m": [0, 2, 4, 6], } + + +def test_group_by_dynamic_overlapping_19704() -> None: + df = pl.DataFrame( + { + "a": [datetime(2020, 1, 1), datetime(2020, 2, 1), datetime(2020, 3, 1)], + "b": [1, 2, 3], + } + ) + result = df.group_by_dynamic( + "a", every="1mo", period="45d", include_boundaries=True + ).agg(pl.col("b").sum()) + expected = pl.DataFrame( + { + "_lower_boundary": [ + datetime(2020, 1, 1, 0, 0), + datetime(2020, 2, 1, 0, 0), + datetime(2020, 3, 1, 0, 0), + ], + "_upper_boundary": [ + datetime(2020, 2, 15, 0, 0), + datetime(2020, 3, 17, 0, 0), + datetime(2020, 4, 15, 0, 0), + ], + "a": [ + datetime(2020, 1, 1, 0, 0), + datetime(2020, 2, 1, 0, 0), + datetime(2020, 3, 1, 0, 0), + ], + "b": [3, 5, 3], + } + ) + assert_frame_equal(result, expected) From efde5e52157fb2cde140a4d14f440b541ce17d88 Mon Sep 17 00:00:00 2001 From: Max Muoto Date: Sat, 9 Nov 2024 23:24:03 -0600 Subject: [PATCH 07/20] refactor: Remove Dead Excel Code (#19710) --- py-polars/polars/io/spreadsheet/functions.py | 42 -------------------- py-polars/tests/unit/io/test_spreadsheet.py | 41 +------------------ 2 files changed, 1 insertion(+), 82 deletions(-) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 1d4bb5fe90b6..91910e8996fa 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -495,48 +495,6 @@ def read_ods( ) -def _identify_from_magic_bytes(data: IO[bytes] | bytes) -> str | None: - if isinstance(data, bytes): - data = BytesIO(data) - - xls_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # excel 97-2004 - xlsx_bytes = b"PK\x03\x04" # xlsx/openoffice (zipped xml) - - initial_position = data.tell() - try: - magic_bytes = data.read(8) - if magic_bytes == xls_bytes: - return "xls" - elif magic_bytes[:4] == xlsx_bytes: - return "xlsx" - except UnicodeDecodeError: - pass - finally: - data.seek(initial_position) - return None - - -def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None: - """Use file extension (and magic bytes) to identify Workbook type.""" - if not isinstance(wb, (str, Path)): - # raw binary data (bytesio, etc) - return _identify_from_magic_bytes(wb) - else: - p = Path(wb) - ext = p.suffix[1:].lower() - - # unambiguous file extensions - if ext in ("xlsx", "xlsm", "xlsb"): - return ext - elif ext[:2] == "od": - return "ods" - - # check magic bytes to resolve ambiguity (eg: xls/xlsx, or no extension) - with p.open("rb") as f: - magic_bytes = BytesIO(f.read(8)) - return _identify_from_magic_bytes(magic_bytes) - - def _read_spreadsheet( sheet_id: int | Sequence[int] | None, sheet_name: str | list[str] | tuple[str] | None, diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index b7b03a0bd02e..acef1ef7f1a5 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -4,7 +4,6 @@ from collections import OrderedDict from datetime import date, datetime from io import BytesIO -from pathlib import Path from typing import TYPE_CHECKING, Any, Callable import pytest @@ -12,12 +11,12 @@ import polars as pl import polars.selectors as cs from polars.exceptions import NoDataError, ParameterCollisionError -from polars.io.spreadsheet.functions import _identify_workbook from polars.testing import assert_frame_equal, assert_series_equal from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES if TYPE_CHECKING: from collections.abc import Sequence + from pathlib import Path from polars._typing import ExcelSpreadsheetEngine, SelectorType @@ -1028,44 +1027,6 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None assert_frame_equal(df.select(reversed_cols), read_df) -@pytest.mark.parametrize( - ("path", "file_type"), - [ - ("path_xls", "xls"), - ("path_xlsx", "xlsx"), - ("path_xlsb", "xlsb"), - ], -) -def test_identify_workbook( - path: str, file_type: str, request: pytest.FixtureRequest -) -> None: - # identify from file path - spreadsheet_path = request.getfixturevalue(path) - assert _identify_workbook(spreadsheet_path) == file_type - - # note that we can't distinguish between xlsx and xlsb - # from the magic bytes block alone (so we default to xlsx) - if file_type == "xlsb": - file_type = "xlsx" - - # identify from IO[bytes] - with Path.open(spreadsheet_path, "rb") as f: - assert _identify_workbook(f) == file_type - assert isinstance(pl.read_excel(f, engine="calamine"), pl.DataFrame) - - # identify from bytes - with Path.open(spreadsheet_path, "rb") as f: - raw_data = f.read() - assert _identify_workbook(raw_data) == file_type - assert isinstance(pl.read_excel(raw_data, engine="calamine"), pl.DataFrame) - - # identify from BytesIO - with Path.open(spreadsheet_path, "rb") as f: - bytesio_data = BytesIO(f.read()) - assert _identify_workbook(bytesio_data) == file_type - assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame) - - def test_drop_empty_rows(path_empty_rows_excel: Path) -> None: df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv") assert df1.shape == (8, 4) From 2c6bae14146d1d5d5856e71ca16ff43589be3f6d Mon Sep 17 00:00:00 2001 From: Max Muoto Date: Sat, 9 Nov 2024 23:44:22 -0600 Subject: [PATCH 08/20] fix(python): Ensure `NoDataError` raised consistently between engines for Excel reads (#19712) --- py-polars/polars/io/spreadsheet/functions.py | 10 +++++----- py-polars/tests/unit/io/test_spreadsheet.py | 21 +++++++++++++------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 91910e8996fa..9b22df604242 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -776,7 +776,7 @@ def _drop_null_data( If `drop_empty_rows` is set to `False`, empty rows are not dropped. """ - null_cols = [] + null_cols: list[str] = [] for col_name in df.columns: # note that if multiple unnamed columns are found then all but the first one # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine) @@ -955,15 +955,15 @@ def _read_spreadsheet_calamine( ): df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)] + df = _drop_null_data( + df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + ) + # note: even if we applied parser dtypes we still re-apply schema_overrides # natively as we can refine integer/float types, temporal precision, etc. if schema_overrides: df = df.cast(dtypes=schema_overrides) - df = _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows - ) - # standardise on string dtype for null columns in empty frame if df.is_empty(): df = df.cast({Null: String}) diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index acef1ef7f1a5..b413242835c3 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -18,7 +18,7 @@ from collections.abc import Sequence from pathlib import Path - from polars._typing import ExcelSpreadsheetEngine, SelectorType + from polars._typing import ExcelSpreadsheetEngine, SchemaDict, SelectorType # pytestmark = pytest.mark.slow() @@ -918,24 +918,31 @@ def test_excel_freeze_panes() -> None: @pytest.mark.parametrize( - ("read_spreadsheet", "source"), + ("read_spreadsheet", "source", "schema_overrides"), [ - (pl.read_excel, "path_xlsx_empty"), - (pl.read_excel, "path_xlsb_empty"), - (pl.read_excel, "path_xls_empty"), - (pl.read_ods, "path_ods_empty"), + (pl.read_excel, "path_xlsx_empty", None), + (pl.read_excel, "path_xlsb_empty", None), + (pl.read_excel, "path_xls_empty", None), + (pl.read_ods, "path_ods_empty", None), + # Test with schema overrides, to ensure they don't interfere with + # raising NoDataErrors. + (pl.read_excel, "path_xlsx_empty", {"a": pl.Int64}), + (pl.read_excel, "path_xlsb_empty", {"a": pl.Int64}), + (pl.read_excel, "path_xls_empty", {"a": pl.Int64}), + (pl.read_ods, "path_ods_empty", {"a": pl.Int64}), ], ) def test_excel_empty_sheet( read_spreadsheet: Callable[..., pl.DataFrame], source: str, request: pytest.FixtureRequest, + schema_overrides: SchemaDict | None, ) -> None: ods = (empty_spreadsheet_path := request.getfixturevalue(source)).suffix == ".ods" read_spreadsheet = pl.read_ods if ods else pl.read_excel # type: ignore[assignment] with pytest.raises(NoDataError, match="empty Excel sheet"): - read_spreadsheet(empty_spreadsheet_path) + read_spreadsheet(empty_spreadsheet_path, schema_overrides=schema_overrides) engine_params = [{}] if ods else [{"engine": "calamine"}] for params in engine_params: From e596fa7120c6ec30d5b43132f17de92bec47d58e Mon Sep 17 00:00:00 2001 From: Max Muoto Date: Mon, 11 Nov 2024 01:52:23 -0600 Subject: [PATCH 09/20] feat(python): Improve `n_chunks` typing (#19727) --- py-polars/polars/dataframe/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 4ff2752fdfb5..49c2e2470534 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -9236,7 +9236,7 @@ def n_chunks(self, strategy: Literal["first"] = ...) -> int: ... @overload def n_chunks(self, strategy: Literal["all"]) -> list[int]: ... - def n_chunks(self, strategy: str = "first") -> int | list[int]: + def n_chunks(self, strategy: Literal["first", "all"] = "first") -> int | list[int]: """ Get number of chunks used by the ChunkedArrays of this DataFrame. From 441c18e6aa9cc805b767922a2b68a79121544d03 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Mon, 11 Nov 2024 11:54:07 +0400 Subject: [PATCH 10/20] fix: SQL `ELSE` clause should be implicitly `NULL` when omitted (#19714) --- crates/polars-sql/src/sql_expr.rs | 2 +- py-polars/tests/unit/sql/test_conditional.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/crates/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs index 9e068efb6064..a2ada46e1c68 100644 --- a/crates/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -919,7 +919,7 @@ impl SQLExprVisitor<'_> { } let else_res = match else_result { Some(else_res) => self.visit_expr(else_res)?, - None => polars_bail!(SQLSyntax: "ELSE expression is required"), + None => lit(Null), // ELSE clause is optional; when omitted, it is implicitly NULL }; if let Some(operand_expr) = operand { let first_operand_expr = self.visit_expr(operand_expr)?; diff --git a/py-polars/tests/unit/sql/test_conditional.py b/py-polars/tests/unit/sql/test_conditional.py index b2000ebe37b1..3a80c1234aff 100644 --- a/py-polars/tests/unit/sql/test_conditional.py +++ b/py-polars/tests/unit/sql/test_conditional.py @@ -36,6 +36,24 @@ def test_case_when() -> None: } +@pytest.mark.parametrize("else_clause", ["ELSE NULL ", ""]) +def test_case_when_optional_else(else_clause: str) -> None: + df = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [7, 6, 5, 4, 3, 2, 1], + "c": [3, 4, 0, 3, 4, 1, 1], + } + ) + query = f""" + SELECT + AVG(CASE WHEN a <= b THEN c {else_clause}END) AS conditional_mean + FROM self + """ + res = df.sql(query) + assert res.to_dict(as_series=False) == {"conditional_mean": [2.5]} + + def test_control_flow(foods_ipc_path: Path) -> None: nums = pl.LazyFrame( { From 0b34f5f75dac77dd5701dae096885bab6d8607d2 Mon Sep 17 00:00:00 2001 From: barak1412 Date: Mon, 11 Nov 2024 10:27:59 +0200 Subject: [PATCH 11/20] fix: Fix validation for inner and left join when join_nulls unflaged (#19698) Co-authored-by: Ritchie Vink --- crates/polars-ops/src/frame/join/args.rs | 9 ++- .../join/hash_join/single_keys_dispatch.rs | 75 +++++++++++++++---- .../frame/join/hash_join/single_keys_inner.rs | 7 +- .../frame/join/hash_join/single_keys_left.rs | 7 +- py-polars/tests/unit/sql/test_joins.py | 23 ++++++ 5 files changed, 104 insertions(+), 17 deletions(-) diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index d34c37e7ff67..def36b76a677 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -237,6 +237,7 @@ impl JoinValidation { s_left: &Series, s_right: &Series, build_shortest_table: bool, + join_nulls: bool, ) -> PolarsResult<()> { // In default, probe is the left series. // @@ -253,7 +254,13 @@ impl JoinValidation { // Only check the `build` side. // The other side use `validate_build` to check ManyToMany | ManyToOne => true, - OneToMany | OneToOne => probe.n_unique()? == probe.len(), + OneToMany | OneToOne => { + if !join_nulls && probe.null_count() > 0 { + probe.n_unique()? - 1 == probe.len() - probe.null_count() + } else { + probe.n_unique()? == probe.len() + } + }, }; polars_ensure!(valid, ComputeError: "join keys did not fulfill {} validation", self); Ok(()) diff --git a/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs b/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs index f79e8759d9e8..7c365210b208 100644 --- a/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs +++ b/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs @@ -20,7 +20,7 @@ pub trait SeriesJoin: SeriesSealed + Sized { ) -> PolarsResult { let s_self = self.as_series(); let (lhs, rhs) = (s_self.to_physical_repr(), other.to_physical_repr()); - validate.validate_probe(&lhs, &rhs, false)?; + validate.validate_probe(&lhs, &rhs, false, join_nulls)?; let lhs_dtype = lhs.dtype(); let rhs_dtype = rhs.dtype(); @@ -35,7 +35,8 @@ pub trait SeriesJoin: SeriesSealed + Sized { let (lhs, rhs, _, _) = prepare_binary::(lhs, rhs, false); let lhs = lhs.iter().map(|v| v.as_slice()).collect::>(); let rhs = rhs.iter().map(|v| v.as_slice()).collect::>(); - hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls) + let build_null_count = other.null_count(); + hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls, build_null_count) }, T::BinaryOffset => { let lhs = lhs.binary_offset().unwrap(); @@ -44,7 +45,8 @@ pub trait SeriesJoin: SeriesSealed + Sized { // Take slices so that vecs are not copied let lhs = lhs.iter().map(|k| k.as_slice()).collect::>(); let rhs = rhs.iter().map(|k| k.as_slice()).collect::>(); - hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls) + let build_null_count = other.null_count(); + hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls, build_null_count) }, x if x.is_float() => { with_match_physical_float_polars_type!(lhs.dtype(), |$T| { @@ -168,7 +170,7 @@ pub trait SeriesJoin: SeriesSealed + Sized { ) -> PolarsResult<(InnerJoinIds, bool)> { let s_self = self.as_series(); let (lhs, rhs) = (s_self.to_physical_repr(), other.to_physical_repr()); - validate.validate_probe(&lhs, &rhs, true)?; + validate.validate_probe(&lhs, &rhs, true, join_nulls)?; let lhs_dtype = lhs.dtype(); let rhs_dtype = rhs.dtype(); @@ -184,8 +186,20 @@ pub trait SeriesJoin: SeriesSealed + Sized { // Take slices so that vecs are not copied let lhs = lhs.iter().map(|k| k.as_slice()).collect::>(); let rhs = rhs.iter().map(|k| k.as_slice()).collect::>(); + let build_null_count = if swapped { + s_self.null_count() + } else { + other.null_count() + }; Ok(( - hash_join_tuples_inner(lhs, rhs, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + lhs, + rhs, + swapped, + validate, + join_nulls, + build_null_count, + )?, !swapped, )) }, @@ -196,8 +210,20 @@ pub trait SeriesJoin: SeriesSealed + Sized { // Take slices so that vecs are not copied let lhs = lhs.iter().map(|k| k.as_slice()).collect::>(); let rhs = rhs.iter().map(|k| k.as_slice()).collect::>(); + let build_null_count = if swapped { + s_self.null_count() + } else { + other.null_count() + }; Ok(( - hash_join_tuples_inner(lhs, rhs, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + lhs, + rhs, + swapped, + validate, + join_nulls, + build_null_count, + )?, !swapped, )) }, @@ -244,7 +270,7 @@ pub trait SeriesJoin: SeriesSealed + Sized { ) -> PolarsResult<(PrimitiveArray, PrimitiveArray)> { let s_self = self.as_series(); let (lhs, rhs) = (s_self.to_physical_repr(), other.to_physical_repr()); - validate.validate_probe(&lhs, &rhs, true)?; + validate.validate_probe(&lhs, &rhs, true, join_nulls)?; let lhs_dtype = lhs.dtype(); let rhs_dtype = rhs.dtype(); @@ -352,20 +378,38 @@ where .map(|arr| arr.as_slice().unwrap()) .collect::>(); Ok(( - hash_join_tuples_inner(splitted_a, splitted_b, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + splitted_a, splitted_b, swapped, validate, join_nulls, 0, + )?, !swapped, )) } else { Ok(( - hash_join_tuples_inner(splitted_a, splitted_b, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + splitted_a, splitted_b, swapped, validate, join_nulls, 0, + )?, !swapped, )) } }, - _ => Ok(( - hash_join_tuples_inner(splitted_a, splitted_b, swapped, validate, join_nulls)?, - !swapped, - )), + _ => { + let build_null_count = if swapped { + left.null_count() + } else { + right.null_count() + }; + Ok(( + hash_join_tuples_inner( + splitted_a, + splitted_b, + swapped, + validate, + join_nulls, + build_null_count, + )?, + !swapped, + )) + }, } } @@ -430,7 +474,7 @@ where (0, 0, 1, 1) => { let keys_a = chunks_as_slices(&splitted_a); let keys_b = chunks_as_slices(&splitted_b); - hash_join_tuples_left(keys_a, keys_b, None, None, validate, join_nulls) + hash_join_tuples_left(keys_a, keys_b, None, None, validate, join_nulls, 0) }, (0, 0, _, _) => { let keys_a = chunks_as_slices(&splitted_a); @@ -445,6 +489,7 @@ where mapping_right.as_deref(), validate, join_nulls, + 0, ) }, _ => { @@ -452,6 +497,7 @@ where let keys_b = get_arrays(&splitted_b); let (mapping_left, mapping_right) = create_mappings(left.chunks(), right.chunks(), left.len(), right.len()); + let build_null_count = right.null_count(); hash_join_tuples_left( keys_a, keys_b, @@ -459,6 +505,7 @@ where mapping_right.as_deref(), validate, join_nulls, + build_null_count, ) }, } diff --git a/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs b/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs index f01c99529aea..aeca8bb32546 100644 --- a/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs +++ b/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs @@ -44,6 +44,8 @@ pub(super) fn hash_join_tuples_inner( swapped: bool, validate: JoinValidation, join_nulls: bool, + // Null count is required for join validation + build_null_count: usize, ) -> PolarsResult<(Vec, Vec)> where I: IntoIterator + Send + Sync + Clone, @@ -53,10 +55,13 @@ where // NOTE: see the left join for more elaborate comments // first we hash one relation let hash_tbls = if validate.needs_checks() { - let expected_size = build + let mut expected_size = build .iter() .map(|v| v.clone().into_iter().size_hint().1.unwrap()) .sum(); + if !join_nulls { + expected_size -= build_null_count; + } let hash_tbls = build_tables(build, join_nulls); let build_size = hash_tbls.iter().map(|m| m.len()).sum(); validate.validate_build(build_size, expected_size, swapped)?; diff --git a/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs b/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs index 91c4f0cd1008..b23d9de1776f 100644 --- a/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs +++ b/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs @@ -112,6 +112,8 @@ pub(super) fn hash_join_tuples_left( chunk_mapping_right: Option<&[ChunkId]>, validate: JoinValidation, join_nulls: bool, + // We should know the number of nulls to avoid extra calculation + build_null_count: usize, ) -> PolarsResult where I: IntoIterator, @@ -123,7 +125,10 @@ where let build = build.into_iter().map(|i| i.into_iter()).collect::>(); // first we hash one relation let hash_tbls = if validate.needs_checks() { - let expected_size = build.iter().map(|v| v.size_hint().1.unwrap()).sum(); + let mut expected_size = build.iter().map(|v| v.size_hint().1.unwrap()).sum(); + if !join_nulls { + expected_size -= build_null_count; + } let hash_tbls = build_tables(build, join_nulls); let build_size = hash_tbls.iter().map(|m| m.len()).sum(); validate.validate_build(build_size, expected_size, false)?; diff --git a/py-polars/tests/unit/sql/test_joins.py b/py-polars/tests/unit/sql/test_joins.py index d25610eb6763..c423fc4c45f4 100644 --- a/py-polars/tests/unit/sql/test_joins.py +++ b/py-polars/tests/unit/sql/test_joins.py @@ -663,3 +663,26 @@ def test_nested_join(join_clause: str) -> None: "Species": "Human", }, ] + + +def test_join_nulls_19624() -> None: + df1 = pl.DataFrame({"a": [1, 2, None, None]}) + df2 = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]}) + + # left join + result_df = df1.join(df2, how="left", on="a", join_nulls=False, validate="1:m") + expected_df = pl.DataFrame( + {"a": [1, 1, 2, 2, None, None], "b": [0, 1, 2, 3, None, None]} + ) + assert_frame_equal(result_df, expected_df) + result_df = df2.join(df1, how="left", on="a", join_nulls=False, validate="m:1") + expected_df = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]}) + assert_frame_equal(result_df, expected_df) + + # inner join + result_df = df1.join(df2, how="inner", on="a", join_nulls=False, validate="1:m") + expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]}) + assert_frame_equal(result_df, expected_df) + result_df = df2.join(df1, how="inner", on="a", join_nulls=False, validate="m:1") + expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]}) + assert_frame_equal(result_df, expected_df) From c4f0cc2774dc8dad238fc3a0835d7680e2a68887 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Mon, 11 Nov 2024 20:54:32 +1100 Subject: [PATCH 12/20] perf: Improve cloud scan performance (#19728) --- .../src/cloud/polars_object_store.rs | 452 ++++++++++++++++-- .../polars-io/src/file_cache/file_fetcher.rs | 9 +- .../polars-io/src/parquet/read/async_impl.rs | 63 +-- crates/polars-io/src/pl_async.rs | 25 +- crates/polars-io/src/utils/byte_source.rs | 29 +- .../parquet_source/row_group_data_fetch.rs | 51 +- crates/polars-utils/src/mmap.rs | 6 + 7 files changed, 498 insertions(+), 137 deletions(-) diff --git a/crates/polars-io/src/cloud/polars_object_store.rs b/crates/polars-io/src/cloud/polars_object_store.rs index 9738e0cbdbe4..cbb804198c12 100644 --- a/crates/polars-io/src/cloud/polars_object_store.rs +++ b/crates/polars-io/src/cloud/polars_object_store.rs @@ -2,14 +2,16 @@ use std::ops::Range; use std::sync::Arc; use bytes::Bytes; -use futures::StreamExt; +use futures::{StreamExt, TryStreamExt}; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; -use polars_error::{to_compute_err, PolarsResult}; +use polars_core::prelude::{InitHashMaps, PlHashMap}; +use polars_error::{to_compute_err, PolarsError, PolarsResult}; use tokio::io::AsyncWriteExt; use crate::pl_async::{ - self, tune_with_concurrency_budget, with_concurrency_budget, MAX_BUDGET_PER_REQUEST, + self, get_concurrency_limit, get_download_chunk_size, tune_with_concurrency_budget, + with_concurrency_budget, MAX_BUDGET_PER_REQUEST, }; /// Polars specific wrapper for `Arc` that limits the number of @@ -23,63 +25,184 @@ impl PolarsObjectStore { Self(store) } - pub async fn get(&self, path: &Path) -> PolarsResult { - tune_with_concurrency_budget(1, || async { - self.0 - .get(path) - .await - .map_err(to_compute_err)? - .bytes() - .await - .map_err(to_compute_err) - }) - .await + /// Returns a buffered stream that downloads concurrently up to the concurrency limit. + fn get_buffered_ranges_stream<'a, T: Iterator>>( + &'a self, + path: &'a Path, + ranges: T, + ) -> impl StreamExt> + + TryStreamExt> + + use<'a, T> { + futures::stream::iter( + ranges + .map(|range| async { self.0.get_range(path, range).await.map_err(to_compute_err) }), + ) + // Add a limit locally as this gets run inside a single `tune_with_concurrency_budget`. + .buffered(get_concurrency_limit() as usize) } pub async fn get_range(&self, path: &Path, range: Range) -> PolarsResult { - tune_with_concurrency_budget(1, || self.0.get_range(path, range)) - .await - .map_err(to_compute_err) + let parts = split_range(range.clone()); + + if parts.len() == 1 { + tune_with_concurrency_budget(1, || self.0.get_range(path, range)) + .await + .map_err(to_compute_err) + } else { + let parts = tune_with_concurrency_budget( + parts.len().clamp(0, MAX_BUDGET_PER_REQUEST) as u32, + || { + self.get_buffered_ranges_stream(path, parts) + .try_collect::>() + }, + ) + .await?; + + let mut combined = Vec::with_capacity(range.len()); + + for part in parts { + combined.extend_from_slice(&part) + } + + assert_eq!(combined.len(), range.len()); + + PolarsResult::Ok(Bytes::from(combined)) + } } - pub async fn get_ranges( + /// Fetch byte ranges into a HashMap keyed by the range start. This will mutably sort the + /// `ranges` slice for coalescing. + /// + /// # Panics + /// Panics if the same range start is used by more than 1 range. + pub async fn get_ranges_sort< + K: TryFrom + std::hash::Hash + Eq, + T: From, + >( &self, path: &Path, - ranges: &[Range], - ) -> PolarsResult> { + ranges: &mut [Range], + ) -> PolarsResult> { + if ranges.is_empty() { + return Ok(Default::default()); + } + + let mut out = PlHashMap::with_capacity(ranges.len()); + + ranges.sort_unstable_by_key(|x| x.start); + + let (merged_ranges, merged_ends): (Vec<_>, Vec<_>) = merge_ranges(ranges).unzip(); + + let mut stream = self.get_buffered_ranges_stream(path, merged_ranges.iter().cloned()); + tune_with_concurrency_budget( - (ranges.len() as u32).clamp(0, MAX_BUDGET_PER_REQUEST as u32), - || self.0.get_ranges(path, ranges), + merged_ranges.len().clamp(0, MAX_BUDGET_PER_REQUEST) as u32, + || async { + let mut len = 0; + let mut current_offset = 0; + let mut ends_iter = merged_ends.iter(); + + let mut splitted_parts = vec![]; + + while let Some(bytes) = stream.try_next().await? { + len += bytes.len(); + let end = *ends_iter.next().unwrap(); + + if end == 0 { + splitted_parts.push(bytes); + continue; + } + + let full_range = ranges[current_offset..end] + .iter() + .cloned() + .reduce(|l, r| l.start.min(r.start)..l.end.max(r.end)) + .unwrap(); + + let bytes = if splitted_parts.is_empty() { + bytes + } else { + let mut out = Vec::with_capacity(full_range.len()); + + for x in splitted_parts.drain(..) { + out.extend_from_slice(&x); + } + + out.extend_from_slice(&bytes); + Bytes::from(out) + }; + + assert_eq!(bytes.len(), full_range.len()); + + for range in &ranges[current_offset..end] { + let v = out.insert( + K::try_from(range.start).unwrap(), + T::from(bytes.slice( + range.start - full_range.start..range.end - full_range.start, + )), + ); + + assert!(v.is_none()); // duplicate range start + } + + current_offset = end; + } + + assert!(splitted_parts.is_empty()); + + PolarsResult::Ok(pl_async::Size::from(len as u64)) + }, ) - .await - .map_err(to_compute_err) + .await?; + + Ok(out) } - pub async fn download( - &self, - path: &Path, - file: &mut F, - ) -> PolarsResult<()> { - tune_with_concurrency_budget(1, || async { - let mut stream = self - .0 - .get(path) - .await - .map_err(to_compute_err)? - .into_stream(); - - let mut len = 0; - while let Some(bytes) = stream.next().await { - let bytes = bytes.map_err(to_compute_err)?; - len += bytes.len(); - file.write_all(bytes.as_ref()) + pub async fn download(&self, path: &Path, file: &mut tokio::fs::File) -> PolarsResult<()> { + let opt_size = self.head(path).await.ok().map(|x| x.size); + let parts = opt_size.map(|x| split_range(0..x)).filter(|x| x.len() > 1); + + if let Some(parts) = parts { + tune_with_concurrency_budget( + parts.len().clamp(0, MAX_BUDGET_PER_REQUEST) as u32, + || async { + let mut stream = self.get_buffered_ranges_stream(path, parts); + let mut len = 0; + while let Some(bytes) = stream.try_next().await? { + len += bytes.len(); + file.write_all(&bytes).await.map_err(to_compute_err)?; + } + + assert_eq!(len, opt_size.unwrap()); + + PolarsResult::Ok(pl_async::Size::from(len as u64)) + }, + ) + .await? + } else { + tune_with_concurrency_budget(1, || async { + let mut stream = self + .0 + .get(path) .await - .map_err(to_compute_err)?; - } + .map_err(to_compute_err)? + .into_stream(); + + let mut len = 0; + while let Some(bytes) = stream.try_next().await? { + len += bytes.len(); + file.write_all(&bytes).await.map_err(to_compute_err)?; + } + + PolarsResult::Ok(pl_async::Size::from(len as u64)) + }) + .await? + }; + + // Dropping is delayed for tokio async files so we need to explicitly + // flush here (https://github.com/tokio-rs/tokio/issues/2307#issuecomment-596336451). + file.sync_all().await.map_err(PolarsError::from)?; - PolarsResult::Ok(pl_async::Size::from(len as u64)) - }) - .await?; Ok(()) } @@ -113,3 +236,238 @@ impl PolarsObjectStore { .map_err(to_compute_err) } } + +/// Splits a single range into multiple smaller ranges, which can be downloaded concurrently for +/// much higher throughput. +fn split_range(range: Range) -> impl ExactSizeIterator> { + let chunk_size = get_download_chunk_size(); + + // Calculate n_parts such that we are as close as possible to the `chunk_size`. + let n_parts = [ + (range.len().div_ceil(chunk_size)).max(1), + (range.len() / chunk_size).max(1), + ] + .into_iter() + .min_by_key(|x| (range.len() / *x).abs_diff(chunk_size)) + .unwrap(); + + let chunk_size = (range.len() / n_parts).max(1); + + assert_eq!(n_parts, (range.len() / chunk_size).max(1)); + let bytes_rem = range.len() % chunk_size; + + (0..n_parts).map(move |part_no| { + let (start, end) = if part_no == 0 { + // Download remainder length in the first chunk since it starts downloading first. + let end = range.start + chunk_size + bytes_rem; + let end = if end > range.end { range.end } else { end }; + (range.start, end) + } else { + let start = bytes_rem + range.start + part_no * chunk_size; + (start, start + chunk_size) + }; + + start..end + }) +} + +/// Note: For optimal performance, `ranges` should be sorted. More generally, +/// ranges placed next to each other should also be close in range value. +/// +/// # Returns +/// `[(range1, end1), (range2, end2)]`, where: +/// * `range1` contains bytes for the ranges from `ranges[0..end1]` +/// * `range2` contains bytes for the ranges from `ranges[end1..end2]` +/// * etc.. +/// +/// Note that if an end value is 0, it means the range is a splitted part and should be combined. +fn merge_ranges(ranges: &[Range]) -> impl Iterator, usize)> + '_ { + let chunk_size = get_download_chunk_size(); + + let mut current_merged_range = ranges.first().map_or(0..0, Clone::clone); + // Number of fetched bytes excluding excess. + let mut current_n_bytes = 0; + + (0..ranges.len()) + .filter_map(move |current_idx| { + let current_idx = 1 + current_idx; + + if current_idx == ranges.len() { + // No more items - flush current state. + Some((current_merged_range.clone(), current_idx)) + } else { + let range = ranges[current_idx].clone(); + + let new_merged = current_merged_range.start.min(range.start) + ..current_merged_range.end.max(range.end); + + // E.g.: + // |--------| + // oo // range1 + // oo // range2 + // ^^^ // distance = 3, is_overlapping = false + // E.g.: + // |--------| + // ooooo // range1 + // ooooo // range2 + // ^^ // distance = 2, is_overlapping = true + let (distance, is_overlapping) = { + let l = current_merged_range.end.min(range.end); + let r = current_merged_range.start.max(range.start); + + (r.abs_diff(l), r < l) + }; + + #[rustfmt::skip] + let should_merge = + is_overlapping // Always merge if overlapping + || ( + // Don't merge if the result size is not closer to the `chunk_size` + new_merged.len().abs_diff(chunk_size) < current_merged_range.len().abs_diff(chunk_size) + && ( + // Either the gap is less than 1MiB.. + distance <= 1024 * 1024 + || ( + // ..or, the gap is less than 12.5% of the largest between `current_n_bytes` + // and the new `range`, capped at 8MiB. + distance <= current_n_bytes.max(range.len()) / 8 + && distance <= 8 * 1024 * 1024 + ) + ) + ); + + if should_merge { + // Merge to existing range + current_merged_range = new_merged; + current_n_bytes += if is_overlapping { + range.len() - distance + } else { + range.len() + }; + None + } else { + let v = current_merged_range.clone(); + current_merged_range = range; + current_n_bytes = 0; + Some((v, current_idx)) + } + } + }) + .flat_map(|x| { + // Split large individual ranges within the list of ranges. + let (range, end) = x; + let split = split_range(range.clone()); + let len = split.len(); + + split + .enumerate() + .map(move |(i, range)| (range, if 1 + i == len { end } else { 0 })) + }) +} + +#[cfg(test)] +mod tests { + + #[test] + fn test_split_range() { + use super::{get_download_chunk_size, split_range}; + + let chunk_size = get_download_chunk_size(); + + assert_eq!(chunk_size, 64 * 1024 * 1024); + + #[allow(clippy::single_range_in_vec_init)] + { + // Round-trip empty ranges. + assert_eq!(split_range(0..0).collect::>(), [0..0]); + assert_eq!(split_range(3..3).collect::>(), [3..3]); + } + + // Threshold to start splitting to 2 ranges + // + // n - chunk_size == chunk_size - n / 2 + // n + n / 2 == 2 * chunk_size + // 3 * n == 4 * chunk_size + // n = 4 * chunk_size / 3 + let n = 4 * chunk_size / 3; + + #[allow(clippy::single_range_in_vec_init)] + { + assert_eq!(split_range(0..n).collect::>(), [0..89478485]); + } + + assert_eq!( + split_range(0..n + 1).collect::>(), + [0..44739243, 44739243..89478486] + ); + + // Threshold to start splitting to 3 ranges + // + // n / 2 - chunk_size == chunk_size - n / 3 + // n / 2 + n / 3 == 2 * chunk_size + // 5 * n == 12 * chunk_size + // n == 12 * chunk_size / 5 + let n = 12 * chunk_size / 5; + + assert_eq!( + split_range(0..n).collect::>(), + [0..80530637, 80530637..161061273] + ); + + assert_eq!( + split_range(0..n + 1).collect::>(), + [0..53687092, 53687092..107374183, 107374183..161061274] + ); + } + + #[test] + fn test_merge_ranges() { + use super::{get_download_chunk_size, merge_ranges}; + + let chunk_size = get_download_chunk_size(); + + assert_eq!(chunk_size, 64 * 1024 * 1024); + + // Round-trip empty slice + assert_eq!(merge_ranges(&[]).collect::>(), []); + + // We have 1 tiny request followed by 1 huge request. They are combined as it reduces the + // `abs_diff()` to the `chunk_size`, but afterwards they are split to 2 evenly sized + // requests. + assert_eq!( + merge_ranges(&[0..1, 1..127 * 1024 * 1024]).collect::>(), + [(0..66584576, 0), (66584576..133169152, 2)] + ); + + // <= 1MiB gap, merge + assert_eq!( + merge_ranges(&[0..1, 1024 * 1024 + 1..1024 * 1024 + 2]).collect::>(), + [(0..1048578, 2)] + ); + + // > 1MiB gap, do not merge + assert_eq!( + merge_ranges(&[0..1, 1024 * 1024 + 2..1024 * 1024 + 3]).collect::>(), + [(0..1, 1), (1048578..1048579, 2)] + ); + + // <= 12.5% gap, merge + assert_eq!( + merge_ranges(&[0..8, 10..11]).collect::>(), + [(0..11, 2)] + ); + + // <= 12.5% gap relative to RHS, merge + assert_eq!( + merge_ranges(&[0..1, 3..11]).collect::>(), + [(0..11, 2)] + ); + + // Overlapping range, merge + assert_eq!( + merge_ranges(&[0..80 * 1024 * 1024, 10 * 1024 * 1024..70 * 1024 * 1024]) + .collect::>(), + [(0..80 * 1024 * 1024, 2)] + ); + } +} diff --git a/crates/polars-io/src/file_cache/file_fetcher.rs b/crates/polars-io/src/file_cache/file_fetcher.rs index bd16dff7fda4..3d712ba955fc 100644 --- a/crates/polars-io/src/file_cache/file_fetcher.rs +++ b/crates/polars-io/src/file_cache/file_fetcher.rs @@ -116,12 +116,7 @@ impl FileFetcher for CloudFileFetcher { .await .map_err(PolarsError::from)?; - self.object_store.download(&self.cloud_path, file).await?; - // Dropping is delayed for tokio async files so we need to explicitly - // flush here (https://github.com/tokio-rs/tokio/issues/2307#issuecomment-596336451). - file.sync_all().await.map_err(PolarsError::from)?; - PolarsResult::Ok(()) - })?; - Ok(()) + self.object_store.download(&self.cloud_path, file).await + }) } } diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index da50364855da..053aad67464a 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -21,7 +21,7 @@ use crate::parquet::metadata::FileMetadataRef; use crate::pl_async::get_runtime; use crate::predicates::PhysicalIoExpr; -type DownloadedRowGroup = Vec<(u64, Bytes)>; +type DownloadedRowGroup = PlHashMap; type QueuePayload = (usize, DownloadedRowGroup); type QueueSend = Arc>>; @@ -49,14 +49,8 @@ impl ParquetObjectStore { }) } - async fn get_range(&self, start: usize, length: usize) -> PolarsResult { - self.store - .get_range(&self.path, start..start + length) - .await - } - - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { - self.store.get_ranges(&self.path, ranges).await + async fn get_ranges(&self, ranges: &mut [Range]) -> PolarsResult> { + self.store.get_ranges_sort(&self.path, ranges).await } /// Initialize the length property of the object, unless it has already been fetched. @@ -194,16 +188,10 @@ async fn download_projection( } }); - let result = async_reader.get_ranges(&ranges).await.map(|bytes| { - ( - rg_index, - bytes - .into_iter() - .zip(offsets) - .map(|(bytes, offset)| (offset, bytes)) - .collect::>(), - ) - }); + let result = async_reader + .get_ranges(&mut ranges) + .await + .map(|bytes_map| (rg_index, bytes_map)); sender.send(result).await.is_ok() } @@ -217,33 +205,20 @@ async fn download_row_group( return true; } - let full_byte_range = rg.full_byte_range(); - let full_byte_range = full_byte_range.start as usize..full_byte_range.end as usize; - - let result = async_reader - .get_range( - full_byte_range.start, - full_byte_range.end - full_byte_range.start, + let mut ranges = rg + .byte_ranges_iter() + .map(|x| x.start as usize..x.end as usize) + .collect::>(); + + sender + .send( + async_reader + .get_ranges(&mut ranges) + .await + .map(|bytes_map| (rg_index, bytes_map)), ) .await - .map(|bytes| { - ( - rg_index, - rg.byte_ranges_iter() - .map(|range| { - ( - range.start, - bytes.slice( - range.start as usize - full_byte_range.start - ..range.end as usize - full_byte_range.start, - ), - ) - }) - .collect::(), - ) - }); - - sender.send(result).await.is_ok() + .is_ok() } pub struct FetchRowGroupsFromObjectStore { diff --git a/crates/polars-io/src/pl_async.rs b/crates/polars-io/src/pl_async.rs index cc43a908cda3..4c95c96f7733 100644 --- a/crates/polars-io/src/pl_async.rs +++ b/crates/polars-io/src/pl_async.rs @@ -4,7 +4,7 @@ use std::ops::Deref; use std::sync::atomic::{AtomicBool, AtomicU64, AtomicU8, Ordering}; use once_cell::sync::Lazy; -use polars_core::config::verbose; +use polars_core::config::{self, verbose}; use polars_core::POOL; use tokio::runtime::{Builder, Runtime}; use tokio::sync::Semaphore; @@ -12,6 +12,25 @@ use tokio::sync::Semaphore; static CONCURRENCY_BUDGET: std::sync::OnceLock<(Semaphore, u32)> = std::sync::OnceLock::new(); pub(super) const MAX_BUDGET_PER_REQUEST: usize = 10; +/// Used to determine chunks when splitting large ranges, or combining small +/// ranges. +pub(super) static DOWNLOAD_CHUNK_SIZE: Lazy = Lazy::new(|| { + let v: usize = std::env::var("POLARS_DOWNLOAD_CHUNK_SIZE") + .as_deref() + .map(|x| x.parse().expect("integer")) + .unwrap_or(64 * 1024 * 1024); + + if config::verbose() { + eprintln!("async download_chunk_size: {}", v) + } + + v +}); + +pub(super) fn get_download_chunk_size() -> usize { + *DOWNLOAD_CHUNK_SIZE +} + pub trait GetSize { fn size(&self) -> u64; } @@ -158,6 +177,10 @@ fn get_semaphore() -> &'static (Semaphore, u32) { }) } +pub(crate) fn get_concurrency_limit() -> u32 { + get_semaphore().1 +} + pub async fn tune_with_concurrency_budget(requested_budget: u32, callable: F) -> Fut::Output where F: FnOnce() -> Fut, diff --git a/crates/polars-io/src/utils/byte_source.rs b/crates/polars-io/src/utils/byte_source.rs index e2dd3e876c2a..af37d32b36da 100644 --- a/crates/polars-io/src/utils/byte_source.rs +++ b/crates/polars-io/src/utils/byte_source.rs @@ -1,6 +1,7 @@ use std::ops::Range; use std::sync::Arc; +use polars_core::prelude::PlHashMap; use polars_error::PolarsResult; use polars_utils::_limit_path_len_io_err; use polars_utils::mmap::MemSlice; @@ -16,7 +17,11 @@ pub trait ByteSource: Send + Sync { /// # Panics /// Panics if `range` is not in bounds. async fn get_range(&self, range: Range) -> PolarsResult; - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult>; + /// Note: This will mutably sort ranges for coalescing. + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult>; } /// Byte source backed by a `MemSlice`, which can potentially be memory-mapped. @@ -49,11 +54,14 @@ impl ByteSource for MemSliceByteSource { Ok(out) } - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult> { Ok(ranges .iter() - .map(|x| self.0.slice(x.clone())) - .collect::>()) + .map(|x| (x.start, self.0.slice(x.clone()))) + .collect()) } } @@ -88,9 +96,11 @@ impl ByteSource for ObjectStoreByteSource { Ok(mem_slice) } - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { - let ranges = self.store.get_ranges(&self.path, ranges).await?; - Ok(ranges.into_iter().map(MemSlice::from_bytes).collect()) + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult> { + self.store.get_ranges_sort(&self.path, ranges).await } } @@ -130,7 +140,10 @@ impl ByteSource for DynByteSource { } } - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult> { match self { Self::MemSlice(v) => v.get_ranges(ranges).await, Self::Cloud(v) => v.get_ranges(ranges).await, diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs index 9a87f0f91b7c..bf2e7e60ea6e 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs @@ -1,7 +1,7 @@ use std::future::Future; use std::sync::Arc; -use polars_core::prelude::{ArrowSchema, InitHashMaps, PlHashMap}; +use polars_core::prelude::{ArrowSchema, PlHashMap}; use polars_core::series::IsSorted; use polars_core::utils::operation_exceeded_idxsize_msg; use polars_error::{polars_err, PolarsResult}; @@ -197,46 +197,37 @@ impl RowGroupDataFetcher { mem_slice, } } else if let Some(columns) = projection.as_ref() { - let ranges = get_row_group_byte_ranges_for_projection( + let mut ranges = get_row_group_byte_ranges_for_projection( &row_group_metadata, columns.as_ref(), ) .collect::>(); - let bytes = current_byte_source.get_ranges(ranges.as_ref()).await?; + let n_ranges = ranges.len(); - assert_eq!(bytes.len(), ranges.len()); + let bytes_map = current_byte_source.get_ranges(&mut ranges).await?; - let mut bytes_map = PlHashMap::with_capacity(ranges.len()); - - for (range, bytes) in ranges.iter().zip(bytes) { - memory_prefetch_func(bytes.as_ref()); - let v = bytes_map.insert(range.start, bytes); - debug_assert!(v.is_none(), "duplicate range start {}", range.start); - } + assert_eq!(bytes_map.len(), n_ranges); FetchedBytes::BytesMap(bytes_map) } else { - // We have a dedicated code-path for a full projection that performs a - // single range request for the entire row group. During testing this - // provided much higher throughput from cloud than making multiple range - // request with `get_ranges()`. - let full_range = row_group_metadata.full_byte_range(); - let full_range = full_range.start as usize..full_range.end as usize; - - let mem_slice = { - let full_range_2 = full_range.clone(); - task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { - current_byte_source.get_range(full_range_2).await - })) - .await - .unwrap()? - }; + // We still prefer `get_ranges()` over a single `get_range()` for downloading + // the entire row group, as it can have less memory-copying. A single `get_range()` + // would naively concatenate the memory blocks of the entire row group, while + // `get_ranges()` can skip concatenation since the downloaded blocks are + // aligned to the columns. + let mut ranges = row_group_metadata + .byte_ranges_iter() + .map(|x| x.start as usize..x.end as usize) + .collect::>(); - FetchedBytes::MemSlice { - offset: full_range.start, - mem_slice, - } + let n_ranges = ranges.len(); + + let bytes_map = current_byte_source.get_ranges(&mut ranges).await?; + + assert_eq!(bytes_map.len(), n_ranges); + + FetchedBytes::BytesMap(bytes_map) }; PolarsResult::Ok(RowGroupData { diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index 0ac1a643d93d..ef07714d591f 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -130,6 +130,12 @@ mod private { out } } + + impl From for MemSlice { + fn from(value: bytes::Bytes) -> Self { + Self::from_bytes(value) + } + } } use memmap::MmapOptions; From 62ef918f7fddcfb084536f2a6bc91e15276ca707 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Mon, 11 Nov 2024 23:28:32 +1100 Subject: [PATCH 13/20] perf: Adjust coalesce for `[, ]` (#19730) --- .../src/cloud/polars_object_store.rs | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/crates/polars-io/src/cloud/polars_object_store.rs b/crates/polars-io/src/cloud/polars_object_store.rs index cbb804198c12..eb65604bd8e4 100644 --- a/crates/polars-io/src/cloud/polars_object_store.rs +++ b/crates/polars-io/src/cloud/polars_object_store.rs @@ -322,8 +322,13 @@ fn merge_ranges(ranges: &[Range]) -> impl Iterator, let should_merge = is_overlapping // Always merge if overlapping || ( - // Don't merge if the result size is not closer to the `chunk_size` - new_merged.len().abs_diff(chunk_size) < current_merged_range.len().abs_diff(chunk_size) + ( + // Either one range is extremely small compared to the other, with a limit of 8MiB.. + range.len().min(current_merged_range.len()) + < (range.len().max(current_merged_range.len()) / 128).min(8 * 1024 * 1024) + // ..or the new size is closer to the chunk_size + || new_merged.len().abs_diff(chunk_size) < current_merged_range.len().abs_diff(chunk_size) + ) && ( // Either the gap is less than 1MiB.. distance <= 1024 * 1024 @@ -439,6 +444,27 @@ mod tests { [(0..66584576, 0), (66584576..133169152, 2)] ); + assert_eq!( + merge_ranges(&[ + 0..1, + 1..128 * 1024 * 1024, + 1 + 128 * 1024 * 1024..2 + 128 * 1024 * 1024, + 2 + 128 * 1024 * 1024..256 * 1024 * 1024 + ]) + .collect::>(), + [ + (0..67108865, 0), + (67108865..134217730, 3), + (134217730..201326593, 0), + (201326593..268435456, 4) + ] + ); + + assert_eq!( + merge_ranges(&[0..1, 1..128 * 1024 * 1024]).collect::>(), + [(0..67108864, 0), (67108864..134217728, 2)] + ); + // <= 1MiB gap, merge assert_eq!( merge_ranges(&[0..1, 1024 * 1024 + 1..1024 * 1024 + 2]).collect::>(), From 4c90458812665189775ae308ac8b8da61b330fee Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 11 Nov 2024 17:56:00 +0100 Subject: [PATCH 14/20] perf: Improve `DataFrame.sort().limit/top_k` performance (#19731) --- .../chunked_array/ops/sort/arg_bottom_k.rs | 5 ++ .../src/chunked_array/ops/sort/arg_sort.rs | 64 +++++++++++++++++-- .../src/chunked_array/ops/sort/categorical.rs | 1 + .../src/chunked_array/ops/sort/mod.rs | 10 +++ .../src/chunked_array/ops/sort/options.rs | 12 ++++ crates/polars-core/src/frame/mod.rs | 7 ++ crates/polars-expr/src/expressions/sortby.rs | 2 + crates/polars-lazy/src/tests/aggregations.rs | 5 ++ crates/polars-lazy/src/tests/queries.rs | 1 + crates/polars-ops/src/chunked_array/top_k.rs | 1 + .../src/frame/join/hash_join/sort_merge.rs | 3 + .../src/executors/sinks/sort/source.rs | 2 + crates/polars-python/src/expr/general.rs | 3 + crates/polars-python/src/functions/lazy.rs | 1 + crates/polars-python/src/lazyframe/general.rs | 2 + crates/polars-python/src/series/general.rs | 1 + py-polars/tests/unit/operations/test_top_k.py | 34 ++++++++++ 17 files changed, 148 insertions(+), 6 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs index 7f257f23f59e..7787ef28076f 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs @@ -39,6 +39,11 @@ pub fn _arg_bottom_k( _broadcast_bools(by_column.len(), &mut sort_options.descending); _broadcast_bools(by_column.len(), &mut sort_options.nulls_last); + // Don't go into row encoding. + if by_column.len() == 1 && sort_options.limit.is_some() && !sort_options.maintain_order { + return Ok(NoNull::new(by_column[0].arg_sort((&*sort_options).into()))); + } + let encoded = _get_rows_encoded( by_column, &sort_options.descending, diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs index ca34d37318a7..4f9a1ff9e9b3 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs @@ -18,7 +18,7 @@ pub(super) fn arg_sort( iters: I, options: SortOptions, null_count: usize, - len: usize, + mut len: usize, ) -> IdxCa where I: IntoIterator, @@ -49,14 +49,46 @@ where vals.extend(iter); } - sort_impl(vals.as_mut_slice(), options); + let vals = if let Some((limit, desc)) = options.limit { + let limit = limit as usize; + // Overwrite output len. + len = limit; + let out = if limit >= vals.len() { + vals.as_mut_slice() + } else if desc { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| b.1.tot_cmp(&a.1)); + lower + } else { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| a.1.tot_cmp(&b.1)); + lower + }; + + sort_impl(out, options); + out + } else { + sort_impl(vals.as_mut_slice(), options); + vals.as_slice() + }; - let iter = vals.into_iter().map(|(idx, _v)| idx); + let iter = vals.iter().map(|(idx, _v)| idx).copied(); let idx = if nulls_last { let mut idx = Vec::with_capacity(len); idx.extend(iter); - idx.extend(nulls_idx); + + let nulls_idx = if options.limit.is_some() { + &nulls_idx[..len - idx.len()] + } else { + &nulls_idx + }; + idx.extend_from_slice(nulls_idx); idx + } else if options.limit.is_some() { + nulls_idx.extend(iter.take(len - nulls_idx.len())); + nulls_idx } else { let ptr = nulls_idx.as_ptr() as usize; nulls_idx.extend(iter); @@ -90,9 +122,29 @@ where })); } - sort_impl(vals.as_mut_slice(), options); + let vals = if let Some((limit, desc)) = options.limit { + let limit = limit as usize; + let out = if limit >= vals.len() { + vals.as_mut_slice() + } else if desc { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| b.1.tot_cmp(&a.1)); + lower + } else { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| a.1.tot_cmp(&b.1)); + lower + }; + sort_impl(out, options); + out + } else { + sort_impl(vals.as_mut_slice(), options); + vals.as_slice() + }; - let iter = vals.into_iter().map(|(idx, _v)| idx); + let iter = vals.iter().map(|(idx, _v)| idx).copied(); let idx: Vec<_> = iter.collect_trusted(); ChunkedArray::with_chunk(name, IdxArr::from_data_default(Buffer::from(idx), None)) diff --git a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs index 5dd71a7b1eb8..c89e0790f251 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs @@ -53,6 +53,7 @@ impl CategoricalChunked { descending, multithreaded: true, maintain_order: false, + limit: None, }) } diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index 727f2ace15a8..add7e8b696a4 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -335,6 +335,7 @@ impl ChunkSort for StringChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -406,6 +407,7 @@ impl ChunkSort for BinaryChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -536,6 +538,7 @@ impl ChunkSort for BinaryOffsetChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -672,6 +675,7 @@ impl ChunkSort for BooleanChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -797,6 +801,7 @@ mod test { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); assert_eq!( Vec::from(&out), @@ -816,6 +821,7 @@ mod test { nulls_last: true, multithreaded: true, maintain_order: false, + limit: None, }); assert_eq!( Vec::from(&out), @@ -925,6 +931,7 @@ mod test { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[None, None, Some("a"), Some("b"), Some("c")]; assert_eq!(Vec::from(&out), expected); @@ -934,6 +941,7 @@ mod test { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[None, None, Some("c"), Some("b"), Some("a")]; @@ -944,6 +952,7 @@ mod test { nulls_last: true, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[Some("a"), Some("b"), Some("c"), None, None]; assert_eq!(Vec::from(&out), expected); @@ -953,6 +962,7 @@ mod test { nulls_last: true, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[Some("c"), Some("b"), Some("a"), None, None]; assert_eq!(Vec::from(&out), expected); diff --git a/crates/polars-core/src/chunked_array/ops/sort/options.rs b/crates/polars-core/src/chunked_array/ops/sort/options.rs index 046d0b251b04..95bff0b1b47a 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/options.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/options.rs @@ -41,6 +41,10 @@ pub struct SortOptions { /// If true maintain the order of equal elements. /// Default `false`. pub maintain_order: bool, + /// Limit a sort output, this is for optimization purposes and might be ignored. + /// - Len + /// - Descending + pub limit: Option<(IdxSize, bool)>, } /// Sort options for multi-series sorting. @@ -96,6 +100,10 @@ pub struct SortMultipleOptions { pub multithreaded: bool, /// Whether maintain the order of equal elements. Default `false`. pub maintain_order: bool, + /// Limit a sort output, this is for optimization purposes and might be ignored. + /// - Len + /// - Descending + pub limit: Option<(IdxSize, bool)>, } impl Default for SortOptions { @@ -105,6 +113,7 @@ impl Default for SortOptions { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, } } } @@ -116,6 +125,7 @@ impl Default for SortMultipleOptions { nulls_last: vec![false], multithreaded: true, maintain_order: false, + limit: None, } } } @@ -224,6 +234,7 @@ impl From<&SortOptions> for SortMultipleOptions { nulls_last: vec![value.nulls_last], multithreaded: value.multithreaded, maintain_order: value.maintain_order, + limit: value.limit, } } } @@ -235,6 +246,7 @@ impl From<&SortMultipleOptions> for SortOptions { nulls_last: value.nulls_last.first().copied().unwrap_or(false), multithreaded: value.multithreaded, maintain_order: value.maintain_order, + limit: value.limit, } } } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index e3b969a81756..7e2d7b050dcf 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1989,6 +1989,12 @@ impl DataFrame { return Ok(out); } if let Some((0, k)) = slice { + let desc = if sort_options.descending.len() == 1 { + sort_options.descending[0] + } else { + false + }; + sort_options.limit = Some((k as IdxSize, desc)); return self.bottom_k_impl(k, by_column, sort_options); } @@ -2012,6 +2018,7 @@ impl DataFrame { nulls_last: sort_options.nulls_last[0], multithreaded: sort_options.multithreaded, maintain_order: sort_options.maintain_order, + limit: sort_options.limit, }; // fast path for a frame with a single series // no need to compute the sort indices and then take by these indices diff --git a/crates/polars-expr/src/expressions/sortby.rs b/crates/polars-expr/src/expressions/sortby.rs index 1624d7c9bcd6..fad081cb49ed 100644 --- a/crates/polars-expr/src/expressions/sortby.rs +++ b/crates/polars-expr/src/expressions/sortby.rs @@ -160,6 +160,7 @@ fn sort_by_groups_multiple_by( nulls_last: nulls_last.to_owned(), multithreaded, maintain_order, + limit: None, }; let sorted_idx = groups[0] @@ -180,6 +181,7 @@ fn sort_by_groups_multiple_by( nulls_last: nulls_last.to_owned(), multithreaded, maintain_order, + limit: None, }; let sorted_idx = groups[0] .as_materialized_series() diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index 6b2d8cb05da0..2ab337ef51e9 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -450,6 +450,7 @@ fn take_aggregations() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .head(Some(2)), ) @@ -489,6 +490,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0))]) .collect()?; @@ -507,6 +509,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0))]) .collect()?; @@ -526,6 +529,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0)) .alias("1"), @@ -537,6 +541,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0)), ) diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index 95cbf586be67..0c4e518b5042 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -1666,6 +1666,7 @@ fn test_single_group_result() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .over([col("a")])]) .collect()?; diff --git a/crates/polars-ops/src/chunked_array/top_k.rs b/crates/polars-ops/src/chunked_array/top_k.rs index 9caf861b6cd9..ef37e267c10f 100644 --- a/crates/polars-ops/src/chunked_array/top_k.rs +++ b/crates/polars-ops/src/chunked_array/top_k.rs @@ -285,6 +285,7 @@ fn top_k_by_impl( nulls_last: vec![true; by.len()], multithreaded, maintain_order: false, + limit: None, }; let idx = _arg_bottom_k(k, by, &mut sort_options)?; diff --git a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs index fce2f2bf6cf0..95cde8387733 100644 --- a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs +++ b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs @@ -225,6 +225,7 @@ pub(crate) fn _sort_or_hash_inner( nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let s_right = unsafe { s_right.take_unchecked(&sort_idx) }; let ids = par_sorted_merge_inner_no_nulls(s_left, &s_right); @@ -252,6 +253,7 @@ pub(crate) fn _sort_or_hash_inner( nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let s_left = unsafe { s_left.take_unchecked(&sort_idx) }; let ids = par_sorted_merge_inner_no_nulls(&s_left, s_right); @@ -323,6 +325,7 @@ pub(crate) fn sort_or_hash_left( nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let s_right = unsafe { s_right.take_unchecked(&sort_idx) }; diff --git a/crates/polars-pipe/src/executors/sinks/sort/source.rs b/crates/polars-pipe/src/executors/sinks/sort/source.rs index 1c1fa2984a0e..6f544e8e6ef1 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/source.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/source.rs @@ -101,6 +101,7 @@ impl SortSource { nulls_last: self.nulls_last, multithreaded: true, maintain_order: false, + limit: None, }, ), Some((offset, len)) => { @@ -119,6 +120,7 @@ impl SortSource { nulls_last: self.nulls_last, multithreaded: true, maintain_order: false, + limit: None, }, ); *len = len.saturating_sub(df_len); diff --git a/crates/polars-python/src/expr/general.rs b/crates/polars-python/src/expr/general.rs index 7125388e88cd..fe5fdafdbbb8 100644 --- a/crates/polars-python/src/expr/general.rs +++ b/crates/polars-python/src/expr/general.rs @@ -260,6 +260,7 @@ impl PyExpr { nulls_last, multithreaded: true, maintain_order: false, + limit: None, }) .into() } @@ -272,6 +273,7 @@ impl PyExpr { nulls_last, multithreaded: true, maintain_order: false, + limit: None, }) .into() } @@ -349,6 +351,7 @@ impl PyExpr { nulls_last, multithreaded, maintain_order, + limit: None, }, ) .into() diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index d3ebb376d10f..1c4e738ea69c 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -75,6 +75,7 @@ pub fn arg_sort_by( nulls_last, multithreaded, maintain_order, + limit: None, }, ) .into() diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index fd89884ece82..f9fb740d4cae 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -539,6 +539,7 @@ impl PyLazyFrame { nulls_last: vec![nulls_last], multithreaded, maintain_order, + limit: None, }, ) .into() @@ -561,6 +562,7 @@ impl PyLazyFrame { nulls_last, maintain_order, multithreaded, + limit: None, }, ) .into() diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index b14285e77aa0..7312995d7606 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -457,6 +457,7 @@ impl PySeries { nulls_last, multithreaded: true, maintain_order: false, + limit: None, }; Ok(self.series.is_sorted(options).map_err(PyPolarsErr::from)?) } diff --git a/py-polars/tests/unit/operations/test_top_k.py b/py-polars/tests/unit/operations/test_top_k.py index debb3e729274..866ef88e6e10 100644 --- a/py-polars/tests/unit/operations/test_top_k.py +++ b/py-polars/tests/unit/operations/test_top_k.py @@ -397,3 +397,37 @@ def test_bottom_k_nulls(s: pl.Series, should_sort: bool) -> None: def test_top_k_descending_deprecated() -> None: with pytest.deprecated_call(): pl.col("a").top_k_by("b", descending=True) # type: ignore[call-arg] + + +def test_top_k_df() -> None: + df = pl.LazyFrame({"a": [3, 4, 1, 2, 5]}) + expected = [5, 4, 3] + assert df.sort("a", descending=True).limit(3).collect()["a"].to_list() == expected + assert df.top_k(3, by="a").collect()["a"].to_list() == expected + expected = [1, 2, 3] + assert df.sort("a", descending=False).limit(3).collect()["a"].to_list() == expected + assert df.bottom_k(3, by="a").collect()["a"].to_list() == expected + + df = pl.LazyFrame({"a": [1, None, None, 4, 5]}) + expected2 = [5, 4, 1, None] + assert ( + df.sort("a", descending=True, nulls_last=True).limit(4).collect()["a"].to_list() + == expected2 + ) + assert df.top_k(4, by="a").collect()["a"].to_list() == expected2 + expected2 = [1, 4, 5, None] + assert ( + df.sort("a", descending=False, nulls_last=True) + .limit(4) + .collect()["a"] + .to_list() + == expected2 + ) + assert df.bottom_k(4, by="a").collect()["a"].to_list() == expected2 + + assert df.sort("a", descending=False, nulls_last=False).limit(4).collect()[ + "a" + ].to_list() == [None, None, 1, 4] + assert df.sort("a", descending=True, nulls_last=False).limit(4).collect()[ + "a" + ].to_list() == [None, None, 5, 4] From 4fb7cd1f8903fee336af9a654ade9e2cbcebb6d1 Mon Sep 17 00:00:00 2001 From: Max Muoto Date: Mon, 11 Nov 2024 12:58:31 -0600 Subject: [PATCH 15/20] fix(python): Use `cls` for `to_python` (#19726) --- py-polars/polars/datatypes/classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py index bb538b4f01e8..64eaf13ea7b4 100644 --- a/py-polars/polars/datatypes/classes.py +++ b/py-polars/polars/datatypes/classes.py @@ -91,7 +91,7 @@ def from_python(cls, py_type: PythonDataType) -> PolarsDataType: # noqa: D102 ... @classmethod - def to_python(self) -> PythonDataType: # noqa: D102 + def to_python(cls) -> PythonDataType: # noqa: D102 ... From 36e5913cac35837067c434f9db5519fa9d2562b9 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 12 Nov 2024 19:04:26 +1100 Subject: [PATCH 16/20] perf: Fix cloud download speed regression (#19734) --- .../src/cloud/polars_object_store.rs | 59 ++++--------------- 1 file changed, 12 insertions(+), 47 deletions(-) diff --git a/crates/polars-io/src/cloud/polars_object_store.rs b/crates/polars-io/src/cloud/polars_object_store.rs index eb65604bd8e4..084408e8bc41 100644 --- a/crates/polars-io/src/cloud/polars_object_store.rs +++ b/crates/polars-io/src/cloud/polars_object_store.rs @@ -286,7 +286,7 @@ fn merge_ranges(ranges: &[Range]) -> impl Iterator, let mut current_merged_range = ranges.first().map_or(0..0, Clone::clone); // Number of fetched bytes excluding excess. - let mut current_n_bytes = 0; + let mut current_n_bytes = current_merged_range.len(); (0..ranges.len()) .filter_map(move |current_idx| { @@ -318,28 +318,14 @@ fn merge_ranges(ranges: &[Range]) -> impl Iterator, (r.abs_diff(l), r < l) }; - #[rustfmt::skip] - let should_merge = - is_overlapping // Always merge if overlapping - || ( - ( - // Either one range is extremely small compared to the other, with a limit of 8MiB.. - range.len().min(current_merged_range.len()) - < (range.len().max(current_merged_range.len()) / 128).min(8 * 1024 * 1024) - // ..or the new size is closer to the chunk_size - || new_merged.len().abs_diff(chunk_size) < current_merged_range.len().abs_diff(chunk_size) - ) - && ( - // Either the gap is less than 1MiB.. - distance <= 1024 * 1024 - || ( - // ..or, the gap is less than 12.5% of the largest between `current_n_bytes` - // and the new `range`, capped at 8MiB. - distance <= current_n_bytes.max(range.len()) / 8 - && distance <= 8 * 1024 * 1024 - ) - ) - ); + let should_merge = is_overlapping || { + let leq_current_len_dist_to_chunk_size = new_merged.len().abs_diff(chunk_size) + <= current_merged_range.len().abs_diff(chunk_size); + let gap_tolerance = + (current_n_bytes.max(range.len()) / 8).clamp(1024 * 1024, 8 * 1024 * 1024); + + leq_current_len_dist_to_chunk_size && distance <= gap_tolerance + }; if should_merge { // Merge to existing range @@ -351,10 +337,10 @@ fn merge_ranges(ranges: &[Range]) -> impl Iterator, }; None } else { - let v = current_merged_range.clone(); + let out = (current_merged_range.clone(), current_idx); current_merged_range = range; - current_n_bytes = 0; - Some((v, current_idx)) + current_n_bytes = current_merged_range.len(); + Some(out) } } }) @@ -444,27 +430,6 @@ mod tests { [(0..66584576, 0), (66584576..133169152, 2)] ); - assert_eq!( - merge_ranges(&[ - 0..1, - 1..128 * 1024 * 1024, - 1 + 128 * 1024 * 1024..2 + 128 * 1024 * 1024, - 2 + 128 * 1024 * 1024..256 * 1024 * 1024 - ]) - .collect::>(), - [ - (0..67108865, 0), - (67108865..134217730, 3), - (134217730..201326593, 0), - (201326593..268435456, 4) - ] - ); - - assert_eq!( - merge_ranges(&[0..1, 1..128 * 1024 * 1024]).collect::>(), - [(0..67108864, 0), (67108864..134217728, 2)] - ); - // <= 1MiB gap, merge assert_eq!( merge_ranges(&[0..1, 1024 * 1024 + 1..1024 * 1024 + 2]).collect::>(), From 017508be195e028e5cde2a83d4de2fb547acee0f Mon Sep 17 00:00:00 2001 From: eitsupi <50911393+eitsupi@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:36:17 +0900 Subject: [PATCH 17/20] feat(rust,python): Implement max/min methods for dtypes (#19494) --- crates/polars-core/src/datatypes/dtype.rs | 46 +++++++++++++++++++ .../src/dsl/function_expr/bounds.rs | 46 ++----------------- crates/polars-python/src/datatypes.rs | 16 ++++++- py-polars/polars/datatypes/classes.py | 39 ++++++++++++++++ py-polars/src/lib.rs | 8 +++- py-polars/tests/unit/test_datatypes.py | 25 ++++++++++ 6 files changed, 136 insertions(+), 44 deletions(-) diff --git a/crates/polars-core/src/datatypes/dtype.rs b/crates/polars-core/src/datatypes/dtype.rs index 30d96649762f..f4c87ce0ad22 100644 --- a/crates/polars-core/src/datatypes/dtype.rs +++ b/crates/polars-core/src/datatypes/dtype.rs @@ -572,6 +572,52 @@ impl DataType { } } + /// Try to get the maximum value for this datatype. + pub fn max(&self) -> PolarsResult { + use DataType::*; + let v = match self { + #[cfg(feature = "dtype-i8")] + Int8 => Scalar::from(i8::MAX), + #[cfg(feature = "dtype-i16")] + Int16 => Scalar::from(i16::MAX), + Int32 => Scalar::from(i32::MAX), + Int64 => Scalar::from(i64::MAX), + #[cfg(feature = "dtype-u8")] + UInt8 => Scalar::from(u8::MAX), + #[cfg(feature = "dtype-u16")] + UInt16 => Scalar::from(u16::MAX), + UInt32 => Scalar::from(u32::MAX), + UInt64 => Scalar::from(u64::MAX), + Float32 => Scalar::from(f32::INFINITY), + Float64 => Scalar::from(f64::INFINITY), + dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt), + }; + Ok(v) + } + + /// Try to get the minimum value for this datatype. + pub fn min(&self) -> PolarsResult { + use DataType::*; + let v = match self { + #[cfg(feature = "dtype-i8")] + Int8 => Scalar::from(i8::MIN), + #[cfg(feature = "dtype-i16")] + Int16 => Scalar::from(i16::MIN), + Int32 => Scalar::from(i32::MIN), + Int64 => Scalar::from(i64::MIN), + #[cfg(feature = "dtype-u8")] + UInt8 => Scalar::from(u8::MIN), + #[cfg(feature = "dtype-u16")] + UInt16 => Scalar::from(u16::MIN), + UInt32 => Scalar::from(u32::MIN), + UInt64 => Scalar::from(u64::MIN), + Float32 => Scalar::from(f32::NEG_INFINITY), + Float64 => Scalar::from(f64::NEG_INFINITY), + dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt), + }; + Ok(v) + } + /// Convert to an Arrow data type. #[inline] pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType { diff --git a/crates/polars-plan/src/dsl/function_expr/bounds.rs b/crates/polars-plan/src/dsl/function_expr/bounds.rs index 77c8a6f3ef5f..ae0f36a0956e 100644 --- a/crates/polars-plan/src/dsl/function_expr/bounds.rs +++ b/crates/polars-plan/src/dsl/function_expr/bounds.rs @@ -2,50 +2,12 @@ use super::*; pub(super) fn upper_bound(s: &Column) -> PolarsResult { let name = s.name().clone(); - use DataType::*; - let s = match s.dtype().to_physical() { - #[cfg(feature = "dtype-i8")] - Int8 => Column::new_scalar(name, Scalar::from(i8::MAX), 1), - #[cfg(feature = "dtype-i16")] - Int16 => Column::new_scalar(name, Scalar::from(i16::MAX), 1), - Int32 => Column::new_scalar(name, Scalar::from(i32::MAX), 1), - Int64 => Column::new_scalar(name, Scalar::from(i64::MAX), 1), - #[cfg(feature = "dtype-u8")] - UInt8 => Column::new_scalar(name, Scalar::from(u8::MAX), 1), - #[cfg(feature = "dtype-u16")] - UInt16 => Column::new_scalar(name, Scalar::from(u16::MAX), 1), - UInt32 => Column::new_scalar(name, Scalar::from(u32::MAX), 1), - UInt64 => Column::new_scalar(name, Scalar::from(u64::MAX), 1), - Float32 => Column::new_scalar(name, Scalar::from(f32::INFINITY), 1), - Float64 => Column::new_scalar(name, Scalar::from(f64::INFINITY), 1), - dt => polars_bail!( - ComputeError: "cannot determine upper bound for dtype `{}`", dt, - ), - }; - Ok(s) + let scalar = s.dtype().to_physical().max()?; + Ok(Column::new_scalar(name, scalar, 1)) } pub(super) fn lower_bound(s: &Column) -> PolarsResult { let name = s.name().clone(); - use DataType::*; - let s = match s.dtype().to_physical() { - #[cfg(feature = "dtype-i8")] - Int8 => Column::new_scalar(name, Scalar::from(i8::MIN), 1), - #[cfg(feature = "dtype-i16")] - Int16 => Column::new_scalar(name, Scalar::from(i16::MIN), 1), - Int32 => Column::new_scalar(name, Scalar::from(i32::MIN), 1), - Int64 => Column::new_scalar(name, Scalar::from(i64::MIN), 1), - #[cfg(feature = "dtype-u8")] - UInt8 => Column::new_scalar(name, Scalar::from(u8::MIN), 1), - #[cfg(feature = "dtype-u16")] - UInt16 => Column::new_scalar(name, Scalar::from(u16::MIN), 1), - UInt32 => Column::new_scalar(name, Scalar::from(u32::MIN), 1), - UInt64 => Column::new_scalar(name, Scalar::from(u64::MIN), 1), - Float32 => Column::new_scalar(name, Scalar::from(f32::NEG_INFINITY), 1), - Float64 => Column::new_scalar(name, Scalar::from(f64::NEG_INFINITY), 1), - dt => polars_bail!( - ComputeError: "cannot determine lower bound for dtype `{}`", dt, - ), - }; - Ok(s) + let scalar = s.dtype().to_physical().min()?; + Ok(Column::new_scalar(name, scalar, 1)) } diff --git a/crates/polars-python/src/datatypes.rs b/crates/polars-python/src/datatypes.rs index a31a2301f866..ea7686a29ec6 100644 --- a/crates/polars-python/src/datatypes.rs +++ b/crates/polars-python/src/datatypes.rs @@ -1,10 +1,12 @@ use polars::prelude::*; use polars_core::utils::arrow::array::Utf8ViewArray; +use polars_lazy::dsl; use pyo3::prelude::*; +use crate::error::PyPolarsErr; #[cfg(feature = "object")] use crate::object::OBJECT_NAME; -use crate::Wrap; +use crate::{PyExpr, Wrap}; // Don't change the order of these! #[repr(u8)] @@ -117,3 +119,15 @@ impl<'py> FromPyObject<'py> for PyDataType { Ok(dt.0.into()) } } + +#[pyfunction] +pub fn _get_dtype_max(dt: Wrap) -> PyResult { + let v = dt.0.max().map_err(PyPolarsErr::from)?; + Ok(dsl::lit(v).into()) +} + +#[pyfunction] +pub fn _get_dtype_min(dt: Wrap) -> PyResult { + let v = dt.0.min().map_err(PyPolarsErr::from)?; + Ok(dsl::lit(v).into()) +} diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py index 64eaf13ea7b4..5543f629a620 100644 --- a/py-polars/polars/datatypes/classes.py +++ b/py-polars/polars/datatypes/classes.py @@ -12,6 +12,7 @@ import polars.functions as F with contextlib.suppress(ImportError): # Module not available when building docs + import polars.polars as plr from polars.polars import dtype_str_repr as _dtype_str_repr if TYPE_CHECKING: @@ -238,6 +239,44 @@ def to_python(self) -> PythonDataType: class NumericType(DataType): """Base class for numeric data types.""" + @classmethod + def max(cls) -> pl.Expr: + """ + Return a literal expression representing the maximum value of this data type. + + Examples + -------- + >>> pl.select(pl.Int8.max() == 127) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ bool │ + ╞═════════╡ + │ true │ + └─────────┘ + """ + return pl.Expr._from_pyexpr(plr._get_dtype_max(cls)) + + @classmethod + def min(cls) -> pl.Expr: + """ + Return a literal expression representing the minimum value of this data type. + + Examples + -------- + >>> pl.select(pl.Int8.min() == -128) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ bool │ + ╞═════════╡ + │ true │ + └─────────┘ + """ + return pl.Expr._from_pyexpr(plr._get_dtype_min(cls)) + class IntegerType(NumericType): """Base class for integer data types.""" diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 859609828d19..f73577319545 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -20,7 +20,7 @@ use polars_python::lazygroupby::PyLazyGroupBy; use polars_python::series::PySeries; #[cfg(feature = "sql")] use polars_python::sql::PySQLContext; -use polars_python::{exceptions, functions}; +use polars_python::{datatypes, exceptions, functions}; use pyo3::prelude::*; use pyo3::{wrap_pyfunction, wrap_pymodule}; @@ -279,6 +279,12 @@ fn polars(py: Python, m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(functions::escape_regex)) .unwrap(); + // Dtype helpers + m.add_wrapped(wrap_pyfunction!(datatypes::_get_dtype_max)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(datatypes::_get_dtype_min)) + .unwrap(); + // Exceptions - Errors m.add( "PolarsError", diff --git a/py-polars/tests/unit/test_datatypes.py b/py-polars/tests/unit/test_datatypes.py index 4d604f2964e9..ed4b8cd1dd61 100644 --- a/py-polars/tests/unit/test_datatypes.py +++ b/py-polars/tests/unit/test_datatypes.py @@ -202,3 +202,28 @@ def test_struct_field_iter() -> None: def test_raise_invalid_namespace() -> None: with pytest.raises(pl.exceptions.InvalidOperationError): pl.select(pl.lit(1.5).str.replace("1", "2")) + + +@pytest.mark.parametrize( + ("dtype", "lower", "upper"), + [ + (pl.Int8, -128, 127), + (pl.UInt8, 0, 255), + (pl.Int16, -32768, 32767), + (pl.UInt16, 0, 65535), + (pl.Int32, -2147483648, 2147483647), + (pl.UInt32, 0, 4294967295), + (pl.Int64, -9223372036854775808, 9223372036854775807), + (pl.UInt64, 0, 18446744073709551615), + (pl.Float32, float("-inf"), float("inf")), + (pl.Float64, float("-inf"), float("inf")), + ], +) +def test_max_min( + dtype: datatypes.IntegerType | datatypes.Float32 | datatypes.Float64, + upper: int | float, + lower: int | float, +) -> None: + df = pl.select(min=dtype.min(), max=dtype.max()) + assert df.to_series(0).item() == lower + assert df.to_series(1).item() == upper From 260e8e2f506fb85dbac4e0a4b043a9dd4f2974cf Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 12 Nov 2024 13:36:50 +0400 Subject: [PATCH 18/20] feat(python): Try to support native SAP HANA driver via `read_database` (#19733) --- py-polars/polars/io/database/_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/io/database/_executor.py b/py-polars/polars/io/database/_executor.py index 1cbaf4679db9..85401d582fe7 100644 --- a/py-polars/polars/io/database/_executor.py +++ b/py-polars/polars/io/database/_executor.py @@ -511,7 +511,7 @@ def execute( result = cursor_execute(query, *positional_options) # note: some cursors execute in-place, some access results via a property - result = self.cursor if result is None else result + result = self.cursor if (result is None or result is True) else result if self.driver_name == "duckdb": result = result.cursor From 7f0b3e00845c013222ac8b1094d693af582258c2 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 12 Nov 2024 11:20:57 +0100 Subject: [PATCH 19/20] Python Polars 1.13.0 (#19737) --- Cargo.lock | 2 +- py-polars/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5176bd831139..edd21ab88098 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3393,7 +3393,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.12.0" +version = "1.13.0" dependencies = [ "jemallocator", "libc", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index fc3e520e5ecc..d17218a3b6cd 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.12.0" +version = "1.13.0" edition = "2021" [lib] From 6cbc7c35a4a06bcce2bd46d288b39a86e7c7cf0f Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Tue, 12 Nov 2024 16:58:46 +0100 Subject: [PATCH 20/20] refactor(rust): Add InMemoryJoin to new-streaming engine (#19741) --- Cargo.lock | 1 + crates/polars-stream/Cargo.toml | 1 + .../src/nodes/joins/in_memory.rs | 119 ++++++++++++++++++ crates/polars-stream/src/nodes/joins/mod.rs | 1 + crates/polars-stream/src/nodes/mod.rs | 1 + crates/polars-stream/src/physical_plan/fmt.rs | 36 ++++-- .../src/physical_plan/lower_ir.rs | 24 +++- crates/polars-stream/src/physical_plan/mod.rs | 25 ++++ .../src/physical_plan/to_graph.rs | 56 +++++++++ .../tests/unit/operations/test_is_sorted.py | 10 +- 10 files changed, 261 insertions(+), 13 deletions(-) create mode 100644 crates/polars-stream/src/nodes/joins/in_memory.rs create mode 100644 crates/polars-stream/src/nodes/joins/mod.rs diff --git a/Cargo.lock b/Cargo.lock index edd21ab88098..f79742adafb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3257,6 +3257,7 @@ dependencies = [ "polars-expr", "polars-io", "polars-mem-engine", + "polars-ops", "polars-parquet", "polars-plan", "polars-utils", diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index c40f477ff741..f0b3b1c30e35 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -28,6 +28,7 @@ polars-core = { workspace = true } polars-error = { workspace = true } polars-expr = { workspace = true } polars-mem-engine = { workspace = true } +polars-ops = { workspace = true } polars-parquet = { workspace = true } polars-plan = { workspace = true } diff --git a/crates/polars-stream/src/nodes/joins/in_memory.rs b/crates/polars-stream/src/nodes/joins/in_memory.rs new file mode 100644 index 000000000000..a98c23a435b0 --- /dev/null +++ b/crates/polars-stream/src/nodes/joins/in_memory.rs @@ -0,0 +1,119 @@ +use std::sync::Arc; + +use polars_core::schema::Schema; + +use crate::nodes::compute_node_prelude::*; +use crate::nodes::in_memory_sink::InMemorySinkNode; +use crate::nodes::in_memory_source::InMemorySourceNode; + +enum InMemoryJoinState { + Sink { + left: InMemorySinkNode, + right: InMemorySinkNode, + }, + Source(InMemorySourceNode), + Done, +} + +pub struct InMemoryJoinNode { + state: InMemoryJoinState, + num_pipelines: usize, + joiner: Arc PolarsResult + Send + Sync>, +} + +impl InMemoryJoinNode { + pub fn new( + left_input_schema: Arc, + right_input_schema: Arc, + joiner: Arc PolarsResult + Send + Sync>, + ) -> Self { + Self { + state: InMemoryJoinState::Sink { + left: InMemorySinkNode::new(left_input_schema), + right: InMemorySinkNode::new(right_input_schema), + }, + num_pipelines: 0, + joiner, + } + } +} + +impl ComputeNode for InMemoryJoinNode { + fn name(&self) -> &str { + "in_memory_join" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.num_pipelines = num_pipelines; + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { + assert!(recv.len() == 2 && send.len() == 1); + + // If the output doesn't want any more data, transition to being done. + if send[0] == PortState::Done && !matches!(self.state, InMemoryJoinState::Done) { + self.state = InMemoryJoinState::Done; + } + + // If the input is done, transition to being a source. + if let InMemoryJoinState::Sink { left, right } = &mut self.state { + if recv[0] == PortState::Done && recv[1] == PortState::Done { + let left_df = left.get_output()?.unwrap(); + let right_df = right.get_output()?.unwrap(); + let mut source_node = + InMemorySourceNode::new(Arc::new((self.joiner)(left_df, right_df)?)); + source_node.initialize(self.num_pipelines); + self.state = InMemoryJoinState::Source(source_node); + } + } + + match &mut self.state { + InMemoryJoinState::Sink { left, right, .. } => { + left.update_state(&mut recv[0..1], &mut [])?; + right.update_state(&mut recv[1..2], &mut [])?; + send[0] = PortState::Blocked; + }, + InMemoryJoinState::Source(source_node) => { + recv[0] = PortState::Done; + recv[1] = PortState::Done; + source_node.update_state(&mut [], send)?; + }, + InMemoryJoinState::Done => { + recv[0] = PortState::Done; + recv[1] = PortState::Done; + send[0] = PortState::Done; + }, + } + Ok(()) + } + + fn is_memory_intensive_pipeline_blocker(&self) -> bool { + matches!(self.state, InMemoryJoinState::Sink { .. }) + } + + fn spawn<'env, 's>( + &'env mut self, + scope: &'s TaskScope<'s, 'env>, + recv_ports: &mut [Option>], + send_ports: &mut [Option>], + state: &'s ExecutionState, + join_handles: &mut Vec>>, + ) { + assert!(recv_ports.len() == 2); + assert!(send_ports.len() == 1); + match &mut self.state { + InMemoryJoinState::Sink { left, right, .. } => { + if recv_ports[0].is_some() { + left.spawn(scope, &mut recv_ports[0..1], &mut [], state, join_handles); + } + if recv_ports[1].is_some() { + right.spawn(scope, &mut recv_ports[1..2], &mut [], state, join_handles); + } + }, + InMemoryJoinState::Source(source) => { + source.spawn(scope, &mut [], send_ports, state, join_handles) + }, + InMemoryJoinState::Done => unreachable!(), + } + } +} diff --git a/crates/polars-stream/src/nodes/joins/mod.rs b/crates/polars-stream/src/nodes/joins/mod.rs new file mode 100644 index 000000000000..fa2e12699f5e --- /dev/null +++ b/crates/polars-stream/src/nodes/joins/mod.rs @@ -0,0 +1 @@ +pub mod in_memory; diff --git a/crates/polars-stream/src/nodes/mod.rs b/crates/polars-stream/src/nodes/mod.rs index 4fb42daddd6b..936c0ceb3ada 100644 --- a/crates/polars-stream/src/nodes/mod.rs +++ b/crates/polars-stream/src/nodes/mod.rs @@ -5,6 +5,7 @@ pub mod in_memory_sink; pub mod in_memory_source; pub mod input_independent_select; pub mod io_sinks; +pub mod joins; pub mod map; pub mod multiplexer; pub mod ordered_union; diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs index e0735144da79..7ef74d5b0ad9 100644 --- a/crates/polars-stream/src/physical_plan/fmt.rs +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -200,16 +200,34 @@ fn visualize_plan_rec( (out, &[][..]) }, - PhysNodeKind::GroupBy { input, key, aggs } => { - let label = "group-by"; - ( - format!( - "{label}\\nkey:\\n{}\\naggs:\\n{}", - fmt_exprs(key, expr_arena), - fmt_exprs(aggs, expr_arena) - ), - from_ref(input), + PhysNodeKind::GroupBy { input, key, aggs } => ( + format!( + "group-by\\nkey:\\n{}\\naggs:\\n{}", + fmt_exprs(key, expr_arena), + fmt_exprs(aggs, expr_arena) + ), + from_ref(input), + ), + PhysNodeKind::InMemoryJoin { + input_left, + input_right, + left_on, + right_on, + args, + } => { + let mut label = "in-memory-join".to_string(); + write!(label, r"\nleft_on:\n{}", fmt_exprs(left_on, expr_arena)).unwrap(); + write!(label, r"\nright_on:\n{}", fmt_exprs(right_on, expr_arena)).unwrap(); + write!( + label, + r"\nhow: {}", + escape_graphviz(&format!("{:?}", args.how)) ) + .unwrap(); + if args.join_nulls { + write!(label, r"\njoin-nulls").unwrap(); + } + (label, &[*input_left, *input_right][..]) }, }; diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index d57a8667c479..95e3ae72224d 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -415,7 +415,29 @@ pub fn lower_ir( } return Ok(node); }, - IR::Join { .. } => todo!(), + IR::Join { + input_left, + input_right, + schema: _, + left_on, + right_on, + options, + } => { + let input_left = *input_left; + let input_right = *input_right; + let left_on = left_on.clone(); + let right_on = right_on.clone(); + let args = options.args.clone(); + let phys_left = lower_ir!(input_left)?; + let phys_right = lower_ir!(input_right)?; + PhysNodeKind::InMemoryJoin { + input_left: phys_left, + input_right: phys_right, + left_on, + right_on, + args, + } + }, IR::Distinct { .. } => todo!(), IR::ExtContext { .. } => todo!(), IR::Invalid => unreachable!(), diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index 3b4643100249..707c2a53dec2 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -5,6 +5,7 @@ use polars_core::frame::DataFrame; use polars_core::prelude::{IdxSize, InitHashMaps, PlHashMap, SortMultipleOptions}; use polars_core::schema::{Schema, SchemaRef}; use polars_error::PolarsResult; +use polars_ops::frame::JoinArgs; use polars_plan::plans::hive::HivePartitions; use polars_plan::plans::{AExpr, DataFrameUdf, FileInfo, FileScan, ScanSources, IR}; use polars_plan::prelude::expr_ir::ExprIR; @@ -100,6 +101,9 @@ pub enum PhysNodeKind { input: PhysNodeKey, }, + /// Generic fallback for (as-of-yet) unsupported streaming mappings. + /// Fully sinks all data to an in-memory data frame and uses the in-memory + /// engine to perform the map. InMemoryMap { input: PhysNodeKey, map: Arc, @@ -149,6 +153,17 @@ pub enum PhysNodeKind { key: Vec, aggs: Vec, }, + + /// Generic fallback for (as-of-yet) unsupported streaming joins. + /// Fully sinks all data to in-memory data frames and uses the in-memory + /// engine to perform the join. + InMemoryJoin { + input_left: PhysNodeKey, + input_right: PhysNodeKey, + left_on: Vec, + right_on: Vec, + args: JoinArgs, + }, } #[recursive::recursive] @@ -198,6 +213,16 @@ fn insert_multiplexers( insert_multiplexers(*input, phys_sm, referenced); }, + PhysNodeKind::InMemoryJoin { + input_left, + input_right, + .. + } => { + let input_right = *input_right; + insert_multiplexers(*input_left, phys_sm, referenced); + insert_multiplexers(input_right, phys_sm, referenced); + }, + PhysNodeKind::OrderedUnion { inputs } | PhysNodeKind::Zip { inputs, .. } => { for input in inputs.clone() { insert_multiplexers(input, phys_sm, referenced); diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index 472cf982a253..befa9c3a93b9 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -8,6 +8,7 @@ use polars_expr::planner::{create_physical_expr, get_expr_depth_limit, Expressio use polars_expr::reduce::into_reduction; use polars_expr::state::ExecutionState; use polars_mem_engine::create_physical_plan; +use polars_plan::dsl::JoinOptions; use polars_plan::global::_set_n_rows_for_scan; use polars_plan::plans::expr_ir::ExprIR; use polars_plan::plans::{AExpr, ArenaExprIter, Context, IR}; @@ -410,6 +411,61 @@ fn to_graph_rec<'a>( [input_key], ) }, + + InMemoryJoin { + input_left, + input_right, + left_on, + right_on, + args, + } => { + let left_input_key = to_graph_rec(*input_left, ctx)?; + let right_input_key = to_graph_rec(*input_right, ctx)?; + let left_input_schema = ctx.phys_sm[*input_left].output_schema.clone(); + let right_input_schema = ctx.phys_sm[*input_right].output_schema.clone(); + + let mut lp_arena = Arena::default(); + let left_lmdf = Arc::new(LateMaterializedDataFrame::default()); + let right_lmdf = Arc::new(LateMaterializedDataFrame::default()); + + let left_node = lp_arena.add(left_lmdf.clone().as_ir_node(left_input_schema.clone())); + let right_node = + lp_arena.add(right_lmdf.clone().as_ir_node(right_input_schema.clone())); + let join_node = lp_arena.add(IR::Join { + input_left: left_node, + input_right: right_node, + schema: node.output_schema.clone(), + left_on: left_on.clone(), + right_on: right_on.clone(), + options: Arc::new(JoinOptions { + allow_parallel: true, + force_parallel: false, + args: args.clone(), + rows_left: (None, 0), + rows_right: (None, 0), + }), + }); + + let executor = Mutex::new(create_physical_plan( + join_node, + &mut lp_arena, + ctx.expr_arena, + )?); + + ctx.graph.add_node( + nodes::joins::in_memory::InMemoryJoinNode::new( + left_input_schema, + right_input_schema, + Arc::new(move |left, right| { + left_lmdf.set_materialized_dataframe(left); + right_lmdf.set_materialized_dataframe(right); + let mut state = ExecutionState::new(); + executor.lock().execute(&mut state) + }), + ), + [left_input_key, right_input_key], + ) + }, }; ctx.phys_to_graph.insert(phys_node_key, graph_key); diff --git a/py-polars/tests/unit/operations/test_is_sorted.py b/py-polars/tests/unit/operations/test_is_sorted.py index f81076ced502..093dae47bfbf 100644 --- a/py-polars/tests/unit/operations/test_is_sorted.py +++ b/py-polars/tests/unit/operations/test_is_sorted.py @@ -384,12 +384,16 @@ def test_with_pd( test_with_pd(dfbpd, dfapd, "b", "left", joined) joined = dfb.join(dfa, on="b", how="inner") - assert not joined["a"].flags["SORTED_ASC"] + if (joined["a"] != sorted(joined["a"])).any(): + assert not joined["a"].flags["SORTED_ASC"] joined = dfb.join(dfa, on="b", how="semi") - assert not joined["a"].flags["SORTED_ASC"] + if (joined["a"] != sorted(joined["a"])).any(): + assert not joined["a"].flags["SORTED_ASC"] + joined = dfb.join(dfa, on="b", how="anti") - assert not joined["a"].flags["SORTED_ASC"] + if (joined["a"] != sorted(joined["a"])).any(): + assert not joined["a"].flags["SORTED_ASC"] def test_sorted_flag_group_by_dynamic() -> None: