docs(python): More accurate and helpful docs for user defined functio…

…ns (pola-rs#15194) Co-authored-by: Itamar Turner-Trauring <[email protected]> Co-authored-by: Marco Gorelli <[email protected]>
alexander-beedie · Jun 26, 2024 · 7e93d79 · 7e93d79
1 parent 94ec792
commit 7e93d79
Show file tree

Hide file tree

Showing 10 changed files with 243 additions and 218 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -5,6 +5,7 @@ matplotlib
 seaborn
 plotly
 altair
+numba
 # Unpin NumPy when support is implemented in numpy crate:
 # https://github.com/pola-rs/polars/issues/16998
 numpy<2

diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py
@@ -64,3 +64,15 @@
 ).filter(pl.struct("Movie", "Theatre").is_duplicated())
 print(out)
 # --8<-- [end:struct_ranking]
+
+# --8<-- [start:multi_column_apply]
+df = pl.DataFrame({"keys": ["a", "a", "b"], "values": [10, 7, 1]})
+
+out = df.select(
+    pl.struct(["keys", "values"])
+    .map_elements(lambda x: len(x["keys"]) + x["values"])
+    .alias("solution_map_elements"),
+    (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
+)
+print(out)
+# --8<-- [end:multi_column_apply]
diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py
@@ -7,59 +7,104 @@
 # --8<-- [start:dataframe]
 df = pl.DataFrame(
     {
-        "keys": ["a", "a", "b"],
-        "values": [10, 7, 1],
+        "keys": ["a", "a", "b", "b"],
+        "values": [10, 7, 1, 23],
     }
 )
 print(df)
 # --8<-- [end:dataframe]
 
-# --8<-- [start:shift_map_batches]
-out = df.group_by("keys", maintain_order=True).agg(
-    pl.col("values")
-    .map_batches(lambda s: s.shift(), is_elementwise=True)
-    .alias("shift_map_batches"),
-    pl.col("values").shift().alias("shift_expression"),
-)
+# --8<-- [start:individual_log]
+import math
+
+
+def my_log(value):
+    return math.log(value)
+
+
+out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64))
 print(out)
-# --8<-- [end:shift_map_batches]
+# --8<-- [end:individual_log]
 
 
-# --8<-- [start:map_elements]
-out = df.group_by("keys", maintain_order=True).agg(
-    pl.col("values")
-    .map_elements(lambda s: s.shift(), return_dtype=pl.List(int))
-    .alias("shift_map_elements"),
-    pl.col("values").shift().alias("shift_expression"),
-)
+# --8<-- [start:diff_from_mean]
+def diff_from_mean(series):
+    # This will be very slow for non-trivial Series, since it's all Python
+    # code:
+    total = 0
+    for value in series:
+        total += value
+    mean = total / len(series)
+    return pl.Series([value - mean for value in series])
+
+
+# Apply our custom function to a full Series with map_batches():
+out = df.select(pl.col("values").map_batches(diff_from_mean))
+print("== select() with UDF ==")
+print(out)
+
+# Apply our custom function per group:
+print("== group_by() with UDF ==")
+out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean))
 print(out)
-# --8<-- [end:map_elements]
+# --8<-- [end:diff_from_mean]
 
-# --8<-- [start:counter]
-counter = 0
+# --8<-- [start:np_log]
+import numpy as np
 
+out = df.select(pl.col("values").map_batches(np.log))
+print(out)
+# --8<-- [end:np_log]
 
-def add_counter(val: int) -> int:
-    global counter
-    counter += 1
-    return counter + val
+# --8<-- [start:diff_from_mean_numba]
+from numba import guvectorize, int64, float64
 
 
-out = df.select(
-    pl.col("values")
-    .map_elements(add_counter, return_dtype=pl.Int64)
-    .alias("solution_map_elements"),
-    (pl.col("values") + pl.int_range(1, pl.len() + 1)).alias("solution_expr"),
-)
+# This will be compiled to machine code, so it will be fast. The Series is
+# converted to a NumPy array before being passed to the function. See the
+# Numba documentation for more details:
+# https://numba.readthedocs.io/en/stable/user/vectorize.html
+@guvectorize([(int64[:], float64[:])], "(n)->(n)")
+def diff_from_mean_numba(arr, result):
+    total = 0
+    for value in arr:
+        total += value
+    mean = total / len(arr)
+    for i, value in enumerate(arr):
+        result[i] = value - mean
+
+
+out = df.select(pl.col("values").map_batches(diff_from_mean_numba))
+print("== select() with UDF ==")
+print(out)
+
+out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean_numba))
+print("== group_by() with UDF ==")
 print(out)
-# --8<-- [end:counter]
+# --8<-- [end:diff_from_mean_numba]
+
 
 # --8<-- [start:combine]
-out = df.select(
-    pl.struct("keys", "values")
-    .map_elements(lambda x: len(x["keys"]) + x["values"], return_dtype=pl.Int64)
-    .alias("solution_map_elements"),
-    (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
+# Add two arrays together:
+@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
+def add(arr, arr2, result):
+    for i in range(len(arr)):
+        result[i] = arr[i] + arr2[i]
+
+
+df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})
+
+out = df3.select(
+    # Create a struct that has two columns in it:
+    pl.struct(["values1", "values2"])
+    # Pass the struct to a lambda that then passes the individual columns to
+    # the add() function:
+    .map_batches(
+        lambda combined: add(
+            combined.struct.field("values1"), combined.struct.field("values2")
+        )
+    )
+    .alias("add_columns")
 )
 print(out)
 # --8<-- [end:combine]
diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs
@@ -95,5 +95,54 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!("{}", &out);
     // --8<-- [end:struct_ranking]
 
+    // --8<-- [start:multi_column_apply]
+    let df = df!(
+        "keys" => &["a", "a", "b"],
+        "values" => &[10, 7, 1],
+    )?;
+
+    let out = df
+        .lazy()
+        .select([
+            // pack to struct to get access to multiple fields in a custom `apply/map`
+            as_struct(vec![col("keys"), col("values")])
+                // we will compute the len(a) + b
+                .apply(
+                    |s| {
+                        // downcast to struct
+                        let ca = s.struct_()?;
+
+                        // get the fields as Series
+                        let s_a = &ca.fields()[0];
+                        let s_b = &ca.fields()[1];
+
+                        // downcast the `Series` to their known type
+                        let ca_a = s_a.str()?;
+                        let ca_b = s_b.i32()?;
+
+                        // iterate both `ChunkedArrays`
+                        let out: Int32Chunked = ca_a
+                            .into_iter()
+                            .zip(ca_b)
+                            .map(|(opt_a, opt_b)| match (opt_a, opt_b) {
+                                (Some(a), Some(b)) => Some(a.len() as i32 + b),
+                                _ => None,
+                            })
+                            .collect();
+
+                        Ok(Some(out.into_series()))
+                    },
+                    GetOutput::from_type(DataType::Int32),
+                )
+                // note: the `'solution_map_elements'` alias is just there to show how you
+                // get the same output as in the Python API example.
+                .alias("solution_map_elements"),
+            (col("keys").str().count_matches(lit("."), true) + col("values"))
+                .alias("solution_expr"),
+        ])
+        .collect()?;
+    println!("{}", out);
+
+    // --8<-- [end:multi_column_apply]
     Ok(())
 }
diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs
@@ -3,93 +3,25 @@ use polars::prelude::*;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     // --8<-- [start:dataframe]
     let df = df!(
-        "keys" => &["a", "a", "b"],
-        "values" => &[10, 7, 1],
+        "keys" => &["a", "a", "b", "b"],
+        "values" => &[10, 7, 1, 23],
     )?;
     println!("{}", df);
     // --8<-- [end:dataframe]
 
-    // --8<-- [start:shift_map_batches]
-    let out = df
-        .clone()
-        .lazy()
-        .group_by(["keys"])
-        .agg([
-            col("values")
-                .map(|s| Ok(Some(s.shift(1))), GetOutput::default())
-                // note: the `'shift_map_batches'` alias is just there to show how you
-                // get the same output as in the Python API example.
-                .alias("shift_map_batches"),
-            col("values").shift(lit(1)).alias("shift_expression"),
-        ])
-        .collect()?;
+    // --8<-- [start:individual_log]
+    // --8<-- [end:individual_log]
 
-    println!("{}", out);
-    // --8<-- [end:shift_map_batches]
+    // --8<-- [start:diff_from_mean]
+    // --8<-- [end:diff_from_mean]
 
-    // --8<-- [start:map_elements]
-    let out = df
-        .clone()
-        .lazy()
-        .group_by([col("keys")])
-        .agg([
-            col("values")
-                .apply(|s| Ok(Some(s.shift(1))), GetOutput::default())
-                // note: the `'shift_map_elements'` alias is just there to show how you
-                // get the same output as in the Python API example.
-                .alias("shift_map_elements"),
-            col("values").shift(lit(1)).alias("shift_expression"),
-        ])
-        .collect()?;
-    println!("{}", out);
-    // --8<-- [end:map_elements]
+    // --8<-- [start:np_log]
+    // --8<-- [end:np_log]
 
-    // --8<-- [start:counter]
-
-    // --8<-- [end:counter]
+    // --8<-- [start:diff_from_mean_numba]
+    // --8<-- [end:diff_from_mean_numba]
 
     // --8<-- [start:combine]
-    let out = df
-        .lazy()
-        .select([
-            // pack to struct to get access to multiple fields in a custom `apply/map`
-            as_struct(vec![col("keys"), col("values")])
-                // we will compute the len(a) + b
-                .apply(
-                    |s| {
-                        // downcast to struct
-                        let ca = s.struct_()?;
-
-                        // get the fields as Series
-                        let s_a = &ca.fields()[0];
-                        let s_b = &ca.fields()[1];
-
-                        // downcast the `Series` to their known type
-                        let ca_a = s_a.str()?;
-                        let ca_b = s_b.i32()?;
-
-                        // iterate both `ChunkedArrays`
-                        let out: Int32Chunked = ca_a
-                            .into_iter()
-                            .zip(ca_b)
-                            .map(|(opt_a, opt_b)| match (opt_a, opt_b) {
-                                (Some(a), Some(b)) => Some(a.len() as i32 + b),
-                                _ => None,
-                            })
-                            .collect();
-
-                        Ok(Some(out.into_series()))
-                    },
-                    GetOutput::from_type(DataType::Int32),
-                )
-                // note: the `'solution_map_elements'` alias is just there to show how you
-                // get the same output as in the Python API example.
-                .alias("solution_map_elements"),
-            (col("keys").str().count_matches(lit("."), true) + col("values"))
-                .alias("solution_expr"),
-        ])
-        .collect()?;
-    println!("{}", out);
     // --8<-- [end:combine]
     Ok(())
 }
diff --git a/docs/user-guide/expressions/numpy.md b/docs/user-guide/expressions/numpy.md
@@ -15,8 +15,8 @@ This means that if a function is not provided by Polars, we can use NumPy and we
 
 ### Interoperability
 
-Polars `Series` have support for NumPy universal functions (ufuncs). Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead.
+Polars `Series` have support for NumPy universal functions (ufuncs) and generalized ufuncs. Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead.
 
-However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results.
+However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results, so an error will be raised if you pass a `Series` with missing data to a generalized ufunc.
 
 Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion.
diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md
@@ -96,4 +96,11 @@ That's a pretty complex set of requirements done very elegantly in Polars!
 
 ### Using multi-column apply
 
-This was discussed in the previous section on _User Defined Functions_.
+This was discussed in the previous section on _User Defined Functions_ for the Python case.
+Here's an example of doing so with both Python and Rust:
+
+{{code_block('user-guide/expressions/structs','multi_column_apply',[])}}
+
+```python exec="on" result="text" session="user-guide/structs"
+--8<-- "python/user-guide/expressions/structs.py:multi_column_apply"
+```