Skip to content

Commit

Permalink
docs(python): More accurate and helpful docs for user defined functio…
Browse files Browse the repository at this point in the history
…ns (pola-rs#15194)

Co-authored-by: Itamar Turner-Trauring <[email protected]>
Co-authored-by: Marco Gorelli <[email protected]>
  • Loading branch information
3 people authored and alexander-beedie committed Jun 26, 2024
1 parent 94ec792 commit 7e93d79
Show file tree
Hide file tree
Showing 10 changed files with 243 additions and 218 deletions.
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ matplotlib
seaborn
plotly
altair
numba
# Unpin NumPy when support is implemented in numpy crate:
# https://github.com/pola-rs/polars/issues/16998
numpy<2
Expand Down
12 changes: 12 additions & 0 deletions docs/src/python/user-guide/expressions/structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,15 @@
).filter(pl.struct("Movie", "Theatre").is_duplicated())
print(out)
# --8<-- [end:struct_ranking]

# --8<-- [start:multi_column_apply]
df = pl.DataFrame({"keys": ["a", "a", "b"], "values": [10, 7, 1]})

out = df.select(
pl.struct(["keys", "values"])
.map_elements(lambda x: len(x["keys"]) + x["values"])
.alias("solution_map_elements"),
(pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
)
print(out)
# --8<-- [end:multi_column_apply]
117 changes: 81 additions & 36 deletions docs/src/python/user-guide/expressions/user-defined-functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,59 +7,104 @@
# --8<-- [start:dataframe]
df = pl.DataFrame(
{
"keys": ["a", "a", "b"],
"values": [10, 7, 1],
"keys": ["a", "a", "b", "b"],
"values": [10, 7, 1, 23],
}
)
print(df)
# --8<-- [end:dataframe]

# --8<-- [start:shift_map_batches]
out = df.group_by("keys", maintain_order=True).agg(
pl.col("values")
.map_batches(lambda s: s.shift(), is_elementwise=True)
.alias("shift_map_batches"),
pl.col("values").shift().alias("shift_expression"),
)
# --8<-- [start:individual_log]
import math


def my_log(value):
return math.log(value)


out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64))
print(out)
# --8<-- [end:shift_map_batches]
# --8<-- [end:individual_log]


# --8<-- [start:map_elements]
out = df.group_by("keys", maintain_order=True).agg(
pl.col("values")
.map_elements(lambda s: s.shift(), return_dtype=pl.List(int))
.alias("shift_map_elements"),
pl.col("values").shift().alias("shift_expression"),
)
# --8<-- [start:diff_from_mean]
def diff_from_mean(series):
# This will be very slow for non-trivial Series, since it's all Python
# code:
total = 0
for value in series:
total += value
mean = total / len(series)
return pl.Series([value - mean for value in series])


# Apply our custom function to a full Series with map_batches():
out = df.select(pl.col("values").map_batches(diff_from_mean))
print("== select() with UDF ==")
print(out)

# Apply our custom function per group:
print("== group_by() with UDF ==")
out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean))
print(out)
# --8<-- [end:map_elements]
# --8<-- [end:diff_from_mean]

# --8<-- [start:counter]
counter = 0
# --8<-- [start:np_log]
import numpy as np

out = df.select(pl.col("values").map_batches(np.log))
print(out)
# --8<-- [end:np_log]

def add_counter(val: int) -> int:
global counter
counter += 1
return counter + val
# --8<-- [start:diff_from_mean_numba]
from numba import guvectorize, int64, float64


out = df.select(
pl.col("values")
.map_elements(add_counter, return_dtype=pl.Int64)
.alias("solution_map_elements"),
(pl.col("values") + pl.int_range(1, pl.len() + 1)).alias("solution_expr"),
)
# This will be compiled to machine code, so it will be fast. The Series is
# converted to a NumPy array before being passed to the function. See the
# Numba documentation for more details:
# https://numba.readthedocs.io/en/stable/user/vectorize.html
@guvectorize([(int64[:], float64[:])], "(n)->(n)")
def diff_from_mean_numba(arr, result):
total = 0
for value in arr:
total += value
mean = total / len(arr)
for i, value in enumerate(arr):
result[i] = value - mean


out = df.select(pl.col("values").map_batches(diff_from_mean_numba))
print("== select() with UDF ==")
print(out)

out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean_numba))
print("== group_by() with UDF ==")
print(out)
# --8<-- [end:counter]
# --8<-- [end:diff_from_mean_numba]


# --8<-- [start:combine]
out = df.select(
pl.struct("keys", "values")
.map_elements(lambda x: len(x["keys"]) + x["values"], return_dtype=pl.Int64)
.alias("solution_map_elements"),
(pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
# Add two arrays together:
@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
def add(arr, arr2, result):
for i in range(len(arr)):
result[i] = arr[i] + arr2[i]


df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})

out = df3.select(
# Create a struct that has two columns in it:
pl.struct(["values1", "values2"])
# Pass the struct to a lambda that then passes the individual columns to
# the add() function:
.map_batches(
lambda combined: add(
combined.struct.field("values1"), combined.struct.field("values2")
)
)
.alias("add_columns")
)
print(out)
# --8<-- [end:combine]
49 changes: 49 additions & 0 deletions docs/src/rust/user-guide/expressions/structs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,5 +95,54 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("{}", &out);
// --8<-- [end:struct_ranking]

// --8<-- [start:multi_column_apply]
let df = df!(
"keys" => &["a", "a", "b"],
"values" => &[10, 7, 1],
)?;

let out = df
.lazy()
.select([
// pack to struct to get access to multiple fields in a custom `apply/map`
as_struct(vec![col("keys"), col("values")])
// we will compute the len(a) + b
.apply(
|s| {
// downcast to struct
let ca = s.struct_()?;

// get the fields as Series
let s_a = &ca.fields()[0];
let s_b = &ca.fields()[1];

// downcast the `Series` to their known type
let ca_a = s_a.str()?;
let ca_b = s_b.i32()?;

// iterate both `ChunkedArrays`
let out: Int32Chunked = ca_a
.into_iter()
.zip(ca_b)
.map(|(opt_a, opt_b)| match (opt_a, opt_b) {
(Some(a), Some(b)) => Some(a.len() as i32 + b),
_ => None,
})
.collect();

Ok(Some(out.into_series()))
},
GetOutput::from_type(DataType::Int32),
)
// note: the `'solution_map_elements'` alias is just there to show how you
// get the same output as in the Python API example.
.alias("solution_map_elements"),
(col("keys").str().count_matches(lit("."), true) + col("values"))
.alias("solution_expr"),
])
.collect()?;
println!("{}", out);

// --8<-- [end:multi_column_apply]
Ok(())
}
88 changes: 10 additions & 78 deletions docs/src/rust/user-guide/expressions/user-defined-functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,93 +3,25 @@ use polars::prelude::*;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// --8<-- [start:dataframe]
let df = df!(
"keys" => &["a", "a", "b"],
"values" => &[10, 7, 1],
"keys" => &["a", "a", "b", "b"],
"values" => &[10, 7, 1, 23],
)?;
println!("{}", df);
// --8<-- [end:dataframe]

// --8<-- [start:shift_map_batches]
let out = df
.clone()
.lazy()
.group_by(["keys"])
.agg([
col("values")
.map(|s| Ok(Some(s.shift(1))), GetOutput::default())
// note: the `'shift_map_batches'` alias is just there to show how you
// get the same output as in the Python API example.
.alias("shift_map_batches"),
col("values").shift(lit(1)).alias("shift_expression"),
])
.collect()?;
// --8<-- [start:individual_log]
// --8<-- [end:individual_log]

println!("{}", out);
// --8<-- [end:shift_map_batches]
// --8<-- [start:diff_from_mean]
// --8<-- [end:diff_from_mean]

// --8<-- [start:map_elements]
let out = df
.clone()
.lazy()
.group_by([col("keys")])
.agg([
col("values")
.apply(|s| Ok(Some(s.shift(1))), GetOutput::default())
// note: the `'shift_map_elements'` alias is just there to show how you
// get the same output as in the Python API example.
.alias("shift_map_elements"),
col("values").shift(lit(1)).alias("shift_expression"),
])
.collect()?;
println!("{}", out);
// --8<-- [end:map_elements]
// --8<-- [start:np_log]
// --8<-- [end:np_log]

// --8<-- [start:counter]

// --8<-- [end:counter]
// --8<-- [start:diff_from_mean_numba]
// --8<-- [end:diff_from_mean_numba]

// --8<-- [start:combine]
let out = df
.lazy()
.select([
// pack to struct to get access to multiple fields in a custom `apply/map`
as_struct(vec![col("keys"), col("values")])
// we will compute the len(a) + b
.apply(
|s| {
// downcast to struct
let ca = s.struct_()?;

// get the fields as Series
let s_a = &ca.fields()[0];
let s_b = &ca.fields()[1];

// downcast the `Series` to their known type
let ca_a = s_a.str()?;
let ca_b = s_b.i32()?;

// iterate both `ChunkedArrays`
let out: Int32Chunked = ca_a
.into_iter()
.zip(ca_b)
.map(|(opt_a, opt_b)| match (opt_a, opt_b) {
(Some(a), Some(b)) => Some(a.len() as i32 + b),
_ => None,
})
.collect();

Ok(Some(out.into_series()))
},
GetOutput::from_type(DataType::Int32),
)
// note: the `'solution_map_elements'` alias is just there to show how you
// get the same output as in the Python API example.
.alias("solution_map_elements"),
(col("keys").str().count_matches(lit("."), true) + col("values"))
.alias("solution_expr"),
])
.collect()?;
println!("{}", out);
// --8<-- [end:combine]
Ok(())
}
4 changes: 2 additions & 2 deletions docs/user-guide/expressions/numpy.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ This means that if a function is not provided by Polars, we can use NumPy and we

### Interoperability

Polars `Series` have support for NumPy universal functions (ufuncs). Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead.
Polars `Series` have support for NumPy universal functions (ufuncs) and generalized ufuncs. Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead.

However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results.
However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results, so an error will be raised if you pass a `Series` with missing data to a generalized ufunc.

Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion.
9 changes: 8 additions & 1 deletion docs/user-guide/expressions/structs.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,11 @@ That's a pretty complex set of requirements done very elegantly in Polars!

### Using multi-column apply

This was discussed in the previous section on _User Defined Functions_.
This was discussed in the previous section on _User Defined Functions_ for the Python case.
Here's an example of doing so with both Python and Rust:

{{code_block('user-guide/expressions/structs','multi_column_apply',[])}}

```python exec="on" result="text" session="user-guide/structs"
--8<-- "python/user-guide/expressions/structs.py:multi_column_apply"
```
Loading

0 comments on commit 7e93d79

Please sign in to comment.