From 0bea71ea955b1d8fb8a1904e812b3b0f2fcf3441 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sat, 14 Dec 2024 15:24:25 +0100 Subject: [PATCH 01/23] init --- R/000-wrappers.R | 312 ++++ R/lazyframe-frame.R | 2311 +++++++++++++++++++++++++++- R/utils-various.R | 113 ++ man/dataframe__cast.Rd | 12 +- man/lazyframe__bottom_k.Rd | 39 + man/lazyframe__cast.Rd | 37 + man/lazyframe__clear.Rd | 29 + man/lazyframe__clone.Rd | 42 + man/lazyframe__collect.Rd | 11 + man/lazyframe__collect_schema.Rd | 29 + man/lazyframe__count.Rd | 18 + man/lazyframe__drop.Rd | 34 + man/lazyframe__drop_nulls.Rd | 34 + man/lazyframe__explain.Rd | 78 + man/lazyframe__explode.Rd | 27 + man/lazyframe__fill_nan.Rd | 24 + man/lazyframe__filter.Rd | 43 + man/lazyframe__first.Rd | 18 + man/lazyframe__gather_every.Rd | 25 + man/lazyframe__group_by.Rd | 45 + man/lazyframe__group_by_dynamic.Rd | 178 +++ man/lazyframe__head.Rd | 22 + man/lazyframe__interpolate.Rd | 23 + man/lazyframe__join.Rd | 108 ++ man/lazyframe__join_asof.Rd | 166 ++ man/lazyframe__join_where.Rd | 52 + man/lazyframe__last.Rd | 18 + man/lazyframe__limit.Rd | 22 + man/lazyframe__max.Rd | 18 + man/lazyframe__mean.Rd | 18 + man/lazyframe__median.Rd | 18 + man/lazyframe__merge_sorted.Rd | 34 + man/lazyframe__min.Rd | 18 + man/lazyframe__null_count.Rd | 18 + man/lazyframe__profile.Rd | 161 ++ man/lazyframe__quantile.Rd | 21 + man/lazyframe__rename.Rd | 40 + man/lazyframe__reverse.Rd | 18 + man/lazyframe__rolling.Rd | 88 ++ man/lazyframe__select_seq.Rd | 30 + man/lazyframe__serialize.Rd | 18 + man/lazyframe__set_sorted.Rd | 22 + man/lazyframe__shift.Rd | 35 + man/lazyframe__sink_csv.Rd | 137 ++ man/lazyframe__sink_ipc.Rd | 86 ++ man/lazyframe__sink_ndjson.Rd | 74 + man/lazyframe__sink_parquet.Rd | 122 ++ man/lazyframe__slice.Rd | 24 + man/lazyframe__sort.Rd | 55 + man/lazyframe__std.Rd | 19 + man/lazyframe__sum.Rd | 18 + man/lazyframe__tail.Rd | 34 + man/lazyframe__to_dot.Rd | 71 + man/lazyframe__top_k.Rd | 41 + man/lazyframe__unique.Rd | 50 + man/lazyframe__unnest.Rd | 34 + man/lazyframe__unpivot.Rd | 45 + man/lazyframe__var.Rd | 19 + man/lazyframe__with_columns_seq.Rd | 66 + man/lazyframe__with_context.Rd | 39 + man/lazyframe__with_row_index.Rd | 32 + man/pl.Rd | 2 +- man/pl__deserialize_lf.Rd | 22 + src/init.c | 258 ++++ src/rust/Cargo.toml | 7 + src/rust/api.h | 43 + src/rust/src/conversion/mod.rs | 227 ++- src/rust/src/lazyframe/general.rs | 722 ++++++++- src/rust/src/lazyframe/mod.rs | 1 + src/rust/src/lazyframe/serde.rs | 12 + 70 files changed, 6678 insertions(+), 9 deletions(-) create mode 100644 man/lazyframe__bottom_k.Rd create mode 100644 man/lazyframe__cast.Rd create mode 100644 man/lazyframe__clear.Rd create mode 100644 man/lazyframe__clone.Rd create mode 100644 man/lazyframe__collect_schema.Rd create mode 100644 man/lazyframe__count.Rd create mode 100644 man/lazyframe__drop.Rd create mode 100644 man/lazyframe__drop_nulls.Rd create mode 100644 man/lazyframe__explain.Rd create mode 100644 man/lazyframe__explode.Rd create mode 100644 man/lazyframe__fill_nan.Rd create mode 100644 man/lazyframe__filter.Rd create mode 100644 man/lazyframe__first.Rd create mode 100644 man/lazyframe__gather_every.Rd create mode 100644 man/lazyframe__group_by.Rd create mode 100644 man/lazyframe__group_by_dynamic.Rd create mode 100644 man/lazyframe__head.Rd create mode 100644 man/lazyframe__interpolate.Rd create mode 100644 man/lazyframe__join.Rd create mode 100644 man/lazyframe__join_asof.Rd create mode 100644 man/lazyframe__join_where.Rd create mode 100644 man/lazyframe__last.Rd create mode 100644 man/lazyframe__limit.Rd create mode 100644 man/lazyframe__max.Rd create mode 100644 man/lazyframe__mean.Rd create mode 100644 man/lazyframe__median.Rd create mode 100644 man/lazyframe__merge_sorted.Rd create mode 100644 man/lazyframe__min.Rd create mode 100644 man/lazyframe__null_count.Rd create mode 100644 man/lazyframe__profile.Rd create mode 100644 man/lazyframe__quantile.Rd create mode 100644 man/lazyframe__rename.Rd create mode 100644 man/lazyframe__reverse.Rd create mode 100644 man/lazyframe__rolling.Rd create mode 100644 man/lazyframe__select_seq.Rd create mode 100644 man/lazyframe__serialize.Rd create mode 100644 man/lazyframe__set_sorted.Rd create mode 100644 man/lazyframe__shift.Rd create mode 100644 man/lazyframe__sink_csv.Rd create mode 100644 man/lazyframe__sink_ipc.Rd create mode 100644 man/lazyframe__sink_ndjson.Rd create mode 100644 man/lazyframe__sink_parquet.Rd create mode 100644 man/lazyframe__slice.Rd create mode 100644 man/lazyframe__sort.Rd create mode 100644 man/lazyframe__std.Rd create mode 100644 man/lazyframe__sum.Rd create mode 100644 man/lazyframe__tail.Rd create mode 100644 man/lazyframe__to_dot.Rd create mode 100644 man/lazyframe__top_k.Rd create mode 100644 man/lazyframe__unique.Rd create mode 100644 man/lazyframe__unnest.Rd create mode 100644 man/lazyframe__unpivot.Rd create mode 100644 man/lazyframe__var.Rd create mode 100644 man/lazyframe__with_columns_seq.Rd create mode 100644 man/lazyframe__with_context.Rd create mode 100644 man/lazyframe__with_row_index.Rd create mode 100644 man/pl__deserialize_lf.Rd create mode 100644 src/rust/src/lazyframe/serde.rs diff --git a/R/000-wrappers.R b/R/000-wrappers.R index c68795d0..72bfead1 100644 --- a/R/000-wrappers.R +++ b/R/000-wrappers.R @@ -285,6 +285,11 @@ NULL .savvy_wrap_PlRWhen(.Call(savvy_when__impl, `condition`)) } + +`deserialize_lf` <- function(`json`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_deserialize_lf__impl, `json`)) +} + ### wrapper functions for PlRChainedThen `PlRChainedThen_when` <- function(self) { @@ -3361,6 +3366,271 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed") } } +`PlRLazyFrame_to_dot` <- function(self) { + function(`optimized`) { + .savvy_wrap_String(.Call(savvy_PlRLazyFrame_to_dot__impl, `self`, `optimized`)) + } +} + +`PlRLazyFrame_sort` <- function(self) { + function(`by_column`, `descending`, `nulls_last`, `maintain_order`, `multithreaded`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_sort__impl, `self`, `by_column`, `descending`, `nulls_last`, `maintain_order`, `multithreaded`)) + } +} + +`PlRLazyFrame_top_k` <- function(self) { + function(`k`, `by`, `reverse`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_top_k__impl, `self`, `k`, `by`, `reverse`)) + } +} + +`PlRLazyFrame_bottom_k` <- function(self) { + function(`k`, `by`, `reverse`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_bottom_k__impl, `self`, `k`, `by`, `reverse`)) + } +} + +`PlRLazyFrame_cache` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_cache__impl, `self`)) + } +} + +`PlRLazyFrame_profile` <- function(self) { + function() { + .Call(savvy_PlRLazyFrame_profile__impl, `self`) + } +} + +`PlRLazyFrame_sink_parquet` <- function(self) { + function(`path`, `compression`, `maintain_order`, `statistics`, `retries`, `compression_level` = NULL, `row_group_size` = NULL, `data_page_size` = NULL, `storage_options` = NULL) { + invisible(.Call(savvy_PlRLazyFrame_sink_parquet__impl, `self`, `path`, `compression`, `maintain_order`, `statistics`, `retries`, `compression_level`, `row_group_size`, `data_page_size`, `storage_options`)) + } +} + +`PlRLazyFrame_sink_ipc` <- function(self) { + function(`path`, `maintain_order`, `retries`, `compression` = NULL, `storage_options` = NULL) { + invisible(.Call(savvy_PlRLazyFrame_sink_ipc__impl, `self`, `path`, `maintain_order`, `retries`, `compression`, `storage_options`)) + } +} + +`PlRLazyFrame_sink_csv` <- function(self) { + function(`path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `retries`, `datetime_format` = NULL, `date_format` = NULL, `time_format` = NULL, `float_scientific` = NULL, `float_precision` = NULL, `null_value` = NULL, `quote_style` = NULL, `storage_options` = NULL) { + invisible(.Call(savvy_PlRLazyFrame_sink_csv__impl, `self`, `path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `retries`, `datetime_format`, `date_format`, `time_format`, `float_scientific`, `float_precision`, `null_value`, `quote_style`, `storage_options`)) + } +} + +`PlRLazyFrame_sink_json` <- function(self) { + function(`path`, `maintain_order`, `retries`, `storage_options` = NULL) { + invisible(.Call(savvy_PlRLazyFrame_sink_json__impl, `self`, `path`, `maintain_order`, `retries`, `storage_options`)) + } +} + +`PlRLazyFrame_serialize` <- function(self) { + function() { + .Call(savvy_PlRLazyFrame_serialize__impl, `self`) + } +} + +`PlRLazyFrame_select_seq` <- function(self) { + function(`exprs`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_select_seq__impl, `self`, `exprs`)) + } +} + +`PlRLazyFrame_rolling` <- function(self) { + function(`index_column`, `period`, `offset`, `closed`, `by`) { + `index_column` <- .savvy_extract_ptr(`index_column`, "PlRExpr") + .savvy_wrap_PlRLazyGroupBy(.Call(savvy_PlRLazyFrame_rolling__impl, `self`, `index_column`, `period`, `offset`, `closed`, `by`)) + } +} + +`PlRLazyFrame_group_by_dynamic` <- function(self) { + function(`index_column`, `every`, `period`, `offset`, `label`, `include_boundaries`, `closed`, `group_by`, `start_by`) { + `index_column` <- .savvy_extract_ptr(`index_column`, "PlRExpr") + .savvy_wrap_PlRLazyGroupBy(.Call(savvy_PlRLazyFrame_group_by_dynamic__impl, `self`, `index_column`, `every`, `period`, `offset`, `label`, `include_boundaries`, `closed`, `group_by`, `start_by`)) + } +} + +`PlRLazyFrame_with_context` <- function(self) { + function(`contexts`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_with_context__impl, `self`, `contexts`)) + } +} + +`PlRLazyFrame_join_asof` <- function(self) { + function(`other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `suffix`, `coalesce`, `strategy`, `left_by` = NULL, `right_by` = NULL, `tolerance` = NULL, `tolerance_str` = NULL) { + `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame") + `left_on` <- .savvy_extract_ptr(`left_on`, "PlRExpr") + `right_on` <- .savvy_extract_ptr(`right_on`, "PlRExpr") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_join_asof__impl, `self`, `other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `suffix`, `coalesce`, `strategy`, `left_by`, `right_by`, `tolerance`, `tolerance_str`)) + } +} + +`PlRLazyFrame_join` <- function(self) { + function(`other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `join_nulls`, `how`, `suffix`, `validate`, `coalesce` = NULL) { + `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_join__impl, `self`, `other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `join_nulls`, `how`, `suffix`, `validate`, `coalesce`)) + } +} + +`PlRLazyFrame_join_where` <- function(self) { + function(`other`, `predicates`, `suffix`) { + `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_join_where__impl, `self`, `other`, `predicates`, `suffix`)) + } +} + +`PlRLazyFrame_with_columns_seq` <- function(self) { + function(`exprs`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_with_columns_seq__impl, `self`, `exprs`)) + } +} + +`PlRLazyFrame_rename` <- function(self) { + function(`existing`, `new`, `strict`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_rename__impl, `self`, `existing`, `new`, `strict`)) + } +} + +`PlRLazyFrame_reverse` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_reverse__impl, `self`)) + } +} + +`PlRLazyFrame_shift` <- function(self) { + function(`n`, `fill_value` = NULL) { + `n` <- .savvy_extract_ptr(`n`, "PlRExpr") + `fill_value` <- .savvy_extract_ptr(`fill_value`, "PlRExpr") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_shift__impl, `self`, `n`, `fill_value`)) + } +} + +`PlRLazyFrame_fill_nan` <- function(self) { + function(`fill_value`) { + `fill_value` <- .savvy_extract_ptr(`fill_value`, "PlRExpr") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_fill_nan__impl, `self`, `fill_value`)) + } +} + +`PlRLazyFrame_fill_null` <- function(self) { + function(`fill_value`) { + `fill_value` <- .savvy_extract_ptr(`fill_value`, "PlRExpr") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_fill_null__impl, `self`, `fill_value`)) + } +} + +`PlRLazyFrame_min` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_min__impl, `self`)) + } +} + +`PlRLazyFrame_max` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_max__impl, `self`)) + } +} + +`PlRLazyFrame_sum` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_sum__impl, `self`)) + } +} + +`PlRLazyFrame_mean` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_mean__impl, `self`)) + } +} + +`PlRLazyFrame_std` <- function(self) { + function(`ddof`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_std__impl, `self`, `ddof`)) + } +} + +`PlRLazyFrame_var` <- function(self) { + function(`ddof`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_var__impl, `self`, `ddof`)) + } +} + +`PlRLazyFrame_median` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_median__impl, `self`)) + } +} + +`PlRLazyFrame_quantile` <- function(self) { + function(`quantile`, `interpolation`) { + `quantile` <- .savvy_extract_ptr(`quantile`, "PlRExpr") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_quantile__impl, `self`, `quantile`, `interpolation`)) + } +} + +`PlRLazyFrame_explode` <- function(self) { + function(`column`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_explode__impl, `self`, `column`)) + } +} + +`PlRLazyFrame_null_count` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_null_count__impl, `self`)) + } +} + +`PlRLazyFrame_unique` <- function(self) { + function(`maintain_order`, `keep`, `subset` = NULL) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_unique__impl, `self`, `maintain_order`, `keep`, `subset`)) + } +} + +`PlRLazyFrame_drop_nulls` <- function(self) { + function(`subset` = NULL) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_drop_nulls__impl, `self`, `subset`)) + } +} + +`PlRLazyFrame_unpivot` <- function(self) { + function(`on`, `index`, `value_name` = NULL, `variable_name` = NULL) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_unpivot__impl, `self`, `on`, `index`, `value_name`, `variable_name`)) + } +} + +`PlRLazyFrame_with_row_index` <- function(self) { + function(`name`, `offset` = NULL) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_with_row_index__impl, `self`, `name`, `offset`)) + } +} + +`PlRLazyFrame_clone` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_clone__impl, `self`)) + } +} + +`PlRLazyFrame_unnest` <- function(self) { + function(`columns`) { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_unnest__impl, `self`, `columns`)) + } +} + +`PlRLazyFrame_count` <- function(self) { + function() { + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_count__impl, `self`)) + } +} + +`PlRLazyFrame_merge_sorted` <- function(self) { + function(`other`, `key`) { + `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame") + .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_merge_sorted__impl, `self`, `other`, `key`)) + } +} + `.savvy_wrap_PlRLazyFrame` <- function(ptr) { e <- new.env(parent = emptyenv()) e$.ptr <- ptr @@ -3381,6 +3651,48 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed") e$`collect_schema` <- `PlRLazyFrame_collect_schema`(ptr) e$`sort_by_exprs` <- `PlRLazyFrame_sort_by_exprs`(ptr) e$`with_columns` <- `PlRLazyFrame_with_columns`(ptr) + e$`to_dot` <- `PlRLazyFrame_to_dot`(ptr) + e$`sort` <- `PlRLazyFrame_sort`(ptr) + e$`top_k` <- `PlRLazyFrame_top_k`(ptr) + e$`bottom_k` <- `PlRLazyFrame_bottom_k`(ptr) + e$`cache` <- `PlRLazyFrame_cache`(ptr) + e$`profile` <- `PlRLazyFrame_profile`(ptr) + e$`sink_parquet` <- `PlRLazyFrame_sink_parquet`(ptr) + e$`sink_ipc` <- `PlRLazyFrame_sink_ipc`(ptr) + e$`sink_csv` <- `PlRLazyFrame_sink_csv`(ptr) + e$`sink_json` <- `PlRLazyFrame_sink_json`(ptr) + e$`serialize` <- `PlRLazyFrame_serialize`(ptr) + e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr) + e$`rolling` <- `PlRLazyFrame_rolling`(ptr) + e$`group_by_dynamic` <- `PlRLazyFrame_group_by_dynamic`(ptr) + e$`with_context` <- `PlRLazyFrame_with_context`(ptr) + e$`join_asof` <- `PlRLazyFrame_join_asof`(ptr) + e$`join` <- `PlRLazyFrame_join`(ptr) + e$`join_where` <- `PlRLazyFrame_join_where`(ptr) + e$`with_columns_seq` <- `PlRLazyFrame_with_columns_seq`(ptr) + e$`rename` <- `PlRLazyFrame_rename`(ptr) + e$`reverse` <- `PlRLazyFrame_reverse`(ptr) + e$`shift` <- `PlRLazyFrame_shift`(ptr) + e$`fill_nan` <- `PlRLazyFrame_fill_nan`(ptr) + e$`fill_null` <- `PlRLazyFrame_fill_null`(ptr) + e$`min` <- `PlRLazyFrame_min`(ptr) + e$`max` <- `PlRLazyFrame_max`(ptr) + e$`sum` <- `PlRLazyFrame_sum`(ptr) + e$`mean` <- `PlRLazyFrame_mean`(ptr) + e$`std` <- `PlRLazyFrame_std`(ptr) + e$`var` <- `PlRLazyFrame_var`(ptr) + e$`median` <- `PlRLazyFrame_median`(ptr) + e$`quantile` <- `PlRLazyFrame_quantile`(ptr) + e$`explode` <- `PlRLazyFrame_explode`(ptr) + e$`null_count` <- `PlRLazyFrame_null_count`(ptr) + e$`unique` <- `PlRLazyFrame_unique`(ptr) + e$`drop_nulls` <- `PlRLazyFrame_drop_nulls`(ptr) + e$`unpivot` <- `PlRLazyFrame_unpivot`(ptr) + e$`with_row_index` <- `PlRLazyFrame_with_row_index`(ptr) + e$`clone` <- `PlRLazyFrame_clone`(ptr) + e$`unnest` <- `PlRLazyFrame_unnest`(ptr) + e$`count` <- `PlRLazyFrame_count`(ptr) + e$`merge_sorted` <- `PlRLazyFrame_merge_sorted`(ptr) class(e) <- c("PlRLazyFrame", "savvy_neopolars__sealed") e diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R index 45e14a6e..0465d4a0 100644 --- a/R/lazyframe-frame.R +++ b/R/lazyframe-frame.R @@ -102,6 +102,61 @@ lazyframe__select <- function(...) { }) } +#' Select columns from this LazyFrame +#' +#' This will run all expression sequentially instead of in parallel. Use this +#' when the work per expression is cheap. +#' +#' @inherit as_polars_lf return +#' @inheritParams lazyframe__select +#' +#' @examples +#' lf <- pl$LazyFrame( +#' foo = 1:3, +#' bar = 6:8, +#' ham = letters[1:3] +#' ) +#' lf$select_seq("foo")$collect() +lazyframe__select_seq <- function(...) { + wrap({ + structify <- parse_env_auto_structify() + parse_into_list_of_expressions(..., `__structify` = structify) |> + self$`_ldf`$select_seq() + }) +} + +#' Start a group by operation +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column(s) to group by. +#' Accepts expression input. Strings are parsed as column names. +#' @param .maintain_order Ensure that the order of the groups is consistent with +#' the input data. This is slower than a default group by. Setting this to +#' `TRUE` blocks the possibility to run on the streaming engine. +#' +# TODO: need a proper definition to link to +#' @return A lazy groupby +#' @examples +#' # Group by one column and call agg() to compute the grouped sum of another +#' # column. +#' lf <- pl$LazyFrame( +#' a = c("a", "b", "a", "b", "c"), +#' b = c(1, 2, 1, 3, 3), +#' c = c(5, 4, 3, 2, 1) +#' ) +#' lf$group_by("a")$agg(pl$col("b")$sum())$collect() +#' +#' # Set .maintain_order = TRUE to ensure the order of the groups is consistent +#' # with the input. +#' lf$group_by("a", .maintain_order = TRUE)$agg(pl$col("b")$sum())$collect() +#' +#' # Group by multiple columns by passing a vector of column names. +#' lf$group_by(c("a", "b"))$agg(pl$col("c")$max())$collect() +#' +#' # Or use positional arguments to group by multiple columns in the same way. +#' # Expressions are also accepted. +#' lf$ +#' group_by("a", pl$col("b") / 2)$ +#' agg(pl$col("c")$mean())$collect() lazyframe__group_by <- function(..., .maintain_order = FALSE) { wrap({ exprs <- parse_into_list_of_expressions(...) @@ -131,6 +186,18 @@ lazyframe__group_by <- function(..., .maintain_order = FALSE) { #' It may be changed at any point without it being considered a breaking change. #' @param _eager A logical, indicates to turn off multi-node optimizations and the other optimizations. #' This option is intended for internal use only. +#' +#' @inherit as_polars_lf return +#' +#' @seealso +#' - [`$profile()`][lazyframe__profile] - same as `$collect()` but also returns +#' a table with each operation profiled. +#' - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking +#' collect returns a future handle. Can also just be used via +#' `$collect(collect_in_background = TRUE)`. +#' - [`$sink_parquet()`][lazyframe__sink_parquet()] streams query to a parquet file. +#' - [`$sink_ipc()`][lazyframe__sink_ipc()] streams query to a arrow file. +#' #' @examples #' lf <- pl$LazyFrame( #' a = c("a", "b", "a", "b", "b", "c"), @@ -185,6 +252,144 @@ lazyframe__collect <- function( }) } +#' Collect and profile a lazy query. +#' +#' This will run the query and return a list containing the materialized +#' DataFrame and a DataFrame that contains profiling information of each node +#' that is executed. +#' +#' @inheritParams rlang::check_dots_empty0 +#' @inheritParams lazyframe__collect +#' @param show_plot Show a Gantt chart of the profiling result +#' @param truncate_nodes Truncate the label lengths in the Gantt chart to this +#' number of characters. If `0` (default), do not truncate. +#' +#' @details +#' The units of the timings are microseconds. +#' +#' @return List of two `DataFrame`s: one with the collected result, the other +#' with the timings of each step. If `show_graph = TRUE`, then the plot is +#' also stored in the list. +#' @seealso +#' - [`$collect()`][LazyFrame_collect] - regular collect. +#' - [`$collect_in_background()`][LazyFrame_collect_in_background] - non-blocking +#' collect returns a future handle. Can also just be used via +#' `$collect(collect_in_background = TRUE)`. +#' - [`$sink_parquet()`][LazyFrame_sink_parquet()] streams query to a parquet file. +#' - [`$sink_ipc()`][LazyFrame_sink_ipc()] streams query to a arrow file. +#' +#' @examples +#' ## Simplest use case +#' pl$LazyFrame()$select(pl$lit(2) + 2)$profile() +#' +#' ## Use $profile() to compare two queries +#' +#' # -1- map each Species-group with native polars +#' as_polars_lf(iris)$ +#' sort("Sepal.Length")$ +#' group_by("Species", maintain_order = TRUE)$ +#' agg(pl$col(pl$Float64)$first() + 5)$ +#' profile() +#' +#' # -2- map each Species-group of each numeric column with an R function +#' +#' # some R function, prints `.` for each time called by polars +#' r_func <- \(s) { +#' cat(".") +#' s$to_r()[1] + 5 +#' } +#' +#' as_polars_lf(iris)$ +#' sort("Sepal.Length")$ +#' group_by("Species", maintain_order = TRUE)$ +#' agg(pl$col(pl$Float64)$map_elements(r_func))$ +#' profile() +lazyframe__profile <- function( + ..., + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE, + no_optimization = FALSE, + collect_in_background = FALSE, + show_plot = FALSE, + truncate_nodes = 0) { + wrap({ + check_dots_empty0(...) + + if (isTRUE(no_optimization)) { + predicate_pushdown <- FALSE + projection_pushdown <- FALSE + slice_pushdown <- FALSE + comm_subplan_elim <- FALSE + comm_subexpr_elim <- FALSE + cluster_with_columns <- FALSE + } + + if (isTRUE(streaming)) { + comm_subplan_elim <- FALSE + } + + lf <- self$`_rexpr`$optimization_toggle( + type_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = comm_subplan_elim, + comm_subexpr_elim = comm_subexpr_elim, + cluster_with_columns = cluster_with_columns, + streaming = streaming, + eager = FALSE + ) + + out <- self$`_ldf`$profile() + + if (isTRUE(show_plot)) { + out[["plot"]] <- make_profile_plot(out, truncate_nodes) |> + wrap() + } + out + }) +} + +#' Create a string representation of the query plan +#' +#' The query plan is read from bottom to top. When `optimized = FALSE`, the +#' query as it was written by the user is shown. This is not what Polars runs. +#' Instead, it applies optimizations that are displayed by default by `$explain()`. +#' One classic example is the predicate pushdown, which applies the filter as +#' early as possible (i.e. at the bottom of the plan). +#' +#' @inheritParams rlang::check_dots_empty0 +#' @inheritParams lazyframe__collect +#' @param format The format to use for displaying the logical plan. Must be +#' either `"plain"` (default) or `"tree"`. +#' @param optimized Return an optimized query plan. If `TRUE` (default), the +#' subsequent optimization flags control which optimizations run. +#' +#' @return A character value containing the query plan. +#' @examples +#' lazy_frame <- as_polars_lf(iris) +#' +#' # Prepare your query +#' lazy_query <- lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa") +#' +#' # This is the query that was written by the user, without any optimizations +#' # (use cat() for better printing) +#' lazy_query$explain(optimized = FALSE) |> cat() +#' +#' # This is the query after `polars` optimizes it: instead of sorting first and +#' # then filtering, it is faster to filter first and then sort the rest. +#' lazy_query$explain() |> cat() +#' +#' # Also possible to see this as tree format +#' lazy_query$explain(format = "tree") |> cat() lazyframe__explain <- function( ..., format = c("plain", "tree"), @@ -232,6 +437,26 @@ lazyframe__explain <- function( }) } +#' Resolve the schema of this LazyFrame +#' +#' This resolves the query plan but does not trigger computations. +#' +#' @return A named list with names indicating column names and values indicating +#' column data types. +#' +#' @examples +#' lf <- pl$LazyFrame( +#' foo = 1:3, +#' bar = 6:8, +#' ham = c("a", "b", "c") +#' ) +#' +#' lf$collect_schema() +#' +#' lf$with_columns( +#' baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String), +#' pl$col("bar")$cast(pl$Int64) +#' )$collect_schema() lazyframe__collect_schema <- function() { self$`_ldf`$collect_schema() |> lapply(function(x) { @@ -241,6 +466,32 @@ lazyframe__collect_schema <- function() { wrap() } +#' Cast LazyFrame column(s) to the specified dtype(s) +#' +#' This allows to convert all columns to a datatype or to convert only specific +#' columns. Contrarily to the Python implementation, it is not possible to +#' convert all columns of a specific datatype to another datatype. +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Either a datatype to which +#' all columns will be cast, or a list where the names are column names and the +#' values are the datatypes to convert to. +#' @param strict If `TRUE` (default), throw an error if a cast could not be done +#' (for instance, due to an overflow). Otherwise, return `null`. +#' +#' @return A LazyFrame +#' +#' @examples +#' lf <- pl$LazyFrame( +#' foo = 1:3, +#' bar = c(6, 7, 8), +#' ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) +#' ) +#' +#' # Cast only some columns +#' lf$cast(foo = pl$Float32, bar = pl$UInt8)$collect() +#' +#' # Cast all columns to the same type +#' lf$cast(pl$String)$collect() lazyframe__cast <- function(..., .strict = TRUE) { wrap({ check_bool(.strict) @@ -254,12 +505,76 @@ lazyframe__cast <- function(..., .strict = TRUE) { }) } +#' Filter the rows in the LazyFrame based on a predicate expression +#' +#' The original order of the remaining rows is preserved. Rows where the filter +#' does not evaluate to `TRUE` are discarded, including nulls. +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Expression that evaluates to +#' a boolean Series. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' foo = c(1, 2, 3, NA, 4, NA, 0), +#' bar = c(6, 7, 8, NA, NA, 9, 0), +#' ham = c("a", "b", "c", NA, "d", "e", "f") +#' ) +#' +#' # Filter on one condition +#' lf$filter(pl$col("foo") > 1)$collect() +#' +#' # Filter on multiple conditions +#' lf$filter((pl$col("foo") < 3) & (pl$col("ham") == "a"))$collect() +#' +#' # Filter on an OR condition +#' lf$filter((pl$col("foo") == 1) | (pl$col("ham") == " c"))$collect() +#' +#' # Filter by comparing two columns against each other +#' lf$filter(pl$col("foo") == pl$col("bar"))$collect() +#' lf$filter(pl$col("foo") != pl$col("bar"))$collect() +#' +#' # Notice how the row with null values is filtered out$ In order to keep the +#' # rows with nulls, use: +#' lf$filter(pl$col("foo")$ne_missing(pl$col("bar")))$collect() lazyframe__filter <- function(...) { parse_predicates_constraints_into_expression(...) |> self$`_ldf`$filter() |> wrap() } +#' Sort the LazyFrame by the given columns +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column(s) to sort by. Can be +#' character values indicating column names or Expr(s). +#' @param descending Sort in descending order. When sorting by multiple +#' columns, this can be specified per column by passing a logical vector. +#' @param nulls_last Place null values last. When sorting by multiple +#' columns, this can be specified per column by passing a logical vector. +#' @param maintain_order Whether the order should be maintained if elements are +#' equal. If `TRUE`, streaming is not possible and performance might be worse +#' since this requires a stable search. +#' @param multithreaded Sort using multiple threads. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' a = c(1, 2, NA, 4), +#' b = c(6, 5, 4, 3), +#' c = c("a", "c", "b", "a") +#' ) +#' +#' # Pass a single column name to sort by that column. +#' lf$sort("a")$collect() +#' +#' # Sorting by expressions is also supported +#' lf$sort(pl$col("a") + pl$col("b") * 2, nulls_last = TRUE)$collect() +#' +#' # Sort by multiple columns by passing a vector of columns +#' lf$sort(c("c", "a"), descending = TRUE)$collect() +#' +#' # Or use positional arguments to sort by multiple columns in the same way +#' lf$sort("c", "a", descending = c(FALSE, TRUE))$collect() lazyframe__sort <- function( ..., descending = FALSE, @@ -269,6 +584,9 @@ lazyframe__sort <- function( wrap({ check_dots_unnamed() + if (missing(...)) { + abort("`...` must contain at least one element.") + } by <- parse_into_list_of_expressions(...) descending <- extend_bool(descending, length(by), "descending", "...") nulls_last <- extend_bool(nulls_last, length(by), "nulls_last", "...") @@ -332,14 +650,93 @@ lazyframe__sort <- function( #' }) #' } lazyframe__with_columns <- function(...) { + structify <- parse_env_auto_structify() + + parse_into_list_of_expressions(..., `__structify` = structify) |> + self$`_ldf`$with_columns() |> + wrap() +} + +#' Modify/append column(s) of a LazyFrame +#' +#' @description +#' This will run all expression sequentially instead of in parallel. Use this +#' only when the work per expression is cheap. +#' +#' Add columns or modify existing ones with expressions. This is similar to +#' `dplyr::mutate()` as it keeps unmentioned columns (unlike `$select()`). +#' +#' However, unlike `dplyr::mutate()`, one cannot use new variables in subsequent +#' expressions in the same `$with_columns_seq()`call. For instance, if you create a +#' variable `x`, you will only be able to use it in another `$with_columns_seq()` +#' or `$select()` call. +#' +#' @inherit as_polars_lf return +#' @inheritParams lazyframe__select +#' @examples +#' # Pass an expression to add it as a new column. +#' lf <- pl$LazyFrame( +#' a = 1:4, +#' b = c(0.5, 4, 10, 13), +#' c = c(TRUE, TRUE, FALSE, TRUE), +#' ) +#' lf$with_columns_seq((pl$col("a")^2)$alias("a^2"))$collect() +#' +#' # Added columns will replace existing columns with the same name. +#' lf$with_columns_seq(a = pl$col("a")$cast(pl$Float64))$collect() +#' +#' # Multiple columns can be added +#' lf$with_columns_seq( +#' (pl$col("a")^2)$alias("a^2"), +#' (pl$col("b") / 2)$alias("b/2"), +#' (pl$col("c")$not())$alias("not c"), +#' )$collect() +#' +#' # Name expression instead of `$alias()` +#' lf$with_columns_seq( +#' `a^2` = pl$col("a")^2, +#' `b/2` = pl$col("b") / 2, +#' `not c` = pl$col("c")$not(), +#' )$collect() +#' +#' # Expressions with multiple outputs can automatically be instantiated +#' # as Structs by enabling the experimental setting `POLARS_AUTO_STRUCTIFY`: +#' if (requireNamespace("withr", quietly = TRUE)) { +#' withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), { +#' lf$drop("c")$with_columns_seq( +#' diffs = pl$col("a", "b")$diff()$name$suffix("_diff"), +#' )$collect() +#' }) +#' } +lazyframe__with_columns_seq <- function(...) { wrap({ structify <- parse_env_auto_structify() parse_into_list_of_expressions(..., `__structify` = structify) |> - self$`_ldf`$with_columns() + self$`_ldf`$with_columns_seq() }) } +#' Remove columns from the DataFrame +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Names of the columns that +#' should be removed from the dataframe. Accepts column selector input. +#' @param strict Validate that all column names exist in the current schema, +#' and throw an exception if any do not. +#' +#' @inherit as_polars_lf return +#' @examples +#' # Drop columns by passing the name of those columns +#' lf <- pl$LazyFrame( +#' foo = 1:3, +#' bar = c(6, 7, 8), +#' ham = c("a", "b", "c") +#' ) +#' lf$drop("ham")$collect() +#' lf$drop("ham", "bar")$collect() +#' +#' # Drop multiple columns by passing a selector +#' lf$drop(cs$all())$collect() lazyframe__drop <- function(..., strict = TRUE) { wrap({ check_dots_unnamed() @@ -358,12 +755,1924 @@ lazyframe__slice <- function(offset, length = NULL) { }) } +#' Get the first `n` rows +#' +#' @param n Number of rows to return. +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:6, b = 7:12) +#' lf$head()$collect() +#' lf$head(2)$collect() lazyframe__head <- function(n = 5) { self$slice(0, n) |> wrap() } +#' Get the first `n` rows +#' +#' Alias for [`$head()`][lazyframe__head]. +#' +#' @inheritParams lazyframe__head +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:6, b = 7:12) +#' lf$limit()$collect() +#' lf$limit(2)$collect() +lazyframe__limit <- function(n = 5) { + wrap({ + self$head(n) + }) +} + +#' Get the last `n` rows +#' +#' @inheritParams lazyframe__head +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:6, b = 7:12) +#' lf$tail()$collect() +#' lf$tail(2)$collect() lazyframe__tail <- function(n = 5) { self$`_ldf`$tail(n) |> wrap() } + + +#' Get the first row of the LazyFrame +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$first()$collect() +lazyframe__first <- function() { + wrap({ + self$slice(0, 1) + }) +} + +#' Get the last row of the LazyFrame +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$last()$collect() +lazyframe__last <- function() { + wrap({ + self$tail(1) + }) +} + +#' Aggregate the columns in the LazyFrame to their maximum value +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$max()$collect() +lazyframe__max <- function() { + wrap({ + self$`_ldf`$max() + }) +} + +#' Aggregate the columns in the LazyFrame to their mean value +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$mean()$collect() +lazyframe__mean <- function() { + wrap({ + self$`_ldf`$mean() + }) +} + +#' Aggregate the columns in the LazyFrame to their median value +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$median()$collect() +lazyframe__median <- function() { + wrap({ + self$`_ldf`$median() + }) +} + +#' Aggregate the columns in the LazyFrame to their minimum value +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$min()$collect() +lazyframe__min <- function() { + wrap({ + self$`_ldf`$min() + }) +} + +#' Aggregate the columns of this LazyFrame to their sum values +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$sum()$collect() +lazyframe__sum <- function() { + wrap({ + self$`_ldf`$sum() + }) +} + +#' Aggregate the columns in the LazyFrame to their variance value +#' +#' @inheritParams DataFrame_var +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$var()$collect() +#' lf$var(ddof = 0)$collect() +lazyframe__var <- function(ddof = 1) { + wrap({ + self$`_ldf`$var(ddof) + }) +} + +#' Aggregate the columns of this LazyFrame to their standard deviation values +#' +#' @inheritParams DataFrame_std +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$std()$collect() +#' lf$std(ddof = 0)$collect() +lazyframe__std <- function(ddof = 1) { + wrap({ + self$`_ldf`$std(ddof) + }) +} + +#' Aggregate the columns in the DataFrame to a unique quantile value +#' +#' @inheritParams DataFrame_quantile +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +#' lf$quantile(0.7)$collect() +lazyframe__quantile <- function( + quantile, + interpolation = c("nearest", "higher", "lower", "midpoint", "linear")) { + wrap({ + interpolation <- arg_match0( + interpolation, + values = c("nearest", "higher", "lower", "midpoint", "linear") + ) + self$`_ldf`$quantile(as_polars_expr(quantile, as_lit = TRUE)$`_rexpr`, interpolation) + }) +} + +#' @inherit expr__fill_nan title params +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' a = c(1.5, 2, NaN, 4), +#' b = c(1.5, NaN, NaN, 4) +#' ) +#' lf$fill_nan(99)$collect() +lazyframe__fill_nan <- function(value) { + wrap({ + self$`_ldf`$fill_nan(as_polars_expr(value)$`_rexpr`) + }) +} + +#' @inherit DataFrame_fill_null title description params +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' a = c(1.5, 2, NA, 4), +#' b = c(1.5, NA, NA, 4) +#' ) +#' lf$fill_null(99)$collect() +lazyframe__fill_null <- function(fill_value) { + wrap({ + self$`_ldf`$fill_null(as_polars_expr(fill_value)$`_rexpr`) + }) +} + +#' Shift values by the given number of indices +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param n Number of indices to shift forward. If a negative value is passed, +#' values are shifted in the opposite direction instead. +#' @param fill_value Fill the resulting null values with this value. Accepts +#' expression input. Non-expression inputs are parsed as literals. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = 5:8) +#' +#' # By default, values are shifted forward by one index. +#' lf$shift()$collect() +#' +#' # Pass a negative value to shift in the opposite direction instead. +#' lf$shift(-2)$collect() +#' +#' # Specify fill_value to fill the resulting null values. +#' lf$shift(-2, fill_value = 100)$collect() +lazyframe__shift <- function(n = 1, ..., fill_value = NULL) { + wrap({ + check_dots_empty0(...) + self$`_ldf`$shift(as_polars_expr(n)$`_rexpr`, as_polars_expr(fill_value)$`_rexpr`) + }) +} + +#' Reverse the LazyFrame +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(key = c("a", "b", "c"), val = 1:3) +#' lf$reverse()$collect() +lazyframe__reverse <- function() { + wrap({ + self$`_ldf`$reverse() + }) +} + +#' Get a slice of the LazyFrame. +#' +#' @param offset Start index. Negative indexing is supported. +#' @param length Length of the slice. If `NULL` (default), all rows starting at +#' the offset will be selected. +#' +#' @return A [LazyFrame][lazyframe__class] +#' @examples +#' lf <- pl$LazyFrame(x = c("a", "b", "c"), y = 1:3, z = 4:6) +#' lf$slice(1, 2)$collect() +lazyframe__slice <- function(offset, length = NULL) { + wrap({ + self$`_ldf`$slice(offset, length) + }) +} + +#' Get the last `n` rows. +#' +#' @inherit lazyframe__head return params +#' @inheritParams lazyframe__head +#' @seealso [`$head()`][lazyframe__head] +#' @examples +#' lf <- pl$LazyFrame(a = 1:6, b = 7:12) +#' +#' lf$tail()$collect() +#' +#' lf$tail(2)$collect() +lazyframe__tail <- function(n = 5L) { + wrap({ + self$`_ldf`$tail(n) + }) +} + +#' Drop all rows that contain null values +#' +#' The original order of the remaining rows is preserved. +#' +#' @param subset Column name(s) for which null values are considered. If `NULL` +#' (default), use all columns. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' foo = 1:3, +#' bar = c(6, NA, 8), +#' ham = c("a", "b", NA) +#' ) +#' +#' # The default behavior of this method is to drop rows where any single value +#' # of the row is null. +#' lf$drop_nulls()$collect() +#' +#' # This behaviour can be constrained to consider only a subset of columns, as +#' # defined by name or with a selector. For example, dropping rows if there is +#' # a null in any of the integer columns: +#' lf$drop_nulls(subset = cs$integer())$collect() +lazyframe__drop_nulls <- function(subset = NULL) { + wrap({ + if (!is.null(subset)) { + subset <- parse_into_list_of_expressions(!!!subset) + } + self$`_ldf`$drop_nulls(subset) + }) +} + +#' Drop duplicate rows from this DataFrame +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param subset Column name(s) or selector(s), to consider when identifying +#' duplicate rows. If `NULL` (default), use all columns. +#' @param keep Which of the duplicate rows to keep. Must be one of: +#' * `"any"`: does not give any guarantee of which row is kept. This allows +#' more optimizations. +#' * `"none"`: don’t keep duplicate rows. +#' * `"first"`: keep first unique row. +#' * `"last"`: keep last unique row. +#' @param maintain_order Keep the same order as the original LazyFrame. This is +#' more expensive to compute. Setting this to `TRUE` blocks the possibility to +#' run on the streaming engine. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' foo = c(1, 2, 3, 1), +#' bar = c("a", "a", "a", "a"), +#' ham = c("b", "b", "b", "b"), +#' ) +#' lf$unique(maintain_order = TRUE)$collect() +#' +#' lf$unique(subset = c("bar", "ham"), maintain_order = TRUE)$collect() +#' +#' lf$unique(keep = "last", maintain_order = TRUE)$collect() +lazyframe__unique <- function( + subset = NULL, + ..., + keep = c("any", "none", "first", "last"), + maintain_order = FALSE) { + wrap({ + check_dots_empty0(...) + keep <- arg_match0(keep, values = c("any", "none", "first", "last")) + if (!is.null(subset)) { + subset <- parse_into_list_of_expressions(!!!subset) + } + self$`_ldf`$unique(subset = subset, keep = keep, maintain_order = maintain_order) + }) +} + +#' Join LazyFrames +#' +#' This function can do both mutating joins (adding columns based on matching +#' observations, for example with `how = "left"`) and filtering joins (keeping +#' observations based on matching observations, for example with `how = +#' "inner"`). +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param other LazyFrame to join with. +#' @param on Either a vector of column names or a list of expressions and/or +#' strings. Use `left_on` and `right_on` if the column names to match on are +#' different between the two DataFrames. +#' @param how One of the following methods: +#' * "inner": returns rows that have matching values in both tables +#' * "left": returns all rows from the left table, and the matched rows from +#' the right table +#' * "right": returns all rows from the right table, and the matched rows from +#' the left table +#' * "full": returns all rows when there is a match in either left or right +#' table +#' * "cross": returns the Cartesian product of rows from both tables +#' * "semi": returns rows from the left table that have a match in the right +#' table. +#' * "anti": returns rows from the left table that have no match in the right +#' table. +#' @param left_on,right_on Same as `on` but only for the left or the right +#' DataFrame. They must have the same length. +#' @param suffix Suffix to add to duplicated column names. +#' @param validate Checks if join is of specified type: +#' * `"m:m"` (default): many-to-many, doesn't perform any checks; +#' * `"1:1"`: one-to-one, check if join keys are unique in both left and right +#' datasets; +#' * `"1:m"`: one-to-many, check if join keys are unique in left dataset +#' * `"m:1"`: many-to-one, check if join keys are unique in right dataset +#' +#' Note that this is currently not supported by the streaming engine. +#' +#' @param join_nulls Join on null values. By default null values will never +#' produce matches. +#' @param allow_parallel Allow the physical plan to optionally evaluate the +#' computation of both DataFrames up to the join in parallel. +#' @param force_parallel Force the physical plan to evaluate the computation of +#' both DataFrames up to the join in parallel. +#' @param coalesce Coalescing behavior (merging of join columns). +#' - `NULL`: join specific. +#' - `TRUE`: Always coalesce join columns. +#' - `FALSE`: Never coalesce join columns. +#' Note that joining on any other expressions than `col` will turn off +#' coalescing. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' foo = 1:3, +#' bar = c(6, 7, 8), +#' ham = c("a", "b", "c") +#' ) +#' other_lf <- pl$LazyFrame( +#' apple = c("x", "y", "z"), +#' ham = c("a", "b", "d") +#' ) +#' lf$join(other_lf, on = "ham")$collect() +#' +#' lf$join(other_lf, on = "ham", how = "full")$collect() +#' +#' lf$join(other_lf, on = "ham", how = "left", coalesce = TRUE)$collect() +#' +#' lf$join(other_lf, on = "ham", how = "semi")$collect() +#' +#' lf$join(other_lf, on = "ham", how = "anti")$collect() +lazyframe__join <- function( + other, + on = NULL, + how = "inner", + ..., + left_on = NULL, + right_on = NULL, + suffix = "_right", + validate = "m:m", + join_nulls = FALSE, + allow_parallel = TRUE, + force_parallel = FALSE, + coalesce = NULL) { + wrap({ + check_dots_empty0(...) + check_polars_lf(other) + how <- arg_match0( + how, + values = c("inner", "full", "left", "right", "semi", "anti", "cross") + ) + validate <- arg_match0(validate, values = c("m:m", "1:m", "m:1", "1:1")) + uses_on <- !is.null(on) + uses_left_on <- !is.null(left_on) + uses_right_on <- !is.null(right_on) + uses_lr_on <- uses_left_on | uses_right_on + if (uses_on && uses_lr_on) { + abort("cannot use 'on' in conjunction with 'left_on' or 'right_on'.") + } + if (uses_left_on && !uses_right_on) { + abort("'left_on' requires corresponding 'right_on'") + } + if (!uses_left_on && uses_right_on) { + abort("'right_on' requires corresponding 'left_on'") + } + if (how == "cross") { + if (uses_on | uses_lr_on) { + abort("cross join should not pass join keys.") + } + return( + self$`_ldf`$join( + other$`_ldf`, list(), list(), + how = how, validate = validate, + join_nulls = join_nulls, suffix = suffix, + allow_parallel = allow_parallel, force_parallel = force_parallel, + coalesce = coalesce + ) + ) + } + + if (uses_on) { + rexprs_right <- rexprs_left <- parse_into_list_of_expressions(!!!on) + } else if (uses_lr_on) { + rexprs_left <- parse_into_list_of_expressions(!!!left_on) + rexprs_right <- parse_into_list_of_expressions(!!!right_on) + } else { + abort("must specify either `on`, or `left_on` and `right_on`.") + } + self$`_ldf`$join( + other$`_ldf`, rexprs_left, rexprs_right, + how = how, validate = validate, + join_nulls = join_nulls, suffix = suffix, + allow_parallel = allow_parallel, force_parallel = force_parallel, + coalesce = coalesce + ) + }) +} + +#' Perform a join based on one or multiple (in)equality predicates +#' +#' @description +#' `r lifecycle::badge("experimental")` +#' +#' This performs an inner join, so only rows where all predicates are true are +#' included in the result, and a row from either LazyFrame may be included +#' multiple times in the result. +#' +#' Note that the row order of the input LazyFrames is not preserved. +#' +#' @param other LazyFrame to join with. +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> (In)Equality condition to +#' join the two tables on. When a column name occurs in both tables, the proper +#' suffix must be applied in the predicate. For example, if both tables have a +#' column `"x"` that you want to use in the conditions, you must refer to the +#' column of the right table as `"x"`. +#' @param suffix Suffix to append to columns with a duplicate name. +#' +#' @inherit as_polars_lf return +#' +#' @examples +#' east <- pl$LazyFrame( +#' id = c(100, 101, 102), +#' dur = c(120, 140, 160), +#' rev = c(12, 14, 16), +#' cores = c(2, 8, 4) +#' ) +#' +#' west <- pl$LazyFrame( +#' t_id = c(404, 498, 676, 742), +#' time = c(90, 130, 150, 170), +#' cost = c(9, 13, 15, 16), +#' cores = c(4, 2, 1, 4) +#' ) +#' +#' east$join_where( +#' west, +#' pl$col("dur") < pl$col("time"), +#' pl$col("rev") < pl$col("cost") +#' )$collect() +lazyframe__join_where <- function( + other, + ..., + suffix = "_right") { + wrap({ + check_polars_lf(other) + by <- parse_into_list_of_expressions(...) + self$`_ldf`$join_where(other$`_ldf`, by, suffix) + }) +} + +#' Unpivot a LazyFrame from wide to long format +#' +#' This function is useful to massage a LazyFrame into a format where one or +#' more columns are identifier variables (`index`) while all other columns, +#' considered measured variables (`on`), are “unpivoted” to the row axis +#' leaving just two non-identifier columns, "variable" and "value". +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param on Values to use as identifier variables. If `value_vars` is +#' empty all columns that are not in `id_vars` will be used. +#' @param index Columns to use as identifier variables. +#' @param variable_name Name to give to the new column containing the names of +#' the melted columns. Defaults to "variable". +#' @param value_name Name to give to the new column containing the values of +#' the melted columns. Defaults to `"value"`. +#' +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame( +#' a = c("x", "y", "z"), +#' b = c(1, 3, 5), +#' c = c(2, 4, 6) +#' ) +#' lf$unpivot(index = "a", on = c("b", "c"))$collect() +lazyframe__unpivot <- function( + on = NULL, + ..., + index = NULL, + variable_name = NULL, + value_name = NULL) { + wrap({ + check_dots_empty0(...) + if (!is.null(on)) { + on <- parse_into_list_of_expressions(!!!on) + } + if (!is.null(index)) { + index <- parse_into_list_of_expressions(!!!index) + } + self$`_ldf`$unpivot(on, index, value_name, variable_name) + }) +} + +#' Rename column names +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Either a function that takes +#' a character vector as input and returns a character vector as output, or +#' named values where names are old column names and values are the new ones. +#' @param .strict Validate that all column names exist in the current schema, +#' and throw an error if any do not. (Note that this parameter is a no-op when +#' passing a function to `...`). +#' +#' @details +#' If existing names are swapped (e.g. 'A' points to 'B' and 'B' points to +#' 'A'), polars will block projection and predicate pushdowns at this node. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' foo = 1:3, +#' bar = 6:8, +#' ham = letters[1:3] +#' ) +#' +#' lf$rename(foo = "apple")$collect() +#' +#' lf$rename( +#' \(column_name) paste0("c", substr(column_name, 2, 100)) +#' )$collect() +lazyframe__rename <- function(..., .strict = TRUE) { + wrap({ + mapping <- list2(...) + if (length(mapping) == 1 && is_function(mapping[[1]]) && !is_named(mapping)) { + # TODO: this requires $name$map() + abort("Not implemented yet") + return(self$select(pl$all()$name$map(mapping[[1]]))) + } + if (!is_list_of_string(mapping)) { + abort("`...` only accepts an unnamed function or named single strings.") + } + existing <- names(mapping) + new <- unlist(mapping) + self$`_ldf`$rename(existing, new, .strict) + }) +} + +#' Collect and profile a lazy query +#' +#' @description +#' This will run the query and return a list containing the +#' materialized DataFrame and a DataFrame that contains profiling information +#' of each node that is executed. +#' +#' @inheritParams lazyframe__collect +#' @param show_plot Show a Gantt chart of the profiling result +#' @param truncate_nodes Truncate the label lengths in the Gantt chart to this +#' number of characters. If `0` (default), do not truncate. +#' +#' @details The units of the timings are microseconds. +#' +#' @return List of two `DataFrame`s: one with the collected result, the other +#' with the timings of each step. If `show_graph = TRUE`, then the plot is +#' also stored in the list. +#' @seealso +#' - [`$collect()`][lazyframe__collect] - regular collect. +#' - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking +#' collect returns a future handle. Can also just be used via +#' `$collect(collect_in_background = TRUE)`. +#' - [`$sink_parquet()`][lazyframe__sink_parquet()] streams query to a parquet file. +#' - [`$sink_ipc()`][lazyframe__sink_ipc()] streams query to a arrow file. +#' +#' @examples +#' ## Simplest use case +#' pl$LazyFrame()$select(pl$lit(2) + 2)$profile() +#' +#' ## Use $profile() to compare two queries +#' +#' # -1- map each Species-group with native polars, takes ~120us only +#' as_polars_lf(iris)$ +#' sort("Sepal.Length")$ +#' group_by("Species", maintain_order = TRUE)$ +#' agg(pl$col(pl$Float64)$first() + 5)$ +#' profile() +#' +#' # -2- map each Species-group of each numeric column with an R function, takes ~7000us (slow!) +#' +#' # some R function, prints `.` for each time called by polars +#' r_func <- \(s) { +#' cat(".") +#' s$to_r()[1] + 5 +#' } +#' +#' as_polars_lf(iris)$ +#' sort("Sepal.Length")$ +#' group_by("Species", maintain_order = TRUE)$ +#' agg(pl$col(pl$Float64)$map_elements(r_func))$ +#' profile() +lazyframe__profile <- function( + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE, + no_optimization = FALSE, + collect_in_background = FALSE, + show_plot = FALSE, + truncate_nodes = 0) { + if (isTRUE(no_optimization)) { + predicate_pushdown <- FALSE + projection_pushdown <- FALSE + slice_pushdown <- FALSE + comm_subplan_elim <- FALSE + comm_subexpr_elim <- FALSE + cluster_with_columns <- FALSE + } + + if (isTRUE(streaming)) { + comm_subplan_elim <- FALSE + } + + lf <- self$`_ldf`$optimization_toggle( + type_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = comm_subplan_elim, + comm_subexpr_elim = comm_subexpr_elim, + cluster_with_columns = cluster_with_columns, + streaming = streaming, + `_eager` = FALSE + ) + + out <- lapply(self$`_ldf`$profile(), \(x) { + x |> + .savvy_wrap_PlRDataFrame() |> + wrap() + }) + + if (isTRUE(show_plot)) { + out[["plot"]] <- make_profile_plot(out, truncate_nodes) + } + + out +} + +#' Serialize the logical plan of this LazyFrame to a string in JSON format +#' +#' @return A character value +#' @examples +#' lf <- pl$LazyFrame(a = 1:3)$sum() +#' lf$serialize() +lazyframe__serialize <- function() { + wrap({ + self$`_ldf`$serialize() + }) +} + +#' Read a logical plan from a file to construct a LazyFrame +#' +#' @param source String containing the LazyFrame logical plan in JSON format. +#' +#' @return A character value +#' @examples +#' lf <- pl$LazyFrame(a = 1:3)$sum() +#' ser <- lf$serialize() +#' pl$deserialize_lf(ser) +pl__deserialize_lf <- function(source) { + wrap({ + deserialize_lf(source) + }) +} + +#' Explode the DataFrame to long format by exploding the given columns +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column names, expressions, or +#' a selector defining them. The underlying columns being exploded must be of +#' the `List` or `Array` data type. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' letters = c("a", "a", "b", "c"), +#' numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)) +#' ) +#' +#' lf$explode("numbers")$collect() +lazyframe__explode <- function(...) { + wrap({ + check_dots_unnamed() + by <- parse_into_list_of_expressions(...) + self$`_ldf`$explode(by) + }) +} + +#' Clone a LazyFrame +#' +#' This makes a very cheap deep copy/clone of an existing +#' [`LazyFrame`][lazyframe__class]. Rarely useful as `LazyFrame`s are nearly 100% +#' immutable. Any modification of a `LazyFrame` should lead to a clone anyways, +#' but this can be useful when dealing with attributes (see examples). +#' +#' +#' @inherit as_polars_lf return +#' @examples +#' df1 <- as_polars_lf(iris) +#' +#' # Make a function to take a LazyFrame, add an attribute, and return a LazyFrame +#' give_attr <- function(data) { +#' attr(data, "created_on") <- "2024-01-29" +#' data +#' } +#' df2 <- give_attr(df1) +#' +#' # Problem: the original LazyFrame also gets the attribute while it shouldn't! +#' attributes(df1) +#' +#' # Use $clone() inside the function to avoid that +#' give_attr <- function(data) { +#' data <- data$clone() +#' attr(data, "created_on") <- "2024-01-29" +#' data +#' } +#' df1 <- as_polars_lf(iris) +#' df2 <- give_attr(df1) +#' +#' # now, the original LazyFrame doesn't get this attribute +#' attributes(df1) +lazyframe__clone <- function() { + self$`_ldf`$clone() +} + + +#' Decompose struct columns into separate columns for each of their fields +#' +#' The new columns will be inserted into the LazyFrame at the location of the +#' struct column. +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name of the struct column(s) +#' that should be unnested. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame( +#' a = 1:5, +#' b = c("one", "two", "three", "four", "five"), +#' c = 6:10 +#' )$ +#' select( +#' pl$struct("b"), +#' pl$struct(c("a", "c"))$alias("a_and_c") +#' ) +#' lf$collect() +#' +#' lf$unnest("a_and_c")$collect() +#' lf$unnest(pl$col("a_and_c"))$collect() +lazyframe__unnest <- function(...) { + wrap({ + check_dots_unnamed() + columns <- parse_into_list_of_expressions(...) + self$`_ldf`$unnest(columns) + }) +} + +#' Add an external context to the computation graph +#' +#' This allows expressions to also access columns from DataFrames or LazyFrames +#' that are not part of this one. +#' +#' @param other Data/LazyFrame to have access to. This can be a list of DataFrames +#' and LazyFrames. +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA)) +#' lf_other <- pl$LazyFrame(c = c("foo", "ham")) +#' +#' lf$with_context(lf_other)$select( +#' pl$col("b") + pl$col("c")$first() +#' )$collect() +#' +#' # Fill nulls with the median from another lazyframe: +#' train_lf <- pl$LazyFrame( +#' feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1) +#' ) +#' test_lf <- pl$LazyFrame( +#' feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1) +#' ) +#' +#' test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select( +#' pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median()) +#' )$collect() +lazyframe__with_context <- function(other) { + self$`_ldf`$with_context(other) +} + + +#' Create rolling groups based on a date/time or integer column +#' +#' @description +#' Different from `group_by_dynamic`, the windows are now determined by the +#' individual values and are not of constant intervals. For constant intervals +#' use [`$group_by_dynamic()`][lazyframe__group_by_dynamic]. +#' +#' If you have a time series ``, then by default the +#' windows created will be: +#' * `(t_0 - period, t_0]` +#' * `(t_1 - period, t_1]` +#' * … +#' * `(t_n - period, t_n]` +#' +#' whereas if you pass a non-default `offset`, then the windows will be: +#' * `(t_0 + offset, t_0 + offset + period]` +#' * `(t_1 + offset, t_1 + offset + period]` +#' * … +#' * `(t_n + offset, t_n + offset + period]` +#' +#' @inheritParams rlang::check_dots_empty0 +#' @inheritParams lazyframe__group_by_dynamic +#' @param period Length of the window - must be non-negative. +#' @param offset Offset of the window. Default is `-period`. +#' +#' @inherit expr__rolling_max params details +#' @return A [LazyGroupBy][LazyGroupBy_class] object +#' @seealso +#' - [`$group_by_dynamic()`][lazyframe__group_by_dynamic] +#' @examples +#' dates <- c( +#' "2020-01-01 13:45:48", +#' "2020-01-01 16:42:13", +#' "2020-01-01 16:45:09", +#' "2020-01-02 18:12:48", +#' "2020-01-03 19:45:32", +#' "2020-01-08 23:16:43" +#' ) +#' +#' df <- pl$LazyFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$with_columns( +#' pl$col("dt")$str$strptime(pl$Datetime()) +#' ) +#' +#' df$rolling(index_column = "dt", period = "2d")$agg( +#' sum_a = pl$col("a")$sum(), +#' min_a = pl$col("a")$min(), +#' max_a = pl$col("a")$max() +#' )$collect() +lazyframe__rolling <- function( + index_column, + ..., + period, + offset = NULL, + closed = "right", + group_by = NULL) { + wrap({ + check_dots_empty0(...) + closed <- arg_match0(closed, values = c("both", "left", "right", "none")) + period <- parse_as_duration_string(period) + if (!is.null(offset)) { + offset <- parse_as_duration_string(offset) + } else { + offset <- negate_duration_string(period) + } + if (!is.null(group_by) && !is.list(group_by)) { + group_by <- list(group_by) + } + by <- parse_into_list_of_expressions(!!!group_by) + self$`_ldf`$rolling( + as_polars_expr(index_column)$`_rexpr`, period, offset, closed, by + ) + }) +} + + +#' Group based on a date/time or integer column +#' +#' Time windows are calculated and rows are assigned to windows. Different from +#' a normal group by is that a row can be member of multiple groups. By +#' default, the windows look like: +#' * [start, start + period) +#' * [start + every, start + every + period) +#' * [start + 2*every, start + 2*every + period) +#' * … +#' +#' where `start` is determined by `start_by`, `offset`, `every`, and the +#' earliest datapoint. See the `start_by` argument description for details. +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param index_column Column used to group based on the time window. Often of +#' type Date/Datetime. This column must be sorted in ascending order (or, if +#' `group_by` is specified, then it must be sorted in ascending order within +#' each group). +#' In case of a dynamic group by on indices, the data type needs to be either +#' Int32 or In64. Note that Int32 gets temporarily cast to Int64, so if +#' performance matters, use an Int64 column. +#' @param every Interval of the window. +#' @param period Length of the window. If `NULL` (default), it will equal +#' `every`. +#' @param offset Offset of the window, does not take effect if +#' `start_by = "datapoint"`. Defaults to zero. +#' @param include_boundaries Add two columns `"_lower_boundary"` and +#' `"_upper_boundary"` columns that show the boundaries of the window. This will +#' impact performance because it’s harder to parallelize. +#' @param closed Define which sides of the interval are closed (inclusive). +#' Default is `"left"`. +#' @param label Define which label to use for the window: +#' * `"left"`: lower boundary of the window +#' * `"right"`: upper boundary of the window +#' * `"datapoint"`: the first value of the index column in the given window. If +#' you don’t need the label to be at one of the boundaries, choose this option +#' for maximum performance. +#' @param start_by The strategy to determine the start of the first window by: +#' * `"window"`: start by taking the earliest timestamp, truncating it with +#' `every`, and then adding `offset`. Note that weekly windows start on +#' Monday. +#' * `"datapoint"`: start from the first encountered data point. +#' * a day of the week (only takes effect if `every` contains `"w"`): `"monday"` +#' starts the window on the Monday before the first data point, etc. +#' +#' @details +#' The `every`, `period`, and `offset` arguments are created with the following +#' string language: +#' - 1ns # 1 nanosecond +#' - 1us # 1 microsecond +#' - 1ms # 1 millisecond +#' - 1s # 1 second +#' - 1m # 1 minute +#' - 1h # 1 hour +#' - 1d # 1 day +#' - 1w # 1 calendar week +#' - 1mo # 1 calendar month +#' - 1y # 1 calendar year +#' These strings can be combined: +#' - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds +#' +#' In case of a `group_by_dynamic` on an integer column, the windows are +#' defined by: +#' - 1i # length 1 +#' - 10i # length 10 +#' +#' @return A [LazyGroupBy][LazyGroupBy_class] object +#' @seealso +#' - [`$rolling()`][lazyframe__rolling] +#' +#' @examples +#' lf <- pl$select( +#' time = pl$datetime_range( +#' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' interval = "30m" +#' ), +#' n = 0:6 +#' )$lazy() +#' lf$collect() +#' +#' # Group by windows of 1 hour. +#' lf$group_by_dynamic("time", every = "1h", closed = "right")$agg( +#' vals = pl$col("n") +#' )$collect() +#' +#' # The window boundaries can also be added to the aggregation result +#' lf$group_by_dynamic( +#' "time", +#' every = "1h", include_boundaries = TRUE, closed = "right" +#' )$agg( +#' pl$col("n")$mean() +#' )$collect() +#' +#' # When closed = "left", the window excludes the right end of interval: +#' # [lower_bound, upper_bound) +#' lf$group_by_dynamic("time", every = "1h", closed = "left")$agg( +#' pl$col("n") +#' )$collect() +#' +#' # When closed = "both" the time values at the window boundaries belong to 2 +#' # groups. +#' lf$group_by_dynamic("time", every = "1h", closed = "both")$agg( +#' pl$col("n") +#' )$collect() +#' +#' # Dynamic group bys can also be combined with grouping on normal keys +#' lf <- lf$with_columns( +#' groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) +#' ) +#' lf$collect() +#' +#' lf$group_by_dynamic( +#' "time", +#' every = "1h", +#' closed = "both", +#' group_by = "groups", +#' include_boundaries = TRUE +#' )$agg(pl$col("n"))$collect() +#' +#' # We can also create a dynamic group by based on an index column +#' lf <- pl$LazyFrame( +#' idx = 0:5, +#' A = c("A", "A", "B", "B", "B", "C") +#' )$with_columns(pl$col("idx")$set_sorted()) +#' lf$collect() +#' +#' lf$group_by_dynamic( +#' "idx", +#' every = "2i", +#' period = "3i", +#' include_boundaries = TRUE, +#' closed = "right" +#' )$agg(A_agg_list = pl$col("A"))$collect() +lazyframe__group_by_dynamic <- function( + index_column, + ..., + every, + period = NULL, + offset = NULL, + include_boundaries = FALSE, + closed = "left", + label = "left", + group_by = NULL, + start_by = "window") { + wrap({ + check_dots_empty0(...) + closed <- arg_match0(closed, values = c("both", "left", "right", "none")) + start_by <- arg_match0( + start_by, + values = c( + "window", "datapoint", "monday", "tuesday", "wednesday", "thursday", + "friday", "saturday", "sunday" + ) + ) + every <- parse_as_duration_string(every) + offset <- parse_as_duration_string(offset) %||% "0ns" + period <- parse_as_duration_string(period) %||% every + group_by <- parse_into_list_of_expressions(!!!group_by) + + self$`_ldf`$group_by_dynamic( + as_polars_expr(index_column)$`_rexpr`, every, period, offset, label, + include_boundaries, closed, + group_by, start_by + ) + }) +} + +#' Plot the query plan +#' +#' This only returns the "dot" output that can be passed to other packages, such +#' as `DiagrammeR::grViz()`. +#' +#' @param ... Not used.. +#' @param optimized Optimize the query plan. +#' @inheritParams lazyframe__explain +#' +#' @return A character vector +#' +#' @examples +#' lf <- pl$LazyFrame( +#' a = c("a", "b", "a", "b", "b", "c"), +#' b = 1:6, +#' c = 6:1 +#' ) +#' +#' query <- lf$group_by("a", maintain_order = TRUE)$agg( +#' pl$all()$sum() +#' )$sort( +#' "a" +#' ) +#' +#' query$to_dot() |> cat() +#' +#' # You could print the graph by using DiagrammeR for example, with +#' # query$to_dot() |> DiagrammeR::grViz(). +lazyframe__to_dot <- function( + ..., + optimized = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE) { + lf <- self |> + self$`_ldf`$optimization_toggle( + pe_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = comm_subplan_elim, + comm_subexpr_elim = comm_subexpr_elim, + cluster_with_columns = cluster_with_columns, + streaming = streaming, + eager = FALSE + ) + + self$`_ldf`$to_dot(optimized) +} + +#' Create an empty or n-row null-filled copy of the LazyFrame +#' +#' Returns a n-row null-filled LazyFrame with an identical schema. `n` can be +#' greater than the current number of rows in the LazyFrame. +#' +#' @param n Number of (empty) rows to return in the cleared frame. +#' +#' @return A n-row null-filled LazyFrame with an identical schema +#' +#' @examples +#' df <- pl$LazyFrame( +#' a = c(NA, 2, 3, 4), +#' b = c(0.5, NA, 2.5, 13), +#' c = c(TRUE, TRUE, FALSE, NA) +#' ) +#' +#' df$clear() +#' +#' df$clear(n = 5) +lazyframe__clear <- function(n = 0) { + pl$DataFrame(schema = self$schema)$clear(n)$lazy() +} + +#' Take every nth row in the LazyFrame +#' +#' @param n Gather every `n`-th row. +#' @param offset Starting index. +#' +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = 5:8) +#' lf$gather_every(2)$collect() +#' +#' lf$gather_every(2, offset = 1)$collect() +lazyframe__gather_every <- function(n, offset = 0) { + self$select(pl$col("*")$gather_every(n, offset)) +} + +#' Return the number of non-null elements for each column +#' +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4)) +#' lf$count()$collect() +lazyframe__count <- function() { + wrap({ + self$`_ldf`$count() + }) +} + +#' Return the number of null elements for each column +#' +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4)) +#' lf$null_count()$collect() +lazyframe__null_count <- function() { + wrap({ + self$`_ldf`$null_count() + }) +} + +#' Return the `k` smallest rows +#' +#' @description +#' Non-null elements are always preferred over null elements, regardless of the +#' value of `reverse`. The output is not guaranteed to be in any particular +#' order, call `sort()` after this function if you wish the output to be sorted. +#' +#' @inheritParams rlang::check_dots_empty +#' @param k Number of rows to return. +#' @param by Column(s) used to determine the bottom rows. Accepts expression +#' input. Strings are parsed as column names. +#' @param reverse Consider the `k` largest elements of the by column(s) +#' (instead of the k smallest). This can be specified per column by passing a +#' sequence of booleans. +#' +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame( +#' a = c("a", "b", "a", "b", "b", "c"), +#' b = c(2, 1, 1, 3, 2, 1) +#' ) +#' +#' # Get the rows which contain the 4 smallest values in column b. +#' lf$bottom_k(4, by = "b")$collect() +#' +#' # Get the rows which contain the 4 smallest values when sorting on column a +#' # and b$ +#' lf$bottom_k(4, by = c("a", "b"))$collect() +lazyframe__bottom_k <- function(k, ..., by, reverse = FALSE) { + wrap({ + check_dots_empty0(...) + by <- parse_into_list_of_expressions(!!!by) + reverse <- extend_bool(reverse, length(by), "reverse", "...") + self$`_ldf`$bottom_k(k, by, reverse) + }) +} + +#' Return the `k` largest rows +#' +#' @inherit lazyframe__bottom_k description params +#' @inheritParams rlang::check_dots_empty0 +#' @param reverse Consider the `k` smallest elements of the `by` column(s) +#' (instead of the `k` largest). This can be specified per column by passing a +#' sequence of booleans. + +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame( +#' a = c("a", "b", "a", "b", "b", "c"), +#' b = c(2, 1, 1, 3, 2, 1) +#' ) +#' +#' # Get the rows which contain the 4 largest values in column b. +#' lf$top_k(4, by = "b")$collect() +#' +#' # Get the rows which contain the 4 largest values when sorting on column a +#' # and b$ +#' lf$top_k(4, by = c("a", "b"))$collect() +lazyframe__top_k <- function(k, ..., by, reverse = FALSE) { + wrap({ + check_dots_empty0(...) + by <- parse_into_list_of_expressions(!!!by) + reverse <- extend_bool(reverse, length(by), "reverse", "...") + self$`_ldf`$top_k(k, by, reverse) + }) +} + +#' Interpolate intermediate values +#' +#' The interpolation method is linear. +#' @inherit as_polars_lf return +#' +#' @examples +#' lf <- pl$LazyFrame( +#' foo = c(1, NA, 9, 10), +#' bar = c(6, 7, 9, NA), +#' ham = c(1, NA, NA, 9) +#' ) +#' +#' lf$interpolate()$collect() +lazyframe__interpolate <- function() { + wrap({ + self$select(pl$col("*")$interpolate()) + }) +} + +#' Take two sorted DataFrames and merge them by the sorted key +#' +#' The output of this operation will also be sorted. It is the callers +#' responsibility that the frames are sorted by that key, otherwise the output +#' will not make sense. The schemas of both LazyFrames must be equal. +#' +#' @param other Other DataFrame that must be merged. +#' @param key Key that is sorted. +#' +#' @inherit as_polars_lf return +#' +#' @examples +#' lf1 <- pl$LazyFrame( +#' name = c("steve", "elise", "bob"), +#' age = c(42, 44, 18) +#' )$sort("age") +#' +#' lf2 <- pl$LazyFrame( +#' name = c("anna", "megan", "steve", "thomas"), +#' age = c(21, 33, 42, 20) +#' )$sort("age") +#' +#' lf1$merge_sorted(lf2, key = "age")$collect() +lazyframe__merge_sorted <- function(other, key) { + wrap({ + self$`_ldf`$merge_sorted(other$`_ldf`, key) + }) +} + +#' Indicate that one or multiple columns are sorted +#' +#' This can speed up future operations, but it can lead to incorrect results if +#' the data is **not** sorted! Use with care! +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param column Columns that are sorted. +#' @param descending Whether the columns are sorted in descending order. +#' +#' @inherit as_polars_lf return +lazyframe__set_sorted <- function(column, ..., descending = FALSE) { + wrap({ + check_dots_empty0(...) + self$with_columns(pl$col(column)$set_sorted(descending = descending)) + }) +} + +#' Add a row index as the first column in the LazyFrame +#' +#' @description +#' Using this function can have a negative effect on query performance. This +#' may, for instance, block predicate pushdown optimization. +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param name Name of the index column. +#' @param offset Start the index at this offset. Cannot be negative. +#' +#' @inherit as_polars_lf return +#' @examples +#' lf <- pl$LazyFrame(x = c(1, 3, 5), y = c(2, 4, 6)) +#' lf$with_row_index()$collect() +#' +#' lf$with_row_index("id", offset = 1000)$collect() +#' +#' # An index column can also be created using the expressions int_range() +#' # and len()$ +#' lf$with_columns( +#' index = pl$int_range(pl$len(), dtype = pl$UInt32) +#' )$collect() +lazyframe__with_row_index <- function(name = "index", offset = 0) { + wrap({ + self$`_ldf`$with_row_index(name, offset) + }) +} + +#' Evaluate the query in streaming mode and write to a Parquet file +#' +#' @description +#' `r lifecycle::badge("experimental")` +#' +#' This allows streaming results that are larger than RAM to be written to disk. +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param path A character. File path to which the file should be written. +#' @param compression The compression method. Must be one of: +#' * `"lz4"`: fast compression/decompression. +#' * `"uncompressed"` +#' * `"snappy"`: this guarantees that the parquet file will be compatible with +#' older parquet readers. +#' * `"gzip"` +#' * `"lzo"` +#' * `"brotli"` +#' * `"zstd"`: good compression performance. +#' @param compression_level `NULL` or integer. The level of compression to use. +#' Only used if method is one of `"gzip"`, `"brotli"`, or `"zstd"`. Higher +#' compression means smaller files on disk: +#' * `"gzip"`: min-level: 0, max-level: 10. +#' * `"brotli"`: min-level: 0, max-level: 11. +#' * `"zstd"`: min-level: 1, max-level: 22. +#' @param statistics Whether statistics should be written to the Parquet +#' headers. Possible values: +#' * `TRUE`: enable default set of statistics (default) +#' * `FALSE`: disable all statistics +#' * `"full"`: calculate and write all available statistics. +#' * A named list where all values must be `TRUE` or `FALSE`, e.g. +#' `list(min = TRUE, max = FALSE)`. Statistics available are `"min"`, `"max"`, +#' `"distinct_count"`, `"null_count"`. +#' @param row_group_size Size of the row groups in number of rows. If `NULL` +#' (default), the chunks of the DataFrame are used. Writing in smaller chunks +#' may reduce memory pressure and improve writing speeds. +#' @param data_page_size Size of the data page in bytes. If `NULL` (default), it +#' is set to 1024^2 bytes. +#' @param maintain_order Maintain the order in which data is processed. Setting +#' this to `FALSE` will be slightly faster. +#' @inheritParams lazyframe__collect +#' @inheritParams pl__scan_parquet +#' +#' @return Invisibly returns the input LazyFrame +#' +#' @examples +#' # sink table 'mtcars' from mem to parquet +#' tmpf <- tempfile() +#' as_polars_lf(mtcars)$sink_parquet(tmpf) +#' +#' # stream a query end-to-end +#' tmpf2 <- tempfile() +#' pl$scan_parquet(tmpf)$select(pl$col("cyl") * 2)$sink_parquet(tmpf2) +#' +#' # load parquet directly into a DataFrame / memory +#' pl$scan_parquet(tmpf2)$collect() +lazyframe__sink_parquet <- function( + path, + ..., + compression = "zstd", + compression_level = 3, + statistics = TRUE, + row_group_size = NULL, + data_page_size = NULL, + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2) { + wrap({ + check_dots_empty0(...) + compression <- arg_match0( + compression, + values = c("lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd") + ) + + if (isTRUE(no_optimization)) { + predicate_pushdown <- FALSE + projection_pushdown <- FALSE + slice_pushdown <- FALSE + } + + lf <- self$`_ldf`$optimization_toggle( + type_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = FALSE, + comm_subexpr_elim = FALSE, + cluster_with_columns = FALSE, + streaming = FALSE, + `_eager` = FALSE + ) + + statistics <- translate_statistics(statistics) + + lf$sink_parquet( + path = path, + compression = compression, + compression_level = compression_level, + statistics = statistics, + row_group_size = row_group_size, + data_page_size = data_page_size, + maintain_order = maintain_order, + storage_options = storage_options, + retries = retries + ) + + invisible(self) + }) +} + +#' Evaluate the query in streaming mode and write to an IPC file +#' +#' @inherit lazyframe__sink_parquet description params return +#' @inheritParams rlang::check_dots_empty0 +#' @param compression `NULL` or one of: +#' * `"uncompressed"`: same as `NULL`. +#' * `"lz4"`: fast compression/decompression. +#' * `"zstd"`: good compression performance. +#' +#' @examples +#' # sink table 'mtcars' from mem to ipc +#' tmpf <- tempfile() +#' as_polars_lf(mtcars)$sink_ipc(tmpf) +#' +#' # stream a query end-to-end (not supported yet, https://github.com/pola-rs/polars/issues/1040) +#' # tmpf2 = tempfile() +#' # pl$scan_ipc(tmpf)$select(pl$col("cyl") * 2)$sink_ipc(tmpf2) +#' +#' # load ipc directly into a DataFrame / memory +#' # pl$scan_ipc(tmpf2)$collect() +lazyframe__sink_ipc <- function( + path, + ..., + compression = c("zstd", "lz4", "uncompressed"), + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2) { + wrap({ + check_dots_empty0(...) + compression <- compression %||% "uncompressed" + compression <- arg_match0( + compression, + values = c("lz4", "uncompressed", "zstd") + ) + + if (isTRUE(no_optimization)) { + predicate_pushdown <- FALSE + projection_pushdown <- FALSE + slice_pushdown <- FALSE + } + + lf <- self$`_ldf`$optimization_toggle( + type_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = FALSE, + comm_subexpr_elim = FALSE, + cluster_with_columns = FALSE, + streaming = FALSE, + `_eager` = FALSE + ) + + lf$sink_ipc( + path = path, + compression = compression, + maintain_order = maintain_order, + storage_options = storage_options, + retries = retries + ) + + invisible(self) + }) +} + +#' Evaluate the query in streaming mode and write to a CSV file +#' +#' @inherit lazyframe__sink_parquet description params return +#' @inheritParams rlang::check_dots_empty0 +#' @param include_bom Logical, whether to include UTF-8 BOM in the CSV output. +#' @param include_header Logical, hether to include header in the CSV output. +#' @param separator Separate CSV fields with this symbol. +#' @param line_terminator String used to end each row. +#' @param quote_char Byte to use as quoting character. +#' @param batch_size Number of rows that will be processed per thread. +#' @param datetime_format A format string, with the specifiers defined by the +#' [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) +#' Rust crate. If no format specified, the default fractional-second precision +#' is inferred from the maximum timeunit found in the frame’s Datetime cols (if +#' any). +#' @param date_format A format string, with the specifiers defined by the +#' [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) +#' Rust crate. +#' @param time_format A format string, with the specifiers defined by the +#' [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) +#' Rust crate. +#' @param float_precision Whether to use scientific form always (`TRUE`), never +#' (`FALSE`), or automatically (`NULL`) for Float32 and Float64 datatypes. +#' @param null_value A string representing null values (defaulting to the empty +#' string). +#' @param quote_style Determines the quoting strategy used. Must be one of: +#' * `"necessary"` (default): This puts quotes around fields only when +#' necessary. They are necessary when fields contain a quote, delimiter or +#' record terminator. Quotes are also necessary when writing an empty record +#' (which is indistinguishable from a record with one empty field). This is +#' the default. +#' * `"always"`: This puts quotes around every field. Always. +#' * `"never"`: This never puts quotes around fields, even if that results in +#' invalid CSV data (e.g.: by not quoting strings containing the separator). +#' * `"non_numeric"`: This puts quotes around all fields that are non-numeric. +#' Namely, when writing a field that does not parse as a valid float or +#' integer, then quotes will be used even if they aren`t strictly necessary. +#' +#' @examples +#' # sink table 'mtcars' from mem to CSV +#' tmpf <- tempfile() +#' pl$LazyFrame(mtcars)$sink_csv(tmpf) +#' +#' # stream a query end-to-end +#' tmpf2 <- tempfile() +#' pl$scan_csv(tmpf)$select(pl$col("cyl") * 2)$sink_csv(tmpf2) +#' +#' # load parquet directly into a DataFrame / memory +#' pl$scan_csv(tmpf2)$collect() +lazyframe__sink_csv <- function( + path, + ..., + include_bom = FALSE, + include_header = TRUE, + separator = ",", + line_terminator = "\n", + quote_char = '"', + batch_size = 1024, + datetime_format = NULL, + date_format = NULL, + time_format = NULL, + float_precision = NULL, + null_value = "", + quote_style = "necessary", + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2) { + wrap({ + check_dots_empty0(...) + quote_style <- arg_match0( + quote_style, + values = c("necessary", "always", "never", "non_numeric") + ) + + if (isTRUE(no_optimization)) { + predicate_pushdown <- FALSE + projection_pushdown <- FALSE + slice_pushdown <- FALSE + } + + lf <- self$`_ldf`$optimization_toggle( + type_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = FALSE, + comm_subexpr_elim = FALSE, + cluster_with_columns = FALSE, + streaming = FALSE, + `_eager` = FALSE + ) + + lf$sink_csv( + path = path, + include_bom = include_bom, + include_header = include_header, + separator = separator, + line_terminator = line_terminator, + quote_char = quote_char, + batch_size = batch_size, + datetime_format = datetime_format, + date_format = date_format, + time_format = time_format, + float_precision = float_precision, + null_value = null_value, + quote_style = quote_style, + maintain_order = maintain_order, + storage_options = storage_options, + retries = retries + ) + + invisible(self) + }) +} + +#' Evaluate the query in streaming mode and write to an NDJSON file +#' +#' @inherit lazyframe__sink_parquet description params return +#' @inheritParams rlang::check_dots_empty0 +#' +#' @examples +#' # sink table 'mtcars' from mem to NDJSON +#' tmpf <- tempfile(fileext = ".ndjson") +#' pl$LazyFrame(mtcars)$sink_ndjson(tmpf) +#' +#' # load parquet directly into a DataFrame / memory +#' pl$scan_ndjson(tmpf)$collect() +lazyframe__sink_ndjson <- function( + path, + ..., + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2) { + wrap({ + check_dots_empty0(...) + if (isTRUE(no_optimization)) { + predicate_pushdown <- FALSE + projection_pushdown <- FALSE + slice_pushdown <- FALSE + } + + lf <- self$`_ldf`$optimization_toggle( + type_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = FALSE, + comm_subexpr_elim = FALSE, + cluster_with_columns = FALSE, + streaming = FALSE, + `_eager` = FALSE + ) + + lf$sink_json( + path = path, + maintain_order = maintain_order, + storage_options = storage_options, + retries = retries + ) + + invisible(self) + }) +} + +#' Perform joins on nearest keys +#' +#' @description +#' This is similar to a left-join except that we match on nearest key rather +#' than equal keys. Both frames must be sorted by the `asof_join` key. +#' +#' @inheritParams rlang::check_dots_empty0 +#' @param other LazyFrame to join with. +#' @inheritParams dataframe__join +#' @param by Join on these columns before performing asof join. Either a vector +#' of column names or a list of expressions and/or strings. Use `left_by` and +#' `right_by` if the column names to match on are different between the two +#' tables. +#' @param by_left,by_right Same as `by` but only for the left or the right +#' table. They must have the same length. +#' @param strategy Strategy for where to find match: +#' * `"backward"` (default): search for the last row in the right table whose +#' `on` key is less than or equal to the left key. +#' * `"forward"`: search for the first row in the right table whose `on` key is +#' greater than or equal to the left key. +#' * `"nearest"`: search for the last row in the right table whose value is +#' nearest to the left key. String keys are not currently supported for a +#' nearest search. +#' @param tolerance Numeric tolerance. By setting this the join will only be +#' done if the near keys are within this distance. If an asof join is done on +#' columns of dtype "Date", "Datetime", "Duration" or "Time", use the Polars +#' duration string language (see details). +#' +#' @param coalesce Coalescing behavior (merging of `on` / `left_on` / +#' `right_on` columns): +#' * `TRUE`: Always coalesce join columns; +#' * `FALSE`: Never coalesce join columns. +#' Note that joining on any other expressions than `col` will turn off +#' coalescing. +#' +#' @inheritSection polars_duration_string Polars duration string language +#' @examples +#' gdp <- pl$LazyFrame( +#' date = as.Date(c("2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1", "2020-1-1")), +#' gdp = c(4164, 4411, 4566, 4696, 4827) +#' ) +#' +#' pop <- pl$LazyFrame( +#' date = as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")), +#' population = c(82.19, 82.66, 83.12) +#' ) +#' +#' # optional make sure tables are already sorted with "on" join-key +#' gdp <- gdp$sort("date") +#' pop <- pop$sort("date") +#' +#' +#' # Note how the dates don’t quite match. If we join them using join_asof and +#' # strategy = 'backward', then each date from population which doesn’t have +#' # an exact match is matched with the closest earlier date from gdp: +#' pop$join_asof(gdp, on = "date", strategy = "backward")$collect() +#' +#' # Note how: +#' # - date 2016-03-01 from population is matched with 2016-01-01 from gdp; +#' # - date 2018-08-01 from population is matched with 2018-01-01 from gdp. +#' # You can verify this by passing coalesce = FALSE: +#' pop$join_asof( +#' gdp, +#' on = "date", strategy = "backward", coalesce = FALSE +#' )$collect() +#' +#' # If we instead use strategy = 'forward', then each date from population +#' # which doesn’t have an exact match is matched with the closest later date +#' # from gdp: +#' pop$join_asof(gdp, on = "date", strategy = "forward")$collect() +#' +#' # Note how: +#' # - date 2016-03-01 from population is matched with 2017-01-01 from gdp; +#' # - date 2018-08-01 from population is matched with 2019-01-01 from gdp. +#' +#' # Finally, strategy = 'nearest' gives us a mix of the two results above, as +#' # each date from population which doesn’t have an exact match is matched +#' # with the closest date from gdp, regardless of whether it’s earlier or +#' # later: +#' pop$join_asof(gdp, on = "date", strategy = "nearest")$collect() +#' +#' # Note how: +#' # - date 2016-03-01 from population is matched with 2016-01-01 from gdp; +#' # - date 2018-08-01 from population is matched with 2019-01-01 from gdp. +#' +#' # The `by` argument allows joining on another column first, before the asof +#' # join. In this example we join by country first, then asof join by date, as +#' # above. +#' gdp2 <- pl$LazyFrame( +#' country = rep(c("Germany", "Netherlands"), each = 5), +#' date = rep( +#' as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1", "2020-1-1")), +#' 2 +#' ), +#' gdp = c(4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909) +#' )$sort("country", "date") +#' gdp2$collect() +#' +#' pop2 <- pl$LazyFrame( +#' country = rep(c("Germany", "Netherlands"), each = 3), +#' date = rep(as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")), 2), +#' population = c(82.19, 82.66, 83.12, 17.11, 17.32, 17.40) +#' )$sort("country", "date") +#' pop2$collect() +#' +#' pop2$join_asof( +#' gdp2, +#' by = "country", on = "date", strategy = "nearest" +#' )$collect() +lazyframe__join_asof <- function( + other, + ..., + left_on = NULL, + right_on = NULL, + on = NULL, + by_left = NULL, + by_right = NULL, + by = NULL, + strategy = c("backward", "forward", "nearest"), + suffix = "_right", + tolerance = NULL, + allow_parallel = TRUE, + force_parallel = FALSE, + coalesce = TRUE) { + wrap({ + check_dots_empty0(...) + strategy <- arg_match0(strategy, values = c("backward", "forward", "nearest")) + if (!is.null(by)) by_left <- by_right <- by + if (!is.null(on)) left_on <- right_on <- on + tolerance_str <- if (is.character(tolerance)) tolerance else NULL + tolerance_num <- if (!is.character(tolerance)) tolerance else NULL + + self$`_ldf`$join_asof( + other = other$`_ldf`, + left_on = as_polars_expr(left_on)$`_rexpr`, + right_on = as_polars_expr(right_on)$`_rexpr`, + left_by = by_left, + right_by = by_right, + allow_parallel = allow_parallel, + force_parallel = force_parallel, + suffix = suffix, + strategy = strategy, + tolerance = tolerance_num, + tolerance_str = tolerance_str, + coalesce = coalesce + ) + }) +} diff --git a/R/utils-various.R b/R/utils-various.R index 5840ed62..c62c2bfe 100644 --- a/R/utils-various.R +++ b/R/utils-various.R @@ -16,3 +16,116 @@ extend_bool <- function(value, n_match, value_name, match_name) { value } } + +#' @noRd +make_profile_plot <- function(data, truncate_nodes) { + check_installed("ggplot2") + timings <- as.data.frame(data[[2]]) + timings$node <- factor(timings$node, levels = unique(timings$node)) + total_timing <- max(timings$end) + if (total_timing > 10000000) { + unit <- "s" + total_timing <- paste0(total_timing / 1000000, "s") + timings$start <- timings$start / 1000000 + timings$end <- timings$end / 1000000 + } else if (total_timing > 10000) { + unit <- "ms" + total_timing <- paste0(total_timing / 1000, "ms") + timings$start <- timings$start / 1000 + timings$end <- timings$end / 1000 + } else { + unit <- "\U00B5s" + total_timing <- paste0(total_timing, "\U00B5s") + } + + # for some reason, there's an error if I use rlang::.data directly in aes() + .data <- rlang::.data + + plot <- ggplot2::ggplot( + timings, + ggplot2::aes( + x = .data[["start"]], xend = .data[["end"]], + y = .data[["node"]], yend = .data[["node"]] + ) + ) + + ggplot2::geom_segment(linewidth = 6) + + ggplot2::xlab( + paste0("Node duration in ", unit, ". Total duration: ", total_timing) + ) + + ggplot2::ylab(NULL) + + ggplot2::theme( + axis.text = ggplot2::element_text(size = 12) + ) + + if (truncate_nodes > 0) { + plot <- plot + + ggplot2::scale_y_discrete( + labels = rev(paste0(strtrim(timings$node, truncate_nodes), "...")), + limits = rev + ) + } else { + plot <- plot + + ggplot2::scale_y_discrete( + limits = rev + ) + } + + # do not show the plot if we're running testthat + if (!identical(Sys.getenv("TESTTHAT"), "true")) { + print(plot) + } + plot +} + +#' @noRd +translate_statistics <- function(statistics, call = caller_env()) { + if (length(statistics) != 1 && !is.list(statistics)) { + abort("`statistics` must be of length 1.", call = call) + } + if (is.logical(statistics)) { + if (isTRUE(statistics)) { + statistics <- list( + min = TRUE, + max = TRUE, + distinct_count = FALSE, + null_count = TRUE + ) + } else { + statistics <- list( + min = FALSE, + max = FALSE, + distinct_count = FALSE, + null_count = FALSE + ) + } + } else if (is.character(statistics)) { + if (statistics == "full") { + statistics <- list( + min = TRUE, + max = TRUE, + distinct_count = TRUE, + null_count = TRUE + ) + } else { + abort("`statistics` must be TRUE/FALSE, \"full\", or a named list.", call = call) + } + } else if (is.list(statistics)) { + default <- list( + min = TRUE, + max = TRUE, + distinct_count = FALSE, + null_count = TRUE + ) + statistics <- utils::modifyList(default, statistics) + nms <- names(statistics) + invalid <- nms[!nms %in% c("min", "max", "distinct_count", "null_count")] + if (length(invalid) > 0) { + msg <- paste0("`", invalid, "`", collapse = ", ") + abort( + paste0("In `statistics`,", msg, "are not valid keys."), + call = call + ) + } + } + statistics +} diff --git a/man/dataframe__cast.Rd b/man/dataframe__cast.Rd index 10011b1c..f168d5a9 100644 --- a/man/dataframe__cast.Rd +++ b/man/dataframe__cast.Rd @@ -6,11 +6,21 @@ \usage{ dataframe__cast(..., .strict = TRUE) } +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Either a datatype to which +all columns will be cast, or a list where the names are column names and the +values are the datatypes to convert to.} + +\item{.strict}{If \code{TRUE} (default), throw an error if a cast could not be done +(for instance, due to an overflow). Otherwise, return \code{null}.} +} \value{ A polars \link{DataFrame} } \description{ -Cast DataFrame column(s) to the specified dtype +This allows to convert all columns to a datatype or to convert only specific +columns. Contrarily to the Python implementation, it is not possible to +convert all columns of a specific datatype to another datatype. } \examples{ df <- pl$DataFrame( diff --git a/man/lazyframe__bottom_k.Rd b/man/lazyframe__bottom_k.Rd new file mode 100644 index 00000000..903ffd46 --- /dev/null +++ b/man/lazyframe__bottom_k.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__bottom_k} +\alias{lazyframe__bottom_k} +\title{Return the \code{k} smallest rows} +\usage{ +lazyframe__bottom_k(k, ..., by, reverse = FALSE) +} +\arguments{ +\item{k}{Number of rows to return.} + +\item{by}{Column(s) used to determine the bottom rows. Accepts expression +input. Strings are parsed as column names.} + +\item{reverse}{Consider the \code{k} largest elements of the by column(s) +(instead of the k smallest). This can be specified per column by passing a +sequence of booleans.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Non-null elements are always preferred over null elements, regardless of the +value of \code{reverse}. The output is not guaranteed to be in any particular +order, call \code{sort()} after this function if you wish the output to be sorted. +} +\examples{ +lf <- pl$LazyFrame( + a = c("a", "b", "a", "b", "b", "c"), + b = c(2, 1, 1, 3, 2, 1) +) + +# Get the rows which contain the 4 smallest values in column b. +lf$bottom_k(4, by = "b")$collect() + +# Get the rows which contain the 4 smallest values when sorting on column a +# and b$ +lf$bottom_k(4, by = c("a", "b"))$collect() +} diff --git a/man/lazyframe__cast.Rd b/man/lazyframe__cast.Rd new file mode 100644 index 00000000..56958008 --- /dev/null +++ b/man/lazyframe__cast.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__cast} +\alias{lazyframe__cast} +\title{Cast LazyFrame column(s) to the specified dtype(s)} +\usage{ +lazyframe__cast(..., .strict = TRUE) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Either a datatype to which +all columns will be cast, or a list where the names are column names and the +values are the datatypes to convert to.} + +\item{strict}{If \code{TRUE} (default), throw an error if a cast could not be done +(for instance, due to an overflow). Otherwise, return \code{null}.} +} +\value{ +A LazyFrame +} +\description{ +This allows to convert all columns to a datatype or to convert only specific +columns. Contrarily to the Python implementation, it is not possible to +convert all columns of a specific datatype to another datatype. +} +\examples{ +lf <- pl$LazyFrame( + foo = 1:3, + bar = c(6, 7, 8), + ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) +) + +# Cast only some columns +lf$cast(foo = pl$Float32, bar = pl$UInt8)$collect() + +# Cast all columns to the same type +lf$cast(pl$String)$collect() +} diff --git a/man/lazyframe__clear.Rd b/man/lazyframe__clear.Rd new file mode 100644 index 00000000..91e4d73c --- /dev/null +++ b/man/lazyframe__clear.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__clear} +\alias{lazyframe__clear} +\title{Create an empty or n-row null-filled copy of the LazyFrame} +\usage{ +lazyframe__clear(n = 0) +} +\arguments{ +\item{n}{Number of (empty) rows to return in the cleared frame.} +} +\value{ +A n-row null-filled LazyFrame with an identical schema +} +\description{ +Returns a n-row null-filled LazyFrame with an identical schema. \code{n} can be +greater than the current number of rows in the LazyFrame. +} +\examples{ +df <- pl$LazyFrame( + a = c(NA, 2, 3, 4), + b = c(0.5, NA, 2.5, 13), + c = c(TRUE, TRUE, FALSE, NA) +) + +df$clear() + +df$clear(n = 5) +} diff --git a/man/lazyframe__clone.Rd b/man/lazyframe__clone.Rd new file mode 100644 index 00000000..4e51d5f7 --- /dev/null +++ b/man/lazyframe__clone.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__clone} +\alias{lazyframe__clone} +\title{Clone a LazyFrame} +\usage{ +lazyframe__clone() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +This makes a very cheap deep copy/clone of an existing +\code{\link[=lazyframe__class]{LazyFrame}}. Rarely useful as \code{LazyFrame}s are nearly 100\% +immutable. Any modification of a \code{LazyFrame} should lead to a clone anyways, +but this can be useful when dealing with attributes (see examples). +} +\examples{ +df1 <- as_polars_lf(iris) + +# Make a function to take a LazyFrame, add an attribute, and return a LazyFrame +give_attr <- function(data) { + attr(data, "created_on") <- "2024-01-29" + data +} +df2 <- give_attr(df1) + +# Problem: the original LazyFrame also gets the attribute while it shouldn't! +attributes(df1) + +# Use $clone() inside the function to avoid that +give_attr <- function(data) { + data <- data$clone() + attr(data, "created_on") <- "2024-01-29" + data +} +df1 <- as_polars_lf(iris) +df2 <- give_attr(df1) + +# now, the original LazyFrame doesn't get this attribute +attributes(df1) +} diff --git a/man/lazyframe__collect.Rd b/man/lazyframe__collect.Rd index 3ccae6b3..e6139b19 100644 --- a/man/lazyframe__collect.Rd +++ b/man/lazyframe__collect.Rd @@ -68,3 +68,14 @@ lf$group_by("a")$agg(pl$all()$sum())$collect( streaming = TRUE ) } +\seealso{ +\itemize{ +\item \code{\link[=lazyframe__profile]{$profile()}} - same as \verb{$collect()} but also returns +a table with each operation profiled. +\item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking +collect returns a future handle. Can also just be used via +\verb{$collect(collect_in_background = TRUE)}. +\item \code{\link[=lazyframe__sink_parquet]{$sink_parquet()}} streams query to a parquet file. +\item \code{\link[=lazyframe__sink_ipc]{$sink_ipc()}} streams query to a arrow file. +} +} diff --git a/man/lazyframe__collect_schema.Rd b/man/lazyframe__collect_schema.Rd new file mode 100644 index 00000000..6a59a51f --- /dev/null +++ b/man/lazyframe__collect_schema.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__collect_schema} +\alias{lazyframe__collect_schema} +\title{Resolve the schema of this LazyFrame} +\usage{ +lazyframe__collect_schema() +} +\value{ +A named list with names indicating column names and values indicating +column data types. +} +\description{ +This resolves the query plan but does not trigger computations. +} +\examples{ +lf <- pl$LazyFrame( + foo = 1:3, + bar = 6:8, + ham = c("a", "b", "c") +) + +lf$collect_schema() + +lf$with_columns( + baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String), + pl$col("bar")$cast(pl$Int64) +)$collect_schema() +} diff --git a/man/lazyframe__count.Rd b/man/lazyframe__count.Rd new file mode 100644 index 00000000..a8d52e21 --- /dev/null +++ b/man/lazyframe__count.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__count} +\alias{lazyframe__count} +\title{Return the number of non-null elements for each column} +\usage{ +lazyframe__count() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Return the number of non-null elements for each column +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4)) +lf$count()$collect() +} diff --git a/man/lazyframe__drop.Rd b/man/lazyframe__drop.Rd new file mode 100644 index 00000000..95552531 --- /dev/null +++ b/man/lazyframe__drop.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__drop} +\alias{lazyframe__drop} +\title{Remove columns from the DataFrame} +\usage{ +lazyframe__drop(..., strict = TRUE) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Names of the columns that +should be removed from the dataframe. Accepts column selector input.} + +\item{strict}{Validate that all column names exist in the current schema, +and throw an exception if any do not.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Remove columns from the DataFrame +} +\examples{ +# Drop columns by passing the name of those columns +lf <- pl$LazyFrame( + foo = 1:3, + bar = c(6, 7, 8), + ham = c("a", "b", "c") +) +lf$drop("ham")$collect() +lf$drop("ham", "bar")$collect() + +# Drop multiple columns by passing a selector +lf$drop(cs$all())$collect() +} diff --git a/man/lazyframe__drop_nulls.Rd b/man/lazyframe__drop_nulls.Rd new file mode 100644 index 00000000..dd3fcc78 --- /dev/null +++ b/man/lazyframe__drop_nulls.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__drop_nulls} +\alias{lazyframe__drop_nulls} +\title{Drop all rows that contain null values} +\usage{ +lazyframe__drop_nulls(subset = NULL) +} +\arguments{ +\item{subset}{Column name(s) for which null values are considered. If \code{NULL} +(default), use all columns.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +The original order of the remaining rows is preserved. +} +\examples{ +lf <- pl$LazyFrame( + foo = 1:3, + bar = c(6, NA, 8), + ham = c("a", "b", NA) +) + +# The default behavior of this method is to drop rows where any single value +# of the row is null. +lf$drop_nulls()$collect() + +# This behaviour can be constrained to consider only a subset of columns, as +# defined by name or with a selector. For example, dropping rows if there is +# a null in any of the integer columns: +lf$drop_nulls(subset = cs$integer())$collect() +} diff --git a/man/lazyframe__explain.Rd b/man/lazyframe__explain.Rd new file mode 100644 index 00000000..582869d2 --- /dev/null +++ b/man/lazyframe__explain.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__explain} +\alias{lazyframe__explain} +\title{Create a string representation of the query plan} +\usage{ +lazyframe__explain( + ..., + format = c("plain", "tree"), + optimized = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE +) +} +\arguments{ +\item{...}{Dots which should be empty.} + +\item{format}{The format to use for displaying the logical plan. Must be +either \code{"plain"} (default) or \code{"tree"}.} + +\item{optimized}{Return an optimized query plan. If \code{TRUE} (default), the +subsequent optimization flags control which optimizations run.} + +\item{type_coercion}{A logical, indicats type coercion optimization.} + +\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.} + +\item{projection_pushdown}{A logical, indicats projection pushdown optimization.} + +\item{simplify_expression}{A logical, indicats simplify expression optimization.} + +\item{slice_pushdown}{A logical, indicats slice pushdown optimization.} + +\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.} + +\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.} + +\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.} + +\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data. +If \code{FALSE} (default), the entire query is processed in a single batch. +Note that streaming mode is considered unstable. +It may be changed at any point without it being considered a breaking change.} +} +\value{ +A character value containing the query plan. +} +\description{ +The query plan is read from bottom to top. When \code{optimized = FALSE}, the +query as it was written by the user is shown. This is not what Polars runs. +Instead, it applies optimizations that are displayed by default by \verb{$explain()}. +One classic example is the predicate pushdown, which applies the filter as +early as possible (i.e. at the bottom of the plan). +} +\examples{ +lazy_frame <- as_polars_lf(iris) + +# Prepare your query +lazy_query <- lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa") + +# This is the query that was written by the user, without any optimizations +# (use cat() for better printing) +lazy_query$explain(optimized = FALSE) |> cat() + +# This is the query after `polars` optimizes it: instead of sorting first and +# then filtering, it is faster to filter first and then sort the rest. +lazy_query$explain() |> cat() + +# Also possible to see this as tree format +lazy_query$explain(format = "tree") |> cat() +} diff --git a/man/lazyframe__explode.Rd b/man/lazyframe__explode.Rd new file mode 100644 index 00000000..e438e172 --- /dev/null +++ b/man/lazyframe__explode.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__explode} +\alias{lazyframe__explode} +\title{Explode the DataFrame to long format by exploding the given columns} +\usage{ +lazyframe__explode(...) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Column names, expressions, or +a selector defining them. The underlying columns being exploded must be of +the \code{List} or \code{Array} data type.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Explode the DataFrame to long format by exploding the given columns +} +\examples{ +lf <- pl$LazyFrame( + letters = c("a", "a", "b", "c"), + numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)) +) + +lf$explode("numbers")$collect() +} diff --git a/man/lazyframe__fill_nan.Rd b/man/lazyframe__fill_nan.Rd new file mode 100644 index 00000000..05e05a2c --- /dev/null +++ b/man/lazyframe__fill_nan.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__fill_nan} +\alias{lazyframe__fill_nan} +\title{Fill floating point \code{NaN} value with a fill value} +\usage{ +lazyframe__fill_nan(value) +} +\arguments{ +\item{value}{Value used to fill \code{NaN} values.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Fill floating point \code{NaN} value with a fill value +} +\examples{ +lf <- pl$LazyFrame( + a = c(1.5, 2, NaN, 4), + b = c(1.5, NaN, NaN, 4) +) +lf$fill_nan(99)$collect() +} diff --git a/man/lazyframe__filter.Rd b/man/lazyframe__filter.Rd new file mode 100644 index 00000000..22a94990 --- /dev/null +++ b/man/lazyframe__filter.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__filter} +\alias{lazyframe__filter} +\title{Filter the rows in the LazyFrame based on a predicate expression} +\usage{ +lazyframe__filter(...) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Expression that evaluates to +a boolean Series.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +The original order of the remaining rows is preserved. Rows where the filter +does not evaluate to \code{TRUE} are discarded, including nulls. +} +\examples{ +lf <- pl$LazyFrame( + foo = c(1, 2, 3, NA, 4, NA, 0), + bar = c(6, 7, 8, NA, NA, 9, 0), + ham = c("a", "b", "c", NA, "d", "e", "f") +) + +# Filter on one condition +lf$filter(pl$col("foo") > 1)$collect() + +# Filter on multiple conditions +lf$filter((pl$col("foo") < 3) & (pl$col("ham") == "a"))$collect() + +# Filter on an OR condition +lf$filter((pl$col("foo") == 1) | (pl$col("ham") == " c"))$collect() + +# Filter by comparing two columns against each other +lf$filter(pl$col("foo") == pl$col("bar"))$collect() +lf$filter(pl$col("foo") != pl$col("bar"))$collect() + +# Notice how the row with null values is filtered out$ In order to keep the +# rows with nulls, use: +lf$filter(pl$col("foo")$ne_missing(pl$col("bar")))$collect() +} diff --git a/man/lazyframe__first.Rd b/man/lazyframe__first.Rd new file mode 100644 index 00000000..20d03d55 --- /dev/null +++ b/man/lazyframe__first.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__first} +\alias{lazyframe__first} +\title{Get the first row of the LazyFrame} +\usage{ +lazyframe__first() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Get the first row of the LazyFrame +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$first()$collect() +} diff --git a/man/lazyframe__gather_every.Rd b/man/lazyframe__gather_every.Rd new file mode 100644 index 00000000..7eaf654d --- /dev/null +++ b/man/lazyframe__gather_every.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__gather_every} +\alias{lazyframe__gather_every} +\title{Take every nth row in the LazyFrame} +\usage{ +lazyframe__gather_every(n, offset = 0) +} +\arguments{ +\item{n}{Gather every \code{n}-th row.} + +\item{offset}{Starting index.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Take every nth row in the LazyFrame +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = 5:8) +lf$gather_every(2)$collect() + +lf$gather_every(2, offset = 1)$collect() +} diff --git a/man/lazyframe__group_by.Rd b/man/lazyframe__group_by.Rd new file mode 100644 index 00000000..51afd1ca --- /dev/null +++ b/man/lazyframe__group_by.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__group_by} +\alias{lazyframe__group_by} +\title{Start a group by operation} +\usage{ +lazyframe__group_by(..., .maintain_order = FALSE) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Column(s) to group by. +Accepts expression input. Strings are parsed as column names.} + +\item{.maintain_order}{Ensure that the order of the groups is consistent with +the input data. This is slower than a default group by. Setting this to +\code{TRUE} blocks the possibility to run on the streaming engine.} +} +\value{ +A lazy groupby +} +\description{ +Start a group by operation +} +\examples{ +# Group by one column and call agg() to compute the grouped sum of another +# column. +lf <- pl$LazyFrame( + a = c("a", "b", "a", "b", "c"), + b = c(1, 2, 1, 3, 3), + c = c(5, 4, 3, 2, 1) +) +lf$group_by("a")$agg(pl$col("b")$sum())$collect() + +# Set .maintain_order = TRUE to ensure the order of the groups is consistent +# with the input. +lf$group_by("a", .maintain_order = TRUE)$agg(pl$col("b")$sum())$collect() + +# Group by multiple columns by passing a vector of column names. +lf$group_by(c("a", "b"))$agg(pl$col("c")$max())$collect() + +# Or use positional arguments to group by multiple columns in the same way. +# Expressions are also accepted. +lf$ + group_by("a", pl$col("b") / 2)$ + agg(pl$col("c")$mean())$collect() +} diff --git a/man/lazyframe__group_by_dynamic.Rd b/man/lazyframe__group_by_dynamic.Rd new file mode 100644 index 00000000..624890ad --- /dev/null +++ b/man/lazyframe__group_by_dynamic.Rd @@ -0,0 +1,178 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__group_by_dynamic} +\alias{lazyframe__group_by_dynamic} +\title{Group based on a date/time or integer column} +\usage{ +lazyframe__group_by_dynamic( + index_column, + ..., + every, + period = NULL, + offset = NULL, + include_boundaries = FALSE, + closed = "left", + label = "left", + group_by = NULL, + start_by = "window" +) +} +\arguments{ +\item{index_column}{Column used to group based on the time window. Often of +type Date/Datetime. This column must be sorted in ascending order (or, if +\code{group_by} is specified, then it must be sorted in ascending order within +each group). +In case of a dynamic group by on indices, the data type needs to be either +Int32 or In64. Note that Int32 gets temporarily cast to Int64, so if +performance matters, use an Int64 column.} + +\item{...}{Dots which should be empty.} + +\item{every}{Interval of the window.} + +\item{period}{Length of the window. If \code{NULL} (default), it will equal +\code{every}.} + +\item{offset}{Offset of the window, does not take effect if +\code{start_by = "datapoint"}. Defaults to zero.} + +\item{include_boundaries}{Add two columns \code{"_lower_boundary"} and +\code{"_upper_boundary"} columns that show the boundaries of the window. This will +impact performance because it’s harder to parallelize.} + +\item{closed}{Define which sides of the interval are closed (inclusive). +Default is \code{"left"}.} + +\item{label}{Define which label to use for the window: +\itemize{ +\item \code{"left"}: lower boundary of the window +\item \code{"right"}: upper boundary of the window +\item \code{"datapoint"}: the first value of the index column in the given window. If +you don’t need the label to be at one of the boundaries, choose this option +for maximum performance. +}} + +\item{start_by}{The strategy to determine the start of the first window by: +\itemize{ +\item \code{"window"}: start by taking the earliest timestamp, truncating it with +\code{every}, and then adding \code{offset}. Note that weekly windows start on +Monday. +\item \code{"datapoint"}: start from the first encountered data point. +\item a day of the week (only takes effect if \code{every} contains \code{"w"}): \code{"monday"} +starts the window on the Monday before the first data point, etc. +}} +} +\value{ +A \link[=LazyGroupBy_class]{LazyGroupBy} object +} +\description{ +Time windows are calculated and rows are assigned to windows. Different from +a normal group by is that a row can be member of multiple groups. By +default, the windows look like: +\itemize{ +\item [start, start + period) +\item [start + every, start + every + period) +\item [start + 2\emph{every, start + 2}every + period) +\item … +} +} +\details{ +where \code{start} is determined by \code{start_by}, \code{offset}, \code{every}, and the +earliest datapoint. See the \code{start_by} argument description for details. + +The \code{every}, \code{period}, and \code{offset} arguments are created with the following +string language: +\itemize{ +\item 1ns # 1 nanosecond +\item 1us # 1 microsecond +\item 1ms # 1 millisecond +\item 1s # 1 second +\item 1m # 1 minute +\item 1h # 1 hour +\item 1d # 1 day +\item 1w # 1 calendar week +\item 1mo # 1 calendar month +\item 1y # 1 calendar year +These strings can be combined: +\itemize{ +\item 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds +} +} + +In case of a \code{group_by_dynamic} on an integer column, the windows are +defined by: +\itemize{ +\item 1i # length 1 +\item 10i # length 10 +} +} +\examples{ +lf <- pl$select( + time = pl$datetime_range( + start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + interval = "30m" + ), + n = 0:6 +)$lazy() +lf$collect() + +# Group by windows of 1 hour. +lf$group_by_dynamic("time", every = "1h", closed = "right")$agg( + vals = pl$col("n") +)$collect() + +# The window boundaries can also be added to the aggregation result +lf$group_by_dynamic( + "time", + every = "1h", include_boundaries = TRUE, closed = "right" +)$agg( + pl$col("n")$mean() +)$collect() + +# When closed = "left", the window excludes the right end of interval: +# [lower_bound, upper_bound) +lf$group_by_dynamic("time", every = "1h", closed = "left")$agg( + pl$col("n") +)$collect() + +# When closed = "both" the time values at the window boundaries belong to 2 +# groups. +lf$group_by_dynamic("time", every = "1h", closed = "both")$agg( + pl$col("n") +)$collect() + +# Dynamic group bys can also be combined with grouping on normal keys +lf <- lf$with_columns( + groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) +) +lf$collect() + +lf$group_by_dynamic( + "time", + every = "1h", + closed = "both", + group_by = "groups", + include_boundaries = TRUE +)$agg(pl$col("n"))$collect() + +# We can also create a dynamic group by based on an index column +lf <- pl$LazyFrame( + idx = 0:5, + A = c("A", "A", "B", "B", "B", "C") +)$with_columns(pl$col("idx")$set_sorted()) +lf$collect() + +lf$group_by_dynamic( + "idx", + every = "2i", + period = "3i", + include_boundaries = TRUE, + closed = "right" +)$agg(A_agg_list = pl$col("A"))$collect() +} +\seealso{ +\itemize{ +\item \code{\link[=lazyframe__rolling]{$rolling()}} +} +} diff --git a/man/lazyframe__head.Rd b/man/lazyframe__head.Rd new file mode 100644 index 00000000..3b202274 --- /dev/null +++ b/man/lazyframe__head.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__head} +\alias{lazyframe__head} +\title{Get the first \code{n} rows} +\usage{ +lazyframe__head(n = 5) +} +\arguments{ +\item{n}{Number of rows to return.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Get the first \code{n} rows +} +\examples{ +lf <- pl$LazyFrame(a = 1:6, b = 7:12) +lf$head()$collect() +lf$head(2)$collect() +} diff --git a/man/lazyframe__interpolate.Rd b/man/lazyframe__interpolate.Rd new file mode 100644 index 00000000..b6cafaf6 --- /dev/null +++ b/man/lazyframe__interpolate.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__interpolate} +\alias{lazyframe__interpolate} +\title{Interpolate intermediate values} +\usage{ +lazyframe__interpolate() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +The interpolation method is linear. +} +\examples{ +lf <- pl$LazyFrame( + foo = c(1, NA, 9, 10), + bar = c(6, 7, 9, NA), + ham = c(1, NA, NA, 9) +) + +lf$interpolate()$collect() +} diff --git a/man/lazyframe__join.Rd b/man/lazyframe__join.Rd new file mode 100644 index 00000000..653d5d65 --- /dev/null +++ b/man/lazyframe__join.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__join} +\alias{lazyframe__join} +\title{Join LazyFrames} +\usage{ +lazyframe__join( + other, + on = NULL, + how = "inner", + ..., + left_on = NULL, + right_on = NULL, + suffix = "_right", + validate = "m:m", + join_nulls = FALSE, + allow_parallel = TRUE, + force_parallel = FALSE, + coalesce = NULL +) +} +\arguments{ +\item{other}{LazyFrame to join with.} + +\item{on}{Either a vector of column names or a list of expressions and/or +strings. Use \code{left_on} and \code{right_on} if the column names to match on are +different between the two DataFrames.} + +\item{how}{One of the following methods: +\itemize{ +\item "inner": returns rows that have matching values in both tables +\item "left": returns all rows from the left table, and the matched rows from +the right table +\item "right": returns all rows from the right table, and the matched rows from +the left table +\item "full": returns all rows when there is a match in either left or right +table +\item "cross": returns the Cartesian product of rows from both tables +\item "semi": returns rows from the left table that have a match in the right +table. +\item "anti": returns rows from the left table that have no match in the right +table. +}} + +\item{...}{Dots which should be empty.} + +\item{left_on, right_on}{Same as \code{on} but only for the left or the right +DataFrame. They must have the same length.} + +\item{suffix}{Suffix to add to duplicated column names.} + +\item{validate}{Checks if join is of specified type: +\itemize{ +\item \code{"m:m"} (default): many-to-many, doesn't perform any checks; +\item \code{"1:1"}: one-to-one, check if join keys are unique in both left and right +datasets; +\item \code{"1:m"}: one-to-many, check if join keys are unique in left dataset +\item \code{"m:1"}: many-to-one, check if join keys are unique in right dataset +} + +Note that this is currently not supported by the streaming engine.} + +\item{join_nulls}{Join on null values. By default null values will never +produce matches.} + +\item{allow_parallel}{Allow the physical plan to optionally evaluate the +computation of both DataFrames up to the join in parallel.} + +\item{force_parallel}{Force the physical plan to evaluate the computation of +both DataFrames up to the join in parallel.} + +\item{coalesce}{Coalescing behavior (merging of join columns). +\itemize{ +\item \code{NULL}: join specific. +\item \code{TRUE}: Always coalesce join columns. +\item \code{FALSE}: Never coalesce join columns. +Note that joining on any other expressions than \code{col} will turn off +coalescing. +}} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +This function can do both mutating joins (adding columns based on matching +observations, for example with \code{how = "left"}) and filtering joins (keeping +observations based on matching observations, for example with \code{how = "inner"}). +} +\examples{ +lf <- pl$LazyFrame( + foo = 1:3, + bar = c(6, 7, 8), + ham = c("a", "b", "c") +) +other_lf <- pl$LazyFrame( + apple = c("x", "y", "z"), + ham = c("a", "b", "d") +) +lf$join(other_lf, on = "ham")$collect() + +lf$join(other_lf, on = "ham", how = "full")$collect() + +lf$join(other_lf, on = "ham", how = "left", coalesce = TRUE)$collect() + +lf$join(other_lf, on = "ham", how = "semi")$collect() + +lf$join(other_lf, on = "ham", how = "anti")$collect() +} diff --git a/man/lazyframe__join_asof.Rd b/man/lazyframe__join_asof.Rd new file mode 100644 index 00000000..fe3cacd3 --- /dev/null +++ b/man/lazyframe__join_asof.Rd @@ -0,0 +1,166 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__join_asof} +\alias{lazyframe__join_asof} +\title{Perform joins on nearest keys} +\usage{ +lazyframe__join_asof( + other, + ..., + left_on = NULL, + right_on = NULL, + on = NULL, + by_left = NULL, + by_right = NULL, + by = NULL, + strategy = c("backward", "forward", "nearest"), + suffix = "_right", + tolerance = NULL, + allow_parallel = TRUE, + force_parallel = FALSE, + coalesce = TRUE +) +} +\arguments{ +\item{other}{LazyFrame to join with.} + +\item{...}{Dots which should be empty.} + +\item{by_left, by_right}{Same as \code{by} but only for the left or the right +table. They must have the same length.} + +\item{by}{Join on these columns before performing asof join. Either a vector +of column names or a list of expressions and/or strings. Use \code{left_by} and +\code{right_by} if the column names to match on are different between the two +tables.} + +\item{strategy}{Strategy for where to find match: +\itemize{ +\item \code{"backward"} (default): search for the last row in the right table whose +\code{on} key is less than or equal to the left key. +\item \code{"forward"}: search for the first row in the right table whose \code{on} key is +greater than or equal to the left key. +\item \code{"nearest"}: search for the last row in the right table whose value is +nearest to the left key. String keys are not currently supported for a +nearest search. +}} + +\item{tolerance}{Numeric tolerance. By setting this the join will only be +done if the near keys are within this distance. If an asof join is done on +columns of dtype "Date", "Datetime", "Duration" or "Time", use the Polars +duration string language (see details).} + +\item{coalesce}{Coalescing behavior (merging of \code{on} / \code{left_on} / +\code{right_on} columns): +\itemize{ +\item \code{TRUE}: Always coalesce join columns; +\item \code{FALSE}: Never coalesce join columns. +Note that joining on any other expressions than \code{col} will turn off +coalescing. +}} +} +\description{ +This is similar to a left-join except that we match on nearest key rather +than equal keys. Both frames must be sorted by the \code{asof_join} key. +} +\section{Polars duration string language}{ + +Polars duration string language is a simple representation of +durations. It is used in many Polars functions that accept durations. + +It has the following format: +\itemize{ +\item 1ns (1 nanosecond) +\item 1us (1 microsecond) +\item 1ms (1 millisecond) +\item 1s (1 second) +\item 1m (1 minute) +\item 1h (1 hour) +\item 1d (1 calendar day) +\item 1w (1 calendar week) +\item 1mo (1 calendar month) +\item 1q (1 calendar quarter) +\item 1y (1 calendar year) +} + +Or combine them: \code{"3d12h4m25s"} # 3 days, 12 hours, 4 minutes, and 25 seconds + +By "calendar day", we mean the corresponding time on the next day +(which may not be 24 hours, due to daylight savings). +Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". +} + +\examples{ +gdp <- pl$LazyFrame( + date = as.Date(c("2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1", "2020-1-1")), + gdp = c(4164, 4411, 4566, 4696, 4827) +) + +pop <- pl$LazyFrame( + date = as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")), + population = c(82.19, 82.66, 83.12) +) + +# optional make sure tables are already sorted with "on" join-key +gdp <- gdp$sort("date") +pop <- pop$sort("date") + + +# Note how the dates don’t quite match. If we join them using join_asof and +# strategy = 'backward', then each date from population which doesn’t have +# an exact match is matched with the closest earlier date from gdp: +pop$join_asof(gdp, on = "date", strategy = "backward")$collect() + +# Note how: +# - date 2016-03-01 from population is matched with 2016-01-01 from gdp; +# - date 2018-08-01 from population is matched with 2018-01-01 from gdp. +# You can verify this by passing coalesce = FALSE: +pop$join_asof( + gdp, + on = "date", strategy = "backward", coalesce = FALSE +)$collect() + +# If we instead use strategy = 'forward', then each date from population +# which doesn’t have an exact match is matched with the closest later date +# from gdp: +pop$join_asof(gdp, on = "date", strategy = "forward")$collect() + +# Note how: +# - date 2016-03-01 from population is matched with 2017-01-01 from gdp; +# - date 2018-08-01 from population is matched with 2019-01-01 from gdp. + +# Finally, strategy = 'nearest' gives us a mix of the two results above, as +# each date from population which doesn’t have an exact match is matched +# with the closest date from gdp, regardless of whether it’s earlier or +# later: +pop$join_asof(gdp, on = "date", strategy = "nearest")$collect() + +# Note how: +# - date 2016-03-01 from population is matched with 2016-01-01 from gdp; +# - date 2018-08-01 from population is matched with 2019-01-01 from gdp. + +# The `by` argument allows joining on another column first, before the asof +# join. In this example we join by country first, then asof join by date, as +# above. +gdp2 <- pl$LazyFrame( + country = rep(c("Germany", "Netherlands"), each = 5), + date = rep( + as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1", "2020-1-1")), + 2 + ), + gdp = c(4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909) +)$sort("country", "date") +gdp2$collect() + +pop2 <- pl$LazyFrame( + country = rep(c("Germany", "Netherlands"), each = 3), + date = rep(as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")), 2), + population = c(82.19, 82.66, 83.12, 17.11, 17.32, 17.40) +)$sort("country", "date") +pop2$collect() + +pop2$join_asof( + gdp2, + by = "country", on = "date", strategy = "nearest" +)$collect() +} diff --git a/man/lazyframe__join_where.Rd b/man/lazyframe__join_where.Rd new file mode 100644 index 00000000..28a6a450 --- /dev/null +++ b/man/lazyframe__join_where.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__join_where} +\alias{lazyframe__join_where} +\title{Perform a join based on one or multiple (in)equality predicates} +\usage{ +lazyframe__join_where(other, ..., suffix = "_right") +} +\arguments{ +\item{other}{LazyFrame to join with.} + +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> (In)Equality condition to +join the two tables on. When a column name occurs in both tables, the proper +suffix must be applied in the predicate. For example, if both tables have a +column \code{"x"} that you want to use in the conditions, you must refer to the +column of the right table as \code{"x"}.} + +\item{suffix}{Suffix to append to columns with a duplicate name.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +This performs an inner join, so only rows where all predicates are true are +included in the result, and a row from either LazyFrame may be included +multiple times in the result. + +Note that the row order of the input LazyFrames is not preserved. +} +\examples{ +east <- pl$LazyFrame( + id = c(100, 101, 102), + dur = c(120, 140, 160), + rev = c(12, 14, 16), + cores = c(2, 8, 4) +) + +west <- pl$LazyFrame( + t_id = c(404, 498, 676, 742), + time = c(90, 130, 150, 170), + cost = c(9, 13, 15, 16), + cores = c(4, 2, 1, 4) +) + +east$join_where( + west, + pl$col("dur") < pl$col("time"), + pl$col("rev") < pl$col("cost") +)$collect() +} diff --git a/man/lazyframe__last.Rd b/man/lazyframe__last.Rd new file mode 100644 index 00000000..a1ab582c --- /dev/null +++ b/man/lazyframe__last.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__last} +\alias{lazyframe__last} +\title{Get the last row of the LazyFrame} +\usage{ +lazyframe__last() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Get the last row of the LazyFrame +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$last()$collect() +} diff --git a/man/lazyframe__limit.Rd b/man/lazyframe__limit.Rd new file mode 100644 index 00000000..8f323354 --- /dev/null +++ b/man/lazyframe__limit.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__limit} +\alias{lazyframe__limit} +\title{Get the first \code{n} rows} +\usage{ +lazyframe__limit(n = 5) +} +\arguments{ +\item{n}{Number of rows to return.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Alias for \code{\link[=lazyframe__head]{$head()}}. +} +\examples{ +lf <- pl$LazyFrame(a = 1:6, b = 7:12) +lf$limit()$collect() +lf$limit(2)$collect() +} diff --git a/man/lazyframe__max.Rd b/man/lazyframe__max.Rd new file mode 100644 index 00000000..f798a561 --- /dev/null +++ b/man/lazyframe__max.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__max} +\alias{lazyframe__max} +\title{Aggregate the columns in the LazyFrame to their maximum value} +\usage{ +lazyframe__max() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns in the LazyFrame to their maximum value +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$max()$collect() +} diff --git a/man/lazyframe__mean.Rd b/man/lazyframe__mean.Rd new file mode 100644 index 00000000..f19405d0 --- /dev/null +++ b/man/lazyframe__mean.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__mean} +\alias{lazyframe__mean} +\title{Aggregate the columns in the LazyFrame to their mean value} +\usage{ +lazyframe__mean() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns in the LazyFrame to their mean value +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$mean()$collect() +} diff --git a/man/lazyframe__median.Rd b/man/lazyframe__median.Rd new file mode 100644 index 00000000..7bcf7a69 --- /dev/null +++ b/man/lazyframe__median.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__median} +\alias{lazyframe__median} +\title{Aggregate the columns in the LazyFrame to their median value} +\usage{ +lazyframe__median() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns in the LazyFrame to their median value +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$median()$collect() +} diff --git a/man/lazyframe__merge_sorted.Rd b/man/lazyframe__merge_sorted.Rd new file mode 100644 index 00000000..1b7eea03 --- /dev/null +++ b/man/lazyframe__merge_sorted.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__merge_sorted} +\alias{lazyframe__merge_sorted} +\title{Take two sorted DataFrames and merge them by the sorted key} +\usage{ +lazyframe__merge_sorted(other, key) +} +\arguments{ +\item{other}{Other DataFrame that must be merged.} + +\item{key}{Key that is sorted.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +The output of this operation will also be sorted. It is the callers +responsibility that the frames are sorted by that key, otherwise the output +will not make sense. The schemas of both LazyFrames must be equal. +} +\examples{ +lf1 <- pl$LazyFrame( + name = c("steve", "elise", "bob"), + age = c(42, 44, 18) +)$sort("age") + +lf2 <- pl$LazyFrame( + name = c("anna", "megan", "steve", "thomas"), + age = c(21, 33, 42, 20) +)$sort("age") + +lf1$merge_sorted(lf2, key = "age")$collect() +} diff --git a/man/lazyframe__min.Rd b/man/lazyframe__min.Rd new file mode 100644 index 00000000..a2946a86 --- /dev/null +++ b/man/lazyframe__min.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__min} +\alias{lazyframe__min} +\title{Aggregate the columns in the LazyFrame to their minimum value} +\usage{ +lazyframe__min() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns in the LazyFrame to their minimum value +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$min()$collect() +} diff --git a/man/lazyframe__null_count.Rd b/man/lazyframe__null_count.Rd new file mode 100644 index 00000000..ec9955fe --- /dev/null +++ b/man/lazyframe__null_count.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__null_count} +\alias{lazyframe__null_count} +\title{Return the number of null elements for each column} +\usage{ +lazyframe__null_count() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Return the number of null elements for each column +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4)) +lf$null_count()$collect() +} diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd new file mode 100644 index 00000000..6182fb6d --- /dev/null +++ b/man/lazyframe__profile.Rd @@ -0,0 +1,161 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__profile} +\alias{lazyframe__profile} +\title{Collect and profile a lazy query.} +\usage{ +lazyframe__profile( + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE, + no_optimization = FALSE, + collect_in_background = FALSE, + show_plot = FALSE, + truncate_nodes = 0 +) + +lazyframe__profile( + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE, + no_optimization = FALSE, + collect_in_background = FALSE, + show_plot = FALSE, + truncate_nodes = 0 +) +} +\arguments{ +\item{type_coercion}{A logical, indicats type coercion optimization.} + +\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.} + +\item{projection_pushdown}{A logical, indicats projection pushdown optimization.} + +\item{simplify_expression}{A logical, indicats simplify expression optimization.} + +\item{slice_pushdown}{A logical, indicats slice pushdown optimization.} + +\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.} + +\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.} + +\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.} + +\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data. +If \code{FALSE} (default), the entire query is processed in a single batch. +Note that streaming mode is considered unstable. +It may be changed at any point without it being considered a breaking change.} + +\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.} + +\item{show_plot}{Show a Gantt chart of the profiling result} + +\item{truncate_nodes}{Truncate the label lengths in the Gantt chart to this +number of characters. If \code{0} (default), do not truncate.} +} +\value{ +List of two \code{DataFrame}s: one with the collected result, the other +with the timings of each step. If \code{show_graph = TRUE}, then the plot is +also stored in the list. + +List of two \code{DataFrame}s: one with the collected result, the other +with the timings of each step. If \code{show_graph = TRUE}, then the plot is +also stored in the list. +} +\description{ +This will run the query and return a list containing the materialized +DataFrame and a DataFrame that contains profiling information of each node +that is executed. + +This will run the query and return a list containing the +materialized DataFrame and a DataFrame that contains profiling information +of each node that is executed. +} +\details{ +The units of the timings are microseconds. + +The units of the timings are microseconds. +} +\examples{ +## Simplest use case +pl$LazyFrame()$select(pl$lit(2) + 2)$profile() + +## Use $profile() to compare two queries + +# -1- map each Species-group with native polars +as_polars_lf(iris)$ + sort("Sepal.Length")$ + group_by("Species", maintain_order = TRUE)$ + agg(pl$col(pl$Float64)$first() + 5)$ + profile() + +# -2- map each Species-group of each numeric column with an R function + +# some R function, prints `.` for each time called by polars +r_func <- \(s) { + cat(".") + s$to_r()[1] + 5 +} + +as_polars_lf(iris)$ + sort("Sepal.Length")$ + group_by("Species", maintain_order = TRUE)$ + agg(pl$col(pl$Float64)$map_elements(r_func))$ + profile() +## Simplest use case +pl$LazyFrame()$select(pl$lit(2) + 2)$profile() + +## Use $profile() to compare two queries + +# -1- map each Species-group with native polars, takes ~120us only +as_polars_lf(iris)$ + sort("Sepal.Length")$ + group_by("Species", maintain_order = TRUE)$ + agg(pl$col(pl$Float64)$first() + 5)$ + profile() + +# -2- map each Species-group of each numeric column with an R function, takes ~7000us (slow!) + +# some R function, prints `.` for each time called by polars +r_func <- \(s) { + cat(".") + s$to_r()[1] + 5 +} + +as_polars_lf(iris)$ + sort("Sepal.Length")$ + group_by("Species", maintain_order = TRUE)$ + agg(pl$col(pl$Float64)$map_elements(r_func))$ + profile() +} +\seealso{ +\itemize{ +\item \code{\link[=LazyFrame_collect]{$collect()}} - regular collect. +\item \code{\link[=LazyFrame_collect_in_background]{$collect_in_background()}} - non-blocking +collect returns a future handle. Can also just be used via +\verb{$collect(collect_in_background = TRUE)}. +\item \code{\link[=LazyFrame_sink_parquet]{$sink_parquet()}} streams query to a parquet file. +\item \code{\link[=LazyFrame_sink_ipc]{$sink_ipc()}} streams query to a arrow file. +} + +\itemize{ +\item \code{\link[=lazyframe__collect]{$collect()}} - regular collect. +\item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking +collect returns a future handle. Can also just be used via +\verb{$collect(collect_in_background = TRUE)}. +\item \code{\link[=lazyframe__sink_parquet]{$sink_parquet()}} streams query to a parquet file. +\item \code{\link[=lazyframe__sink_ipc]{$sink_ipc()}} streams query to a arrow file. +} +} diff --git a/man/lazyframe__quantile.Rd b/man/lazyframe__quantile.Rd new file mode 100644 index 00000000..7621d742 --- /dev/null +++ b/man/lazyframe__quantile.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__quantile} +\alias{lazyframe__quantile} +\title{Aggregate the columns in the DataFrame to a unique quantile value} +\usage{ +lazyframe__quantile( + quantile, + interpolation = c("nearest", "higher", "lower", "midpoint", "linear") +) +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns in the DataFrame to a unique quantile value +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$quantile(0.7)$collect() +} diff --git a/man/lazyframe__rename.Rd b/man/lazyframe__rename.Rd new file mode 100644 index 00000000..454f7e99 --- /dev/null +++ b/man/lazyframe__rename.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__rename} +\alias{lazyframe__rename} +\title{Rename column names} +\usage{ +lazyframe__rename(..., .strict = TRUE) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Either a function that takes +a character vector as input and returns a character vector as output, or +named values where names are old column names and values are the new ones.} + +\item{.strict}{Validate that all column names exist in the current schema, +and throw an error if any do not. (Note that this parameter is a no-op when +passing a function to \code{...}).} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Rename column names +} +\details{ +If existing names are swapped (e.g. 'A' points to 'B' and 'B' points to +'A'), polars will block projection and predicate pushdowns at this node. +} +\examples{ +lf <- pl$LazyFrame( + foo = 1:3, + bar = 6:8, + ham = letters[1:3] +) + +lf$rename(foo = "apple")$collect() + +lf$rename( + \(column_name) paste0("c", substr(column_name, 2, 100)) +)$collect() +} diff --git a/man/lazyframe__reverse.Rd b/man/lazyframe__reverse.Rd new file mode 100644 index 00000000..d9675b14 --- /dev/null +++ b/man/lazyframe__reverse.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__reverse} +\alias{lazyframe__reverse} +\title{Reverse the LazyFrame} +\usage{ +lazyframe__reverse() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Reverse the LazyFrame +} +\examples{ +lf <- pl$LazyFrame(key = c("a", "b", "c"), val = 1:3) +lf$reverse()$collect() +} diff --git a/man/lazyframe__rolling.Rd b/man/lazyframe__rolling.Rd new file mode 100644 index 00000000..62768257 --- /dev/null +++ b/man/lazyframe__rolling.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__rolling} +\alias{lazyframe__rolling} +\title{Create rolling groups based on a date/time or integer column} +\usage{ +lazyframe__rolling( + index_column, + ..., + period, + offset = NULL, + closed = "right", + group_by = NULL +) +} +\arguments{ +\item{index_column}{Column used to group based on the time window. Often of +type Date/Datetime. This column must be sorted in ascending order (or, if +\code{group_by} is specified, then it must be sorted in ascending order within +each group). +In case of a dynamic group by on indices, the data type needs to be either +Int32 or In64. Note that Int32 gets temporarily cast to Int64, so if +performance matters, use an Int64 column.} + +\item{...}{Dots which should be empty.} + +\item{period}{Length of the window - must be non-negative.} + +\item{offset}{Offset of the window. Default is \code{-period}.} + +\item{closed}{Define which sides of the interval are closed (inclusive). +Default is \code{"left"}.} +} +\value{ +A \link[=LazyGroupBy_class]{LazyGroupBy} object +} +\description{ +Different from \code{group_by_dynamic}, the windows are now determined by the +individual values and are not of constant intervals. For constant intervals +use \code{\link[=lazyframe__group_by_dynamic]{$group_by_dynamic()}}. + +If you have a time series \verb{}, then by default the +windows created will be: +\itemize{ +\item \verb{(t_0 - period, t_0]} +\item \verb{(t_1 - period, t_1]} +\item … +\item \verb{(t_n - period, t_n]} +} + +whereas if you pass a non-default \code{offset}, then the windows will be: +\itemize{ +\item \verb{(t_0 + offset, t_0 + offset + period]} +\item \verb{(t_1 + offset, t_1 + offset + period]} +\item … +\item \verb{(t_n + offset, t_n + offset + period]} +} +} +\details{ +If you want to compute multiple aggregation statistics over the same dynamic +window, consider using \code{\link[=expr__rolling]{$rolling()}} - this method can cache +the window size computation. +} +\examples{ +dates <- c( + "2020-01-01 13:45:48", + "2020-01-01 16:42:13", + "2020-01-01 16:45:09", + "2020-01-02 18:12:48", + "2020-01-03 19:45:32", + "2020-01-08 23:16:43" +) + +df <- pl$LazyFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$with_columns( + pl$col("dt")$str$strptime(pl$Datetime()) +) + +df$rolling(index_column = "dt", period = "2d")$agg( + sum_a = pl$col("a")$sum(), + min_a = pl$col("a")$min(), + max_a = pl$col("a")$max() +)$collect() +} +\seealso{ +\itemize{ +\item \code{\link[=lazyframe__group_by_dynamic]{$group_by_dynamic()}} +} +} diff --git a/man/lazyframe__select_seq.Rd b/man/lazyframe__select_seq.Rd new file mode 100644 index 00000000..eec4c8fc --- /dev/null +++ b/man/lazyframe__select_seq.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__select_seq} +\alias{lazyframe__select_seq} +\title{Select columns from this LazyFrame} +\usage{ +lazyframe__select_seq(...) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> +Name-value pairs of objects to be converted to polars \link[=Expr]{expressions} +by the \code{\link[=as_polars_expr]{as_polars_expr()}} function. +Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}. +Each name will be used as the expression name.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +This will run all expression sequentially instead of in parallel. Use this +when the work per expression is cheap. +} +\examples{ +lf <- pl$LazyFrame( + foo = 1:3, + bar = 6:8, + ham = letters[1:3] +) +lf$select_seq("foo")$collect() +} diff --git a/man/lazyframe__serialize.Rd b/man/lazyframe__serialize.Rd new file mode 100644 index 00000000..f532da6c --- /dev/null +++ b/man/lazyframe__serialize.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__serialize} +\alias{lazyframe__serialize} +\title{Serialize the logical plan of this LazyFrame to a string in JSON format} +\usage{ +lazyframe__serialize() +} +\value{ +A character value +} +\description{ +Serialize the logical plan of this LazyFrame to a string in JSON format +} +\examples{ +lf <- pl$LazyFrame(a = 1:3)$sum() +lf$serialize() +} diff --git a/man/lazyframe__set_sorted.Rd b/man/lazyframe__set_sorted.Rd new file mode 100644 index 00000000..bf53468a --- /dev/null +++ b/man/lazyframe__set_sorted.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__set_sorted} +\alias{lazyframe__set_sorted} +\title{Indicate that one or multiple columns are sorted} +\usage{ +lazyframe__set_sorted(column, ..., descending = FALSE) +} +\arguments{ +\item{column}{Columns that are sorted.} + +\item{...}{Dots which should be empty.} + +\item{descending}{Whether the columns are sorted in descending order.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +This can speed up future operations, but it can lead to incorrect results if +the data is \strong{not} sorted! Use with care! +} diff --git a/man/lazyframe__shift.Rd b/man/lazyframe__shift.Rd new file mode 100644 index 00000000..e4fafbb2 --- /dev/null +++ b/man/lazyframe__shift.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__shift} +\alias{lazyframe__shift} +\title{Shift values by the given number of indices} +\usage{ +lazyframe__shift(n = 1, ..., fill_value = NULL) +} +\arguments{ +\item{n}{Number of indices to shift forward. If a negative value is passed, +values are shifted in the opposite direction instead.} + +\item{...}{Dots which should be empty.} + +\item{fill_value}{Fill the resulting null values with this value. Accepts +expression input. Non-expression inputs are parsed as literals.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Shift values by the given number of indices +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = 5:8) + +# By default, values are shifted forward by one index. +lf$shift()$collect() + +# Pass a negative value to shift in the opposite direction instead. +lf$shift(-2)$collect() + +# Specify fill_value to fill the resulting null values. +lf$shift(-2, fill_value = 100)$collect() +} diff --git a/man/lazyframe__sink_csv.Rd b/man/lazyframe__sink_csv.Rd new file mode 100644 index 00000000..98202d20 --- /dev/null +++ b/man/lazyframe__sink_csv.Rd @@ -0,0 +1,137 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__sink_csv} +\alias{lazyframe__sink_csv} +\title{Evaluate the query in streaming mode and write to a CSV file} +\usage{ +lazyframe__sink_csv( + path, + ..., + include_bom = FALSE, + include_header = TRUE, + separator = ",", + line_terminator = "\\n", + quote_char = "\\"", + batch_size = 1024, + datetime_format = NULL, + date_format = NULL, + time_format = NULL, + float_precision = NULL, + null_value = "", + quote_style = "necessary", + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2 +) +} +\arguments{ +\item{path}{A character. File path to which the file should be written.} + +\item{...}{Dots which should be empty.} + +\item{include_bom}{Logical, whether to include UTF-8 BOM in the CSV output.} + +\item{include_header}{Logical, hether to include header in the CSV output.} + +\item{separator}{Separate CSV fields with this symbol.} + +\item{line_terminator}{String used to end each row.} + +\item{quote_char}{Byte to use as quoting character.} + +\item{batch_size}{Number of rows that will be processed per thread.} + +\item{datetime_format}{A format string, with the specifiers defined by the +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{chrono} +Rust crate. If no format specified, the default fractional-second precision +is inferred from the maximum timeunit found in the frame’s Datetime cols (if +any).} + +\item{date_format}{A format string, with the specifiers defined by the +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{chrono} +Rust crate.} + +\item{time_format}{A format string, with the specifiers defined by the +\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{chrono} +Rust crate.} + +\item{float_precision}{Whether to use scientific form always (\code{TRUE}), never +(\code{FALSE}), or automatically (\code{NULL}) for Float32 and Float64 datatypes.} + +\item{null_value}{A string representing null values (defaulting to the empty +string).} + +\item{quote_style}{Determines the quoting strategy used. Must be one of: +\itemize{ +\item \code{"necessary"} (default): This puts quotes around fields only when +necessary. They are necessary when fields contain a quote, delimiter or +record terminator. Quotes are also necessary when writing an empty record +(which is indistinguishable from a record with one empty field). This is +the default. +\item \code{"always"}: This puts quotes around every field. Always. +\item \code{"never"}: This never puts quotes around fields, even if that results in +invalid CSV data (e.g.: by not quoting strings containing the separator). +\item \code{"non_numeric"}: This puts quotes around all fields that are non-numeric. +Namely, when writing a field that does not parse as a valid float or +integer, then quotes will be used even if they aren`t strictly necessary. +}} + +\item{maintain_order}{Maintain the order in which data is processed. Setting +this to \code{FALSE} will be slightly faster.} + +\item{type_coercion}{A logical, indicats type coercion optimization.} + +\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.} + +\item{projection_pushdown}{A logical, indicats projection pushdown optimization.} + +\item{simplify_expression}{A logical, indicats simplify expression optimization.} + +\item{slice_pushdown}{A logical, indicats slice pushdown optimization.} + +\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.} + +\item{storage_options}{Named vector containing options that indicate how to +connect to a cloud provider. The cloud providers currently supported are +AWS, GCP, and Azure. +See supported keys here: +\itemize{ +\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws} +\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} +\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} +\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter +\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment +variable. +} + +If \code{storage_options} is not provided, Polars will try to infer the +information from environment variables.} + +\item{retries}{Number of retries if accessing a cloud instance fails.} +} +\value{ +Invisibly returns the input LazyFrame +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +This allows streaming results that are larger than RAM to be written to disk. +} +\examples{ +# sink table 'mtcars' from mem to CSV +tmpf <- tempfile() +pl$LazyFrame(mtcars)$sink_csv(tmpf) + +# stream a query end-to-end +tmpf2 <- tempfile() +pl$scan_csv(tmpf)$select(pl$col("cyl") * 2)$sink_csv(tmpf2) + +# load parquet directly into a DataFrame / memory +pl$scan_csv(tmpf2)$collect() +} diff --git a/man/lazyframe__sink_ipc.Rd b/man/lazyframe__sink_ipc.Rd new file mode 100644 index 00000000..c9c509a9 --- /dev/null +++ b/man/lazyframe__sink_ipc.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__sink_ipc} +\alias{lazyframe__sink_ipc} +\title{Evaluate the query in streaming mode and write to an IPC file} +\usage{ +lazyframe__sink_ipc( + path, + ..., + compression = c("zstd", "lz4", "uncompressed"), + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2 +) +} +\arguments{ +\item{path}{A character. File path to which the file should be written.} + +\item{...}{Dots which should be empty.} + +\item{compression}{\code{NULL} or one of: +\itemize{ +\item \code{"uncompressed"}: same as \code{NULL}. +\item \code{"lz4"}: fast compression/decompression. +\item \code{"zstd"}: good compression performance. +}} + +\item{maintain_order}{Maintain the order in which data is processed. Setting +this to \code{FALSE} will be slightly faster.} + +\item{type_coercion}{A logical, indicats type coercion optimization.} + +\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.} + +\item{projection_pushdown}{A logical, indicats projection pushdown optimization.} + +\item{simplify_expression}{A logical, indicats simplify expression optimization.} + +\item{slice_pushdown}{A logical, indicats slice pushdown optimization.} + +\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.} + +\item{storage_options}{Named vector containing options that indicate how to +connect to a cloud provider. The cloud providers currently supported are +AWS, GCP, and Azure. +See supported keys here: +\itemize{ +\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws} +\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} +\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} +\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter +\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment +variable. +} + +If \code{storage_options} is not provided, Polars will try to infer the +information from environment variables.} + +\item{retries}{Number of retries if accessing a cloud instance fails.} +} +\value{ +Invisibly returns the input LazyFrame +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +This allows streaming results that are larger than RAM to be written to disk. +} +\examples{ +# sink table 'mtcars' from mem to ipc +tmpf <- tempfile() +as_polars_lf(mtcars)$sink_ipc(tmpf) + +# stream a query end-to-end (not supported yet, https://github.com/pola-rs/polars/issues/1040) +# tmpf2 = tempfile() +# pl$scan_ipc(tmpf)$select(pl$col("cyl") * 2)$sink_ipc(tmpf2) + +# load ipc directly into a DataFrame / memory +# pl$scan_ipc(tmpf2)$collect() +} diff --git a/man/lazyframe__sink_ndjson.Rd b/man/lazyframe__sink_ndjson.Rd new file mode 100644 index 00000000..218c8e2e --- /dev/null +++ b/man/lazyframe__sink_ndjson.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__sink_ndjson} +\alias{lazyframe__sink_ndjson} +\title{Evaluate the query in streaming mode and write to an NDJSON file} +\usage{ +lazyframe__sink_ndjson( + path, + ..., + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2 +) +} +\arguments{ +\item{path}{A character. File path to which the file should be written.} + +\item{...}{Dots which should be empty.} + +\item{maintain_order}{Maintain the order in which data is processed. Setting +this to \code{FALSE} will be slightly faster.} + +\item{type_coercion}{A logical, indicats type coercion optimization.} + +\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.} + +\item{projection_pushdown}{A logical, indicats projection pushdown optimization.} + +\item{simplify_expression}{A logical, indicats simplify expression optimization.} + +\item{slice_pushdown}{A logical, indicats slice pushdown optimization.} + +\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.} + +\item{storage_options}{Named vector containing options that indicate how to +connect to a cloud provider. The cloud providers currently supported are +AWS, GCP, and Azure. +See supported keys here: +\itemize{ +\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws} +\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} +\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} +\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter +\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment +variable. +} + +If \code{storage_options} is not provided, Polars will try to infer the +information from environment variables.} + +\item{retries}{Number of retries if accessing a cloud instance fails.} +} +\value{ +Invisibly returns the input LazyFrame +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +This allows streaming results that are larger than RAM to be written to disk. +} +\examples{ +# sink table 'mtcars' from mem to NDJSON +tmpf <- tempfile(fileext = ".ndjson") +pl$LazyFrame(mtcars)$sink_ndjson(tmpf) + +# load parquet directly into a DataFrame / memory +pl$scan_ndjson(tmpf)$collect() +} diff --git a/man/lazyframe__sink_parquet.Rd b/man/lazyframe__sink_parquet.Rd new file mode 100644 index 00000000..2e04bd4a --- /dev/null +++ b/man/lazyframe__sink_parquet.Rd @@ -0,0 +1,122 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__sink_parquet} +\alias{lazyframe__sink_parquet} +\title{Evaluate the query in streaming mode and write to a Parquet file} +\usage{ +lazyframe__sink_parquet( + path, + ..., + compression = "zstd", + compression_level = 3, + statistics = TRUE, + row_group_size = NULL, + data_page_size = NULL, + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + storage_options = NULL, + retries = 2 +) +} +\arguments{ +\item{path}{A character. File path to which the file should be written.} + +\item{...}{Dots which should be empty.} + +\item{compression}{The compression method. Must be one of: +\itemize{ +\item \code{"lz4"}: fast compression/decompression. +\item \code{"uncompressed"} +\item \code{"snappy"}: this guarantees that the parquet file will be compatible with +older parquet readers. +\item \code{"gzip"} +\item \code{"lzo"} +\item \code{"brotli"} +\item \code{"zstd"}: good compression performance. +}} + +\item{compression_level}{\code{NULL} or integer. The level of compression to use. +Only used if method is one of \code{"gzip"}, \code{"brotli"}, or \code{"zstd"}. Higher +compression means smaller files on disk: +\itemize{ +\item \code{"gzip"}: min-level: 0, max-level: 10. +\item \code{"brotli"}: min-level: 0, max-level: 11. +\item \code{"zstd"}: min-level: 1, max-level: 22. +}} + +\item{statistics}{Whether statistics should be written to the Parquet +headers. Possible values: +\itemize{ +\item \code{TRUE}: enable default set of statistics (default) +\item \code{FALSE}: disable all statistics +\item \code{"full"}: calculate and write all available statistics. +\item A named list where all values must be \code{TRUE} or \code{FALSE}, e.g. +\code{list(min = TRUE, max = FALSE)}. Statistics available are \code{"min"}, \code{"max"}, +\code{"distinct_count"}, \code{"null_count"}. +}} + +\item{row_group_size}{Size of the row groups in number of rows. If \code{NULL} +(default), the chunks of the DataFrame are used. Writing in smaller chunks +may reduce memory pressure and improve writing speeds.} + +\item{data_page_size}{Size of the data page in bytes. If \code{NULL} (default), it +is set to 1024^2 bytes.} + +\item{maintain_order}{Maintain the order in which data is processed. Setting +this to \code{FALSE} will be slightly faster.} + +\item{type_coercion}{A logical, indicats type coercion optimization.} + +\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.} + +\item{projection_pushdown}{A logical, indicats projection pushdown optimization.} + +\item{simplify_expression}{A logical, indicats simplify expression optimization.} + +\item{slice_pushdown}{A logical, indicats slice pushdown optimization.} + +\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.} + +\item{storage_options}{Named vector containing options that indicate how to +connect to a cloud provider. The cloud providers currently supported are +AWS, GCP, and Azure. +See supported keys here: +\itemize{ +\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws} +\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} +\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} +\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter +\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment +variable. +} + +If \code{storage_options} is not provided, Polars will try to infer the +information from environment variables.} + +\item{retries}{Number of retries if accessing a cloud instance fails.} +} +\value{ +Invisibly returns the input LazyFrame +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +This allows streaming results that are larger than RAM to be written to disk. +} +\examples{ +# sink table 'mtcars' from mem to parquet +tmpf <- tempfile() +as_polars_lf(mtcars)$sink_parquet(tmpf) + +# stream a query end-to-end +tmpf2 <- tempfile() +pl$scan_parquet(tmpf)$select(pl$col("cyl") * 2)$sink_parquet(tmpf2) + +# load parquet directly into a DataFrame / memory +pl$scan_parquet(tmpf2)$collect() +} diff --git a/man/lazyframe__slice.Rd b/man/lazyframe__slice.Rd new file mode 100644 index 00000000..c268a768 --- /dev/null +++ b/man/lazyframe__slice.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__slice} +\alias{lazyframe__slice} +\title{Get a slice of the LazyFrame.} +\usage{ +lazyframe__slice(offset, length = NULL) +} +\arguments{ +\item{offset}{Start index. Negative indexing is supported.} + +\item{length}{Length of the slice. If \code{NULL} (default), all rows starting at +the offset will be selected.} +} +\value{ +A \link[=lazyframe__class]{LazyFrame} +} +\description{ +Get a slice of the LazyFrame. +} +\examples{ +lf <- pl$LazyFrame(x = c("a", "b", "c"), y = 1:3, z = 4:6) +lf$slice(1, 2)$collect() +} diff --git a/man/lazyframe__sort.Rd b/man/lazyframe__sort.Rd new file mode 100644 index 00000000..cd537530 --- /dev/null +++ b/man/lazyframe__sort.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__sort} +\alias{lazyframe__sort} +\title{Sort the LazyFrame by the given columns} +\usage{ +lazyframe__sort( + ..., + descending = FALSE, + nulls_last = FALSE, + multithreaded = TRUE, + maintain_order = FALSE +) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Column(s) to sort by. Can be +character values indicating column names or Expr(s).} + +\item{descending}{Sort in descending order. When sorting by multiple +columns, this can be specified per column by passing a logical vector.} + +\item{nulls_last}{Place null values last. When sorting by multiple +columns, this can be specified per column by passing a logical vector.} + +\item{multithreaded}{Sort using multiple threads.} + +\item{maintain_order}{Whether the order should be maintained if elements are +equal. If \code{TRUE}, streaming is not possible and performance might be worse +since this requires a stable search.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Sort the LazyFrame by the given columns +} +\examples{ +lf <- pl$LazyFrame( + a = c(1, 2, NA, 4), + b = c(6, 5, 4, 3), + c = c("a", "c", "b", "a") +) + +# Pass a single column name to sort by that column. +lf$sort("a")$collect() + +# Sorting by expressions is also supported +lf$sort(pl$col("a") + pl$col("b") * 2, nulls_last = TRUE)$collect() + +# Sort by multiple columns by passing a vector of columns +lf$sort(c("c", "a"), descending = TRUE)$collect() + +# Or use positional arguments to sort by multiple columns in the same way +lf$sort("c", "a", descending = c(FALSE, TRUE))$collect() +} diff --git a/man/lazyframe__std.Rd b/man/lazyframe__std.Rd new file mode 100644 index 00000000..e69e9d76 --- /dev/null +++ b/man/lazyframe__std.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__std} +\alias{lazyframe__std} +\title{Aggregate the columns of this LazyFrame to their standard deviation values} +\usage{ +lazyframe__std(ddof = 1) +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns of this LazyFrame to their standard deviation values +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$std()$collect() +lf$std(ddof = 0)$collect() +} diff --git a/man/lazyframe__sum.Rd b/man/lazyframe__sum.Rd new file mode 100644 index 00000000..b1391c71 --- /dev/null +++ b/man/lazyframe__sum.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__sum} +\alias{lazyframe__sum} +\title{Aggregate the columns of this LazyFrame to their sum values} +\usage{ +lazyframe__sum() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns of this LazyFrame to their sum values +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$sum()$collect() +} diff --git a/man/lazyframe__tail.Rd b/man/lazyframe__tail.Rd new file mode 100644 index 00000000..aebd3e0f --- /dev/null +++ b/man/lazyframe__tail.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__tail} +\alias{lazyframe__tail} +\title{Get the last \code{n} rows} +\usage{ +lazyframe__tail(n = 5L) + +lazyframe__tail(n = 5L) +} +\arguments{ +\item{n}{Number of rows to return.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Get the last \code{n} rows + +Get the last \code{n} rows. +} +\examples{ +lf <- pl$LazyFrame(a = 1:6, b = 7:12) +lf$tail()$collect() +lf$tail(2)$collect() +lf <- pl$LazyFrame(a = 1:6, b = 7:12) + +lf$tail()$collect() + +lf$tail(2)$collect() +} +\seealso{ +\code{\link[=lazyframe__head]{$head()}} +} diff --git a/man/lazyframe__to_dot.Rd b/man/lazyframe__to_dot.Rd new file mode 100644 index 00000000..eacace78 --- /dev/null +++ b/man/lazyframe__to_dot.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__to_dot} +\alias{lazyframe__to_dot} +\title{Plot the query plan} +\usage{ +lazyframe__to_dot( + ..., + optimized = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE +) +} +\arguments{ +\item{...}{Not used..} + +\item{optimized}{Optimize the query plan.} + +\item{type_coercion}{A logical, indicats type coercion optimization.} + +\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.} + +\item{projection_pushdown}{A logical, indicats projection pushdown optimization.} + +\item{simplify_expression}{A logical, indicats simplify expression optimization.} + +\item{slice_pushdown}{A logical, indicats slice pushdown optimization.} + +\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.} + +\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.} + +\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.} + +\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data. +If \code{FALSE} (default), the entire query is processed in a single batch. +Note that streaming mode is considered unstable. +It may be changed at any point without it being considered a breaking change.} +} +\value{ +A character vector +} +\description{ +This only returns the "dot" output that can be passed to other packages, such +as \code{DiagrammeR::grViz()}. +} +\examples{ +lf <- pl$LazyFrame( + a = c("a", "b", "a", "b", "b", "c"), + b = 1:6, + c = 6:1 +) + +query <- lf$group_by("a", maintain_order = TRUE)$agg( + pl$all()$sum() +)$sort( + "a" +) + +query$to_dot() |> cat() + +# You could print the graph by using DiagrammeR for example, with +# query$to_dot() |> DiagrammeR::grViz(). +} diff --git a/man/lazyframe__top_k.Rd b/man/lazyframe__top_k.Rd new file mode 100644 index 00000000..60e13e1f --- /dev/null +++ b/man/lazyframe__top_k.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__top_k} +\alias{lazyframe__top_k} +\title{Return the \code{k} largest rows} +\usage{ +lazyframe__top_k(k, ..., by, reverse = FALSE) +} +\arguments{ +\item{k}{Number of rows to return.} + +\item{...}{Dots which should be empty.} + +\item{by}{Column(s) used to determine the bottom rows. Accepts expression +input. Strings are parsed as column names.} + +\item{reverse}{Consider the \code{k} smallest elements of the \code{by} column(s) +(instead of the \code{k} largest). This can be specified per column by passing a +sequence of booleans.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Non-null elements are always preferred over null elements, regardless of the +value of \code{reverse}. The output is not guaranteed to be in any particular +order, call \code{sort()} after this function if you wish the output to be sorted. +} +\examples{ +lf <- pl$LazyFrame( + a = c("a", "b", "a", "b", "b", "c"), + b = c(2, 1, 1, 3, 2, 1) +) + +# Get the rows which contain the 4 largest values in column b. +lf$top_k(4, by = "b")$collect() + +# Get the rows which contain the 4 largest values when sorting on column a +# and b$ +lf$top_k(4, by = c("a", "b"))$collect() +} diff --git a/man/lazyframe__unique.Rd b/man/lazyframe__unique.Rd new file mode 100644 index 00000000..22d160d3 --- /dev/null +++ b/man/lazyframe__unique.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__unique} +\alias{lazyframe__unique} +\title{Drop duplicate rows from this DataFrame} +\usage{ +lazyframe__unique( + subset = NULL, + ..., + keep = c("any", "none", "first", "last"), + maintain_order = FALSE +) +} +\arguments{ +\item{subset}{Column name(s) or selector(s), to consider when identifying +duplicate rows. If \code{NULL} (default), use all columns.} + +\item{...}{Dots which should be empty.} + +\item{keep}{Which of the duplicate rows to keep. Must be one of: +\itemize{ +\item \code{"any"}: does not give any guarantee of which row is kept. This allows +more optimizations. +\item \code{"none"}: don’t keep duplicate rows. +\item \code{"first"}: keep first unique row. +\item \code{"last"}: keep last unique row. +}} + +\item{maintain_order}{Keep the same order as the original LazyFrame. This is +more expensive to compute. Setting this to \code{TRUE} blocks the possibility to +run on the streaming engine.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Drop duplicate rows from this DataFrame +} +\examples{ +lf <- pl$LazyFrame( + foo = c(1, 2, 3, 1), + bar = c("a", "a", "a", "a"), + ham = c("b", "b", "b", "b"), +) +lf$unique(maintain_order = TRUE)$collect() + +lf$unique(subset = c("bar", "ham"), maintain_order = TRUE)$collect() + +lf$unique(keep = "last", maintain_order = TRUE)$collect() +} diff --git a/man/lazyframe__unnest.Rd b/man/lazyframe__unnest.Rd new file mode 100644 index 00000000..844d86f1 --- /dev/null +++ b/man/lazyframe__unnest.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__unnest} +\alias{lazyframe__unnest} +\title{Decompose struct columns into separate columns for each of their fields} +\usage{ +lazyframe__unnest(...) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name of the struct column(s) +that should be unnested.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +The new columns will be inserted into the LazyFrame at the location of the +struct column. +} +\examples{ +lf <- pl$LazyFrame( + a = 1:5, + b = c("one", "two", "three", "four", "five"), + c = 6:10 +)$ + select( + pl$struct("b"), + pl$struct(c("a", "c"))$alias("a_and_c") +) +lf$collect() + +lf$unnest("a_and_c")$collect() +lf$unnest(pl$col("a_and_c"))$collect() +} diff --git a/man/lazyframe__unpivot.Rd b/man/lazyframe__unpivot.Rd new file mode 100644 index 00000000..5c37f657 --- /dev/null +++ b/man/lazyframe__unpivot.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__unpivot} +\alias{lazyframe__unpivot} +\title{Unpivot a LazyFrame from wide to long format} +\usage{ +lazyframe__unpivot( + on = NULL, + ..., + index = NULL, + variable_name = NULL, + value_name = NULL +) +} +\arguments{ +\item{on}{Values to use as identifier variables. If \code{value_vars} is +empty all columns that are not in \code{id_vars} will be used.} + +\item{...}{Dots which should be empty.} + +\item{index}{Columns to use as identifier variables.} + +\item{variable_name}{Name to give to the new column containing the names of +the melted columns. Defaults to "variable".} + +\item{value_name}{Name to give to the new column containing the values of +the melted columns. Defaults to \code{"value"}.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +This function is useful to massage a LazyFrame into a format where one or +more columns are identifier variables (\code{index}) while all other columns, +considered measured variables (\code{on}), are “unpivoted” to the row axis +leaving just two non-identifier columns, "variable" and "value". +} +\examples{ +lf <- pl$LazyFrame( + a = c("x", "y", "z"), + b = c(1, 3, 5), + c = c(2, 4, 6) +) +lf$unpivot(index = "a", on = c("b", "c"))$collect() +} diff --git a/man/lazyframe__var.Rd b/man/lazyframe__var.Rd new file mode 100644 index 00000000..4e8c3ab5 --- /dev/null +++ b/man/lazyframe__var.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__var} +\alias{lazyframe__var} +\title{Aggregate the columns in the LazyFrame to their variance value} +\usage{ +lazyframe__var(ddof = 1) +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Aggregate the columns in the LazyFrame to their variance value +} +\examples{ +lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1)) +lf$var()$collect() +lf$var(ddof = 0)$collect() +} diff --git a/man/lazyframe__with_columns_seq.Rd b/man/lazyframe__with_columns_seq.Rd new file mode 100644 index 00000000..59dfc908 --- /dev/null +++ b/man/lazyframe__with_columns_seq.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__with_columns_seq} +\alias{lazyframe__with_columns_seq} +\title{Modify/append column(s) of a LazyFrame} +\usage{ +lazyframe__with_columns_seq(...) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> +Name-value pairs of objects to be converted to polars \link[=Expr]{expressions} +by the \code{\link[=as_polars_expr]{as_polars_expr()}} function. +Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}. +Each name will be used as the expression name.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +This will run all expression sequentially instead of in parallel. Use this +only when the work per expression is cheap. + +Add columns or modify existing ones with expressions. This is similar to +\code{dplyr::mutate()} as it keeps unmentioned columns (unlike \verb{$select()}). + +However, unlike \code{dplyr::mutate()}, one cannot use new variables in subsequent +expressions in the same \verb{$with_columns_seq()}call. For instance, if you create a +variable \code{x}, you will only be able to use it in another \verb{$with_columns_seq()} +or \verb{$select()} call. +} +\examples{ +# Pass an expression to add it as a new column. +lf <- pl$LazyFrame( + a = 1:4, + b = c(0.5, 4, 10, 13), + c = c(TRUE, TRUE, FALSE, TRUE), +) +lf$with_columns_seq((pl$col("a")^2)$alias("a^2"))$collect() + +# Added columns will replace existing columns with the same name. +lf$with_columns_seq(a = pl$col("a")$cast(pl$Float64))$collect() + +# Multiple columns can be added +lf$with_columns_seq( + (pl$col("a")^2)$alias("a^2"), + (pl$col("b") / 2)$alias("b/2"), + (pl$col("c")$not())$alias("not c"), +)$collect() + +# Name expression instead of `$alias()` +lf$with_columns_seq( + `a^2` = pl$col("a")^2, + `b/2` = pl$col("b") / 2, + `not c` = pl$col("c")$not(), +)$collect() + +# Expressions with multiple outputs can automatically be instantiated +# as Structs by enabling the experimental setting `POLARS_AUTO_STRUCTIFY`: +if (requireNamespace("withr", quietly = TRUE)) { + withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), { + lf$drop("c")$with_columns_seq( + diffs = pl$col("a", "b")$diff()$name$suffix("_diff"), + )$collect() + }) +} +} diff --git a/man/lazyframe__with_context.Rd b/man/lazyframe__with_context.Rd new file mode 100644 index 00000000..4a47d8b4 --- /dev/null +++ b/man/lazyframe__with_context.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__with_context} +\alias{lazyframe__with_context} +\title{Add an external context to the computation graph} +\usage{ +lazyframe__with_context(other) +} +\arguments{ +\item{other}{Data/LazyFrame to have access to. This can be a list of DataFrames +and LazyFrames.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +This allows expressions to also access columns from DataFrames or LazyFrames +that are not part of this one. +} +\examples{ +lf <- pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA)) +lf_other <- pl$LazyFrame(c = c("foo", "ham")) + +lf$with_context(lf_other)$select( + pl$col("b") + pl$col("c")$first() +)$collect() + +# Fill nulls with the median from another lazyframe: +train_lf <- pl$LazyFrame( + feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1) +) +test_lf <- pl$LazyFrame( + feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1) +) + +test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select( + pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median()) +)$collect() +} diff --git a/man/lazyframe__with_row_index.Rd b/man/lazyframe__with_row_index.Rd new file mode 100644 index 00000000..2a6fc206 --- /dev/null +++ b/man/lazyframe__with_row_index.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{lazyframe__with_row_index} +\alias{lazyframe__with_row_index} +\title{Add a row index as the first column in the LazyFrame} +\usage{ +lazyframe__with_row_index(name = "index", offset = 0) +} +\arguments{ +\item{name}{Name of the index column.} + +\item{offset}{Start the index at this offset. Cannot be negative.} +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Using this function can have a negative effect on query performance. This +may, for instance, block predicate pushdown optimization. +} +\examples{ +lf <- pl$LazyFrame(x = c(1, 3, 5), y = c(2, 4, 6)) +lf$with_row_index()$collect() + +lf$with_row_index("id", offset = 1000)$collect() + +# An index column can also be created using the expressions int_range() +# and len()$ +lf$with_columns( + index = pl$int_range(pl$len(), dtype = pl$UInt32) +)$collect() +} diff --git a/man/pl.Rd b/man/pl.Rd index 327dee16..33e6b48d 100644 --- a/man/pl.Rd +++ b/man/pl.Rd @@ -5,7 +5,7 @@ \alias{pl} \title{Polars top-level function namespace} \format{ -An object of class \code{polars_object} of length 74. +An object of class \code{polars_object} of length 75. } \usage{ pl diff --git a/man/pl__deserialize_lf.Rd b/man/pl__deserialize_lf.Rd new file mode 100644 index 00000000..0d71fb6f --- /dev/null +++ b/man/pl__deserialize_lf.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe-frame.R +\name{pl__deserialize_lf} +\alias{pl__deserialize_lf} +\title{Read a logical plan from a file to construct a LazyFrame} +\usage{ +pl__deserialize_lf(source) +} +\arguments{ +\item{source}{String containing the LazyFrame logical plan in JSON format.} +} +\value{ +A character value +} +\description{ +Read a logical plan from a file to construct a LazyFrame +} +\examples{ +lf <- pl$LazyFrame(a = 1:3)$sum() +ser <- lf$serialize() +pl$deserialize_lf(ser) +} diff --git a/src/init.c b/src/init.c index 21774650..d05a21b1 100644 --- a/src/init.c +++ b/src/init.c @@ -244,6 +244,11 @@ SEXP savvy_when__impl(SEXP c_arg__condition) { return handle_result(res); } +SEXP savvy_deserialize_lf__impl(SEXP c_arg__json) { + SEXP res = savvy_deserialize_lf__ffi(c_arg__json); + return handle_result(res); +} + SEXP savvy_PlRChainedThen_when__impl(SEXP self__, SEXP c_arg__condition) { SEXP res = savvy_PlRChainedThen_when__ffi(self__, c_arg__condition); return handle_result(res); @@ -2294,6 +2299,216 @@ SEXP savvy_PlRLazyFrame_with_columns__impl(SEXP self__, SEXP c_arg__exprs) { return handle_result(res); } +SEXP savvy_PlRLazyFrame_to_dot__impl(SEXP self__, SEXP c_arg__optimized) { + SEXP res = savvy_PlRLazyFrame_to_dot__ffi(self__, c_arg__optimized); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_sort__impl(SEXP self__, SEXP c_arg__by_column, SEXP c_arg__descending, SEXP c_arg__nulls_last, SEXP c_arg__maintain_order, SEXP c_arg__multithreaded) { + SEXP res = savvy_PlRLazyFrame_sort__ffi(self__, c_arg__by_column, c_arg__descending, c_arg__nulls_last, c_arg__maintain_order, c_arg__multithreaded); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_top_k__impl(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse) { + SEXP res = savvy_PlRLazyFrame_top_k__ffi(self__, c_arg__k, c_arg__by, c_arg__reverse); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_bottom_k__impl(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse) { + SEXP res = savvy_PlRLazyFrame_bottom_k__ffi(self__, c_arg__k, c_arg__by, c_arg__reverse); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_cache__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_cache__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_profile__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_profile__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_sink_parquet__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__retries, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size, SEXP c_arg__storage_options) { + SEXP res = savvy_PlRLazyFrame_sink_parquet__ffi(self__, c_arg__path, c_arg__compression, c_arg__maintain_order, c_arg__statistics, c_arg__retries, c_arg__compression_level, c_arg__row_group_size, c_arg__data_page_size, c_arg__storage_options); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_sink_ipc__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__compression, SEXP c_arg__storage_options) { + SEXP res = savvy_PlRLazyFrame_sink_ipc__ffi(self__, c_arg__path, c_arg__maintain_order, c_arg__retries, c_arg__compression, c_arg__storage_options); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_sink_csv__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__retries, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style, SEXP c_arg__storage_options) { + SEXP res = savvy_PlRLazyFrame_sink_csv__ffi(self__, c_arg__path, c_arg__include_bom, c_arg__include_header, c_arg__separator, c_arg__line_terminator, c_arg__quote_char, c_arg__maintain_order, c_arg__batch_size, c_arg__retries, c_arg__datetime_format, c_arg__date_format, c_arg__time_format, c_arg__float_scientific, c_arg__float_precision, c_arg__null_value, c_arg__quote_style, c_arg__storage_options); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_sink_json__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__storage_options) { + SEXP res = savvy_PlRLazyFrame_sink_json__ffi(self__, c_arg__path, c_arg__maintain_order, c_arg__retries, c_arg__storage_options); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_serialize__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_serialize__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_select_seq__impl(SEXP self__, SEXP c_arg__exprs) { + SEXP res = savvy_PlRLazyFrame_select_seq__ffi(self__, c_arg__exprs); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_rolling__impl(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by) { + SEXP res = savvy_PlRLazyFrame_rolling__ffi(self__, c_arg__index_column, c_arg__period, c_arg__offset, c_arg__closed, c_arg__by); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_group_by_dynamic__impl(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__every, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__label, SEXP c_arg__include_boundaries, SEXP c_arg__closed, SEXP c_arg__group_by, SEXP c_arg__start_by) { + SEXP res = savvy_PlRLazyFrame_group_by_dynamic__ffi(self__, c_arg__index_column, c_arg__every, c_arg__period, c_arg__offset, c_arg__label, c_arg__include_boundaries, c_arg__closed, c_arg__group_by, c_arg__start_by); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_with_context__impl(SEXP self__, SEXP c_arg__contexts) { + SEXP res = savvy_PlRLazyFrame_with_context__ffi(self__, c_arg__contexts); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_join_asof__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__suffix, SEXP c_arg__coalesce, SEXP c_arg__strategy, SEXP c_arg__left_by, SEXP c_arg__right_by, SEXP c_arg__tolerance, SEXP c_arg__tolerance_str) { + SEXP res = savvy_PlRLazyFrame_join_asof__ffi(self__, c_arg__other, c_arg__left_on, c_arg__right_on, c_arg__allow_parallel, c_arg__force_parallel, c_arg__suffix, c_arg__coalesce, c_arg__strategy, c_arg__left_by, c_arg__right_by, c_arg__tolerance, c_arg__tolerance_str); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_join__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__join_nulls, SEXP c_arg__how, SEXP c_arg__suffix, SEXP c_arg__validate, SEXP c_arg__coalesce) { + SEXP res = savvy_PlRLazyFrame_join__ffi(self__, c_arg__other, c_arg__left_on, c_arg__right_on, c_arg__allow_parallel, c_arg__force_parallel, c_arg__join_nulls, c_arg__how, c_arg__suffix, c_arg__validate, c_arg__coalesce); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_join_where__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__predicates, SEXP c_arg__suffix) { + SEXP res = savvy_PlRLazyFrame_join_where__ffi(self__, c_arg__other, c_arg__predicates, c_arg__suffix); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_with_columns_seq__impl(SEXP self__, SEXP c_arg__exprs) { + SEXP res = savvy_PlRLazyFrame_with_columns_seq__ffi(self__, c_arg__exprs); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_rename__impl(SEXP self__, SEXP c_arg__existing, SEXP c_arg__new, SEXP c_arg__strict) { + SEXP res = savvy_PlRLazyFrame_rename__ffi(self__, c_arg__existing, c_arg__new, c_arg__strict); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_reverse__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_reverse__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_shift__impl(SEXP self__, SEXP c_arg__n, SEXP c_arg__fill_value) { + SEXP res = savvy_PlRLazyFrame_shift__ffi(self__, c_arg__n, c_arg__fill_value); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_fill_nan__impl(SEXP self__, SEXP c_arg__fill_value) { + SEXP res = savvy_PlRLazyFrame_fill_nan__ffi(self__, c_arg__fill_value); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_fill_null__impl(SEXP self__, SEXP c_arg__fill_value) { + SEXP res = savvy_PlRLazyFrame_fill_null__ffi(self__, c_arg__fill_value); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_min__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_min__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_max__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_max__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_sum__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_sum__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_mean__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_mean__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_std__impl(SEXP self__, SEXP c_arg__ddof) { + SEXP res = savvy_PlRLazyFrame_std__ffi(self__, c_arg__ddof); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_var__impl(SEXP self__, SEXP c_arg__ddof) { + SEXP res = savvy_PlRLazyFrame_var__ffi(self__, c_arg__ddof); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_median__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_median__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_quantile__impl(SEXP self__, SEXP c_arg__quantile, SEXP c_arg__interpolation) { + SEXP res = savvy_PlRLazyFrame_quantile__ffi(self__, c_arg__quantile, c_arg__interpolation); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_explode__impl(SEXP self__, SEXP c_arg__column) { + SEXP res = savvy_PlRLazyFrame_explode__ffi(self__, c_arg__column); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_null_count__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_null_count__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_unique__impl(SEXP self__, SEXP c_arg__maintain_order, SEXP c_arg__keep, SEXP c_arg__subset) { + SEXP res = savvy_PlRLazyFrame_unique__ffi(self__, c_arg__maintain_order, c_arg__keep, c_arg__subset); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_drop_nulls__impl(SEXP self__, SEXP c_arg__subset) { + SEXP res = savvy_PlRLazyFrame_drop_nulls__ffi(self__, c_arg__subset); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_unpivot__impl(SEXP self__, SEXP c_arg__on, SEXP c_arg__index, SEXP c_arg__value_name, SEXP c_arg__variable_name) { + SEXP res = savvy_PlRLazyFrame_unpivot__ffi(self__, c_arg__on, c_arg__index, c_arg__value_name, c_arg__variable_name); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_with_row_index__impl(SEXP self__, SEXP c_arg__name, SEXP c_arg__offset) { + SEXP res = savvy_PlRLazyFrame_with_row_index__ffi(self__, c_arg__name, c_arg__offset); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_clone__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_clone__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_unnest__impl(SEXP self__, SEXP c_arg__columns) { + SEXP res = savvy_PlRLazyFrame_unnest__ffi(self__, c_arg__columns); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_count__impl(SEXP self__) { + SEXP res = savvy_PlRLazyFrame_count__ffi(self__); + return handle_result(res); +} + +SEXP savvy_PlRLazyFrame_merge_sorted__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__key) { + SEXP res = savvy_PlRLazyFrame_merge_sorted__ffi(self__, c_arg__other, c_arg__key); + return handle_result(res); +} + SEXP savvy_PlRLazyFrame_new_from_ipc__impl(SEXP c_arg__source, SEXP c_arg__cache, SEXP c_arg__rechunk, SEXP c_arg__try_parse_hive_dates, SEXP c_arg__retries, SEXP c_arg__row_index_offset, SEXP c_arg__n_rows, SEXP c_arg__row_index_name, SEXP c_arg__storage_options, SEXP c_arg__hive_partitioning, SEXP c_arg__hive_schema, SEXP c_arg__file_cache_ttl, SEXP c_arg__include_file_paths) { SEXP res = savvy_PlRLazyFrame_new_from_ipc__ffi(c_arg__source, c_arg__cache, c_arg__rechunk, c_arg__try_parse_hive_dates, c_arg__retries, c_arg__row_index_offset, c_arg__n_rows, c_arg__row_index_name, c_arg__storage_options, c_arg__hive_partitioning, c_arg__hive_schema, c_arg__file_cache_ttl, c_arg__include_file_paths); return handle_result(res); @@ -2558,6 +2773,7 @@ static const R_CallMethodDef CallEntries[] = { {"savvy_time_range__impl", (DL_FUNC) &savvy_time_range__impl, 4}, {"savvy_time_ranges__impl", (DL_FUNC) &savvy_time_ranges__impl, 4}, {"savvy_when__impl", (DL_FUNC) &savvy_when__impl, 1}, + {"savvy_deserialize_lf__impl", (DL_FUNC) &savvy_deserialize_lf__impl, 1}, {"savvy_PlRChainedThen_when__impl", (DL_FUNC) &savvy_PlRChainedThen_when__impl, 2}, {"savvy_PlRChainedThen_otherwise__impl", (DL_FUNC) &savvy_PlRChainedThen_otherwise__impl, 2}, {"savvy_PlRChainedWhen_then__impl", (DL_FUNC) &savvy_PlRChainedWhen_then__impl, 2}, @@ -2968,6 +3184,48 @@ static const R_CallMethodDef CallEntries[] = { {"savvy_PlRLazyFrame_collect_schema__impl", (DL_FUNC) &savvy_PlRLazyFrame_collect_schema__impl, 1}, {"savvy_PlRLazyFrame_sort_by_exprs__impl", (DL_FUNC) &savvy_PlRLazyFrame_sort_by_exprs__impl, 6}, {"savvy_PlRLazyFrame_with_columns__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_columns__impl, 2}, + {"savvy_PlRLazyFrame_to_dot__impl", (DL_FUNC) &savvy_PlRLazyFrame_to_dot__impl, 2}, + {"savvy_PlRLazyFrame_sort__impl", (DL_FUNC) &savvy_PlRLazyFrame_sort__impl, 6}, + {"savvy_PlRLazyFrame_top_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_top_k__impl, 4}, + {"savvy_PlRLazyFrame_bottom_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_bottom_k__impl, 4}, + {"savvy_PlRLazyFrame_cache__impl", (DL_FUNC) &savvy_PlRLazyFrame_cache__impl, 1}, + {"savvy_PlRLazyFrame_profile__impl", (DL_FUNC) &savvy_PlRLazyFrame_profile__impl, 1}, + {"savvy_PlRLazyFrame_sink_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_parquet__impl, 10}, + {"savvy_PlRLazyFrame_sink_ipc__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_ipc__impl, 6}, + {"savvy_PlRLazyFrame_sink_csv__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_csv__impl, 18}, + {"savvy_PlRLazyFrame_sink_json__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_json__impl, 5}, + {"savvy_PlRLazyFrame_serialize__impl", (DL_FUNC) &savvy_PlRLazyFrame_serialize__impl, 1}, + {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2}, + {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6}, + {"savvy_PlRLazyFrame_group_by_dynamic__impl", (DL_FUNC) &savvy_PlRLazyFrame_group_by_dynamic__impl, 10}, + {"savvy_PlRLazyFrame_with_context__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_context__impl, 2}, + {"savvy_PlRLazyFrame_join_asof__impl", (DL_FUNC) &savvy_PlRLazyFrame_join_asof__impl, 13}, + {"savvy_PlRLazyFrame_join__impl", (DL_FUNC) &savvy_PlRLazyFrame_join__impl, 11}, + {"savvy_PlRLazyFrame_join_where__impl", (DL_FUNC) &savvy_PlRLazyFrame_join_where__impl, 4}, + {"savvy_PlRLazyFrame_with_columns_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_columns_seq__impl, 2}, + {"savvy_PlRLazyFrame_rename__impl", (DL_FUNC) &savvy_PlRLazyFrame_rename__impl, 4}, + {"savvy_PlRLazyFrame_reverse__impl", (DL_FUNC) &savvy_PlRLazyFrame_reverse__impl, 1}, + {"savvy_PlRLazyFrame_shift__impl", (DL_FUNC) &savvy_PlRLazyFrame_shift__impl, 3}, + {"savvy_PlRLazyFrame_fill_nan__impl", (DL_FUNC) &savvy_PlRLazyFrame_fill_nan__impl, 2}, + {"savvy_PlRLazyFrame_fill_null__impl", (DL_FUNC) &savvy_PlRLazyFrame_fill_null__impl, 2}, + {"savvy_PlRLazyFrame_min__impl", (DL_FUNC) &savvy_PlRLazyFrame_min__impl, 1}, + {"savvy_PlRLazyFrame_max__impl", (DL_FUNC) &savvy_PlRLazyFrame_max__impl, 1}, + {"savvy_PlRLazyFrame_sum__impl", (DL_FUNC) &savvy_PlRLazyFrame_sum__impl, 1}, + {"savvy_PlRLazyFrame_mean__impl", (DL_FUNC) &savvy_PlRLazyFrame_mean__impl, 1}, + {"savvy_PlRLazyFrame_std__impl", (DL_FUNC) &savvy_PlRLazyFrame_std__impl, 2}, + {"savvy_PlRLazyFrame_var__impl", (DL_FUNC) &savvy_PlRLazyFrame_var__impl, 2}, + {"savvy_PlRLazyFrame_median__impl", (DL_FUNC) &savvy_PlRLazyFrame_median__impl, 1}, + {"savvy_PlRLazyFrame_quantile__impl", (DL_FUNC) &savvy_PlRLazyFrame_quantile__impl, 3}, + {"savvy_PlRLazyFrame_explode__impl", (DL_FUNC) &savvy_PlRLazyFrame_explode__impl, 2}, + {"savvy_PlRLazyFrame_null_count__impl", (DL_FUNC) &savvy_PlRLazyFrame_null_count__impl, 1}, + {"savvy_PlRLazyFrame_unique__impl", (DL_FUNC) &savvy_PlRLazyFrame_unique__impl, 4}, + {"savvy_PlRLazyFrame_drop_nulls__impl", (DL_FUNC) &savvy_PlRLazyFrame_drop_nulls__impl, 2}, + {"savvy_PlRLazyFrame_unpivot__impl", (DL_FUNC) &savvy_PlRLazyFrame_unpivot__impl, 5}, + {"savvy_PlRLazyFrame_with_row_index__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_row_index__impl, 3}, + {"savvy_PlRLazyFrame_clone__impl", (DL_FUNC) &savvy_PlRLazyFrame_clone__impl, 1}, + {"savvy_PlRLazyFrame_unnest__impl", (DL_FUNC) &savvy_PlRLazyFrame_unnest__impl, 2}, + {"savvy_PlRLazyFrame_count__impl", (DL_FUNC) &savvy_PlRLazyFrame_count__impl, 1}, + {"savvy_PlRLazyFrame_merge_sorted__impl", (DL_FUNC) &savvy_PlRLazyFrame_merge_sorted__impl, 3}, {"savvy_PlRLazyFrame_new_from_ipc__impl", (DL_FUNC) &savvy_PlRLazyFrame_new_from_ipc__impl, 13}, {"savvy_PlRLazyFrame_new_from_csv__impl", (DL_FUNC) &savvy_PlRLazyFrame_new_from_csv__impl, 30}, {"savvy_PlRLazyFrame_new_from_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_new_from_parquet__impl, 18}, diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index e3a85e8f..5e10feef 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -31,6 +31,7 @@ features = [ "array_any_all", "array_count", "array_to_struct", + "asof_join", "binary_encoding", "business", "cloud", @@ -41,6 +42,8 @@ features = [ "cutqcut", "diagonal_concat", "diff", + "dynamic_group_by", + "dot_diagram", "dot_product", "dtype-full", "dynamic_group_by", @@ -51,6 +54,7 @@ features = [ "find_many", "fused", "hist", + "iejoin", "interpolate", "interpolate_by", "ipc", @@ -71,6 +75,7 @@ features = [ "list_sets", "list_to_struct", "log", + "merge_sorted", "meta", "mode", "moment", @@ -80,6 +85,7 @@ features = [ "parquet", "pct_change", "peaks", + "pivot", "product", "propagate_nans", "random", @@ -94,6 +100,7 @@ features = [ "round_series", "row_hash", "search_sorted", + "semi_anti_join", "serde", "serde-lazy", "sign", diff --git a/src/rust/api.h b/src/rust/api.h index d39fb62b..f51c8bfe 100644 --- a/src/rust/api.h +++ b/src/rust/api.h @@ -40,6 +40,7 @@ SEXP savvy_datetime_ranges__ffi(SEXP c_arg__start, SEXP c_arg__end, SEXP c_arg__ SEXP savvy_time_range__ffi(SEXP c_arg__start, SEXP c_arg__end, SEXP c_arg__every, SEXP c_arg__closed); SEXP savvy_time_ranges__ffi(SEXP c_arg__start, SEXP c_arg__end, SEXP c_arg__every, SEXP c_arg__closed); SEXP savvy_when__ffi(SEXP c_arg__condition); +SEXP savvy_deserialize_lf__ffi(SEXP c_arg__json); // methods and associated functions for PlRChainedThen SEXP savvy_PlRChainedThen_when__ffi(SEXP self__, SEXP c_arg__condition); @@ -462,6 +463,48 @@ SEXP savvy_PlRLazyFrame_cast_all__ffi(SEXP self__, SEXP c_arg__dtype, SEXP c_arg SEXP savvy_PlRLazyFrame_collect_schema__ffi(SEXP self__); SEXP savvy_PlRLazyFrame_sort_by_exprs__ffi(SEXP self__, SEXP c_arg__by, SEXP c_arg__descending, SEXP c_arg__nulls_last, SEXP c_arg__maintain_order, SEXP c_arg__multithreaded); SEXP savvy_PlRLazyFrame_with_columns__ffi(SEXP self__, SEXP c_arg__exprs); +SEXP savvy_PlRLazyFrame_to_dot__ffi(SEXP self__, SEXP c_arg__optimized); +SEXP savvy_PlRLazyFrame_sort__ffi(SEXP self__, SEXP c_arg__by_column, SEXP c_arg__descending, SEXP c_arg__nulls_last, SEXP c_arg__maintain_order, SEXP c_arg__multithreaded); +SEXP savvy_PlRLazyFrame_top_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse); +SEXP savvy_PlRLazyFrame_bottom_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse); +SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_sink_parquet__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__retries, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size, SEXP c_arg__storage_options); +SEXP savvy_PlRLazyFrame_sink_ipc__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__compression, SEXP c_arg__storage_options); +SEXP savvy_PlRLazyFrame_sink_csv__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__retries, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style, SEXP c_arg__storage_options); +SEXP savvy_PlRLazyFrame_sink_json__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__storage_options); +SEXP savvy_PlRLazyFrame_serialize__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs); +SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by); +SEXP savvy_PlRLazyFrame_group_by_dynamic__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__every, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__label, SEXP c_arg__include_boundaries, SEXP c_arg__closed, SEXP c_arg__group_by, SEXP c_arg__start_by); +SEXP savvy_PlRLazyFrame_with_context__ffi(SEXP self__, SEXP c_arg__contexts); +SEXP savvy_PlRLazyFrame_join_asof__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__suffix, SEXP c_arg__coalesce, SEXP c_arg__strategy, SEXP c_arg__left_by, SEXP c_arg__right_by, SEXP c_arg__tolerance, SEXP c_arg__tolerance_str); +SEXP savvy_PlRLazyFrame_join__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__join_nulls, SEXP c_arg__how, SEXP c_arg__suffix, SEXP c_arg__validate, SEXP c_arg__coalesce); +SEXP savvy_PlRLazyFrame_join_where__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__predicates, SEXP c_arg__suffix); +SEXP savvy_PlRLazyFrame_with_columns_seq__ffi(SEXP self__, SEXP c_arg__exprs); +SEXP savvy_PlRLazyFrame_rename__ffi(SEXP self__, SEXP c_arg__existing, SEXP c_arg__new, SEXP c_arg__strict); +SEXP savvy_PlRLazyFrame_reverse__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_shift__ffi(SEXP self__, SEXP c_arg__n, SEXP c_arg__fill_value); +SEXP savvy_PlRLazyFrame_fill_nan__ffi(SEXP self__, SEXP c_arg__fill_value); +SEXP savvy_PlRLazyFrame_fill_null__ffi(SEXP self__, SEXP c_arg__fill_value); +SEXP savvy_PlRLazyFrame_min__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_max__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_sum__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_mean__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_std__ffi(SEXP self__, SEXP c_arg__ddof); +SEXP savvy_PlRLazyFrame_var__ffi(SEXP self__, SEXP c_arg__ddof); +SEXP savvy_PlRLazyFrame_median__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_quantile__ffi(SEXP self__, SEXP c_arg__quantile, SEXP c_arg__interpolation); +SEXP savvy_PlRLazyFrame_explode__ffi(SEXP self__, SEXP c_arg__column); +SEXP savvy_PlRLazyFrame_null_count__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_unique__ffi(SEXP self__, SEXP c_arg__maintain_order, SEXP c_arg__keep, SEXP c_arg__subset); +SEXP savvy_PlRLazyFrame_drop_nulls__ffi(SEXP self__, SEXP c_arg__subset); +SEXP savvy_PlRLazyFrame_unpivot__ffi(SEXP self__, SEXP c_arg__on, SEXP c_arg__index, SEXP c_arg__value_name, SEXP c_arg__variable_name); +SEXP savvy_PlRLazyFrame_with_row_index__ffi(SEXP self__, SEXP c_arg__name, SEXP c_arg__offset); +SEXP savvy_PlRLazyFrame_clone__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_unnest__ffi(SEXP self__, SEXP c_arg__columns); +SEXP savvy_PlRLazyFrame_count__ffi(SEXP self__); +SEXP savvy_PlRLazyFrame_merge_sorted__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__key); SEXP savvy_PlRLazyFrame_new_from_ipc__ffi(SEXP c_arg__source, SEXP c_arg__cache, SEXP c_arg__rechunk, SEXP c_arg__try_parse_hive_dates, SEXP c_arg__retries, SEXP c_arg__row_index_offset, SEXP c_arg__n_rows, SEXP c_arg__row_index_name, SEXP c_arg__storage_options, SEXP c_arg__hive_partitioning, SEXP c_arg__hive_schema, SEXP c_arg__file_cache_ttl, SEXP c_arg__include_file_paths); SEXP savvy_PlRLazyFrame_new_from_csv__ffi(SEXP c_arg__source, SEXP c_arg__separator, SEXP c_arg__has_header, SEXP c_arg__ignore_errors, SEXP c_arg__skip_rows, SEXP c_arg__cache, SEXP c_arg__missing_utf8_is_empty_string, SEXP c_arg__low_memory, SEXP c_arg__rechunk, SEXP c_arg__skip_rows_after_header, SEXP c_arg__encoding, SEXP c_arg__try_parse_dates, SEXP c_arg__eol_char, SEXP c_arg__raise_if_empty, SEXP c_arg__truncate_ragged_lines, SEXP c_arg__decimal_comma, SEXP c_arg__glob, SEXP c_arg__retries, SEXP c_arg__row_index_offset, SEXP c_arg__comment_prefix, SEXP c_arg__quote_char, SEXP c_arg__null_values, SEXP c_arg__infer_schema_length, SEXP c_arg__row_index_name, SEXP c_arg__n_rows, SEXP c_arg__overwrite_dtype, SEXP c_arg__schema, SEXP c_arg__storage_options, SEXP c_arg__file_cache_ttl, SEXP c_arg__include_file_paths); SEXP savvy_PlRLazyFrame_new_from_parquet__ffi(SEXP c_arg__source, SEXP c_arg__cache, SEXP c_arg__parallel, SEXP c_arg__rechunk, SEXP c_arg__low_memory, SEXP c_arg__use_statistics, SEXP c_arg__try_parse_hive_dates, SEXP c_arg__retries, SEXP c_arg__glob, SEXP c_arg__allow_missing_columns, SEXP c_arg__row_index_offset, SEXP c_arg__storage_options, SEXP c_arg__n_rows, SEXP c_arg__row_index_name, SEXP c_arg__hive_partitioning, SEXP c_arg__schema, SEXP c_arg__hive_schema, SEXP c_arg__include_file_paths); diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs index 743a2f38..dce3a5d1 100644 --- a/src/rust/src/conversion/mod.rs +++ b/src/rust/src/conversion/mod.rs @@ -4,10 +4,10 @@ use crate::prelude::*; use crate::{PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame, PlRSeries, RPolarsErr}; use polars::prelude::cloud::CloudOptions; use polars::series::ops::NullBehavior; -use savvy::{ListSexp, NumericScalar, NumericSexp, NumericTypedSexp, StringSexp, TypedSexp}; +use savvy::{ListSexp, NumericScalar, NumericSexp, NumericTypedSexp, Sexp, StringSexp, TypedSexp}; use search_sorted::SearchSortedSide; pub mod base_date; -mod chunked_array; +pub mod chunked_array; pub mod clock; pub mod data_table; @@ -33,6 +33,25 @@ impl From for Wrap { } } +impl TryFrom for Wrap> { + type Error = String; + fn try_from(obj: Sexp) -> Result { + let typed = obj.into_typed(); + let out = match typed { + TypedSexp::Integer(x) => AnyValue::Int64(*(x.to_vec().first().unwrap()) as i64), + TypedSexp::Real(x) => AnyValue::Float64(*(x.to_vec().first().unwrap())), + TypedSexp::Logical(x) => AnyValue::Boolean(*(x.to_vec().first().unwrap())), + TypedSexp::String(x) => { + let val = x.to_vec(); + AnyValue::StringOwned((*val.first().unwrap()).into()) + } + TypedSexp::Null(_) => AnyValue::Null, + _ => return Err("Cannot cast to AnyValue".to_string()), + }; + Ok(Wrap(out)) + } +} + impl TryFrom<&str> for PlRDataType { type Error = String; @@ -85,6 +104,20 @@ impl From for Wrap>>> { } } +impl TryFrom<&str> for Wrap { + type Error = String; + + fn try_from(string: &str) -> Result { + let mut utf8_byte_iter = string.as_bytes().iter(); + match (utf8_byte_iter.next(), utf8_byte_iter.next()) { + (Some(s), None) => Ok(Wrap(*s)), + (None, None) => Err(format!("cannot extract single byte from empty string")), + (Some(_), Some(_)) => Err(format!("multi byte-string not allowed")), + (None, Some(_)) => unreachable!("the iter() cannot yield Some after None(depleted)"), + } + } +} + impl TryFrom for Wrap> { type Error = savvy::Error; @@ -601,6 +634,196 @@ impl TryFrom<&str> for Wrap { } } +impl TryFrom<&str> for Wrap { + type Error = String; + + fn try_from(strategy: &str) -> Result { + let parsed = match strategy { + "first" => UniqueKeepStrategy::First, + "last" => UniqueKeepStrategy::Last, + "none" => UniqueKeepStrategy::None, + "any" => UniqueKeepStrategy::Any, + _ => return Err("unreachable".to_string()), + }; + Ok(Wrap(parsed)) + } +} + +impl TryFrom<&str> for Wrap { + type Error = String; + + fn try_from(how: &str) -> Result { + let parsed = match how { + "cross" => JoinType::Cross, + "inner" => JoinType::Inner, + "left" => JoinType::Left, + "right" => JoinType::Right, + "full" => JoinType::Full, + "semi" => JoinType::Semi, + "anti" => JoinType::Anti, + _ => return Err("unreachable".to_string()), + }; + Ok(Wrap(parsed)) + } +} + +impl TryFrom<&str> for Wrap { + type Error = String; + + fn try_from(validation: &str) -> Result { + let parsed = match validation { + "m:m" => JoinValidation::ManyToMany, + "1:m" => JoinValidation::OneToMany, + "1:1" => JoinValidation::OneToOne, + "m:1" => JoinValidation::ManyToOne, + _ => return Err("unreachable".to_string()), + }; + Ok(Wrap(parsed)) + } +} + +impl TryFrom<&str> for Wrap