diff --git a/r/NEWS.md b/r/NEWS.md index 4ed9f28a28436..05f934dac68f3 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -21,6 +21,7 @@ * R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are automatic translations rather than true user-defined functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. +* The `na_matches` argument to the `dplyr::*_join()` functions is now supported. This argument controls whether `NA` values are considered equal when joining. (#41358) # arrow 16.0.0 diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index f6977e626276b..7087a40c4903a 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -66,12 +66,12 @@ supported_dplyr_methods <- list( compute = NULL, collapse = NULL, distinct = "`.keep_all = TRUE` not supported", - left_join = "the `copy` and `na_matches` arguments are ignored", - right_join = "the `copy` and `na_matches` arguments are ignored", - inner_join = "the `copy` and `na_matches` arguments are ignored", - full_join = "the `copy` and `na_matches` arguments are ignored", - semi_join = "the `copy` and `na_matches` arguments are ignored", - anti_join = "the `copy` and `na_matches` arguments are ignored", + left_join = "the `copy` argument is ignored", + right_join = "the `copy` argument is ignored", + inner_join = "the `copy` argument is ignored", + full_join = "the `copy` argument is ignored", + semi_join = "the `copy` argument is ignored", + anti_join = "the `copy` argument is ignored", count = NULL, tally = NULL, rename_with = NULL, diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 752d3a266b26a..62e2182ffcd52 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -484,8 +484,8 @@ ExecNode_Aggregate <- function(input, options, key_names) { .Call(`_arrow_ExecNode_Aggregate`, input, options, key_names) } -ExecNode_Join <- function(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right) { - .Call(`_arrow_ExecNode_Join`, input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right) +ExecNode_Join <- function(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right, na_matches) { + .Call(`_arrow_ExecNode_Join`, input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right, na_matches) } ExecNode_Union <- function(input, right_data) { diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index 2042f800142b7..fda77bca83fc2 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -36,7 +36,7 @@ #' which returns an `arrow` [Table], or `collect()`, which pulls the resulting #' Table into an R `tibble`. #' -#' * [`anti_join()`][dplyr::anti_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`anti_join()`][dplyr::anti_join()]: the `copy` argument is ignored #' * [`arrange()`][dplyr::arrange()] #' * [`collapse()`][dplyr::collapse()] #' * [`collect()`][dplyr::collect()] @@ -45,22 +45,22 @@ #' * [`distinct()`][dplyr::distinct()]: `.keep_all = TRUE` not supported #' * [`explain()`][dplyr::explain()] #' * [`filter()`][dplyr::filter()] -#' * [`full_join()`][dplyr::full_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`full_join()`][dplyr::full_join()]: the `copy` argument is ignored #' * [`glimpse()`][dplyr::glimpse()] #' * [`group_by()`][dplyr::group_by()] #' * [`group_by_drop_default()`][dplyr::group_by_drop_default()] #' * [`group_vars()`][dplyr::group_vars()] #' * [`groups()`][dplyr::groups()] -#' * [`inner_join()`][dplyr::inner_join()]: the `copy` and `na_matches` arguments are ignored -#' * [`left_join()`][dplyr::left_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`inner_join()`][dplyr::inner_join()]: the `copy` argument is ignored +#' * [`left_join()`][dplyr::left_join()]: the `copy` argument is ignored #' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported #' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally. #' * [`relocate()`][dplyr::relocate()] #' * [`rename()`][dplyr::rename()] #' * [`rename_with()`][dplyr::rename_with()] -#' * [`right_join()`][dplyr::right_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`right_join()`][dplyr::right_join()]: the `copy` argument is ignored #' * [`select()`][dplyr::select()] -#' * [`semi_join()`][dplyr::semi_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`semi_join()`][dplyr::semi_join()]: the `copy` argument is ignored #' * [`show_query()`][dplyr::show_query()] #' * [`slice_head()`][dplyr::slice_head()]: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating #' * [`slice_max()`][dplyr::slice_max()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R index 39237f574bd28..e76e041a54277 100644 --- a/r/R/dplyr-join.R +++ b/r/R/dplyr-join.R @@ -25,14 +25,15 @@ do_join <- function(x, suffix = c(".x", ".y"), ..., keep = FALSE, - na_matches, + na_matches = c("na", "never"), join_type) { # TODO: handle `copy` arg: ignore? - # TODO: handle `na_matches` arg x <- as_adq(x) y <- as_adq(y) by <- handle_join_by(by, x, y) + na_matches <- match.arg(na_matches) + # For outer joins, we need to output the join keys on both sides so we # can coalesce them afterwards. left_output <- if (!keep && join_type == "RIGHT_OUTER") { @@ -54,7 +55,8 @@ do_join <- function(x, left_output = left_output, right_output = right_output, suffix = suffix, - keep = keep + keep = keep, + na_matches = na_matches == "na" ) collapse.arrow_dplyr_query(x) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 0f8a84f9b867e..fb48d790fd36e 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -148,7 +148,8 @@ ExecPlan <- R6Class("ExecPlan", left_output = .data$join$left_output, right_output = .data$join$right_output, left_suffix = .data$join$suffix[[1]], - right_suffix = .data$join$suffix[[2]] + right_suffix = .data$join$suffix[[2]], + na_matches = .data$join$na_matches ) } @@ -307,7 +308,7 @@ ExecNode <- R6Class("ExecNode", out$extras$source_schema$metadata[["r"]]$attributes <- NULL out }, - Join = function(type, right_node, by, left_output, right_output, left_suffix, right_suffix) { + Join = function(type, right_node, by, left_output, right_output, left_suffix, right_suffix, na_matches = TRUE) { self$preserve_extras( ExecNode_Join( self, @@ -318,7 +319,8 @@ ExecNode <- R6Class("ExecNode", left_output = left_output, right_output = right_output, output_suffix_for_left = left_suffix, - output_suffix_for_right = right_suffix + output_suffix_for_right = right_suffix, + na_matches = na_matches ) ) }, diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 365795d9fc65c..ca51ef56334eb 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -23,7 +23,7 @@ the query on the data. To run the query, call either \code{compute()}, which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting Table into an R \code{tibble}. \itemize{ -\item \code{\link[dplyr:filter-joins]{anti_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:filter-joins]{anti_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:arrange]{arrange()}} \item \code{\link[dplyr:compute]{collapse()}} \item \code{\link[dplyr:compute]{collect()}} @@ -32,22 +32,22 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} not supported \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} -\item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} \item \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} \item \code{\link[dplyr:group_data]{group_vars()}} \item \code{\link[dplyr:group_data]{groups()}} -\item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} and \code{na_matches} arguments are ignored -\item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported \item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally. \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} \item \code{\link[dplyr:rename]{rename_with()}} -\item \code{\link[dplyr:mutate-joins]{right_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate-joins]{right_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:select]{select()}} -\item \code{\link[dplyr:filter-joins]{semi_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:filter-joins]{semi_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:explain]{show_query()}} \item \code{\link[dplyr:slice]{slice_head()}}: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:slice]{slice_max()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index a4c4b614d6d75..d5aec50219e0b 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1163,8 +1163,8 @@ extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SE // compute-exec.cpp #if defined(ARROW_R_WITH_ACERO) -std::shared_ptr ExecNode_Join(const std::shared_ptr& input, acero::JoinType join_type, const std::shared_ptr& right_data, std::vector left_keys, std::vector right_keys, std::vector left_output, std::vector right_output, std::string output_suffix_for_left, std::string output_suffix_for_right); -extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp){ +std::shared_ptr ExecNode_Join(const std::shared_ptr& input, acero::JoinType join_type, const std::shared_ptr& right_data, std::vector left_keys, std::vector right_keys, std::vector left_output, std::vector right_output, std::string output_suffix_for_left, std::string output_suffix_for_right, bool na_matches); +extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp, SEXP na_matches_sexp){ BEGIN_CPP11 arrow::r::Input&>::type input(input_sexp); arrow::r::Input::type join_type(join_type_sexp); @@ -1175,11 +1175,12 @@ BEGIN_CPP11 arrow::r::Input>::type right_output(right_output_sexp); arrow::r::Input::type output_suffix_for_left(output_suffix_for_left_sexp); arrow::r::Input::type output_suffix_for_right(output_suffix_for_right_sexp); - return cpp11::as_sexp(ExecNode_Join(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right)); + arrow::r::Input::type na_matches(na_matches_sexp); + return cpp11::as_sexp(ExecNode_Join(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right, na_matches)); END_CPP11 } #else -extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp){ +extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp, SEXP na_matches_sexp){ Rf_error("Cannot call ExecNode_Join(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -5790,7 +5791,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 3}, - { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 9}, + { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 10}, { "_arrow_ExecNode_Union", (DL_FUNC) &_arrow_ExecNode_Union, 2}, { "_arrow_ExecNode_Fetch", (DL_FUNC) &_arrow_ExecNode_Fetch, 3}, { "_arrow_ExecNode_OrderBy", (DL_FUNC) &_arrow_ExecNode_OrderBy, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index e0b3c62c47d7f..d0c50315c299f 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -411,10 +411,17 @@ std::shared_ptr ExecNode_Join( const std::shared_ptr& right_data, std::vector left_keys, std::vector right_keys, std::vector left_output, std::vector right_output, - std::string output_suffix_for_left, std::string output_suffix_for_right) { + std::string output_suffix_for_left, std::string output_suffix_for_right, + bool na_matches) { std::vector left_refs, right_refs, left_out_refs, right_out_refs; + std::vector key_cmps; for (auto&& name : left_keys) { left_refs.emplace_back(std::move(name)); + // Populate key_cmps in this loop, one for each key + // Note that Acero supports having different values for each key, but dplyr + // only supports one value for all keys, so we're only going to support that + // for now. + key_cmps.emplace_back(na_matches ? acero::JoinKeyCmp::IS : acero::JoinKeyCmp::EQ); } for (auto&& name : right_keys) { right_refs.emplace_back(std::move(name)); @@ -434,10 +441,11 @@ std::shared_ptr ExecNode_Join( return MakeExecNodeOrStop( "hashjoin", input->plan(), {input.get(), right_data.get()}, - acero::HashJoinNodeOptions{ - join_type, std::move(left_refs), std::move(right_refs), - std::move(left_out_refs), std::move(right_out_refs), compute::literal(true), - std::move(output_suffix_for_left), std::move(output_suffix_for_right)}); + acero::HashJoinNodeOptions{join_type, std::move(left_refs), std::move(right_refs), + std::move(left_out_refs), std::move(right_out_refs), + std::move(key_cmps), compute::literal(true), + std::move(output_suffix_for_left), + std::move(output_suffix_for_right)}); } // [[acero::export]] diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index e3e1e98cfca15..9a1c8b7b80fea 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -441,3 +441,35 @@ test_that("full joins handle keep", { small_dataset_df ) }) + +left <- tibble::tibble( + x = c(1, NA, 3), +) +right <- tibble::tibble( + x = c(1, NA, 3), + y = c("a", "b", "c") +) +na_matches_na <- right +na_matches_never <- tibble::tibble( + x = c(1, NA, 3), + y = c("a", NA, "c") +) +test_that("na_matches argument to join: na (default)", { + expect_equal( + arrow_table(left) %>% + left_join(right, by = "x", na_matches = "na") %>% + arrange(x) %>% + collect(), + na_matches_na %>% arrange(x) + ) +}) + +test_that("na_matches argument to join: never", { + expect_equal( + arrow_table(left) %>% + left_join(right, by = "x", na_matches = "never") %>% + arrange(x) %>% + collect(), + na_matches_never %>% arrange(x) + ) +})