diff --git a/NEWS.md b/NEWS.md index ad8693cc..7cccf667 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,10 @@ * `update_snapshot()` has been optimized and now runs faster on all the supported backends (#137). +* `*_joins()` have been more robust: + * `dbplyr` is now used internally which improves `full_join` and adds `anti_join` and `semi_join` (#157). + * When not supplying a `na_by` argument no input validation is made and unmodified `dplyr::*_join()` is called (#156). + ## Documentation * A vignette including benchmarks of `update_snapshot()` across various backends is added (#138). diff --git a/R/db_joins.R b/R/db_joins.R index 0d207903..700a256a 100644 --- a/R/db_joins.R +++ b/R/db_joins.R @@ -1,138 +1,22 @@ -#' Generate sql_on statement for na joins -#' -#' @description -#' This function generates a much faster SQL statement for NA join compared to dbplyr's _join with na_matches = "na". -#' @inheritParams left_join -#' @return -#' A sql_on statement to join by such that "NA" are matched with "NA" given the columns listed in "by" and "na_by". -#' @noRd -join_na_sql <- function(x, by, na_by) { - UseMethod("join_na_sql") -} - -join_na_not_distinct <- function(by, na_by = NULL) { - sql_on <- "" - if (!missing(by)) { - for (i in seq_along(by)) { - sql_on <- paste0(sql_on, '"LHS"."', by[i], '" = "RHS"."', by[i], '"') - if (i < length(by) || !is.null(na_by)) { - sql_on <- paste(sql_on, "\nAND ") - } - } - } - - if (!missing(na_by)) { - for (i in seq_along(na_by)) { - sql_on <- paste0(sql_on, '"LHS"."', na_by[i], '" IS NOT DISTINCT FROM "RHS"."', na_by[i], '"') - if (i < length(na_by)) { - sql_on <- paste(sql_on, "\nAND ") - } - } - } - - return(sql_on) -} - -join_na_not_null <- function(by, na_by = NULL) { - sql_on <- "" - if (!missing(by)) { - for (i in seq_along(by)) { - sql_on <- paste0(sql_on, '"LHS"."', by[i], '" = "RHS"."', by[i], '"') - if (i < length(by) || !is.null(na_by)) { - sql_on <- paste(sql_on, "\nAND ") - } - } - } - - if (!missing(na_by)) { - for (i in seq_along(na_by)) { - sql_on <- paste0(sql_on, - '("LHS"."', na_by[i], '" IS NULL AND "RHS"."', na_by[i], '" IS NULL ', - 'OR "LHS"."', na_by[i], '" = "RHS"."', na_by[i], '")') - if (i < length(na_by)) { - sql_on <- paste(sql_on, "\nAND ") - } - } - } - - return(sql_on) -} - -#' @noRd -join_na_sql.tbl_dbi <- function(x, by, na_by) { - return(join_na_not_distinct(by = by, na_by = na_by)) -} - -#' @noRd -`join_na_sql.tbl_Microsoft SQL Server` <- function(x, by, na_by) { - return(join_na_not_null(by = by, na_by = na_by)) -} - -#' Get colnames to select -#' -#' @inheritParams left_join -#' @param left (`logical(1)`)\cr -#' Is the join a left (alternatively right) join? -#' @return -#' A named character vector indicating which columns to select from x and y. -#' @noRd -select_na_sql <- function(x, y, by, na_by, left = TRUE) { - - all_by <- c(by, na_by) # Variables to be common after join - cx <- dplyr::setdiff(colnames(x), colnames(y)) # Variables only in x - cy <- dplyr::setdiff(colnames(y), colnames(x)) # Variables only in y - - sql_select <- - c(paste0(colnames(x), ifelse(colnames(x) %in% cx, "", ".x")), - paste0(colnames(y), ifelse(colnames(y) %in% cy, "", ".y"))[!colnames(y) %in% all_by]) |> - stats::setNames(c(colnames(x), - paste0(colnames(y), ifelse(colnames(y) %in% colnames(x), ".y", ""))[!colnames(y) %in% all_by])) - - return(sql_select) -} - - -#' Warn users that SQL does not match on NA by default -#' -#' @return -#' A warning that *_joins on SQL backends does not match NA by default. -#' @noRd -join_warn <- function() { - if (interactive() && identical(parent.frame(n = 2), globalenv())) { - rlang::warn(paste("*_joins in database-backend does not match NA by default.\n", - "If your data contains NA, the columns with NA values must be supplied to \"na_by\",", - "or you must specifiy na_matches = \"na\""), - .frequency = "once", .frequency_id = "*_join NA warning") - } -} - - -#' Warn users that SQL joins by NA is experimental -#' -#' @return -#' A warning that *_joins are still experimental. -#' @noRd -join_warn_experimental <- function() { - if (interactive() && identical(parent.frame(n = 2), globalenv())) { - rlang::warn("*_joins with na_by is still experimental. Please report issues.", - .frequency = "once", .frequency_id = "*_join NA warning") - } -} - - #' SQL Joins #' #' @name joins #' #' @description +#' `r lifecycle::badge("experimental")` +#' #' Overloads the dplyr `*_join` to accept an `na_by` argument. #' By default, joining using SQL does not match on `NA` / `NULL`. -#' dbplyr `*_join`s has the option "na_matches = na" to match on `NA` / `NULL` but this is very inefficient in some -#' cases. -#' This function does the matching more efficiently: +#' dbplyr `*_join`s has the option "na_matches = na" to match on `NA` / `NULL` but this operation is substantially +#' slower since it turns all equality comparisons to identical comparisons. +#' +#' This function does the matching more efficiently by allowing the user to specify which column contains +#' `NA` / `NULL` values and which does not: #' If a column contains `NA` / `NULL`, the names of these columns can be passed via the `na_by` argument and -#' efficiently match as if "na_matches = na". -#' If no `na_by` argument is given is given, the function defaults to using `dplyr::*_join`. +#' efficiently match as if `na_matches = "na"`. +#' Columns without `NA` / `NULL` values is passed via the `by` argument and will be matched `na_matches = "never"`. +#' +#' If no `na_by` argument is given, the function defaults to using `dplyr::*_join` without modification. #' #' @inheritParams dbplyr::join.tbl_sql #' @return Another \code{tbl_lazy}. Use \code{\link[dplyr:show_query]{show_query()}} to see the generated @@ -159,179 +43,345 @@ join_warn_experimental <- function() { #' # But you can activate R's usual behaviour with the na_matches argument #' left_join(db, label, by = "x", na_matches = "na") #' -#' # By default, joins are equijoins, but you can use `sql_on` to +#' # By default, joins are equijoins, but you can use `dplyr::join_by()` to #' # express richer relationships -#' db1 <- memdb_frame(x = 1:5) -#' db2 <- memdb_frame(x = 1:3, y = letters[1:3]) +#' db1 <- memdb_frame(id = 1:5) +#' db2 <- memdb_frame(id = 1:3, y = letters[1:3]) #' #' left_join(db1, db2) |> show_query() -#' left_join(db1, db2, sql_on = "LHS.x < RHS.x") |> show_query() +#' left_join(db1, db2, by = join_by(x$id < y$id)) |> show_query() #' @seealso [dplyr::mutate-joins] which this function wraps. #' @seealso [dbplyr::join.tbl_sql] which this function wraps. #' @seealso [dplyr::show_query] #' @exportS3Method dplyr::inner_join inner_join.tbl_sql <- function(x, y, by = NULL, ...) { - - # Check arguments - assert_data_like(x) - assert_data_like(y) - checkmate::assert_character(by, null.ok = TRUE) - .dots <- list(...) if (!"na_by" %in% names(.dots)) { - if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn() + join_warn() return(NextMethod("inner_join")) } - join_warn_experimental() - - args <- as.list(rlang::current_env()) |> - append(.dots) - - .renamer <- select_na_sql(x, y, by, .dots$na_by) - - # Remove na_by from args to avoid infinite loops - args$na_by <- NULL - args$sql_on <- join_na_sql(x, by, .dots$na_by) - - join_result <- do.call(dplyr::inner_join, args = args) |> - dplyr::rename(!!.renamer) |> - dplyr::select(tidyselect::all_of(names(.renamer))) + # Prepare the combined join + out <- do.call(dplyr::inner_join, args = join_args(x, y, by, .dots)) + out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by) - return(join_result) + return(out) } #' @rdname joins #' @exportS3Method dplyr::left_join left_join.tbl_sql <- function(x, y, by = NULL, ...) { - - # Check arguments - assert_data_like(x) - assert_data_like(y) - checkmate::assert_character(by, null.ok = TRUE) - .dots <- list(...) if (!"na_by" %in% names(.dots)) { - if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn() - + join_warn() return(NextMethod("left_join")) } - join_warn_experimental() + out <- do.call(dplyr::left_join, args = join_args(x, y, by, .dots)) + out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by) - args <- as.list(rlang::current_env()) |> - append(.dots) + return(out) +} - .renamer <- select_na_sql(x, y, by, .dots$na_by) +#' @rdname joins +#' @exportS3Method dplyr::right_join +right_join.tbl_sql <- function(x, y, by = NULL, ...) { + .dots <- list(...) - # Remove na_by from args to avoid infinite loops - args$na_by <- NULL - args$sql_on <- join_na_sql(x, by, .dots$na_by) + if (!"na_by" %in% names(.dots)) { + join_warn() + return(NextMethod("right_join")) + } - join_result <- do.call(dplyr::left_join, args = args) |> - dplyr::rename(!!.renamer) |> - dplyr::select(tidyselect::all_of(names(.renamer))) + out <- do.call(dplyr::right_join, args = join_args(x, y, by, .dots)) + out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by, right = TRUE) - return(join_result) + return(out) } + #' @rdname joins -#' @exportS3Method dplyr::right_join -right_join.tbl_sql <- function(x, y, by = NULL, ...) { +#' @exportS3Method dplyr::full_join +full_join.tbl_sql <- function(x, y, by = NULL, ...) { + .dots <- list(...) + + if (!"na_by" %in% names(.dots)) { + join_warn() + return(NextMethod("full_join")) + } + + out <- do.call(dplyr::full_join, args = join_args(x, y, by, .dots)) + out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by) + + return(out) +} - # Check arguments - assert_data_like(x) - assert_data_like(y) - checkmate::assert_character(by, null.ok = TRUE) +#' @rdname joins +#' @exportS3Method dplyr::semi_join +semi_join.tbl_sql <- function(x, y, by = NULL, ...) { .dots <- list(...) if (!"na_by" %in% names(.dots)) { - if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn() + join_warn() + return(NextMethod("semi_join")) + } - return(NextMethod("right_join")) + out <- do.call(dplyr::semi_join, args = join_args(x, y, by, .dots)) + + return(out) +} + + +#' @rdname joins +#' @exportS3Method dplyr::anti_join +anti_join.tbl_sql <- function(x, y, by = NULL, ...) { + .dots <- list(...) + + if (!"na_by" %in% names(.dots)) { + join_warn() + return(NextMethod("anti_join")) } - join_warn_experimental() + out <- do.call(dplyr::anti_join, args = join_args(x, y, by, .dots)) - args <- as.list(rlang::current_env()) |> - append(.dots) + return(out) +} - .renamer <- select_na_sql(x, y, by, .dots$na_by) - # Remove na_by from args to avoid infinite loops - args$na_by <- NULL - args$sql_on <- join_na_sql(x, by, .dots$na_by) +#' Warn users that SQL does not match on NA by default +#' +#' @return +#' A warning that *_joins on SQL backends does not match NA by default. +#' @noRd +join_warn <- function() { + if (interactive() && identical(parent.frame(n = 2), globalenv())) { + rlang::warn( + paste( + "*_joins in database-backend does not match NA by default.\n", + "If your data contains NA, the columns with NA values must be supplied to \"na_by\",", + "or you must specify na_matches = \"na\"" + ), + .frequency = "once", + .frequency_id = "*_join NA warning" + ) + } +} - join_result <- do.call(dplyr::right_join, args = args) |> - dplyr::rename(!!.renamer) |> - dplyr::select(tidyselect::all_of(names(.renamer))) - return(join_result) +#' Warn users that SQL joins by NA is experimental +#' +#' @return +#' A warning that *_joins are still experimental. +#' @noRd +join_warn_experimental <- function() { + if (interactive() && identical(parent.frame(n = 2), globalenv())) { + rlang::warn( + "*_joins with na_by is still experimental. Please report issues.", + .frequency = "once", + .frequency_id = "*_join NA warning" + ) + } } -#' @rdname joins -#' @exportS3Method dplyr::full_join -full_join.tbl_sql <- function(x, y, by = NULL, ...) { +#' Construct the arguments to `*_join` that accounts for the na matching +#' @param x (`tbl_sql`) \cr +#' The left table to join. +#' @param y (`tbl_sql`) \cr +#' The right table to join. +#' @param by (`dbplyr_join_by` or `character`) \cr +#' The columns to match on without NA values. +#' @param .dots (`list`) \cr +#' Arguments passed to the `*_join` function. +#' @noRd +join_args <- function(x, y, by, .dots) { - # Check arguments - assert_data_like(x) - assert_data_like(y) - checkmate::assert_character(by, null.ok = TRUE) + # Remove the na matching args, and let join_na_sql combine the `by` and `na_by` statements + by <- join_na_sql(x, y, by, .dots) + args <- append(list(x = x, y = y, by = by), purrr::discard_at(.dots, c("na_by", "na_matches"))) - .dots <- list(...) + return(args) +} - if ("na_by" %in% names(.dots)) { - join_warn_experimental() - # Full joins are hard... - out <- dplyr::union(dplyr::left_join(x, y, by = by, na_by = .dots$na_by), - dplyr::right_join(x, y, by = by, na_by = .dots$na_by)) - return(out) - } else { - if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn() - return(dplyr::full_join(x, y, by = by, ...)) + +#' Merge two `dplyr_join_by` objects +#' @param by (`dplyr_join_by` or `character`) \cr +#' The columns to match on without NA values. +#' @param na_by (`dplyr_join_by` or `character`) \cr +#' The columns to match on NA. +#' @noRd +join_merger <- function(by, na_by) { + + # Early return if only one by statement is given + if (is.null(by) && is.null(na_by)) { + stop("Both by and na_by cannot be NULL") + } else if (is.null(by)) { + return(na_by) + } else if (is.null(na_by)) { + return(by) } + + # Combine the by and na_by statements by unclassing, merging and reclassing + combined_join <- list( + "exprs" = c(purrr::pluck(by, "exprs"), purrr::pluck(na_by, "exprs")) + ) |> + utils::modifyList( + purrr::map2(purrr::discard_at(by, "exprs"), purrr::discard_at(na_by, "exprs"), ~ c(.x, .y)) + ) + class(combined_join) <- "dplyr_join_by" + + return(combined_join) } -#' @rdname joins -#' @exportS3Method dplyr::semi_join -semi_join.tbl_sql <- function(x, y, by = NULL, ...) { +#' Generate `dplyr_join_by` statement for na joins +#' +#' @description +#' This function creates a `dplyr_join_by` object to join by where the statements supplied in `by` are treated as not +#' having NA values while the columns listed in `na_by` are treated as having NA values. +#' This latter translation corresponds to using `dplyr::*_join` with `na_matches = "na"`. +#' @inheritParams left_join +#' @param na_by (`character`)\cr +#' The columns to match on NA. If a column contains NA, the names of these columns can be passed via the `na_by` +#' argument. These will then be matched as if with the `na_matches = "na"` argument. +#' @return +#' A `dplyr_join_by` object to join by such that "NA" are matched with "NA" given the columns listed in `by` and +#' `na_by`. +#' @noRd +join_na_sql <- function(x, y, by = NULL, .dots = NULL) { + + # Early return if no na_by statement is given + if (is.null(.dots$na_by)) { + return(by) + } else { + na_by <- .dots$na_by + } # Check arguments - assert_data_like(x) - assert_data_like(y) - checkmate::assert_character(by, null.ok = TRUE) + checkmate::assert( + checkmate::check_character(by, null.ok = TRUE), + checkmate::check_class(by, "dplyr_join_by", null.ok = TRUE) + ) + checkmate::assert( + checkmate::check_character(na_by, null.ok = TRUE), + checkmate::check_class(na_by, "dplyr_join_by", null.ok = TRUE) + ) - .dots <- list(...) + join_warn_experimental() - if ("na_by" %in% names(.dots)) { - stop("Not implemented") - } else { - if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn() - return(dplyr::semi_join(x, y, by = by, ...)) + # Convert to dplyr_join_by if not already + if (!is.null(by) && !inherits(by, "dplyr_join_by")) { + by <- dplyr::join_by(!!!by) } + + if (!is.null(na_by) && !inherits(na_by, "dplyr_join_by")) { + na_by <- dplyr::join_by(!!!na_by) + } + + # Get the translation for matching the na_by component of the join + subquery_args <- purrr::discard_at(.dots, "na_by") |> + utils::modifyList( + list( + x = x, + y = y, + by = join_merger(by, na_by), + na_matches = "na" + ) + ) + na_subquery <- dbplyr::remote_query(do.call(dplyr::inner_join, args = subquery_args)) + + # Determine the NA matching statement by extracting from the translated query. + # E.g. on RSQlite, the keyword "IS" checks if arguments are identical + # and on PostgreSQL, the keyword "IS NOT DISTINCT FROM" checks if arguments are identical. + na_matching <- na_subquery |> + stringr::str_remove_all(stringr::fixed("\n")) |> # Remove newlines from the formatted query + stringr::str_replace_all(r"{\s{2,}}", " ") |> # Remove multiple spaces from the formatted query + stringr::str_extract(r"{(?<=ON \().*(?=\))}") # Extract the contents of the ON statement + + print("ON subquery") + print(na_matching) + + na_matching <- na_matching |> + stringr::str_extract(pattern = r"{(?:["'`]\s)([\w\s]+)(?:\s["'`])}", group = 1) # First non quoted word(s) + + # Replace NA equals with NA matching statement + na_by$condition[na_by$condition == "=="] <- na_matching + + return(join_merger(by, na_by)) } -#' @rdname joins -#' @exportS3Method dplyr::anti_join -anti_join.tbl_sql <- function(x, y, by = NULL, ...) { +#' Manually fixes the select component of the `lazy_query` after overwriting the `by` statement. +#' +#' @description +#' After overwriting the `by` statement in the `lazy_query`, the `vars` component of the `lazy_query` is not +#' consistent with the new non-overwritten `by` statement. +#' As a result, columns which are matched in the join are included as both `.x` and ``, instead of just +#' as ``. +#' This function fixes the `vars` component of the `lazy_query` to remove the doubly selected columns and rename +#' to the expected name. +#' @param vars (`tibble`)\cr +#' The `vars` component of the `lazy_query`. +#' @param na_by (`dplyr_join_by`)\cr +#' The `na_by` statement used in the join. +#' @param right (`logical`)\cr +#' If the join is a right join. +#' @return +#' A `tibble` with the `vars` component of the `lazy_query` fixed to remove doubly selected columns. +#' @noRd +join_na_select_fix <- function(vars, na_by, right = FALSE) { + if (is.null(na_by)) return(vars) - # Check arguments - assert_data_like(x) - assert_data_like(y) - checkmate::assert_character(by, null.ok = TRUE) + if (!inherits(na_by, "dplyr_join_by")) na_by <- dplyr::join_by(!!!na_by) - .dots <- list(...) + # All equality joins in `na_by` are incorrectly translated + doubly_selected_columns <- na_by |> + purrr::discard_at("exprs") |> + tibble::as_tibble() |> + dplyr::filter(.data$condition == "==", .data$x == .data$y) |> + dplyr::pull("x") - if ("na_by" %in% names(.dots)) { - stop("Not implemented") + if (length(doubly_selected_columns) == 0) { + updated_vars <- vars # no doubly selected columns } else { - if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn() - return(dplyr::anti_join(x, y, by = by, ...)) + + # The vars table structure is not consistent between dplyr join types + # There are two formats which we needs to manage independently. + if (checkmate::test_names(names(vars), identical.to = c("name", "x", "y"))) { + updated_vars <- rbind( + tibble::tibble( + "name" = doubly_selected_columns, + "x" = ifelse(right, NA, doubly_selected_columns), + "y" = doubly_selected_columns + ), + dplyr::filter(vars, .data$x %in% !!doubly_selected_columns | .data$y %in% !!doubly_selected_columns) + ) |> + dplyr::symdiff(vars) + + # Reorder our updated columns to match the original order + updated_vars <- updated_vars[ + order(match(updated_vars$name, unique(purrr::pmap_chr(vars, ~ dplyr::coalesce(..2, ..3))))), + ] + + } else if (checkmate::test_names(names(vars), identical.to = c("name", "table", "var"))) { + updated_vars <- rbind( + tibble::tibble( + "name" = doubly_selected_columns, + "table" = 1, + "var" = doubly_selected_columns + ), + dplyr::filter(vars, .data$var %in% !!doubly_selected_columns) + ) |> + dplyr::symdiff(vars) + + # Reorder our updated columns to match the original order + updated_vars <- updated_vars[order(match(updated_vars$name, unique(vars$var))), ] + + } } + + return(updated_vars) } diff --git a/man/joins.Rd b/man/joins.Rd index dd54252b..5af76132 100644 --- a/man/joins.Rd +++ b/man/joins.Rd @@ -62,14 +62,20 @@ query, and use \code{\link[dbplyr:collect.tbl_sql]{collect()}} to execute the qu and return data to R. } \description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + Overloads the dplyr \verb{*_join} to accept an \code{na_by} argument. By default, joining using SQL does not match on \code{NA} / \code{NULL}. -dbplyr \verb{*_join}s has the option "na_matches = na" to match on \code{NA} / \code{NULL} but this is very inefficient in some -cases. -This function does the matching more efficiently: +dbplyr \verb{*_join}s has the option "na_matches = na" to match on \code{NA} / \code{NULL} but this operation is substantially +slower since it turns all equality comparisons to identical comparisons. + +This function does the matching more efficiently by allowing the user to specify which column contains +\code{NA} / \code{NULL} values and which does not: If a column contains \code{NA} / \code{NULL}, the names of these columns can be passed via the \code{na_by} argument and -efficiently match as if "na_matches = na". -If no \code{na_by} argument is given is given, the function defaults to using \verb{dplyr::*_join}. +efficiently match as if \code{na_matches = "na"}. +Columns without \code{NA} / \code{NULL} values is passed via the \code{by} argument and will be matched \code{na_matches = "never"}. + +If no \code{na_by} argument is given, the function defaults to using \verb{dplyr::*_join} without modification. } \examples{ \dontshow{if (requireNamespace("RSQLite", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} @@ -93,13 +99,13 @@ If no \code{na_by} argument is given is given, the function defaults to using \v # But you can activate R's usual behaviour with the na_matches argument left_join(db, label, by = "x", na_matches = "na") - # By default, joins are equijoins, but you can use `sql_on` to + # By default, joins are equijoins, but you can use `dplyr::join_by()` to # express richer relationships - db1 <- memdb_frame(x = 1:5) - db2 <- memdb_frame(x = 1:3, y = letters[1:3]) + db1 <- memdb_frame(id = 1:5) + db2 <- memdb_frame(id = 1:3, y = letters[1:3]) left_join(db1, db2) |> show_query() - left_join(db1, db2, sql_on = "LHS.x < RHS.x") |> show_query() + left_join(db1, db2, by = join_by(x$id < y$id)) |> show_query() \dontshow{\}) # examplesIf} } \seealso{ diff --git a/pak.lock b/pak.lock index 96a02e9c..6b153271 100644 --- a/pak.lock +++ b/pak.lock @@ -7,7 +7,7 @@ { "ref": "askpass", "package": "askpass", - "version": "1.2.0", + "version": "1.2.1", "type": "standard", "direct": false, "binary": true, @@ -19,10 +19,10 @@ "RemoteRef": "askpass", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "1.2.0" + "RemoteSha": "1.2.1" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/askpass_1.2.0.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/askpass_1.2.0.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/askpass_1.2.1.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/askpass_1.2.1.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, @@ -534,7 +534,7 @@ { "ref": "commonmark", "package": "commonmark", - "version": "1.9.1", + "version": "1.9.2", "type": "standard", "direct": false, "binary": true, @@ -546,10 +546,10 @@ "RemoteRef": "commonmark", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "1.9.1" + "RemoteSha": "1.9.2" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/commonmark_1.9.1.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/commonmark_1.9.1.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/commonmark_1.9.2.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/commonmark_1.9.2.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, @@ -702,7 +702,7 @@ { "ref": "data.table", "package": "data.table", - "version": "1.16.0", + "version": "1.16.2", "type": "standard", "direct": false, "binary": true, @@ -714,10 +714,10 @@ "RemoteRef": "data.table", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "1.16.0" + "RemoteSha": "1.16.2" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/data.table_1.16.0.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/data.table_1.16.0.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/data.table_1.16.2.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/data.table_1.16.2.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, @@ -1012,7 +1012,7 @@ { "ref": "evaluate", "package": "evaluate", - "version": "1.0.0", + "version": "1.0.1", "type": "standard", "direct": false, "binary": true, @@ -1024,10 +1024,10 @@ "RemoteRef": "evaluate", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "1.0.0" + "RemoteSha": "1.0.1" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/evaluate_1.0.0.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/evaluate_1.0.0.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/evaluate_1.0.1.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/evaluate_1.0.1.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, @@ -1683,7 +1683,7 @@ { "ref": "hunspell", "package": "hunspell", - "version": "3.0.4", + "version": "3.0.5", "type": "standard", "direct": false, "binary": true, @@ -1695,10 +1695,10 @@ "RemoteRef": "hunspell", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "3.0.4" + "RemoteSha": "3.0.5" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/hunspell_3.0.4.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/hunspell_3.0.4.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/hunspell_3.0.5.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/hunspell_3.0.5.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, @@ -3619,7 +3619,7 @@ { "ref": "spelling", "package": "spelling", - "version": "2.3.0", + "version": "2.3.1", "type": "standard", "direct": false, "binary": true, @@ -3631,10 +3631,10 @@ "RemoteRef": "spelling", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "2.3.0" + "RemoteSha": "2.3.1" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/spelling_2.3.0.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/spelling_2.3.0.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/spelling_2.3.1.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/spelling_2.3.1.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, @@ -3719,7 +3719,7 @@ { "ref": "sys", "package": "sys", - "version": "3.4.2", + "version": "3.4.3", "type": "standard", "direct": false, "binary": true, @@ -3731,10 +3731,10 @@ "RemoteRef": "sys", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "3.4.2" + "RemoteSha": "3.4.3" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/sys_3.4.2.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/sys_3.4.2.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/sys_3.4.3.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/sys_3.4.3.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, @@ -4340,7 +4340,7 @@ { "ref": "xfun", "package": "xfun", - "version": "0.47", + "version": "0.48", "type": "standard", "direct": false, "binary": true, @@ -4352,10 +4352,10 @@ "RemoteRef": "xfun", "RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest", "RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04", - "RemoteSha": "0.47" + "RemoteSha": "0.48" }, - "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/xfun_0.47.tar.gz", - "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/xfun_0.47.tar.gz", + "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/xfun_0.48.tar.gz", + "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/xfun_0.48.tar.gz", "platform": "x86_64-pc-linux-gnu-ubuntu-22.04", "rversion": "4.4", "directpkg": false, diff --git a/tests/testthat/test-db_joins.R b/tests/testthat/test-db_joins.R index 320c7c25..ed4abe77 100644 --- a/tests/testthat/test-db_joins.R +++ b/tests/testthat/test-db_joins.R @@ -1,28 +1,6 @@ -test_that("*_join() works", { +test_that("*_join() works with character `by` and `na_by`", { for (conn in get_test_conns()) { - # Define two test datasets - x <- get_table(conn, "__mtcars") |> - dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb) - - y <- get_table(conn, "__mtcars") |> - dplyr::select(name, drat, wt, qsec) - - - # Test the implemented joins - q <- dplyr::left_join(x, y, by = "name") |> dplyr::collect() - qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "name") - expect_equal(q, qr) - - q <- dplyr::right_join(x, y, by = "name") |> dplyr::collect() - qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "name") - expect_equal(q, qr) - - q <- dplyr::inner_join(x, y, by = "name") |> dplyr::collect() - qr <- dplyr::inner_join(dplyr::collect(x), dplyr::collect(y), by = "name") - expect_equal(q, qr) - - # Create two more synthetic test data set with NA data # First test case @@ -43,7 +21,7 @@ test_that("*_join() works", { dplyr::arrange(number, t, letter) qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "number", multiple = "all") |> dplyr::arrange(number, t, letter) - expect_mapequal(q, qr) + expect_equal(q, qr) q <- dplyr::right_join(x, y, na_by = "number") |> dplyr::collect() |> @@ -115,3 +93,131 @@ test_that("*_join() works", { connection_clean_up(conn) } }) + + +test_that("*_join() does not break any dplyr joins when no `na_by` argument is given", { + for (conn in get_test_conns()) { + + # Define two test datasets + x <- get_table(conn, "__mtcars") |> + dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb) + + y <- get_table(conn, "__mtcars") |> + dplyr::select(name, drat, wt, qsec) + + # Test the standard joins + # left_join + qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::left_join(x, y, by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::left_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # right_join + qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::right_join(x, y, by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::right_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # inner_join + qr <- dplyr::inner_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::inner_join(x, y, by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::inner_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # full_join + qr <- dplyr::full_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::full_join(x, y, by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::full_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # semi_join + qr <- dplyr::semi_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::semi_join(x, y, by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::semi_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # anti_join + qr <- dplyr::anti_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::anti_join(x, y, by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::anti_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + connection_clean_up(conn) + } +}) + + +test_that("*_join() with only `na_by` works identically as if `by` was given instead when no data is NA", { + for (conn in get_test_conns()) { + + # Define two test datasets + x <- get_table(conn, "__mtcars") |> + dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb) + + y <- get_table(conn, "__mtcars") |> + dplyr::select(name, drat, wt, qsec) + + # Test the standard joins + # left_join + qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::left_join(x, y, na_by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::left_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # right_join + qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::right_join(x, y, na_by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::right_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # inner_join + qr <- dplyr::inner_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::inner_join(x, y, na_by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::inner_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # full_join + qr <- dplyr::full_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::full_join(x, y, na_by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::full_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # semi_join + qr <- dplyr::semi_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::semi_join(x, y, na_by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::semi_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + # anti_join + qr <- dplyr::anti_join(dplyr::collect(x), dplyr::collect(y), by = "name") + q <- dplyr::anti_join(x, y, na_by = "name") |> dplyr::collect() + expect_equal(q, qr) + + q <- dplyr::anti_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect() + expect_equal(q, qr) + + connection_clean_up(conn) + } +}) diff --git a/tests/testthat/test-filter_keys.R b/tests/testthat/test-filter_keys.R index ad6952cc..76b80b4a 100644 --- a/tests/testthat/test-filter_keys.R +++ b/tests/testthat/test-filter_keys.R @@ -55,7 +55,7 @@ test_that("filter_keys() works with copy = TRUE", { dplyr::collect()) # The above filter_keys with `copy = TRUE` generates a dbplyr_### table. - # We manually remove this since we expect it. If more arrise, we will get an error. + # We manually remove this since we expect it. If more arise, we will get an error. DBI::dbRemoveTable(conn, id(utils::head(get_tables(conn, "dbplyr_"), 1))) connection_clean_up(conn)