diff --git a/NEWS.md b/NEWS.md
index ad8693cc..7cccf667 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -8,6 +8,10 @@
* `update_snapshot()` has been optimized and now runs faster on all the supported backends (#137).
+* `*_joins()` have been more robust:
+ * `dbplyr` is now used internally which improves `full_join` and adds `anti_join` and `semi_join` (#157).
+ * When not supplying a `na_by` argument no input validation is made and unmodified `dplyr::*_join()` is called (#156).
+
## Documentation
* A vignette including benchmarks of `update_snapshot()` across various backends is added (#138).
diff --git a/R/db_joins.R b/R/db_joins.R
index 0d207903..700a256a 100644
--- a/R/db_joins.R
+++ b/R/db_joins.R
@@ -1,138 +1,22 @@
-#' Generate sql_on statement for na joins
-#'
-#' @description
-#' This function generates a much faster SQL statement for NA join compared to dbplyr's _join with na_matches = "na".
-#' @inheritParams left_join
-#' @return
-#' A sql_on statement to join by such that "NA" are matched with "NA" given the columns listed in "by" and "na_by".
-#' @noRd
-join_na_sql <- function(x, by, na_by) {
- UseMethod("join_na_sql")
-}
-
-join_na_not_distinct <- function(by, na_by = NULL) {
- sql_on <- ""
- if (!missing(by)) {
- for (i in seq_along(by)) {
- sql_on <- paste0(sql_on, '"LHS"."', by[i], '" = "RHS"."', by[i], '"')
- if (i < length(by) || !is.null(na_by)) {
- sql_on <- paste(sql_on, "\nAND ")
- }
- }
- }
-
- if (!missing(na_by)) {
- for (i in seq_along(na_by)) {
- sql_on <- paste0(sql_on, '"LHS"."', na_by[i], '" IS NOT DISTINCT FROM "RHS"."', na_by[i], '"')
- if (i < length(na_by)) {
- sql_on <- paste(sql_on, "\nAND ")
- }
- }
- }
-
- return(sql_on)
-}
-
-join_na_not_null <- function(by, na_by = NULL) {
- sql_on <- ""
- if (!missing(by)) {
- for (i in seq_along(by)) {
- sql_on <- paste0(sql_on, '"LHS"."', by[i], '" = "RHS"."', by[i], '"')
- if (i < length(by) || !is.null(na_by)) {
- sql_on <- paste(sql_on, "\nAND ")
- }
- }
- }
-
- if (!missing(na_by)) {
- for (i in seq_along(na_by)) {
- sql_on <- paste0(sql_on,
- '("LHS"."', na_by[i], '" IS NULL AND "RHS"."', na_by[i], '" IS NULL ',
- 'OR "LHS"."', na_by[i], '" = "RHS"."', na_by[i], '")')
- if (i < length(na_by)) {
- sql_on <- paste(sql_on, "\nAND ")
- }
- }
- }
-
- return(sql_on)
-}
-
-#' @noRd
-join_na_sql.tbl_dbi <- function(x, by, na_by) {
- return(join_na_not_distinct(by = by, na_by = na_by))
-}
-
-#' @noRd
-`join_na_sql.tbl_Microsoft SQL Server` <- function(x, by, na_by) {
- return(join_na_not_null(by = by, na_by = na_by))
-}
-
-#' Get colnames to select
-#'
-#' @inheritParams left_join
-#' @param left (`logical(1)`)\cr
-#' Is the join a left (alternatively right) join?
-#' @return
-#' A named character vector indicating which columns to select from x and y.
-#' @noRd
-select_na_sql <- function(x, y, by, na_by, left = TRUE) {
-
- all_by <- c(by, na_by) # Variables to be common after join
- cx <- dplyr::setdiff(colnames(x), colnames(y)) # Variables only in x
- cy <- dplyr::setdiff(colnames(y), colnames(x)) # Variables only in y
-
- sql_select <-
- c(paste0(colnames(x), ifelse(colnames(x) %in% cx, "", ".x")),
- paste0(colnames(y), ifelse(colnames(y) %in% cy, "", ".y"))[!colnames(y) %in% all_by]) |>
- stats::setNames(c(colnames(x),
- paste0(colnames(y), ifelse(colnames(y) %in% colnames(x), ".y", ""))[!colnames(y) %in% all_by]))
-
- return(sql_select)
-}
-
-
-#' Warn users that SQL does not match on NA by default
-#'
-#' @return
-#' A warning that *_joins on SQL backends does not match NA by default.
-#' @noRd
-join_warn <- function() {
- if (interactive() && identical(parent.frame(n = 2), globalenv())) {
- rlang::warn(paste("*_joins in database-backend does not match NA by default.\n",
- "If your data contains NA, the columns with NA values must be supplied to \"na_by\",",
- "or you must specifiy na_matches = \"na\""),
- .frequency = "once", .frequency_id = "*_join NA warning")
- }
-}
-
-
-#' Warn users that SQL joins by NA is experimental
-#'
-#' @return
-#' A warning that *_joins are still experimental.
-#' @noRd
-join_warn_experimental <- function() {
- if (interactive() && identical(parent.frame(n = 2), globalenv())) {
- rlang::warn("*_joins with na_by is still experimental. Please report issues.",
- .frequency = "once", .frequency_id = "*_join NA warning")
- }
-}
-
-
#' SQL Joins
#'
#' @name joins
#'
#' @description
+#' `r lifecycle::badge("experimental")`
+#'
#' Overloads the dplyr `*_join` to accept an `na_by` argument.
#' By default, joining using SQL does not match on `NA` / `NULL`.
-#' dbplyr `*_join`s has the option "na_matches = na" to match on `NA` / `NULL` but this is very inefficient in some
-#' cases.
-#' This function does the matching more efficiently:
+#' dbplyr `*_join`s has the option "na_matches = na" to match on `NA` / `NULL` but this operation is substantially
+#' slower since it turns all equality comparisons to identical comparisons.
+#'
+#' This function does the matching more efficiently by allowing the user to specify which column contains
+#' `NA` / `NULL` values and which does not:
#' If a column contains `NA` / `NULL`, the names of these columns can be passed via the `na_by` argument and
-#' efficiently match as if "na_matches = na".
-#' If no `na_by` argument is given is given, the function defaults to using `dplyr::*_join`.
+#' efficiently match as if `na_matches = "na"`.
+#' Columns without `NA` / `NULL` values is passed via the `by` argument and will be matched `na_matches = "never"`.
+#'
+#' If no `na_by` argument is given, the function defaults to using `dplyr::*_join` without modification.
#'
#' @inheritParams dbplyr::join.tbl_sql
#' @return Another \code{tbl_lazy}. Use \code{\link[dplyr:show_query]{show_query()}} to see the generated
@@ -159,179 +43,345 @@ join_warn_experimental <- function() {
#' # But you can activate R's usual behaviour with the na_matches argument
#' left_join(db, label, by = "x", na_matches = "na")
#'
-#' # By default, joins are equijoins, but you can use `sql_on` to
+#' # By default, joins are equijoins, but you can use `dplyr::join_by()` to
#' # express richer relationships
-#' db1 <- memdb_frame(x = 1:5)
-#' db2 <- memdb_frame(x = 1:3, y = letters[1:3])
+#' db1 <- memdb_frame(id = 1:5)
+#' db2 <- memdb_frame(id = 1:3, y = letters[1:3])
#'
#' left_join(db1, db2) |> show_query()
-#' left_join(db1, db2, sql_on = "LHS.x < RHS.x") |> show_query()
+#' left_join(db1, db2, by = join_by(x$id < y$id)) |> show_query()
#' @seealso [dplyr::mutate-joins] which this function wraps.
#' @seealso [dbplyr::join.tbl_sql] which this function wraps.
#' @seealso [dplyr::show_query]
#' @exportS3Method dplyr::inner_join
inner_join.tbl_sql <- function(x, y, by = NULL, ...) {
-
- # Check arguments
- assert_data_like(x)
- assert_data_like(y)
- checkmate::assert_character(by, null.ok = TRUE)
-
.dots <- list(...)
if (!"na_by" %in% names(.dots)) {
- if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn()
+ join_warn()
return(NextMethod("inner_join"))
}
- join_warn_experimental()
-
- args <- as.list(rlang::current_env()) |>
- append(.dots)
-
- .renamer <- select_na_sql(x, y, by, .dots$na_by)
-
- # Remove na_by from args to avoid infinite loops
- args$na_by <- NULL
- args$sql_on <- join_na_sql(x, by, .dots$na_by)
-
- join_result <- do.call(dplyr::inner_join, args = args) |>
- dplyr::rename(!!.renamer) |>
- dplyr::select(tidyselect::all_of(names(.renamer)))
+ # Prepare the combined join
+ out <- do.call(dplyr::inner_join, args = join_args(x, y, by, .dots))
+ out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by)
- return(join_result)
+ return(out)
}
#' @rdname joins
#' @exportS3Method dplyr::left_join
left_join.tbl_sql <- function(x, y, by = NULL, ...) {
-
- # Check arguments
- assert_data_like(x)
- assert_data_like(y)
- checkmate::assert_character(by, null.ok = TRUE)
-
.dots <- list(...)
if (!"na_by" %in% names(.dots)) {
- if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn()
-
+ join_warn()
return(NextMethod("left_join"))
}
- join_warn_experimental()
+ out <- do.call(dplyr::left_join, args = join_args(x, y, by, .dots))
+ out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by)
- args <- as.list(rlang::current_env()) |>
- append(.dots)
+ return(out)
+}
- .renamer <- select_na_sql(x, y, by, .dots$na_by)
+#' @rdname joins
+#' @exportS3Method dplyr::right_join
+right_join.tbl_sql <- function(x, y, by = NULL, ...) {
+ .dots <- list(...)
- # Remove na_by from args to avoid infinite loops
- args$na_by <- NULL
- args$sql_on <- join_na_sql(x, by, .dots$na_by)
+ if (!"na_by" %in% names(.dots)) {
+ join_warn()
+ return(NextMethod("right_join"))
+ }
- join_result <- do.call(dplyr::left_join, args = args) |>
- dplyr::rename(!!.renamer) |>
- dplyr::select(tidyselect::all_of(names(.renamer)))
+ out <- do.call(dplyr::right_join, args = join_args(x, y, by, .dots))
+ out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by, right = TRUE)
- return(join_result)
+ return(out)
}
+
#' @rdname joins
-#' @exportS3Method dplyr::right_join
-right_join.tbl_sql <- function(x, y, by = NULL, ...) {
+#' @exportS3Method dplyr::full_join
+full_join.tbl_sql <- function(x, y, by = NULL, ...) {
+ .dots <- list(...)
+
+ if (!"na_by" %in% names(.dots)) {
+ join_warn()
+ return(NextMethod("full_join"))
+ }
+
+ out <- do.call(dplyr::full_join, args = join_args(x, y, by, .dots))
+ out$lazy_query$vars <- join_na_select_fix(out$lazy_query$vars, .dots$na_by)
+
+ return(out)
+}
- # Check arguments
- assert_data_like(x)
- assert_data_like(y)
- checkmate::assert_character(by, null.ok = TRUE)
+#' @rdname joins
+#' @exportS3Method dplyr::semi_join
+semi_join.tbl_sql <- function(x, y, by = NULL, ...) {
.dots <- list(...)
if (!"na_by" %in% names(.dots)) {
- if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn()
+ join_warn()
+ return(NextMethod("semi_join"))
+ }
- return(NextMethod("right_join"))
+ out <- do.call(dplyr::semi_join, args = join_args(x, y, by, .dots))
+
+ return(out)
+}
+
+
+#' @rdname joins
+#' @exportS3Method dplyr::anti_join
+anti_join.tbl_sql <- function(x, y, by = NULL, ...) {
+ .dots <- list(...)
+
+ if (!"na_by" %in% names(.dots)) {
+ join_warn()
+ return(NextMethod("anti_join"))
}
- join_warn_experimental()
+ out <- do.call(dplyr::anti_join, args = join_args(x, y, by, .dots))
- args <- as.list(rlang::current_env()) |>
- append(.dots)
+ return(out)
+}
- .renamer <- select_na_sql(x, y, by, .dots$na_by)
- # Remove na_by from args to avoid infinite loops
- args$na_by <- NULL
- args$sql_on <- join_na_sql(x, by, .dots$na_by)
+#' Warn users that SQL does not match on NA by default
+#'
+#' @return
+#' A warning that *_joins on SQL backends does not match NA by default.
+#' @noRd
+join_warn <- function() {
+ if (interactive() && identical(parent.frame(n = 2), globalenv())) {
+ rlang::warn(
+ paste(
+ "*_joins in database-backend does not match NA by default.\n",
+ "If your data contains NA, the columns with NA values must be supplied to \"na_by\",",
+ "or you must specify na_matches = \"na\""
+ ),
+ .frequency = "once",
+ .frequency_id = "*_join NA warning"
+ )
+ }
+}
- join_result <- do.call(dplyr::right_join, args = args) |>
- dplyr::rename(!!.renamer) |>
- dplyr::select(tidyselect::all_of(names(.renamer)))
- return(join_result)
+#' Warn users that SQL joins by NA is experimental
+#'
+#' @return
+#' A warning that *_joins are still experimental.
+#' @noRd
+join_warn_experimental <- function() {
+ if (interactive() && identical(parent.frame(n = 2), globalenv())) {
+ rlang::warn(
+ "*_joins with na_by is still experimental. Please report issues.",
+ .frequency = "once",
+ .frequency_id = "*_join NA warning"
+ )
+ }
}
-#' @rdname joins
-#' @exportS3Method dplyr::full_join
-full_join.tbl_sql <- function(x, y, by = NULL, ...) {
+#' Construct the arguments to `*_join` that accounts for the na matching
+#' @param x (`tbl_sql`) \cr
+#' The left table to join.
+#' @param y (`tbl_sql`) \cr
+#' The right table to join.
+#' @param by (`dbplyr_join_by` or `character`) \cr
+#' The columns to match on without NA values.
+#' @param .dots (`list`) \cr
+#' Arguments passed to the `*_join` function.
+#' @noRd
+join_args <- function(x, y, by, .dots) {
- # Check arguments
- assert_data_like(x)
- assert_data_like(y)
- checkmate::assert_character(by, null.ok = TRUE)
+ # Remove the na matching args, and let join_na_sql combine the `by` and `na_by` statements
+ by <- join_na_sql(x, y, by, .dots)
+ args <- append(list(x = x, y = y, by = by), purrr::discard_at(.dots, c("na_by", "na_matches")))
- .dots <- list(...)
+ return(args)
+}
- if ("na_by" %in% names(.dots)) {
- join_warn_experimental()
- # Full joins are hard...
- out <- dplyr::union(dplyr::left_join(x, y, by = by, na_by = .dots$na_by),
- dplyr::right_join(x, y, by = by, na_by = .dots$na_by))
- return(out)
- } else {
- if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn()
- return(dplyr::full_join(x, y, by = by, ...))
+
+#' Merge two `dplyr_join_by` objects
+#' @param by (`dplyr_join_by` or `character`) \cr
+#' The columns to match on without NA values.
+#' @param na_by (`dplyr_join_by` or `character`) \cr
+#' The columns to match on NA.
+#' @noRd
+join_merger <- function(by, na_by) {
+
+ # Early return if only one by statement is given
+ if (is.null(by) && is.null(na_by)) {
+ stop("Both by and na_by cannot be NULL")
+ } else if (is.null(by)) {
+ return(na_by)
+ } else if (is.null(na_by)) {
+ return(by)
}
+
+ # Combine the by and na_by statements by unclassing, merging and reclassing
+ combined_join <- list(
+ "exprs" = c(purrr::pluck(by, "exprs"), purrr::pluck(na_by, "exprs"))
+ ) |>
+ utils::modifyList(
+ purrr::map2(purrr::discard_at(by, "exprs"), purrr::discard_at(na_by, "exprs"), ~ c(.x, .y))
+ )
+ class(combined_join) <- "dplyr_join_by"
+
+ return(combined_join)
}
-#' @rdname joins
-#' @exportS3Method dplyr::semi_join
-semi_join.tbl_sql <- function(x, y, by = NULL, ...) {
+#' Generate `dplyr_join_by` statement for na joins
+#'
+#' @description
+#' This function creates a `dplyr_join_by` object to join by where the statements supplied in `by` are treated as not
+#' having NA values while the columns listed in `na_by` are treated as having NA values.
+#' This latter translation corresponds to using `dplyr::*_join` with `na_matches = "na"`.
+#' @inheritParams left_join
+#' @param na_by (`character`)\cr
+#' The columns to match on NA. If a column contains NA, the names of these columns can be passed via the `na_by`
+#' argument. These will then be matched as if with the `na_matches = "na"` argument.
+#' @return
+#' A `dplyr_join_by` object to join by such that "NA" are matched with "NA" given the columns listed in `by` and
+#' `na_by`.
+#' @noRd
+join_na_sql <- function(x, y, by = NULL, .dots = NULL) {
+
+ # Early return if no na_by statement is given
+ if (is.null(.dots$na_by)) {
+ return(by)
+ } else {
+ na_by <- .dots$na_by
+ }
# Check arguments
- assert_data_like(x)
- assert_data_like(y)
- checkmate::assert_character(by, null.ok = TRUE)
+ checkmate::assert(
+ checkmate::check_character(by, null.ok = TRUE),
+ checkmate::check_class(by, "dplyr_join_by", null.ok = TRUE)
+ )
+ checkmate::assert(
+ checkmate::check_character(na_by, null.ok = TRUE),
+ checkmate::check_class(na_by, "dplyr_join_by", null.ok = TRUE)
+ )
- .dots <- list(...)
+ join_warn_experimental()
- if ("na_by" %in% names(.dots)) {
- stop("Not implemented")
- } else {
- if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn()
- return(dplyr::semi_join(x, y, by = by, ...))
+ # Convert to dplyr_join_by if not already
+ if (!is.null(by) && !inherits(by, "dplyr_join_by")) {
+ by <- dplyr::join_by(!!!by)
}
+
+ if (!is.null(na_by) && !inherits(na_by, "dplyr_join_by")) {
+ na_by <- dplyr::join_by(!!!na_by)
+ }
+
+ # Get the translation for matching the na_by component of the join
+ subquery_args <- purrr::discard_at(.dots, "na_by") |>
+ utils::modifyList(
+ list(
+ x = x,
+ y = y,
+ by = join_merger(by, na_by),
+ na_matches = "na"
+ )
+ )
+ na_subquery <- dbplyr::remote_query(do.call(dplyr::inner_join, args = subquery_args))
+
+ # Determine the NA matching statement by extracting from the translated query.
+ # E.g. on RSQlite, the keyword "IS" checks if arguments are identical
+ # and on PostgreSQL, the keyword "IS NOT DISTINCT FROM" checks if arguments are identical.
+ na_matching <- na_subquery |>
+ stringr::str_remove_all(stringr::fixed("\n")) |> # Remove newlines from the formatted query
+ stringr::str_replace_all(r"{\s{2,}}", " ") |> # Remove multiple spaces from the formatted query
+ stringr::str_extract(r"{(?<=ON \().*(?=\))}") # Extract the contents of the ON statement
+
+ print("ON subquery")
+ print(na_matching)
+
+ na_matching <- na_matching |>
+ stringr::str_extract(pattern = r"{(?:["'`]\s)([\w\s]+)(?:\s["'`])}", group = 1) # First non quoted word(s)
+
+ # Replace NA equals with NA matching statement
+ na_by$condition[na_by$condition == "=="] <- na_matching
+
+ return(join_merger(by, na_by))
}
-#' @rdname joins
-#' @exportS3Method dplyr::anti_join
-anti_join.tbl_sql <- function(x, y, by = NULL, ...) {
+#' Manually fixes the select component of the `lazy_query` after overwriting the `by` statement.
+#'
+#' @description
+#' After overwriting the `by` statement in the `lazy_query`, the `vars` component of the `lazy_query` is not
+#' consistent with the new non-overwritten `by` statement.
+#' As a result, columns which are matched in the join are included as both `
.x` and ``, instead of just
+#' as ``.
+#' This function fixes the `vars` component of the `lazy_query` to remove the doubly selected columns and rename
+#' to the expected name.
+#' @param vars (`tibble`)\cr
+#' The `vars` component of the `lazy_query`.
+#' @param na_by (`dplyr_join_by`)\cr
+#' The `na_by` statement used in the join.
+#' @param right (`logical`)\cr
+#' If the join is a right join.
+#' @return
+#' A `tibble` with the `vars` component of the `lazy_query` fixed to remove doubly selected columns.
+#' @noRd
+join_na_select_fix <- function(vars, na_by, right = FALSE) {
+ if (is.null(na_by)) return(vars)
- # Check arguments
- assert_data_like(x)
- assert_data_like(y)
- checkmate::assert_character(by, null.ok = TRUE)
+ if (!inherits(na_by, "dplyr_join_by")) na_by <- dplyr::join_by(!!!na_by)
- .dots <- list(...)
+ # All equality joins in `na_by` are incorrectly translated
+ doubly_selected_columns <- na_by |>
+ purrr::discard_at("exprs") |>
+ tibble::as_tibble() |>
+ dplyr::filter(.data$condition == "==", .data$x == .data$y) |>
+ dplyr::pull("x")
- if ("na_by" %in% names(.dots)) {
- stop("Not implemented")
+ if (length(doubly_selected_columns) == 0) {
+ updated_vars <- vars # no doubly selected columns
} else {
- if (inherits(x, "tbl_dbi") || inherits(y, "tbl_dbi")) join_warn()
- return(dplyr::anti_join(x, y, by = by, ...))
+
+ # The vars table structure is not consistent between dplyr join types
+ # There are two formats which we needs to manage independently.
+ if (checkmate::test_names(names(vars), identical.to = c("name", "x", "y"))) {
+ updated_vars <- rbind(
+ tibble::tibble(
+ "name" = doubly_selected_columns,
+ "x" = ifelse(right, NA, doubly_selected_columns),
+ "y" = doubly_selected_columns
+ ),
+ dplyr::filter(vars, .data$x %in% !!doubly_selected_columns | .data$y %in% !!doubly_selected_columns)
+ ) |>
+ dplyr::symdiff(vars)
+
+ # Reorder our updated columns to match the original order
+ updated_vars <- updated_vars[
+ order(match(updated_vars$name, unique(purrr::pmap_chr(vars, ~ dplyr::coalesce(..2, ..3))))),
+ ]
+
+ } else if (checkmate::test_names(names(vars), identical.to = c("name", "table", "var"))) {
+ updated_vars <- rbind(
+ tibble::tibble(
+ "name" = doubly_selected_columns,
+ "table" = 1,
+ "var" = doubly_selected_columns
+ ),
+ dplyr::filter(vars, .data$var %in% !!doubly_selected_columns)
+ ) |>
+ dplyr::symdiff(vars)
+
+ # Reorder our updated columns to match the original order
+ updated_vars <- updated_vars[order(match(updated_vars$name, unique(vars$var))), ]
+
+ }
}
+
+ return(updated_vars)
}
diff --git a/man/joins.Rd b/man/joins.Rd
index dd54252b..5af76132 100644
--- a/man/joins.Rd
+++ b/man/joins.Rd
@@ -62,14 +62,20 @@ query, and use \code{\link[dbplyr:collect.tbl_sql]{collect()}} to execute the qu
and return data to R.
}
\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
Overloads the dplyr \verb{*_join} to accept an \code{na_by} argument.
By default, joining using SQL does not match on \code{NA} / \code{NULL}.
-dbplyr \verb{*_join}s has the option "na_matches = na" to match on \code{NA} / \code{NULL} but this is very inefficient in some
-cases.
-This function does the matching more efficiently:
+dbplyr \verb{*_join}s has the option "na_matches = na" to match on \code{NA} / \code{NULL} but this operation is substantially
+slower since it turns all equality comparisons to identical comparisons.
+
+This function does the matching more efficiently by allowing the user to specify which column contains
+\code{NA} / \code{NULL} values and which does not:
If a column contains \code{NA} / \code{NULL}, the names of these columns can be passed via the \code{na_by} argument and
-efficiently match as if "na_matches = na".
-If no \code{na_by} argument is given is given, the function defaults to using \verb{dplyr::*_join}.
+efficiently match as if \code{na_matches = "na"}.
+Columns without \code{NA} / \code{NULL} values is passed via the \code{by} argument and will be matched \code{na_matches = "never"}.
+
+If no \code{na_by} argument is given, the function defaults to using \verb{dplyr::*_join} without modification.
}
\examples{
\dontshow{if (requireNamespace("RSQLite", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
@@ -93,13 +99,13 @@ If no \code{na_by} argument is given is given, the function defaults to using \v
# But you can activate R's usual behaviour with the na_matches argument
left_join(db, label, by = "x", na_matches = "na")
- # By default, joins are equijoins, but you can use `sql_on` to
+ # By default, joins are equijoins, but you can use `dplyr::join_by()` to
# express richer relationships
- db1 <- memdb_frame(x = 1:5)
- db2 <- memdb_frame(x = 1:3, y = letters[1:3])
+ db1 <- memdb_frame(id = 1:5)
+ db2 <- memdb_frame(id = 1:3, y = letters[1:3])
left_join(db1, db2) |> show_query()
- left_join(db1, db2, sql_on = "LHS.x < RHS.x") |> show_query()
+ left_join(db1, db2, by = join_by(x$id < y$id)) |> show_query()
\dontshow{\}) # examplesIf}
}
\seealso{
diff --git a/pak.lock b/pak.lock
index 96a02e9c..6b153271 100644
--- a/pak.lock
+++ b/pak.lock
@@ -7,7 +7,7 @@
{
"ref": "askpass",
"package": "askpass",
- "version": "1.2.0",
+ "version": "1.2.1",
"type": "standard",
"direct": false,
"binary": true,
@@ -19,10 +19,10 @@
"RemoteRef": "askpass",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "1.2.0"
+ "RemoteSha": "1.2.1"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/askpass_1.2.0.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/askpass_1.2.0.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/askpass_1.2.1.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/askpass_1.2.1.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
@@ -534,7 +534,7 @@
{
"ref": "commonmark",
"package": "commonmark",
- "version": "1.9.1",
+ "version": "1.9.2",
"type": "standard",
"direct": false,
"binary": true,
@@ -546,10 +546,10 @@
"RemoteRef": "commonmark",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "1.9.1"
+ "RemoteSha": "1.9.2"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/commonmark_1.9.1.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/commonmark_1.9.1.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/commonmark_1.9.2.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/commonmark_1.9.2.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
@@ -702,7 +702,7 @@
{
"ref": "data.table",
"package": "data.table",
- "version": "1.16.0",
+ "version": "1.16.2",
"type": "standard",
"direct": false,
"binary": true,
@@ -714,10 +714,10 @@
"RemoteRef": "data.table",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "1.16.0"
+ "RemoteSha": "1.16.2"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/data.table_1.16.0.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/data.table_1.16.0.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/data.table_1.16.2.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/data.table_1.16.2.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
@@ -1012,7 +1012,7 @@
{
"ref": "evaluate",
"package": "evaluate",
- "version": "1.0.0",
+ "version": "1.0.1",
"type": "standard",
"direct": false,
"binary": true,
@@ -1024,10 +1024,10 @@
"RemoteRef": "evaluate",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "1.0.0"
+ "RemoteSha": "1.0.1"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/evaluate_1.0.0.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/evaluate_1.0.0.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/evaluate_1.0.1.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/evaluate_1.0.1.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
@@ -1683,7 +1683,7 @@
{
"ref": "hunspell",
"package": "hunspell",
- "version": "3.0.4",
+ "version": "3.0.5",
"type": "standard",
"direct": false,
"binary": true,
@@ -1695,10 +1695,10 @@
"RemoteRef": "hunspell",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "3.0.4"
+ "RemoteSha": "3.0.5"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/hunspell_3.0.4.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/hunspell_3.0.4.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/hunspell_3.0.5.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/hunspell_3.0.5.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
@@ -3619,7 +3619,7 @@
{
"ref": "spelling",
"package": "spelling",
- "version": "2.3.0",
+ "version": "2.3.1",
"type": "standard",
"direct": false,
"binary": true,
@@ -3631,10 +3631,10 @@
"RemoteRef": "spelling",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "2.3.0"
+ "RemoteSha": "2.3.1"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/spelling_2.3.0.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/spelling_2.3.0.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/spelling_2.3.1.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/spelling_2.3.1.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
@@ -3719,7 +3719,7 @@
{
"ref": "sys",
"package": "sys",
- "version": "3.4.2",
+ "version": "3.4.3",
"type": "standard",
"direct": false,
"binary": true,
@@ -3731,10 +3731,10 @@
"RemoteRef": "sys",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "3.4.2"
+ "RemoteSha": "3.4.3"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/sys_3.4.2.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/sys_3.4.2.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/sys_3.4.3.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/sys_3.4.3.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
@@ -4340,7 +4340,7 @@
{
"ref": "xfun",
"package": "xfun",
- "version": "0.47",
+ "version": "0.48",
"type": "standard",
"direct": false,
"binary": true,
@@ -4352,10 +4352,10 @@
"RemoteRef": "xfun",
"RemoteRepos": "https://packagemanager.posit.co/cran/__linux__/jammy/latest",
"RemotePkgPlatform": "x86_64-pc-linux-gnu-ubuntu-22.04",
- "RemoteSha": "0.47"
+ "RemoteSha": "0.48"
},
- "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/xfun_0.47.tar.gz",
- "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/xfun_0.47.tar.gz",
+ "sources": "https://packagemanager.posit.co/cran/__linux__/jammy/latest/src/contrib/xfun_0.48.tar.gz",
+ "target": "src/contrib/x86_64-pc-linux-gnu-ubuntu-22.04/4.4/xfun_0.48.tar.gz",
"platform": "x86_64-pc-linux-gnu-ubuntu-22.04",
"rversion": "4.4",
"directpkg": false,
diff --git a/tests/testthat/test-db_joins.R b/tests/testthat/test-db_joins.R
index 320c7c25..ed4abe77 100644
--- a/tests/testthat/test-db_joins.R
+++ b/tests/testthat/test-db_joins.R
@@ -1,28 +1,6 @@
-test_that("*_join() works", {
+test_that("*_join() works with character `by` and `na_by`", {
for (conn in get_test_conns()) {
- # Define two test datasets
- x <- get_table(conn, "__mtcars") |>
- dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb)
-
- y <- get_table(conn, "__mtcars") |>
- dplyr::select(name, drat, wt, qsec)
-
-
- # Test the implemented joins
- q <- dplyr::left_join(x, y, by = "name") |> dplyr::collect()
- qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "name")
- expect_equal(q, qr)
-
- q <- dplyr::right_join(x, y, by = "name") |> dplyr::collect()
- qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "name")
- expect_equal(q, qr)
-
- q <- dplyr::inner_join(x, y, by = "name") |> dplyr::collect()
- qr <- dplyr::inner_join(dplyr::collect(x), dplyr::collect(y), by = "name")
- expect_equal(q, qr)
-
-
# Create two more synthetic test data set with NA data
# First test case
@@ -43,7 +21,7 @@ test_that("*_join() works", {
dplyr::arrange(number, t, letter)
qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "number", multiple = "all") |>
dplyr::arrange(number, t, letter)
- expect_mapequal(q, qr)
+ expect_equal(q, qr)
q <- dplyr::right_join(x, y, na_by = "number") |>
dplyr::collect() |>
@@ -115,3 +93,131 @@ test_that("*_join() works", {
connection_clean_up(conn)
}
})
+
+
+test_that("*_join() does not break any dplyr joins when no `na_by` argument is given", {
+ for (conn in get_test_conns()) {
+
+ # Define two test datasets
+ x <- get_table(conn, "__mtcars") |>
+ dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb)
+
+ y <- get_table(conn, "__mtcars") |>
+ dplyr::select(name, drat, wt, qsec)
+
+ # Test the standard joins
+ # left_join
+ qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::left_join(x, y, by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::left_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # right_join
+ qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::right_join(x, y, by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::right_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # inner_join
+ qr <- dplyr::inner_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::inner_join(x, y, by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::inner_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # full_join
+ qr <- dplyr::full_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::full_join(x, y, by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::full_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # semi_join
+ qr <- dplyr::semi_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::semi_join(x, y, by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::semi_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # anti_join
+ qr <- dplyr::anti_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::anti_join(x, y, by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::anti_join(x, y, by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ connection_clean_up(conn)
+ }
+})
+
+
+test_that("*_join() with only `na_by` works identically as if `by` was given instead when no data is NA", {
+ for (conn in get_test_conns()) {
+
+ # Define two test datasets
+ x <- get_table(conn, "__mtcars") |>
+ dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb)
+
+ y <- get_table(conn, "__mtcars") |>
+ dplyr::select(name, drat, wt, qsec)
+
+ # Test the standard joins
+ # left_join
+ qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::left_join(x, y, na_by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::left_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # right_join
+ qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::right_join(x, y, na_by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::right_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # inner_join
+ qr <- dplyr::inner_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::inner_join(x, y, na_by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::inner_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # full_join
+ qr <- dplyr::full_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::full_join(x, y, na_by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::full_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # semi_join
+ qr <- dplyr::semi_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::semi_join(x, y, na_by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::semi_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ # anti_join
+ qr <- dplyr::anti_join(dplyr::collect(x), dplyr::collect(y), by = "name")
+ q <- dplyr::anti_join(x, y, na_by = "name") |> dplyr::collect()
+ expect_equal(q, qr)
+
+ q <- dplyr::anti_join(x, y, na_by = dplyr::join_by(x$name == y$name)) |> dplyr::collect()
+ expect_equal(q, qr)
+
+ connection_clean_up(conn)
+ }
+})
diff --git a/tests/testthat/test-filter_keys.R b/tests/testthat/test-filter_keys.R
index ad6952cc..76b80b4a 100644
--- a/tests/testthat/test-filter_keys.R
+++ b/tests/testthat/test-filter_keys.R
@@ -55,7 +55,7 @@ test_that("filter_keys() works with copy = TRUE", {
dplyr::collect())
# The above filter_keys with `copy = TRUE` generates a dbplyr_### table.
- # We manually remove this since we expect it. If more arrise, we will get an error.
+ # We manually remove this since we expect it. If more arise, we will get an error.
DBI::dbRemoveTable(conn, id(utils::head(get_tables(conn, "dbplyr_"), 1)))
connection_clean_up(conn)