Skip to content

Commit

Permalink
apacheGH-41834: [R] Better error handling in dplyr code (apache#41576)
Browse files Browse the repository at this point in the history
* GitHub Issue: apache#41834
  • Loading branch information
nealrichardson authored May 29, 2024
1 parent 4a2df66 commit 774ee0f
Show file tree
Hide file tree
Showing 38 changed files with 804 additions and 823 deletions.
6 changes: 5 additions & 1 deletion r/R/dplyr-across.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ expand_across <- function(.data, quos_in, exclude_cols = NULL) {
)

if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow")
arrow_not_supported(
"`...` argument to `across()` is deprecated in dplyr and",
body = c(">" = "Convert your call into a function or formula including the arguments"),
call = rlang::caller_call()
)
}

if (!is.null(across_call[[".cols"]])) {
Expand Down
87 changes: 44 additions & 43 deletions r/R/dplyr-arrange.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,47 +19,46 @@
# The following S3 methods are registered on load if dplyr is present

arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) {
call <- match.call()
.data <- as_adq(.data)
exprs <- expand_across(.data, quos(...))
try_arrow_dplyr({
.data <- as_adq(.data)
exprs <- expand_across(.data, quos(...))

if (.by_group) {
# when the data is grouped and .by_group is TRUE, order the result by
# the grouping columns first
exprs <- c(quos(!!!dplyr::groups(.data)), exprs)
}
if (length(exprs) == 0) {
# Nothing to do
return(.data)
}
.data <- as_adq(.data)
# find and remove any dplyr::desc() and tidy-eval
# the arrange expressions inside an Arrow data_mask
sorts <- vector("list", length(exprs))
descs <- logical(0)
mask <- arrow_mask(.data)
for (i in seq_along(exprs)) {
x <- find_and_remove_desc(exprs[[i]])
exprs[[i]] <- x[["quos"]]
sorts[[i]] <- arrow_eval(exprs[[i]], mask)
names(sorts)[i] <- format_expr(exprs[[i]])
if (inherits(sorts[[i]], "try-error")) {
msg <- paste("Expression", names(sorts)[i], "not supported in Arrow")
return(abandon_ship(call, .data, msg))
if (.by_group) {
# when the data is grouped and .by_group is TRUE, order the result by
# the grouping columns first
exprs <- c(quos(!!!dplyr::groups(.data)), exprs)
}
if (length(mask$.aggregations)) {
# dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it.
# But we could, the same way it works in mutate() via join, if someone asks.
# Until then, just error.
# TODO: add a test for this
msg <- paste("Expression", format_expr(expr), "not supported in arrange() in Arrow")
return(abandon_ship(call, .data, msg))
if (length(exprs) == 0) {
# Nothing to do
return(.data)
}
descs[i] <- x[["desc"]]
}
.data$arrange_vars <- c(sorts, .data$arrange_vars)
.data$arrange_desc <- c(descs, .data$arrange_desc)
.data
.data <- as_adq(.data)
# find and remove any dplyr::desc() and tidy-eval
# the arrange expressions inside an Arrow data_mask
sorts <- vector("list", length(exprs))
descs <- logical(0)
mask <- arrow_mask(.data)
for (i in seq_along(exprs)) {
x <- find_and_remove_desc(exprs[[i]])
exprs[[i]] <- x[["quos"]]
sorts[[i]] <- arrow_eval(exprs[[i]], mask)
names(sorts)[i] <- format_expr(exprs[[i]])
if (length(mask$.aggregations)) {
# dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it.
# But we could, the same way it works in mutate() via join, if someone asks.
# Until then, just error.
# TODO: add a test for this
arrow_not_supported(
.actual_msg = "Expression not supported in arrange() in Arrow",
call = expr
)
}
descs[i] <- x[["desc"]]
}
.data$arrange_vars <- c(sorts, .data$arrange_vars)
.data$arrange_desc <- c(descs, .data$arrange_desc)
.data
})
}
arrange.Dataset <- arrange.ArrowTabular <- arrange.RecordBatchReader <- arrange.arrow_dplyr_query

Expand All @@ -73,10 +72,9 @@ find_and_remove_desc <- function(quosure) {
expr <- quo_get_expr(quosure)
descending <- FALSE
if (length(all.vars(expr)) < 1L) {
stop(
"Expression in arrange() does not contain any field names: ",
deparse(expr),
call. = FALSE
validation_error(
"Expression in arrange() does not contain any field names",
call = quosure
)
}
# Use a while loop to remove any number of nested pairs of enclosing
Expand All @@ -90,7 +88,10 @@ find_and_remove_desc <- function(quosure) {
# ensure desc() has only one argument (when an R expression is a function
# call, length == 2 means it has exactly one argument)
if (length(expr) > 2) {
stop("desc() expects only one argument", call. = FALSE)
validation_error(
"desc() expects only one argument",
call = expr
)
}
# remove desc() and toggle descending
expr <- expr[[2]]
Expand Down
31 changes: 16 additions & 15 deletions r/R/dplyr-datetime-helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) {
if (tolower(Sys.info()[["sysname"]]) == "windows" && locale != "C") {
# MingW C++ std::locale only supports "C" and "POSIX"
stop(paste0(
"On Windows, time locales other than 'C' are not supported in Arrow. ",
"Consider setting `Sys.setlocale('LC_TIME', 'C')`"
))
arrow_not_supported(
"On Windows, time locales other than 'C'",
body = c(">" = "Consider setting `Sys.setlocale('LC_TIME', 'C')`")
)
}
locale
}
Expand Down Expand Up @@ -56,13 +56,15 @@ duration_from_chunks <- function(chunks) {
matched_chunks <- accepted_chunks[pmatch(names(chunks), accepted_chunks, duplicates.ok = TRUE)]

if (any(is.na(matched_chunks))) {
abort(
paste0(
"named `difftime` units other than: ",
oxford_paste(accepted_chunks, quote_symbol = "`"),
" not supported in Arrow. \nInvalid `difftime` parts: ",
arrow_not_supported(
paste(
"named `difftime` units other than:",
oxford_paste(accepted_chunks, quote_symbol = "`")
),
body = c(i = paste(
"Invalid `difftime` parts:",
oxford_paste(names(chunks[is.na(matched_chunks)]), quote_symbol = "`")
)
))
)
}

Expand Down Expand Up @@ -114,7 +116,6 @@ binding_as_date_character <- function(x,
}

binding_as_date_numeric <- function(x, origin = "1970-01-01") {

# Arrow does not support direct casting from double to date32(), but for
# integer-like values we can go via int32()
# TODO: revisit after ARROW-15798
Expand Down Expand Up @@ -442,7 +443,7 @@ parse_period_unit <- function(x) {
unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L

if (any(is.na(unit))) {
abort(
validation_error(
sprintf(
"Invalid period name: '%s'",
str_unit,
Expand Down Expand Up @@ -484,13 +485,13 @@ parse_period_unit <- function(x) {
# more special cases: lubridate imposes sensible maximum
# values on the number of seconds, minutes and hours
if (unit == 3L && multiple > 60) {
abort("Rounding with second > 60 is not supported")
validation_error("Rounding with second > 60 is not supported")
}
if (unit == 4L && multiple > 60) {
abort("Rounding with minute > 60 is not supported")
validation_error("Rounding with minute > 60 is not supported")
}
if (unit == 5L && multiple > 24) {
abort("Rounding with hour > 24 is not supported")
validation_error("Rounding with hour > 24 is not supported")
}

list(unit = unit, multiple = multiple)
Expand Down
Loading

0 comments on commit 774ee0f

Please sign in to comment.