From b92c41c43473edc58b75339d61afbf68c0c55940 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Mon, 15 Feb 2021 06:48:40 -0800 Subject: [PATCH 01/10] Use data.table for all j-index subsetting #157 --- R/SoilProfileCollection-operators.R | 43 +++++++++++++++-------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/R/SoilProfileCollection-operators.R b/R/SoilProfileCollection-operators.R index 133aeca29..8a8af1d69 100644 --- a/R/SoilProfileCollection-operators.R +++ b/R/SoilProfileCollection-operators.R @@ -124,7 +124,9 @@ setMethod("[", signature(x = "SoilProfileCollection", if (!missing(j)) { # faster replacement of j subsetting of horizon data - if (aqp_df_class(x) == "data.table") { + # if (aqp_df_class(x) == "data.table") { + + h <- as.data.table(h) # local vars to make R CMD check happy .N <- NULL @@ -132,7 +134,7 @@ setMethod("[", signature(x = "SoilProfileCollection", V1 <- NULL # data.table can do this much more efficiently - if (requireNamespace("data.table", quietly = TRUE)) { + # if (requireNamespace("data.table", quietly = TRUE)) { idn <- idname(x) # by list @horizons idname (essentially iterating over profiles) @@ -154,25 +156,24 @@ setMethod("[", signature(x = "SoilProfileCollection", pids <- h[, .I[any(1:.N %in% j)][1], by = bylist] i.idx <- pids[, .I[!is.na(V1)]] } - } - - } else { - # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) - j.res <- as.list(aggregate( - h[[hzidname(x)]], - by = list(h[[idname(x)]]), - FUN = function(hh) { - list(1:length(hh) %in% j) - }, - drop = FALSE - )$x) - - ## https://github.com/ncss-tech/aqp/issues/89 - # fix #89, where i with no matching j e.g. @site data returned - i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) - - j.idx <- which(do.call('c', j.res)) - } + # } + # } else { + # # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) + # j.res <- as.list(aggregate( + # h[[hzidname(x)]], + # by = list(h[[idname(x)]]), + # FUN = function(hh) { + # list(1:length(hh) %in% j) + # }, + # drop = FALSE + # )$x) + # + # ## https://github.com/ncss-tech/aqp/issues/89 + # # fix #89, where i with no matching j e.g. @site data returned + # i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) + # + # j.idx <- which(do.call('c', j.res)) + # } # find any index out of bounds and ignore them # j.idx.bad <- which(abs(j.idx) > nrow(h)) From 3f956bfad8005c87e558ca16a84ad7b0f1107362 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Mon, 15 Feb 2021 07:35:52 -0800 Subject: [PATCH 02/10] Significant optimization for j-index subsetting of SPC --- R/SoilProfileCollection-operators.R | 6 +- misc/aqp2/jindex-benchmarks.R | 146 ++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 misc/aqp2/jindex-benchmarks.R diff --git a/R/SoilProfileCollection-operators.R b/R/SoilProfileCollection-operators.R index 8a8af1d69..c6d3e4f74 100644 --- a/R/SoilProfileCollection-operators.R +++ b/R/SoilProfileCollection-operators.R @@ -152,10 +152,10 @@ setMethod("[", signature(x = "SoilProfileCollection", if (length(j.idx) == 0) { i.idx <- numeric(0) } else { - # determine which profile IDs KEEP - pids <- h[, .I[any(1:.N %in% j)][1], by = bylist] - i.idx <- pids[, .I[!is.na(V1)]] + # determine which profiles to KEEP + i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]])) } + # } # } else { # # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) diff --git a/misc/aqp2/jindex-benchmarks.R b/misc/aqp2/jindex-benchmarks.R new file mode 100644 index 000000000..c3e9b6125 --- /dev/null +++ b/misc/aqp2/jindex-benchmarks.R @@ -0,0 +1,146 @@ +spc_j_DT <- function(x, i, j) { + + # this is already live in AQP for data.table SPCs -- is it worth enabling globally? + + h <- data.table::as.data.table(horizons(x)) + + # local vars to make R CMD check happy + .N <- NULL + .I <- NULL + V1 <- NULL + + # data.table can do this much more efficiently + # if (requireNamespace("data.table", quietly = TRUE)) { + idn <- idname(x) + + # by list @horizons idname (essentially iterating over profiles) + bylist <- list(h[[idn]]) + names(bylist) <- idn + + # figured out the data.table way to do this + # not using := or . anymore + + # determine j indices to KEEP + j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + + # determine which site indices to keep + # in case all horizons are removed, remove sites too + if (length(j.idx) == 0) { + i.idx <- numeric(0) + } else { + # determine which profile IDs KEEP + pids <- h[, .I[any(1:.N %in% j)][1], by = bylist] + i.idx <- pids[, .I[!is.na(V1)]] + } + return(list(i.idx, j.idx)) +} + +# benchmark faster replacement of j subsetting of horizon data + +spc_j_DT_2 <- function(x, i, j) { + + # this is already live in AQP for data.table SPCs + # question: is it worth enabling globally? + + h <- data.table::as.data.table(horizons(x)) + + # local vars to make R CMD check happy + .N <- NULL + .I <- NULL + V1 <- NULL + + idn <- idname(x) + + # by list @horizons idname (essentially iterating over profiles) + bylist <- list(h[[idn]]) + names(bylist) <- idn + + # figured out the data.table way to do this + # not using := or . anymore + + # determine j indices to KEEP + j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + + # determine which site indices to keep + # in case all horizons are removed, remove sites too + if (length(j.idx) == 0) { + i.idx <- numeric(0) + } else { + # determine which profile IDs KEEP + i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]])) + } + return(list(i.idx, j.idx)) +} + +spc_j_base <- function(x, i, j) { + h <- horizons(x) + # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) + j.res <- as.list(aggregate( + h[[hzidname(x)]], + by = list(h[[idname(x)]]), + FUN = function(hh) { + list(1:length(hh) %in% j) + }, + drop = FALSE + )$x) + + ## https://github.com/ncss-tech/aqp/issues/89 + # fix #89, where i with no matching j e.g. @site data returned + i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) + j.idx <- which(do.call('c', j.res)) + + return(list(i.idx, j.idx)) +} + +library(aqp) + +data(sp4) +depths(sp4) <- id ~ top + bottom + +# base outperforms old DT with 10 profiles, but new solution slightly better +microbenchmark::microbenchmark(spc_j_base(sp4, 1:10, 2), + spc_j_DT(sp4, 1:10, 2), + spc_j_DT_2(sp4, 1:10, 2)) + +# whether the SPC is a "data.table SPC" ahead of time does not affect overhead much +sp4_DT <- data.table::as.data.table(horizons(sp4)) +depths(sp4_DT) <- id ~ top + bottom + +bench::mark(spc_j_base(sp4_DT, 1:10, 2:3), + spc_j_DT(sp4_DT, 1:10, 2:3), + spc_j_DT_2(sp4_DT, 1:10, 2:3)) + +# basically neck-and-neck at 1000 profiles; closing the gap of base over DT from ~50% to about ~5% +# the new solution is 200% faster than both old ones +thousand <- as.data.frame(data.table::rbindlist(lapply(1:1000, random_profile))) +depths(thousand) <- id ~ top + bottom + +bench::mark(spc_j_base(thousand, 1:10, 2:3), + spc_j_DT(thousand, 1:10, 2:3), + spc_j_DT_2(thousand, 1:10, 2:3)) + +# ten thousand profiles, we see a benefit from DT (old: ~20% faster than base; new 300% faster) +tenthousand <- as.data.frame(data.table::rbindlist(lapply(1:10000, random_profile))) +depths(tenthousand) <- id ~ top + bottom + +bench::mark(spc_j_base(tenthousand, 1:10, 2:3), + spc_j_DT(tenthousand, 1:10, 2:3), + spc_j_DT_2(tenthousand, 1:10, 2:3)) + +# hundred thousand profiles, we see a similar benefit from DT -- 2 to 3 times faster +hundredthousand <- as.data.frame(data.table::rbindlist(lapply(1:100000, random_profile))) +depths(hundredthousand) <- id ~ top + bottom + +bench::mark(spc_j_base(hundredthousand, 1:10, 2:3), + spc_j_DT(hundredthousand, 1:10, 2:3), + spc_j_DT_2(hundredthousand, 1:10, 2:3)) + +# # million profiles, we see a benefit from DT +# million <- as.data.frame(data.table::rbindlist(lapply(1:1000000, random_profile))) +# depths(million) <- id ~ top + bottom +# +# bench::mark( +# spc_j_base(million, 1:10, 2:3), +# spc_j_DT(million, 1:10, 2:3), +# spc_j_DT_2(hundredthousand, 1:10, 2:3) +# ) From 7ab55da66173307d73b7917e3f80141d4f41e9b7 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Mon, 15 Feb 2021 08:33:48 -0800 Subject: [PATCH 03/10] Add more competitive version of base function for j-index --- misc/aqp2/jindex-benchmarks.R | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/misc/aqp2/jindex-benchmarks.R b/misc/aqp2/jindex-benchmarks.R index c3e9b6125..ca858e560 100644 --- a/misc/aqp2/jindex-benchmarks.R +++ b/misc/aqp2/jindex-benchmarks.R @@ -68,6 +68,7 @@ spc_j_DT_2 <- function(x, i, j) { } else { # determine which profile IDs KEEP i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]])) + # i.idx <- which(vapply(j.res, function(jr) { any(jr) }, FUN.VALUE = logical(1))) } return(list(i.idx, j.idx)) } @@ -92,21 +93,44 @@ spc_j_base <- function(x, i, j) { return(list(i.idx, j.idx)) } +spc_j_base_2 <- function(x, i, j) { + h <- horizons(x) + idn <- idname(x) + + # slightly more competitive version of base + j.res <- as.list(aggregate( + seq_len(nrow(h)), + by = list(h[[idn]]), + FUN = function(hh) { + (1:length(hh) %in% j) + } + )$x) + + # i.idx <- profile_id(x) %in% unique(h[unlist(j.res), ][[idn]]) # not faster? + i.idx <- which(vapply(j.res, function(jr) { any(jr) }, FUN.VALUE = logical(1))) + + j.idx <- which(unlist(j.res)) + + return(list(i.idx, j.idx)) +} + library(aqp) data(sp4) depths(sp4) <- id ~ top + bottom # base outperforms old DT with 10 profiles, but new solution slightly better -microbenchmark::microbenchmark(spc_j_base(sp4, 1:10, 2), - spc_j_DT(sp4, 1:10, 2), - spc_j_DT_2(sp4, 1:10, 2)) +microbenchmark::microbenchmark(spc_j_base(sp4, 1:10, 2:3), + spc_j_base_2(sp4, 1:10, 2:3), + spc_j_DT(sp4, 1:10, 2:3), + spc_j_DT_2(sp4, 1:10, 2:3)) # whether the SPC is a "data.table SPC" ahead of time does not affect overhead much sp4_DT <- data.table::as.data.table(horizons(sp4)) depths(sp4_DT) <- id ~ top + bottom bench::mark(spc_j_base(sp4_DT, 1:10, 2:3), + spc_j_base_2(sp4_DT, 1:10, 2:3), spc_j_DT(sp4_DT, 1:10, 2:3), spc_j_DT_2(sp4_DT, 1:10, 2:3)) @@ -116,6 +140,7 @@ thousand <- as.data.frame(data.table::rbindlist(lapply(1:1000, random_profile))) depths(thousand) <- id ~ top + bottom bench::mark(spc_j_base(thousand, 1:10, 2:3), + spc_j_base_2(thousand, 1:10, 2:3), spc_j_DT(thousand, 1:10, 2:3), spc_j_DT_2(thousand, 1:10, 2:3)) @@ -124,6 +149,7 @@ tenthousand <- as.data.frame(data.table::rbindlist(lapply(1:10000, random_profil depths(tenthousand) <- id ~ top + bottom bench::mark(spc_j_base(tenthousand, 1:10, 2:3), + spc_j_base_2(tenthousand, 1:10, 2:3), spc_j_DT(tenthousand, 1:10, 2:3), spc_j_DT_2(tenthousand, 1:10, 2:3)) From 58ba3543285c9ba269a52fcabe718edfafa8ba61 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Mon, 15 Feb 2021 08:54:55 -0800 Subject: [PATCH 04/10] spc_j_base_2: 100k benchmark ~0.1s faster than original, nearly 3x slower than new DT --- misc/aqp2/jindex-benchmarks.R | 1 + 1 file changed, 1 insertion(+) diff --git a/misc/aqp2/jindex-benchmarks.R b/misc/aqp2/jindex-benchmarks.R index ca858e560..f1f534cc7 100644 --- a/misc/aqp2/jindex-benchmarks.R +++ b/misc/aqp2/jindex-benchmarks.R @@ -158,6 +158,7 @@ hundredthousand <- as.data.frame(data.table::rbindlist(lapply(1:100000, random_p depths(hundredthousand) <- id ~ top + bottom bench::mark(spc_j_base(hundredthousand, 1:10, 2:3), + spc_j_base_2(hundredthousand, 1:10, 2:3), spc_j_DT(hundredthousand, 1:10, 2:3), spc_j_DT_2(hundredthousand, 1:10, 2:3)) From bcbbc208290e643709cee13df44fb02f7884d76c Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Mon, 15 Feb 2021 09:35:57 -0800 Subject: [PATCH 05/10] spc_j_base_3: more competitive for small SPCs --- misc/aqp2/jindex-benchmarks.R | 38 ++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/misc/aqp2/jindex-benchmarks.R b/misc/aqp2/jindex-benchmarks.R index f1f534cc7..fad3e4b50 100644 --- a/misc/aqp2/jindex-benchmarks.R +++ b/misc/aqp2/jindex-benchmarks.R @@ -114,6 +114,25 @@ spc_j_base_2 <- function(x, i, j) { return(list(i.idx, j.idx)) } +spc_j_base_3 <- function(x, i, j) { + h <- horizons(x) + idn <- idname(x) + + # slightly more competitive version of base + j.res <- aggregate( + seq_len(nrow(h)), + by = list(h[[idn]]), + FUN = function(hh) { + (1:length(hh) %in% j) + } + )$x + + i.idx <- which(vapply(j.res, any, FUN.VALUE = logical(1))) + j.idx <- which(unlist(j.res)) + + return(list(i.idx, j.idx)) +} + library(aqp) data(sp4) @@ -122,6 +141,7 @@ depths(sp4) <- id ~ top + bottom # base outperforms old DT with 10 profiles, but new solution slightly better microbenchmark::microbenchmark(spc_j_base(sp4, 1:10, 2:3), spc_j_base_2(sp4, 1:10, 2:3), + spc_j_base_3(sp4, 1:10, 2:3), spc_j_DT(sp4, 1:10, 2:3), spc_j_DT_2(sp4, 1:10, 2:3)) @@ -131,6 +151,7 @@ depths(sp4_DT) <- id ~ top + bottom bench::mark(spc_j_base(sp4_DT, 1:10, 2:3), spc_j_base_2(sp4_DT, 1:10, 2:3), + spc_j_base_3(sp4_DT, 1:10, 2:3), spc_j_DT(sp4_DT, 1:10, 2:3), spc_j_DT_2(sp4_DT, 1:10, 2:3)) @@ -141,6 +162,7 @@ depths(thousand) <- id ~ top + bottom bench::mark(spc_j_base(thousand, 1:10, 2:3), spc_j_base_2(thousand, 1:10, 2:3), + spc_j_base_3(thousand, 1:10, 2:3), spc_j_DT(thousand, 1:10, 2:3), spc_j_DT_2(thousand, 1:10, 2:3)) @@ -150,17 +172,19 @@ depths(tenthousand) <- id ~ top + bottom bench::mark(spc_j_base(tenthousand, 1:10, 2:3), spc_j_base_2(tenthousand, 1:10, 2:3), + spc_j_base_3(tenthousand, 1:10, 2:3), spc_j_DT(tenthousand, 1:10, 2:3), spc_j_DT_2(tenthousand, 1:10, 2:3)) # hundred thousand profiles, we see a similar benefit from DT -- 2 to 3 times faster -hundredthousand <- as.data.frame(data.table::rbindlist(lapply(1:100000, random_profile))) -depths(hundredthousand) <- id ~ top + bottom - -bench::mark(spc_j_base(hundredthousand, 1:10, 2:3), - spc_j_base_2(hundredthousand, 1:10, 2:3), - spc_j_DT(hundredthousand, 1:10, 2:3), - spc_j_DT_2(hundredthousand, 1:10, 2:3)) +# hundredthousand <- as.data.frame(data.table::rbindlist(lapply(1:100000, random_profile))) +# depths(hundredthousand) <- id ~ top + bottom +# +# bench::mark(spc_j_base(hundredthousand, 1:10, 2:3), +# spc_j_base_2(hundredthousand, 1:10, 2:3), +# spc_j_base_3(hundredthousand, 1:10, 2:3), +# spc_j_DT(hundredthousand, 1:10, 2:3), +# spc_j_DT_2(hundredthousand, 1:10, 2:3)) # # million profiles, we see a benefit from DT # million <- as.data.frame(data.table::rbindlist(lapply(1:1000000, random_profile))) From f109923c9ce45f601ea0d394305d25b4a363c6c9 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Tue, 16 Feb 2021 21:55:09 -0800 Subject: [PATCH 06/10] add demo of SPC non-standard eval "k-index" .LAST keyword #199 #201 --- R/SoilProfileCollection-operators.R | 514 +++++++++++++++------------- man/singlebracket.Rd | 20 +- 2 files changed, 288 insertions(+), 246 deletions(-) diff --git a/R/SoilProfileCollection-operators.R b/R/SoilProfileCollection-operators.R index c6d3e4f74..876194925 100644 --- a/R/SoilProfileCollection-operators.R +++ b/R/SoilProfileCollection-operators.R @@ -16,254 +16,288 @@ #' #' @description You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j]}. Subset operations are propagated to other slots when they result in removal of sites from a collection. #' -#' \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. +#' - \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. #' -#' \code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. So, for instance, if \code{spc} contained 100 profiles, \code{spc[,2]} would return 100 profiles, but just the second horizon from each of the profiles ... assuming each profile had at least two horizons! The single horizon profiles would be dropped from the collection. +#' - \code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. #' +#' - `...` is an area to specify an expression that is evaluated in the subset. Currently supported +#' +#' - `.LAST`: return the last horizon from each profile. This uses `i` but ignores the regular `j` index. #' #' @param x a SoilProfileCollection #' @param i a numeric or logical value denoting profile indices to select in a subset -#' @param j a numeric or logical value denoting profile indices to select in a subset +#' @param j a numeric or logical value denoting horizon indices to select in a subset +#' @param ... non-standard expressions to evaluate in a subset; currently supported: `.LAST` (last horizon in each profile) +#' @param drop not used #' @rdname singlebracket +# SPC extract: "[" '[' single bracket SPC object extract method setMethod("[", signature(x = "SoilProfileCollection", i = "ANY", - j = "ANY"), - function(x, i, j) { - # check for missing i and j - if (missing(i) & missing(j)) { - stop('must provide either a profile index or horizon/slice index, or both', - call. = FALSE) - } - - # convert to integer - if (!missing(i)) { - if (any(is.na(i))) { - stop('NA not permitted in profile index', call. = FALSE) - } - - # convert logical to integer - # (thanks Jos? Padarian for the suggestion!) - if (is.logical(i)) { - i <- (1:length(x))[i] - } - - can.cast <- is.numeric(i) - if (can.cast) { - if (all(abs(i - round(i)) < .Machine$double.eps ^ 0.5)) { - i <- as.integer(i) - } else { - stop("Numeric site index does not contain whole numbers.") - } - } else { - stop("Failed to coerce site index to integer.") - } - } else { - # if no index is provided, the user wants all profiles - i <- 1:length(x) - } - - if (!missing(j)) { - # AGB added logical handling to horizon index - if (is.logical(j)) { - j <- (1:length(x))[j] - } - - can.cast <- is.numeric(j) - if (can.cast) { - if (all(abs(j - round(j)) < .Machine$double.eps ^ 0.5)) { - j <- as.integer(j) - } else { - stop("Numeric horizon/slice index does not contain whole numbers.") - } - } else { - stop("Failed to coerce horizon/slice index to integer.") - } - - if (any(is.na(j))) { - stop('NA not permitted in horizon/slice index', call. = FALSE) - } - } - - # extract all site and horizon data - h <- x@horizons - s.all <- x@site - - # extract requested profile IDs - p.ids <- s.all[[idname(x)]][unique(i)] - - # keep only the requested horizon data (filtered by profile ID) - h <- .as.data.frame.aqp(h, aqp_df_class(x))[h[[idname(x)]] %in% p.ids,] - - # keep only the requested site data, (filtered by profile ID) - s.i <- which(s.all[[idname(x)]] %in% p.ids) - - # need to use drop=FALSE when @site contains only a single column - s <- s.all[s.i, , drop = FALSE] - - # subset spatial data, but only if valid - if (validSpatialData(x)) { - sp <- x@sp[i] - } else { - # copy empty SpatialPoints object - sp <- x@sp - } - - # subset diagnostic data - d <- diagnostic_hz(x) - if (length(d) > 0) { - d <- d[which(d[[idname(x)]] %in% p.ids),] - } - - # subset restriction data - r <- restrictions(x) - if (length(r) > 0) { - r <- r[which(r[[idname(x)]] %in% p.ids),] - } - - # subset horizons/slices based on j --> only when j is given - if (!missing(j)) { - - # faster replacement of j subsetting of horizon data - # if (aqp_df_class(x) == "data.table") { - - h <- as.data.table(h) - - # local vars to make R CMD check happy - .N <- NULL - .I <- NULL - V1 <- NULL - - # data.table can do this much more efficiently - # if (requireNamespace("data.table", quietly = TRUE)) { - idn <- idname(x) - - # by list @horizons idname (essentially iterating over profiles) - bylist <- list(h[[idn]]) - names(bylist) <- idn - - # figured out the data.table way to do this - # not using := or . anymore - - # determine j indices to KEEP - j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 - - # determine which site indices to keep - # in case all horizons are removed, remove sites too - if (length(j.idx) == 0) { - i.idx <- numeric(0) - } else { - # determine which profiles to KEEP - i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]])) - } - - # } - # } else { - # # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) - # j.res <- as.list(aggregate( - # h[[hzidname(x)]], - # by = list(h[[idname(x)]]), - # FUN = function(hh) { - # list(1:length(hh) %in% j) - # }, - # drop = FALSE - # )$x) - # - # ## https://github.com/ncss-tech/aqp/issues/89 - # # fix #89, where i with no matching j e.g. @site data returned - # i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) - # - # j.idx <- which(do.call('c', j.res)) - # } - - # find any index out of bounds and ignore them - # j.idx.bad <- which(abs(j.idx) > nrow(h)) - # i.idx.bad <- which(abs(i.idx) > nrow(s)) - # - # if (length(i.idx)) - # i.idx <- i.idx[-i.idx.bad] - # - # if (length(j.idx)) - # j.idx <- j.idx[-j.idx.bad] - - # do horizon subset with j index - h <- h[j.idx, ] - - # if profiles have been removed based on the j-index constraints - if (length(i.idx) > 0) { - # remove sites that have no matching j - s <- s[i.idx, , drop = FALSE] - h.ids <- s[[idname(x)]] - - # remove also: diagnostics - d.idx <- which(d[[idname(x)]] %in% h.ids) - if (length(d.idx) > 0) { - d <- d[-d.idx, , drop = FALSE] - } - - # restrictions - r.idx <- which(r[[idname(x)]] %in% h.ids) - if (length(r.idx) > 0) { - r <- r[-r.idx, , drop = FALSE] - } - - # spatial - if (validSpatialData(x)) { - sp <- sp[i.idx,] - } - } - } - - rownames(h) <- NULL - - # rebuild SPC object from slots - res <- SoilProfileCollection( - idcol = idname(x), - hzidcol = hzidname(x), - depthcols = horizonDepths(x), - metadata = aqp::metadata(x), - horizons = .as.data.frame.aqp(h, aqp_df_class(x)), - site = .as.data.frame.aqp(s, aqp_df_class(x)), - sp = sp, - diagnostic = .as.data.frame.aqp(d, aqp_df_class(x)), - restrictions = .as.data.frame.aqp(r, aqp_df_class(x)) - ) - - # fill in any missing data.frame class or group var - o.df.class <- aqp::metadata(x)$aqp_df_class - if(length(o.df.class) == 0) { - o.df.class <- "data.frame" - } - - o.group.by <- aqp::metadata(x)$aqp_group_by - if(length(o.group.by) == 0) { - o.group.by <- "" - } - - metadata(res)$aqp_df_class <- o.df.class - metadata(res)$aqp_group_by <- o.group.by - - # preserve slots that may have been customized relative to defaults - # in prototype or resulting from construction of SPC - suppressMessages(hzidname(res) <- hzidname(x)) - suppressMessages(hzdesgnname(res) <- hzdesgnname(x)) - suppressMessages(hztexclname(res) <- hztexclname(x)) - - # there should be as many records in @site as there are profile IDs - pid.res <- profile_id(res) - site.res <- site(res)[[idname(res)]] - - if (length(pid.res) != length(site.res)) { - message("Some profiles have been removed from the collection.") - } - - # the order of profile_ids should be the same as in @site - if (!all(pid.res == site.res)) { - warning("profile ID order does not match order in @site", - call. = FALSE) - } - - return(res) - }) + j = "ANY"), function(x, i, j, ...) { + + # capture k right away + ksubflag <- FALSE + + # 2nd value is first user-supplied expression + ksub <- as.character(substitute(list(...)))[2] + + # handle special .LAST horizon keyword in "k" index + if (!is.na(ksub) & ksub == ".LAST") { + if (missing(j)) + j <- 9999 # dummy value to trigger j indexing + ksubflag <- TRUE # only support .LAST for now + } + + # check for missing i and j + # if (missing(i) & missing(j) | is.na(ksub)) { + # stop('must provide either a profile index, horizon/slice index or a special expression such as .LAST', + # call. = FALSE) + # } + + # convert to integer + if (!missing(i)) { + if (any(is.na(i))) { + stop('NA not permitted in profile index', call. = FALSE) + } + + # convert logical to integer + # (thanks Jos? Padarian for the suggestion!) + if (is.logical(i)) { + i <- (1:length(x))[i] + } + + can.cast <- is.numeric(i) + if (can.cast) { + if (all(abs(i - round(i)) < .Machine$double.eps ^ 0.5)) { + i <- as.integer(i) + } else { + stop("Numeric site index does not contain whole numbers.") + } + } else { + stop("Failed to coerce site index to integer.") + } + } else { + # if no index is provided, the user wants all profiles + i <- 1:length(x) + } + + + # this block does processing for logical or numeric j + # not special symbols like .LAST + if (!missing(j) & !ksubflag) { + + # handle special symbols in j index + # currently supported: + # - .LAST: last horizon index per profile + + # AGB added logical handling to horizon index + if (is.logical(j)) { + j <- (1:length(x))[j] + } + + can.cast <- is.numeric(j) + if (can.cast) { + if (all(abs(j - round(j)) < .Machine$double.eps ^ 0.5)) { + j <- as.integer(j) + } else { + stop("Numeric horizon/slice index does not contain whole numbers.") + } + } else { + stop("Failed to coerce horizon/slice index to integer.") + } + + if (any(is.na(j))) { + stop('NA not permitted in horizon/slice index', call. = FALSE) + } + } + + # extract all site and horizon data + h <- x@horizons + s.all <- x@site + + # extract requested profile IDs + p.ids <- s.all[[idname(x)]][unique(i)] + + # keep only the requested horizon data (filtered by profile ID) + h <- .as.data.frame.aqp(h, aqp_df_class(x))[h[[idname(x)]] %in% p.ids,] + + # keep only the requested site data, (filtered by profile ID) + s.i <- which(s.all[[idname(x)]] %in% p.ids) + + # need to use drop=FALSE when @site contains only a single column + s <- s.all[s.i, , drop = FALSE] + + # subset spatial data, but only if valid + if (validSpatialData(x)) { + sp <- x@sp[i] + } else { + # copy empty SpatialPoints object + sp <- x@sp + } + + # subset diagnostic data + d <- diagnostic_hz(x) + if (length(d) > 0) { + d <- d[which(d[[idname(x)]] %in% p.ids),] + } + + # subset restriction data + r <- restrictions(x) + if (length(r) > 0) { + r <- r[which(r[[idname(x)]] %in% p.ids),] + } + + # subset horizons/slices based on j --> only when j is given + if (!missing(j)) { + + # faster replacement of j subsetting of horizon data + # if (aqp_df_class(x) == "data.table") { + + h <- data.table::as.data.table(h) + + # local vars to make R CMD check happy + .N <- NULL + .I <- NULL + V1 <- NULL + + # data.table can do this much more efficiently + # if (requireNamespace("data.table", quietly = TRUE)) { + idn <- idname(x) + + # by list @horizons idname (essentially iterating over profiles) + bylist <- list(h[[idn]]) + names(bylist) <- idn + + # figured out the data.table way to do this + # not using := or . anymore + + # determine j indices to KEEP + if (ksubflag) { + # trigger special last horizon case + j.idx <- h[, .I[.N], by = bylist]$V1 + } else { + # get row indices of horizon data corresponding to j within profiles + j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + } + + # determine which site indices to keep + # in case all horizons are removed, remove sites too + if (length(j.idx) == 0) { + i.idx <- numeric(0) + } else { + # determine which profiles to KEEP + i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]])) + } + + # } + # } else { + # # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) + # j.res <- as.list(aggregate( + # h[[hzidname(x)]], + # by = list(h[[idname(x)]]), + # FUN = function(hh) { + # list(1:length(hh) %in% j) + # }, + # drop = FALSE + # )$x) + # + # ## https://github.com/ncss-tech/aqp/issues/89 + # # fix #89, where i with no matching j e.g. @site data returned + # i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) + # + # j.idx <- which(do.call('c', j.res)) + # } + + # find any index out of bounds and ignore them + # j.idx.bad <- which(abs(j.idx) > nrow(h)) + # i.idx.bad <- which(abs(i.idx) > nrow(s)) + # + # if (length(i.idx)) + # i.idx <- i.idx[-i.idx.bad] + # + # if (length(j.idx)) + # j.idx <- j.idx[-j.idx.bad] + + # do horizon subset with j index + h <- h[j.idx, ] + + # if profiles have been removed based on the j-index constraints + if (length(i.idx) > 0) { + # remove sites that have no matching j + i.idx <- unique(c(s.i, i.idx)) + s <- s.all[i.idx, , drop = FALSE] + h.ids <- s[[idname(x)]] + + # remove also: diagnostics + d.idx <- which(d[[idname(x)]] %in% h.ids) + if (length(d.idx) > 0) { + d <- d[-d.idx, , drop = FALSE] + } + + # restrictions + r.idx <- which(r[[idname(x)]] %in% h.ids) + if (length(r.idx) > 0) { + r <- r[-r.idx, , drop = FALSE] + } + + # spatial + if (validSpatialData(x)) { + sp <- sp[i.idx,] + } + } + } + + rownames(h) <- NULL + + # rebuild SPC object from slots + res <- SoilProfileCollection( + idcol = idname(x), + hzidcol = hzidname(x), + depthcols = horizonDepths(x), + metadata = aqp::metadata(x), + horizons = .as.data.frame.aqp(h, aqp_df_class(x)), + site = .as.data.frame.aqp(s, aqp_df_class(x)), + sp = sp, + diagnostic = .as.data.frame.aqp(d, aqp_df_class(x)), + restrictions = .as.data.frame.aqp(r, aqp_df_class(x)) + ) + + # fill in any missing data.frame class or group var + o.df.class <- aqp::metadata(x)$aqp_df_class + if(length(o.df.class) == 0) { + o.df.class <- "data.frame" + } + + o.group.by <- aqp::metadata(x)$aqp_group_by + if(length(o.group.by) == 0) { + o.group.by <- "" + } + + metadata(res)$aqp_df_class <- o.df.class + metadata(res)$aqp_group_by <- o.group.by + + # preserve slots that may have been customized relative to defaults + # in prototype or resulting from construction of SPC + suppressMessages(hzidname(res) <- hzidname(x)) + suppressMessages(hzdesgnname(res) <- hzdesgnname(x)) + suppressMessages(hztexclname(res) <- hztexclname(x)) + + # there should be as many records in @site as there are profile IDs + pid.res <- profile_id(res) + site.res <- site(res)[[idname(res)]] + + if (length(pid.res) != length(site.res)) { + message("Some profiles have been removed from the collection.") + } + + # the order of profile_ids should be the same as in @site + if (!all(pid.res == site.res)) { + warning("profile ID order does not match order in @site", + call. = FALSE) + } + + return(res) + }) #### #### double bracket SoilProfileCollection methods diff --git a/man/singlebracket.Rd b/man/singlebracket.Rd index 8fd8bc6e5..1cfefe483 100644 --- a/man/singlebracket.Rd +++ b/man/singlebracket.Rd @@ -4,19 +4,27 @@ \alias{[,SoilProfileCollection-method} \title{Matrix/data.frame-like access to profiles and horizons in a SoilProfileCollection} \usage{ -\S4method{[}{SoilProfileCollection}(x, i, j) +\S4method{[}{SoilProfileCollection}(x, i, j, ..., drop = TRUE) } \arguments{ \item{x}{a SoilProfileCollection} \item{i}{a numeric or logical value denoting profile indices to select in a subset} -\item{j}{a numeric or logical value denoting profile indices to select in a subset} +\item{j}{a numeric or logical value denoting horizon indices to select in a subset} + +\item{...}{non-standard expressions to evaluate in a subset; currently supported: \code{.LAST} (last horizon in each profile)} + +\item{drop}{not used} } \description{ You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j]}. Subset operations are propagated to other slots when they result in removal of sites from a collection. - -\code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. - -\code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. So, for instance, if \code{spc} contained 100 profiles, \code{spc[,2]} would return 100 profiles, but just the second horizon from each of the profiles ... assuming each profile had at least two horizons! The single horizon profiles would be dropped from the collection. +\itemize{ +\item \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. +\item \code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. +\item \code{...} is an area to specify an expression that is evaluated in the subset. Currently supported +\itemize{ +\item \code{.LAST}: return the last horizon from each profile. This uses \code{i} but ignores the regular \code{j} index. +} +} } From 808f55a9058d235c405237d822b98dff198bdb35 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Sun, 21 Feb 2021 09:42:15 -0800 Subject: [PATCH 07/10] add experimental support for .FIRST, .HZID and multiple special keywords --- R/SoilProfileCollection-operators.R | 103 +++++++++++++++++----------- misc/aqp2/test-k-keywords.R | 47 +++++++++++++ 2 files changed, 110 insertions(+), 40 deletions(-) create mode 100644 misc/aqp2/test-k-keywords.R diff --git a/R/SoilProfileCollection-operators.R b/R/SoilProfileCollection-operators.R index 876194925..21ab69767 100644 --- a/R/SoilProfileCollection-operators.R +++ b/R/SoilProfileCollection-operators.R @@ -39,21 +39,25 @@ setMethod("[", signature(x = "SoilProfileCollection", ksubflag <- FALSE # 2nd value is first user-supplied expression - ksub <- as.character(substitute(list(...)))[2] - - # handle special .LAST horizon keyword in "k" index - if (!is.na(ksub) & ksub == ".LAST") { - if (missing(j)) - j <- 9999 # dummy value to trigger j indexing - ksubflag <- TRUE # only support .LAST for now + kargs <- substitute(list(...)) + ksub <- as.character(kargs)[2:length(kargs)] + + # handle special keywords in "k" index + for(k in ksub) { + if (!is.na(k)) { + switch(k, + ".FIRST" = { + j <- 1 + }, + ".LAST" = { + ksubflag <- TRUE + }, + ".HZID" = { + ksubflag <- TRUE + }) + } } - # check for missing i and j - # if (missing(i) & missing(j) | is.na(ksub)) { - # stop('must provide either a profile index, horizon/slice index or a special expression such as .LAST', - # call. = FALSE) - # } - # convert to integer if (!missing(i)) { if (any(is.na(i))) { @@ -119,36 +123,47 @@ setMethod("[", signature(x = "SoilProfileCollection", p.ids <- s.all[[idname(x)]][unique(i)] # keep only the requested horizon data (filtered by profile ID) - h <- .as.data.frame.aqp(h, aqp_df_class(x))[h[[idname(x)]] %in% p.ids,] + h <- .as.data.frame.aqp(h, aqp_df_class(x)) - # keep only the requested site data, (filtered by profile ID) - s.i <- which(s.all[[idname(x)]] %in% p.ids) + # + if (ksubflag && ".HZID" %in% ksub) { - # need to use drop=FALSE when @site contains only a single column - s <- s.all[s.i, , drop = FALSE] + h <- h # do nothing, short circuit - # subset spatial data, but only if valid - if (validSpatialData(x)) { - sp <- x@sp[i] } else { - # copy empty SpatialPoints object - sp <- x@sp - } - # subset diagnostic data - d <- diagnostic_hz(x) - if (length(d) > 0) { - d <- d[which(d[[idname(x)]] %in% p.ids),] - } + h <- h[h[[idname(x)]] %in% p.ids,] + + # keep only the requested site data, (filtered by profile ID) + s.i <- which(s.all[[idname(x)]] %in% p.ids) + + # need to use drop=FALSE when @site contains only a single column + s <- s.all[s.i, , drop = FALSE] + + # subset spatial data, but only if valid + if (validSpatialData(x)) { + sp <- x@sp[i] + } else { + # copy empty SpatialPoints object + sp <- x@sp + } + + # subset diagnostic data + d <- diagnostic_hz(x) + if (length(d) > 0) { + d <- d[which(d[[idname(x)]] %in% p.ids),] + } + + # subset restriction data + r <- restrictions(x) + if (length(r) > 0) { + r <- r[which(r[[idname(x)]] %in% p.ids),] + } - # subset restriction data - r <- restrictions(x) - if (length(r) > 0) { - r <- r[which(r[[idname(x)]] %in% p.ids),] } # subset horizons/slices based on j --> only when j is given - if (!missing(j)) { + if (!missing(j) | ksubflag) { # faster replacement of j subsetting of horizon data # if (aqp_df_class(x) == "data.table") { @@ -168,16 +183,24 @@ setMethod("[", signature(x = "SoilProfileCollection", bylist <- list(h[[idn]]) names(bylist) <- idn - # figured out the data.table way to do this - # not using := or . anymore - # determine j indices to KEEP - if (ksubflag) { + if (ksubflag && ".LAST" %in% ksub) { # trigger special last horizon case j.idx <- h[, .I[.N], by = bylist]$V1 } else { - # get row indices of horizon data corresponding to j within profiles - j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + if (missing(j)) { + # default is an index to each horizon + # note: i-subset happens here + j.idx <- seq_len(nrow(h))[h[[idname(x)]] %in% p.ids] + } else { + # get row indices of horizon data corresponding to j within profiles + j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + } + } + + # short circuit for horizon-slot indices + if (ksubflag && ".HZID" %in% ksub) { + return(j.idx) } # determine which site indices to keep diff --git a/misc/aqp2/test-k-keywords.R b/misc/aqp2/test-k-keywords.R new file mode 100644 index 000000000..d0c36e1d4 --- /dev/null +++ b/misc/aqp2/test-k-keywords.R @@ -0,0 +1,47 @@ +# DEMO: base non-standard eval of keyword in ... "k-index": SPC[i, j, ...] +# version 1: add support for .LAST +# version 2: add support for .FIRST, .HZID and multiple special keywords + +library(aqp, warn = FALSE) + +data(sp4) +depths(sp4) <- id ~ top + bottom + +# .LAST +sp4[, , .LAST] +sp4[, , .HZID] +sp4[, , .LAST, .HZID] +sp4[, , .FIRST, .HZID] # this just sets j <- 1 +sp4[, 1000, .FIRST, .HZID] # this sets j <- 1; ignores j input if given +sp4[, 1000, .LAST, .HZID] # this ignores j input if given + +# horizon index of 2nd horizon in each profile +sp4[5:10, 2, .HZID] + +# this is more realistic, perhaps: +fivetoten <- sp4[5:10,] # or some more complicated subset() + +# third horizon ID, index to horizon data.frame +horizons(fivetoten)[fivetoten[,3,,.HZID],] + +# use it to implement #199 +getLastHorizonID_v1 <- function(x) { + res <- hzID(x[, , .LAST]) + names(res) <- profile_id(x) + return(res) +} + +# ~4x more efficient using j-index shortcut +getLastHorizonID_v2 <- function(x) { + res <- hzID(x)[x[, , .LAST, .HZID]] + names(res) <- profile_id(x) + return(res) +} + +bench::mark(getLastHorizonID_v1(sp4), + getLastHorizonID_v2(sp4)) + +# bigger <- data.table::rbindlist(lapply(1:100000, random_profile)) +# depths(bigger) <- id ~ top + bottom +# bench::mark(x <- getLastHorizonID_v1(bigger), +# x <- getLastHorizonID_v2(bigger)) From aa1149129754c72272d6dcf1b9cf276b7bd5da0a Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Sun, 21 Feb 2021 10:54:26 -0800 Subject: [PATCH 08/10] better logic for i+j+k --- R/SoilProfileCollection-operators.R | 33 +++++++++++++++-------------- misc/aqp2/test-k-keywords.R | 5 +++-- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/R/SoilProfileCollection-operators.R b/R/SoilProfileCollection-operators.R index 21ab69767..fbafaf31e 100644 --- a/R/SoilProfileCollection-operators.R +++ b/R/SoilProfileCollection-operators.R @@ -90,10 +90,6 @@ setMethod("[", signature(x = "SoilProfileCollection", # not special symbols like .LAST if (!missing(j) & !ksubflag) { - # handle special symbols in j index - # currently supported: - # - .LAST: last horizon index per profile - # AGB added logical handling to horizon index if (is.logical(j)) { j <- (1:length(x))[j] @@ -124,15 +120,15 @@ setMethod("[", signature(x = "SoilProfileCollection", # keep only the requested horizon data (filtered by profile ID) h <- .as.data.frame.aqp(h, aqp_df_class(x)) + pidx <- h[[idname(x)]] %in% p.ids + h <- h[pidx,] - # if (ksubflag && ".HZID" %in% ksub) { - h <- h # do nothing, short circuit + j.idx.allowed <- seq_len(nrow(x@horizons))[pidx] } else { - - h <- h[h[[idname(x)]] %in% p.ids,] + j.idx.allowed <- seq_len(nrow(h)) # keep only the requested site data, (filtered by profile ID) s.i <- which(s.all[[idname(x)]] %in% p.ids) @@ -175,24 +171,29 @@ setMethod("[", signature(x = "SoilProfileCollection", .I <- NULL V1 <- NULL - # data.table can do this much more efficiently - # if (requireNamespace("data.table", quietly = TRUE)) { idn <- idname(x) # by list @horizons idname (essentially iterating over profiles) bylist <- list(h[[idn]]) names(bylist) <- idn + # handle special symbols in j index + # currently supported: + # - .LAST: last horizon index per profile + # - .HZID: return horizon slot row index (short circuit) + # - .FIRST: first horizon index per profile + # the above can be combined. .LAST takes precedent over .FIRST. + # determine j indices to KEEP + + # default is an index to each horizon + j.idx <- j.idx.allowed + if (ksubflag && ".LAST" %in% ksub) { # trigger special last horizon case j.idx <- h[, .I[.N], by = bylist]$V1 } else { - if (missing(j)) { - # default is an index to each horizon - # note: i-subset happens here - j.idx <- seq_len(nrow(h))[h[[idname(x)]] %in% p.ids] - } else { + if (!missing(j)) { # get row indices of horizon data corresponding to j within profiles j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 } @@ -200,7 +201,7 @@ setMethod("[", signature(x = "SoilProfileCollection", # short circuit for horizon-slot indices if (ksubflag && ".HZID" %in% ksub) { - return(j.idx) + return(j.idx.allowed[j.idx]) } # determine which site indices to keep diff --git a/misc/aqp2/test-k-keywords.R b/misc/aqp2/test-k-keywords.R index d0c36e1d4..c4e79f303 100644 --- a/misc/aqp2/test-k-keywords.R +++ b/misc/aqp2/test-k-keywords.R @@ -12,6 +12,7 @@ sp4[, , .LAST] sp4[, , .HZID] sp4[, , .LAST, .HZID] sp4[, , .FIRST, .HZID] # this just sets j <- 1 +sp4[, 1, , .HZID] sp4[, 1000, .FIRST, .HZID] # this sets j <- 1; ignores j input if given sp4[, 1000, .LAST, .HZID] # this ignores j input if given @@ -31,7 +32,7 @@ getLastHorizonID_v1 <- function(x) { return(res) } -# ~4x more efficient using j-index shortcut +# ~3x more efficient using j-index shortcut getLastHorizonID_v2 <- function(x) { res <- hzID(x)[x[, , .LAST, .HZID]] names(res) <- profile_id(x) @@ -41,7 +42,7 @@ getLastHorizonID_v2 <- function(x) { bench::mark(getLastHorizonID_v1(sp4), getLastHorizonID_v2(sp4)) -# bigger <- data.table::rbindlist(lapply(1:100000, random_profile)) +# bigger <- data.table::rbindlist(lapply(1:1000, random_profile)) # depths(bigger) <- id ~ top + bottom # bench::mark(x <- getLastHorizonID_v1(bigger), # x <- getLastHorizonID_v2(bigger)) From 1bd73e90571f54a0ad6a7bd54e9bc993525b080f Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Wed, 24 Feb 2021 09:49:28 -0800 Subject: [PATCH 09/10] Convert demo to tests --- tests/testthat/test-SPC-k-keywords.R | 35 ++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/testthat/test-SPC-k-keywords.R diff --git a/tests/testthat/test-SPC-k-keywords.R b/tests/testthat/test-SPC-k-keywords.R new file mode 100644 index 000000000..be34d3176 --- /dev/null +++ b/tests/testthat/test-SPC-k-keywords.R @@ -0,0 +1,35 @@ +# base non-standard eval of keyword in ... "k-index": SPC[i, j, ...] +# support for .LAST, .FIRST, .HZID special keywords +context(".LAST, .FIRST, .HZID k-keywords for SoilProfileCollection objects") + +# define special symbols in global env +# (not needed for tests, but needed wherever they are used in package) +.FIRST <- NULL +.LAST <- NULL +.HZID <- NULL + +data(sp4) +depths(sp4) <- id ~ top + bottom + +# .LAST +expect_equal(length(sp4[, , .LAST]), 10) + +expect_equal(sp4[, , .HZID], 1:30) + +expect_equal(sp4[, , .LAST, .HZID], + c(4L, 6L, 9L, 13L, 16L, 18L, 20L, 22L, 27L, 30L)) +# .FIRST sets j <- 1 +expect_equal(sp4[, , .FIRST, .HZID], + sp4[, 1, , .HZID]) + +# .FIRST ignores j input if given +expect_equal(sp4[, 1000, .FIRST, .HZID], + sp4[, 1, , .HZID]) + +# .LAST ignores j input if given +expect_equal(sp4[, , .LAST, .HZID], + sp4[, 1000, .LAST, .HZID]) + +# horizon index of 2nd horizon in each profile +expect_equal(sp4[5:10, 2, .HZID], + c(15L, 18L, 20L, 22L, 24L, 29L)) From b1ea38faf5cd7012383be2b5bac601d6a7c3f727 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Wed, 24 Feb 2021 09:56:29 -0800 Subject: [PATCH 10/10] Docs --- R/SoilProfileCollection-operators.R | 9 ++++++--- man/singlebracket.Rd | 8 +++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/R/SoilProfileCollection-operators.R b/R/SoilProfileCollection-operators.R index fbafaf31e..bca484a1f 100644 --- a/R/SoilProfileCollection-operators.R +++ b/R/SoilProfileCollection-operators.R @@ -14,7 +14,7 @@ #' #' @aliases [,SoilProfileCollection-method #' -#' @description You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j]}. Subset operations are propagated to other slots when they result in removal of sites from a collection. +#' @description You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j, ...]}. Subset operations are propagated to other slots (such as diagnostics or spatial) when they result in removal of sites from a collection. #' #' - \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. #' @@ -22,12 +22,15 @@ #' #' - `...` is an area to specify an expression that is evaluated in the subset. Currently supported #' -#' - `.LAST`: return the last horizon from each profile. This uses `i` but ignores the regular `j` index. +#' - `.LAST` (last horizon in each profile): return the last horizon from each profile. This uses `i` but ignores the regular `j` index. +#' - `.FIRST` (first horizon in each profile): return the last horizon from each profile. This uses `i` but ignores the regular `j` index. +#' - `.HZID` (horizon index not `SoilProfileCollection` result): return the horizon indices corresponding to `i`+`j`+`...` ("k") constraints #' #' @param x a SoilProfileCollection #' @param i a numeric or logical value denoting profile indices to select in a subset #' @param j a numeric or logical value denoting horizon indices to select in a subset -#' @param ... non-standard expressions to evaluate in a subset; currently supported: `.LAST` (last horizon in each profile) +#' @param ... non-standard expressions to evaluate in a subset +#' #' @param drop not used #' @rdname singlebracket # SPC extract: "[" '[' single bracket SPC object extract method diff --git a/man/singlebracket.Rd b/man/singlebracket.Rd index 1cfefe483..33a40a97f 100644 --- a/man/singlebracket.Rd +++ b/man/singlebracket.Rd @@ -13,18 +13,20 @@ \item{j}{a numeric or logical value denoting horizon indices to select in a subset} -\item{...}{non-standard expressions to evaluate in a subset; currently supported: \code{.LAST} (last horizon in each profile)} +\item{...}{non-standard expressions to evaluate in a subset} \item{drop}{not used} } \description{ -You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j]}. Subset operations are propagated to other slots when they result in removal of sites from a collection. +You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j, ...]}. Subset operations are propagated to other slots (such as diagnostics or spatial) when they result in removal of sites from a collection. \itemize{ \item \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. \item \code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. \item \code{...} is an area to specify an expression that is evaluated in the subset. Currently supported \itemize{ -\item \code{.LAST}: return the last horizon from each profile. This uses \code{i} but ignores the regular \code{j} index. +\item \code{.LAST} (last horizon in each profile): return the last horizon from each profile. This uses \code{i} but ignores the regular \code{j} index. +\item \code{.FIRST} (first horizon in each profile): return the last horizon from each profile. This uses \code{i} but ignores the regular \code{j} index. +\item \code{.HZID} (horizon index not \code{SoilProfileCollection} result): return the horizon indices corresponding to \code{i}+\code{j}+\code{...} ("k") constraints } } }