Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ak/check derived task ids/189 #191

Merged
merged 7 commits into from
Jan 15, 2025
62 changes: 42 additions & 20 deletions R/check_tbl_value_col_ascending.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
#' If not, the check is skipped and a `<message/check_info>` condition class
#' object is returned.
#'
#' @inherit check_tbl_value_col params
#' @inherit check_tbl_values params
#' @inherit check_tbl_col_types return
#' @export
check_tbl_value_col_ascending <- function(tbl, file_path, hub_path, round_id,
derived_task_ids = get_hub_derived_task_ids(hub_path)) {
check_output_types <- intersect(c("cdf", "quantile"), unique(tbl[["output_type"]]))

# Exit early if there are no values to check
no_values_to_check <- all(!c("cdf", "quantile") %in% tbl[["output_type"]])
if (no_values_to_check) {
if (length(check_output_types) == 0L) {
return(
capture_check_info(
file_path,
Expand All @@ -27,28 +27,25 @@ check_tbl_value_col_ascending <- function(tbl, file_path, hub_path, round_id,
}
annakrystalli marked this conversation as resolved.
Show resolved Hide resolved

config_tasks <- hubUtils::read_config(hub_path, "tasks")
not_value <- names(tbl) != "value"
tbl[not_value] <- hubData::coerce_to_character(tbl[not_value])

if (!is.null(derived_task_ids)) {
tbl[derived_task_ids] <- NA_character_
}
round_output_types <- get_round_output_type_names(config_tasks, round_id)
only_cdf_or_quantile <- intersect(c("cdf", "quantile"), round_output_types)
# FIX for <https://github.com/hubverse-org/hubValidations/issues/78>
# This function uses an inner join to auto-sort the table by model task,
# splitting by output type. We can use that to loop through the check.
output_type_tbls <- match_tbl_to_model_task(
tbl,
config_tasks = config_tasks,
round_id = round_id,
output_types = only_cdf_or_quantile,
derived_task_ids = derived_task_ids

# Check that values are non-decreasing for each output type separately to reduce
# memory pressure
error_tbl <- purrr::map(
check_output_types,
\(.x) {
check_values_ascending_by_output_type(
.x, tbl,
config_tasks, round_id,
derived_task_ids
)
}
) %>%
purrr::compact()
error_tbl <- purrr::map(output_type_tbls, check_values_ascending) %>%
purrr::list_rbind()


check <- nrow(error_tbl) == 0L

if (check) {
Expand All @@ -70,9 +67,34 @@ check_tbl_value_col_ascending <- function(tbl, file_path, hub_path, round_id,
)
}

#' Check that values for each model task in specific output types are ascending
#'
#' This function allows us to map over individual output types one at a time to
#' reduce memory pressure.
#' @param output_type the output type(s) to check. Must be a character vector
#' @noRd
check_values_ascending_by_output_type <- function(output_type, tbl,
config_tasks, round_id,
derived_task_ids) {
# FIX for <https://github.com/hubverse-org/hubValidations/issues/78>
# This function uses an inner join to auto-sort the table by model task,
# splitting by output type. We can use that to loop through the check.
output_type_tbls <- match_tbl_to_model_task(
tbl,
config_tasks = config_tasks,
round_id = round_id,
output_types = output_type,
derived_task_ids = derived_task_ids
) %>%
purrr::compact()

purrr::map(output_type_tbls, check_values_ascending) %>%
purrr::list_rbind()
annakrystalli marked this conversation as resolved.
Show resolved Hide resolved
}

#' Check that values for each model task are ascending
#'
#' @param tbl a table with a single output type
#' @param tbl an all character table with a single output type
#' @return
#' - If the check succeeds, and all values are non-decreasing: NULL
#' - If the check fails, a summary table showing the model tasks that
Expand Down
2 changes: 1 addition & 1 deletion R/match_tbl_to_model_task.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ match_tbl_to_model_task <- function(tbl, config_tasks, round_id,
config_tasks,
round_id = round_id,
required_vals_only = FALSE,
all_character = TRUE,
all_character = all_character,
annakrystalli marked this conversation as resolved.
Show resolved Hide resolved
as_arrow_table = FALSE,
bind_model_tasks = FALSE,
output_types = output_types,
Expand Down
2 changes: 1 addition & 1 deletion R/validate_model_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,

checks$value_col_non_desc <- try_check(
check_tbl_value_col_ascending(
tbl,
tbl_chr,
file_path = file_path,
hub_path = hub_path,
round_id = round_id,
Expand Down
2 changes: 1 addition & 1 deletion man/check_tbl_value_col_ascending.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 25 additions & 10 deletions tests/testthat/test-check_tbl_value_col_ascending.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ test_that("check_tbl_value_col_ascending works", {
hub_path <- system.file("testhubs/simple", package = "hubValidations")
file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")

expect_snapshot(
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Expand All @@ -12,7 +12,7 @@ test_that("check_tbl_value_col_ascending works", {
file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
file_meta <- parse_file_name(file_path)

tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")

expect_snapshot(
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Expand All @@ -22,7 +22,7 @@ test_that("check_tbl_value_col_ascending works", {
test_that("check_tbl_value_col_ascending works when output type IDs not ordered", {
hub_path <- test_path("testdata/hub-unordered/")
file_path <- "ISI-NotOrdered/2024-01-10-ISI-NotOrdered.csv"
tbl <- read_model_out_file(file_path, hub_path)
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")
file_meta <- parse_file_name(file_path)
expect_snapshot(
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Expand All @@ -33,7 +33,7 @@ test_that("check_tbl_value_col_ascending errors correctly", {
hub_path <- system.file("testhubs/simple", package = "hubValidations")
file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")

tbl$value[c(1, 10)] <- 150

Expand All @@ -44,7 +44,7 @@ test_that("check_tbl_value_col_ascending errors correctly", {
hub_path <- system.file("testhubs/flusight", package = "hubUtils")
file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")
tbl_error <- tbl
# TODO: 2025-01-07 investigate the purpose of adding an invalid target, which
# causes the test to fail
Expand Down Expand Up @@ -72,7 +72,7 @@ test_that("check_tbl_value_col_ascending skips correctly", {
hub_path <- system.file("testhubs/simple", package = "hubValidations")
file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")
tbl <- tbl[tbl$output_type == "mean", ]

expect_snapshot(
Expand Down Expand Up @@ -116,9 +116,10 @@ test_that("(#78) check_tbl_value_col_ascending will sort even if the data doesn'
convert_to_cdf <- function(x) {
ifelse(x == "quantile", "cdf", x)
}
tbl <- hubValidations::read_model_out_file(file_path, hub_path) %>%
tbl <- read_model_out_file(file_path, hub_path) %>%
dplyr::mutate(output_type_id = make_unsortable(.data[["output_type_id"]])) %>%
dplyr::mutate(output_type = convert_to_cdf(.data[["output_type"]]))
dplyr::mutate(output_type = convert_to_cdf(.data[["output_type"]])) %>%
hubData::coerce_to_character()

# validating when it is sorted -----------------------------------------
res <- check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Expand Down Expand Up @@ -161,14 +162,28 @@ test_that("(#78) check_tbl_value_col_ascending will sort even if the data doesn'
expect_equal(actual, expected, ignore_attr = TRUE)
})


test_that("(#78) check_tbl_value_col_ascending works when output type IDs differ by target", {
hub_path <- test_path("testdata/hub-diff-otid-per-task/")
file_path <- "ISI-NotOrdered/2024-01-10-ILI-model.csv"
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")
file_meta <- parse_file_name(file_path)

res_ok <- check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
expect_s3_class(res_ok, "check_success")
expect_null(res_ok$error_tbl)
})

test_that("(#189) check_tbl_value_col_ascending ignores derived task IDs", {
hub_path <- test_path("testdata/hub-177")
file_path <- "FluSight-baseline/2024-12-14-FluSight-baseline.parquet"
tbl <- read_model_out_file(file_path, hub_path, coerce_types = "chr")
file_meta <- parse_file_name(file_path)

# Introduce invalid value to derived task id that should be ignored when using
# `derived_task_ids`.
tbl[1, "target_end_date"] <- "random_date"
annakrystalli marked this conversation as resolved.
Show resolved Hide resolved

res_ok <- check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
expect_s3_class(res_ok, "check_success")
expect_null(res_ok$error_tbl)
})