From c6635a8e7fdc14c3d40cfad09cd9a142e092c98a Mon Sep 17 00:00:00 2001 From: Cam Race <52536248+cjrace@users.noreply.github.com> Date: Tue, 22 Oct 2024 14:50:43 +0100 Subject: [PATCH] Fix/41 48 download dataset preview (#50) * Update notes around using download_dataset as a previewing function * I don't know why integer checking isn't in base R * update download to preview and ditch httr for httr2 combined with readr * Increment version number to 0.3.1.9000 * response to PR comments --- DESCRIPTION | 6 +- NAMESPACE | 2 +- NEWS.md | 4 +- R/{download_dataset.R => preview_dataset.R} | 66 +++++++++++++-------- R/utils.R | 23 +++++++ _pkgdown.yml | 2 +- man/check_integer.Rd | 25 ++++++++ man/download_dataset.Rd | 48 --------------- man/preview_dataset.Rd | 62 +++++++++++++++++++ tests/testthat/test-download_dataset.R | 12 ---- tests/testthat/test-preview_dataset.R | 66 +++++++++++++++++++++ vignettes/ees-api-workflow.Rmd | 42 +++++++------ 12 files changed, 251 insertions(+), 107 deletions(-) rename R/{download_dataset.R => preview_dataset.R} (52%) create mode 100644 man/check_integer.Rd delete mode 100644 man/download_dataset.Rd create mode 100644 man/preview_dataset.Rd delete mode 100644 tests/testthat/test-download_dataset.R create mode 100644 tests/testthat/test-preview_dataset.R diff --git a/DESCRIPTION b/DESCRIPTION index 2caeb9e..5bcac5f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: eesyapi Title: EES-y API -Version: 0.3.1 +Version: 0.3.1.9000 Authors@R: c( person("Rich", "Bielby", , "richard.bielby@education.gov.uk", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9070-9969")), @@ -17,18 +17,20 @@ Imports: data.table, dplyr, httr, + httr2, jsonlite, magrittr, + readr, rlang, stringr Suggests: knitr, - readr, rmarkdown, testthat (>= 3.0.0) VignetteBuilder: knitr Config/testthat/edition: 3 Encoding: UTF-8 +Language: en-GB Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 0a2ddcb..5bb4807 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,6 @@ export(api_url) export(api_url_pages) export(api_url_query) export(convert_api_filter_type) -export(download_dataset) export(example_data_raw) export(example_geography_query) export(example_id) @@ -33,6 +32,7 @@ export(parse_tojson_params) export(parse_tojson_time_periods) export(parse_tourl_filter_in) export(post_dataset) +export(preview_dataset) export(query_dataset) export(validate_ees_filter_type) export(validate_ees_id) diff --git a/NEWS.md b/NEWS.md index 8bfc971..030035c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,9 @@ +# eesyapi (development version) + # eesyapi 0.3.1 * Added parsing of SQIDs in retrieved data to provide human readable content -* Created function, `download_dataset()`, to connect to csv endpoint for downloading data set csv file +* Created function, `preview_dataset()`, to connect to csv endpoint for downloading data set csv file * Added first draft of example workflow for querying a data set # eesyapi 0.3.0 diff --git a/R/download_dataset.R b/R/preview_dataset.R similarity index 52% rename from R/download_dataset.R rename to R/preview_dataset.R index 9227bf3..1cb07d8 100644 --- a/R/download_dataset.R +++ b/R/preview_dataset.R @@ -1,38 +1,51 @@ -#' Download the raw CSV for an API data set +#' Preview the raw CSV for an API data set #' -#' This gives a super quick way to just fetch the whole file in a human -#' readable format. +#' This gives a super quick way to just fetch the file in a human readable +#' format. #' #' @description #' This function is mostly designed for exploring the API, and is unlikely to #' be suitable for long term production use. #' -#' There is no filtering down of the file so you will always get the whole file -#' and in some instances this may be very large. +#' You can set the number of rows to preview using the n_max parameter. This +#' uses the n_max from `readr::read_csv()` under the hood. #' #' As there are no IDs involved, this is brittle and code relying on this #' function will likely break whenever there is renaming of variables or items #' in the data. #' #' It is recommended to take the time to set up custom queries using the -#' `query_dataset()` function instead. If you are using this function for more -#' than exploratory purposes, make sure you subscribe to the data set you're -#' downloading and then keep track of any updates to the data. +#' `query_dataset()` function instead. +#' +#' If you are using this function for more than exploratory purposes, make +#' sure you subscribe to the data set you're downloading and then keep track +#' of any updates to the data. #' #' @param dataset_id ID of data set #' @param dataset_version Version number of data set #' @param api_version EES API version -#' @param verbose Run with additional contextual messaging, logical, default = FALSE +#' @param n_max maximum number of rows to preview, 10 by default, Inf will get +#' all available rows +#' @param verbose Run with additional contextual messaging, logical, +#' default = FALSE #' #' @return data.frame #' @export #' #' @examples -#' download_dataset(example_id("dataset")) -download_dataset <- function( +#' # Preview first 10 rows +#' preview_dataset(example_id("dataset")) +#' +#' # Get 2 rows +#' preview_dataset(example_id("dataset"), n_max = 2) +#' +#' # Get all rows +#' preview_dataset(example_id("dataset"), n_max = Inf) +preview_dataset <- function( dataset_id, dataset_version = NULL, api_version = NULL, + n_max = 10, verbose = FALSE) { # Validation ---------------------------------------------------------------- if (!is.null(dataset_version)) { @@ -48,6 +61,12 @@ download_dataset <- function( stop("verbose must be a logical value, either TRUE or FALSE") } + if (n_max != Inf) { + if (!check_integer(n_max)) { + stop("n_max must be a positive integer value, e.g. 15, or Inf") + } + } + eesyapi::validate_ees_id(dataset_id, level = "dataset") # Generate query ------------------------------------------------------------ @@ -57,25 +76,24 @@ download_dataset <- function( verbose = verbose ) + # Check we can request successfully ----------------------------------------- toggle_message("Requesting data...", verbose = verbose) - response <- httr::GET(query_url) + response <- query_url |> + httr2::request() |> + httr2::req_perform() eesyapi::http_request_error(response, verbose = verbose) - toggle_message("Parsing response...", verbose = verbose) - - # Parse into data.frame ----------------------------------------------------- - output <- httr::content( - response, - - # All EES CSVs should be UTF-8 and are validated on import - encoding = "UTF-8", + # Read in the CSV ----------------------------------------------------------- + toggle_message("Reading response...", verbose = verbose) - # httr uses read_csv() underneath, controlling read_csv() verbosity - show_col_types = verbose, - progress = verbose - ) |> + output <- query_url |> + readr::read_csv( + show_col_types = verbose, + progress = verbose, + n_max = n_max + ) |> as.data.frame() return(output) diff --git a/R/utils.R b/R/utils.R index f3a0b84..5a350f1 100644 --- a/R/utils.R +++ b/R/utils.R @@ -16,3 +16,26 @@ toggle_message <- function(..., verbose) { message(...) } } + +#' Check if a value is an integer +#' +#' is.integer checks the object class, not the value, so credit to VitoshKa +#' on stack overflow for the core of this function... +#' +#' https://stackoverflow.com/questions/3476782/check-if-the-number-is-integer +#' +#' looks like it's been adopted in installr too, avoiding needing that as a +#' dependency by putting the code we need here. +#' +#' @param x a value to test +#' +#' @return logical, false if not an integer, true if an integer +#' @keywords internal +check_integer <- function(x) { + if (!is.double(x)) { + # Return early if wrapped in quotes + return(FALSE) + } else { + !grepl("[^[:digit:]]", format(x, digits = 20, scientific = FALSE)) + } +} diff --git a/_pkgdown.yml b/_pkgdown.yml index d74a49a..8e5df8e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -12,7 +12,7 @@ reference: - get_publications - get_data_catalogue - get_meta - - download_dataset + - preview_dataset - query_dataset - title: Support for generating API URLs and interpreting responses diff --git a/man/check_integer.Rd b/man/check_integer.Rd new file mode 100644 index 0000000..2edbeed --- /dev/null +++ b/man/check_integer.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{check_integer} +\alias{check_integer} +\title{Check if a value is an integer} +\usage{ +check_integer(x) +} +\arguments{ +\item{x}{a value to test} +} +\value{ +logical, false if not an integer, true if an integer +} +\description{ +is.integer checks the object class, not the value, so credit to VitoshKa +on stack overflow for the core of this function... +} +\details{ +https://stackoverflow.com/questions/3476782/check-if-the-number-is-integer + +looks like it's been adopted in installr too, avoiding needing that as a +dependency by putting the code we need here. +} +\keyword{internal} diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd deleted file mode 100644 index c39dd0e..0000000 --- a/man/download_dataset.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/download_dataset.R -\name{download_dataset} -\alias{download_dataset} -\title{Download the raw CSV for an API data set} -\usage{ -download_dataset( - dataset_id, - dataset_version = NULL, - api_version = NULL, - verbose = FALSE -) -} -\arguments{ -\item{dataset_id}{ID of data set} - -\item{dataset_version}{Version number of data set} - -\item{api_version}{EES API version} - -\item{verbose}{Run with additional contextual messaging, logical, default = FALSE} -} -\value{ -data.frame -} -\description{ -This function is mostly designed for exploring the API, and is unlikely to -be suitable for long term production use. - -There is no filtering down of the file so you will always get the whole file -and in some instances this may be very large. - -As there are no IDs involved, this is brittle and code relying on this -function will likely break whenever there is renaming of variables or items -in the data. - -It is recommended to take the time to set up custom queries using the -\code{query_dataset()} function instead. If you are using this function for more -than exploratory purposes, make sure you subscribe to the data set you're -downloading and then keep track of any updates to the data. -} -\details{ -This gives a super quick way to just fetch the whole file in a human -readable format. -} -\examples{ -download_dataset(example_id("dataset")) -} diff --git a/man/preview_dataset.Rd b/man/preview_dataset.Rd new file mode 100644 index 0000000..0d06b94 --- /dev/null +++ b/man/preview_dataset.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preview_dataset.R +\name{preview_dataset} +\alias{preview_dataset} +\title{Preview the raw CSV for an API data set} +\usage{ +preview_dataset( + dataset_id, + dataset_version = NULL, + api_version = NULL, + n_max = 10, + verbose = FALSE +) +} +\arguments{ +\item{dataset_id}{ID of data set} + +\item{dataset_version}{Version number of data set} + +\item{api_version}{EES API version} + +\item{n_max}{maximum number of rows to preview, 10 by default, Inf will get +all available rows} + +\item{verbose}{Run with additional contextual messaging, logical, +default = FALSE} +} +\value{ +data.frame +} +\description{ +This function is mostly designed for exploring the API, and is unlikely to +be suitable for long term production use. + +You can set the number of rows to preview using the n_max parameter. This +uses the n_max from \code{readr::read_csv()} under the hood. + +As there are no IDs involved, this is brittle and code relying on this +function will likely break whenever there is renaming of variables or items +in the data. + +It is recommended to take the time to set up custom queries using the +\code{query_dataset()} function instead. + +If you are using this function for more than exploratory purposes, make +sure you subscribe to the data set you're downloading and then keep track +of any updates to the data. +} +\details{ +This gives a super quick way to just fetch the file in a human readable +format. +} +\examples{ +# Preview first 10 rows +preview_dataset(example_id("dataset")) + +# Get 2 rows +preview_dataset(example_id("dataset"), n_max = 2) + +# Get all rows +preview_dataset(example_id("dataset"), n_max = Inf) +} diff --git a/tests/testthat/test-download_dataset.R b/tests/testthat/test-download_dataset.R deleted file mode 100644 index 884af3d..0000000 --- a/tests/testthat/test-download_dataset.R +++ /dev/null @@ -1,12 +0,0 @@ -test_that("Returns a data frame and has no errors", { - expect_true(class(download_dataset(example_id("dataset"))) == "data.frame") - expect_no_error(download_dataset(example_id("dataset"))) -}) - -test_that("Incorrect inputs cause errors", { - expect_error(download_dataset("ark-of-the-covenent")) - expect_error( - download_dataset(example_id("dataset"), verbose = "chatty"), - "verbose must be a logical value, either TRUE or FALSE" - ) -}) diff --git a/tests/testthat/test-preview_dataset.R b/tests/testthat/test-preview_dataset.R new file mode 100644 index 0000000..517413a --- /dev/null +++ b/tests/testthat/test-preview_dataset.R @@ -0,0 +1,66 @@ +test_that("Returns a data frame and has no errors", { + expect_true(class(preview_dataset(example_id("dataset"))) == "data.frame") + expect_no_error(preview_dataset(example_id("dataset"))) +}) + +test_that("Incorrect inputs cause errors", { + expect_error(preview_dataset("ark-of-the-covenent")) + + expect_error( + preview_dataset(example_id("dataset"), verbose = "chatty"), + "verbose must be a logical value, either TRUE or FALSE" + ) + + expect_error( + preview_dataset(example_id("dataset"), n_max = 20.2), + "n_max must be a positive integer value, e.g. 15, or Inf" + ) + + expect_error( + preview_dataset(example_id("dataset"), n_max = "20"), + "n_max must be a positive integer value, e.g. 15, or Inf" + ) + + expect_error( + preview_dataset(example_id("dataset"), n_max = -2), + "n_max must be a positive integer value, e.g. 15, or Inf" + ) + + expect_error( + preview_dataset(example_id("dataset"), n_max = "fifty"), + "n_max must be a positive integer value, e.g. 15, or Inf" + ) + + expect_error( + preview_dataset(example_id("dataset"), n_max = -Inf), + "n_max must be a positive integer value, e.g. 15, or Inf" + ) +}) + +test_that("only previews a specified number of rows", { + expect_equal( + preview_dataset(example_id("dataset")) |> nrow(), + 10 + ) + + expect_equal( + preview_dataset(example_id("dataset"), n_max = 0) |> nrow(), + 0 + ) + + expect_equal( + preview_dataset(example_id("dataset"), n_max = 42) |> nrow(), + 42 + ) +}) + +test_that("returns all rows", { + # In truth, the test doesn't check all rows as we don't reliably know that + # number, so just that it returns more than an arbitrary number above the + # default + expect_gt( + preview_dataset(example_id("dataset"), n_max = Inf) |> + nrow(), + 200 + ) +}) diff --git a/vignettes/ees-api-workflow.Rmd b/vignettes/ees-api-workflow.Rmd index 9fda99b..f769321 100644 --- a/vignettes/ees-api-workflow.Rmd +++ b/vignettes/ees-api-workflow.Rmd @@ -75,6 +75,30 @@ taken to it's details page in the catalogue. On this page (pictured below), you' ![An example of an API data set's details](ees-data-catalogue-details-api-data-set.png) +## Preview underlying data + +In the EES API you have the option to preview the underlying data for a given data set. +This bypasses the use of IDs, making it a simpler short-term option to explore the available data. + +Using `eesyapi`, you can preview a data set using `preview_dataset()` and providing the data set +ID as follows: + +``` +eesyapi::preview_dataset("7c0e9201-c7c0-ff73-bee4-304e731ec0e6") +``` + +This will return a data frame containing the first 10 rows of the data in the form that DfE +analysts uploaded it to EES in. + +While you can set n_max to Inf, and get the whole data set using this, you should avoid relying on +the `preview_dataset()` function in your pipelines or dashboards and instead use it to explore the +data as you create your own custom query using `query_dataset()`. + +If you only read the whole file in using the preview you risk: + +* code breaking whenever element labels change +* reading in too much data into active memory, causing R to slow or even crash + ## Getting the data set meta data When querying a data set via the API, column names (indicators and filters) and the options available within columns (filter items) are referenced @@ -183,24 +207,6 @@ eesyapi::get_meta(dataset_id = "7c0e9201-c7c0-ff73-bee4-304e731ec0e6") |> ## Retrieving data from a data set -### Downloading an entire data set in a single file - -Whilst specific queries (as detailed in the following sections) are best used for automated pipeline -workflows, the EES API also offers the option -to just download the entirety of the underlying data file for a given data set. This bypasses the -use of IDs, making it a simpler short-term option to retrieve data, but with the risk of creating -breaking changes in automated pipelines should element labels change. - -Using `eesyapi`, you can retrieve a data set using `download_dataset()` and providing the data set -ID as follows: - -``` -download_dataset("7c0e9201-c7c0-ff73-bee4-304e731ec0e6") -``` - -This will return a data frame containing the data in the form that DfE analysts uploaded it to -EES in. - ### Using `query_dataset()` The recommended go-to option for retrieving data is the `query_dataset()` function. This provides