From c6635a8e7fdc14c3d40cfad09cd9a142e092c98a Mon Sep 17 00:00:00 2001
From: Cam Race <52536248+cjrace@users.noreply.github.com>
Date: Tue, 22 Oct 2024 14:50:43 +0100
Subject: [PATCH] Fix/41 48 download dataset preview (#50)

* Update notes around using download_dataset as a previewing function

* I don't know why integer checking isn't in base R

* update download to preview and ditch httr for httr2 combined with readr

* Increment version number to 0.3.1.9000

* response to PR comments
---
 DESCRIPTION                                 |  6 +-
 NAMESPACE                                   |  2 +-
 NEWS.md                                     |  4 +-
 R/{download_dataset.R => preview_dataset.R} | 66 +++++++++++++--------
 R/utils.R                                   | 23 +++++++
 _pkgdown.yml                                |  2 +-
 man/check_integer.Rd                        | 25 ++++++++
 man/download_dataset.Rd                     | 48 ---------------
 man/preview_dataset.Rd                      | 62 +++++++++++++++++++
 tests/testthat/test-download_dataset.R      | 12 ----
 tests/testthat/test-preview_dataset.R       | 66 +++++++++++++++++++++
 vignettes/ees-api-workflow.Rmd              | 42 +++++++------
 12 files changed, 251 insertions(+), 107 deletions(-)
 rename R/{download_dataset.R => preview_dataset.R} (52%)
 create mode 100644 man/check_integer.Rd
 delete mode 100644 man/download_dataset.Rd
 create mode 100644 man/preview_dataset.Rd
 delete mode 100644 tests/testthat/test-download_dataset.R
 create mode 100644 tests/testthat/test-preview_dataset.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 2caeb9e..5bcac5f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: eesyapi
 Title: EES-y API
-Version: 0.3.1
+Version: 0.3.1.9000
 Authors@R: c(
     person("Rich", "Bielby", , "richard.bielby@education.gov.uk", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-9070-9969")),
@@ -17,18 +17,20 @@ Imports:
     data.table,
     dplyr,
     httr,
+    httr2,
     jsonlite,
     magrittr,
+    readr,
     rlang,
     stringr
 Suggests: 
     knitr,
-    readr,
     rmarkdown,
     testthat (>= 3.0.0)
 VignetteBuilder: 
     knitr
 Config/testthat/edition: 3
 Encoding: UTF-8
+Language: en-GB
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
index 0a2ddcb..5bb4807 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,7 +4,6 @@ export(api_url)
 export(api_url_pages)
 export(api_url_query)
 export(convert_api_filter_type)
-export(download_dataset)
 export(example_data_raw)
 export(example_geography_query)
 export(example_id)
@@ -33,6 +32,7 @@ export(parse_tojson_params)
 export(parse_tojson_time_periods)
 export(parse_tourl_filter_in)
 export(post_dataset)
+export(preview_dataset)
 export(query_dataset)
 export(validate_ees_filter_type)
 export(validate_ees_id)
diff --git a/NEWS.md b/NEWS.md
index 8bfc971..030035c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,9 @@
+# eesyapi (development version)
+
 # eesyapi 0.3.1
 
 * Added parsing of SQIDs in retrieved data to provide human readable content
-* Created function, `download_dataset()`, to connect to csv endpoint for downloading data set csv file
+* Created function, `preview_dataset()`, to connect to csv endpoint for downloading data set csv file
 * Added first draft of example workflow for querying a data set
 
 # eesyapi 0.3.0
diff --git a/R/download_dataset.R b/R/preview_dataset.R
similarity index 52%
rename from R/download_dataset.R
rename to R/preview_dataset.R
index 9227bf3..1cb07d8 100644
--- a/R/download_dataset.R
+++ b/R/preview_dataset.R
@@ -1,38 +1,51 @@
-#' Download the raw CSV for an API data set
+#' Preview the raw CSV for an API data set
 #'
-#' This gives a super quick way to just fetch the whole file in a human
-#' readable format.
+#' This gives a super quick way to just fetch the file in a human readable
+#' format.
 #'
 #' @description
 #' This function is mostly designed for exploring the API, and is unlikely to
 #' be suitable for long term production use.
 #'
-#' There is no filtering down of the file so you will always get the whole file
-#' and in some instances this may be very large.
+#' You can set the number of rows to preview using the n_max parameter. This
+#' uses the n_max from `readr::read_csv()` under the hood.
 #'
 #' As there are no IDs involved, this is brittle and code relying on this
 #' function will likely break whenever there is renaming of variables or items
 #' in the data.
 #'
 #' It is recommended to take the time to set up custom queries using the
-#' `query_dataset()` function instead. If you are using this function for more
-#' than exploratory purposes, make sure you subscribe to the data set you're
-#' downloading and then keep track of any updates to the data.
+#' `query_dataset()` function instead.
+#'
+#' If you are using this function for more than exploratory purposes, make
+#' sure you subscribe to the data set you're downloading and then keep track
+#' of any updates to the data.
 #'
 #' @param dataset_id ID of data set
 #' @param dataset_version Version number of data set
 #' @param api_version EES API version
-#' @param verbose Run with additional contextual messaging, logical, default = FALSE
+#' @param n_max maximum number of rows to preview, 10 by default, Inf will get
+#'  all available rows
+#' @param verbose Run with additional contextual messaging, logical,
+#' default = FALSE
 #'
 #' @return data.frame
 #' @export
 #'
 #' @examples
-#' download_dataset(example_id("dataset"))
-download_dataset <- function(
+#' # Preview first 10 rows
+#' preview_dataset(example_id("dataset"))
+#'
+#' # Get 2 rows
+#' preview_dataset(example_id("dataset"), n_max = 2)
+#'
+#' # Get all rows
+#' preview_dataset(example_id("dataset"), n_max = Inf)
+preview_dataset <- function(
     dataset_id,
     dataset_version = NULL,
     api_version = NULL,
+    n_max = 10,
     verbose = FALSE) {
   # Validation ----------------------------------------------------------------
   if (!is.null(dataset_version)) {
@@ -48,6 +61,12 @@ download_dataset <- function(
     stop("verbose must be a logical value, either TRUE or FALSE")
   }
 
+  if (n_max != Inf) {
+    if (!check_integer(n_max)) {
+      stop("n_max must be a positive integer value, e.g. 15, or Inf")
+    }
+  }
+
   eesyapi::validate_ees_id(dataset_id, level = "dataset")
 
   # Generate query ------------------------------------------------------------
@@ -57,25 +76,24 @@ download_dataset <- function(
     verbose = verbose
   )
 
+  # Check we can request successfully -----------------------------------------
   toggle_message("Requesting data...", verbose = verbose)
 
-  response <- httr::GET(query_url)
+  response <- query_url |>
+    httr2::request() |>
+    httr2::req_perform()
 
   eesyapi::http_request_error(response, verbose = verbose)
 
-  toggle_message("Parsing response...", verbose = verbose)
-
-  # Parse into data.frame -----------------------------------------------------
-  output <- httr::content(
-    response,
-
-    # All EES CSVs should be UTF-8 and are validated on import
-    encoding = "UTF-8",
+  # Read in the CSV -----------------------------------------------------------
+  toggle_message("Reading response...", verbose = verbose)
 
-    # httr uses read_csv() underneath, controlling read_csv() verbosity
-    show_col_types = verbose,
-    progress = verbose
-  ) |>
+  output <- query_url |>
+    readr::read_csv(
+      show_col_types = verbose,
+      progress = verbose,
+      n_max = n_max
+    ) |>
     as.data.frame()
 
   return(output)
diff --git a/R/utils.R b/R/utils.R
index f3a0b84..5a350f1 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -16,3 +16,26 @@ toggle_message <- function(..., verbose) {
     message(...)
   }
 }
+
+#' Check if a value is an integer
+#'
+#' is.integer checks the object class, not the value, so credit to VitoshKa
+#' on stack overflow for the core of this function...
+#'
+#' https://stackoverflow.com/questions/3476782/check-if-the-number-is-integer
+#'
+#' looks like it's been adopted in installr too, avoiding needing that as a
+#' dependency by putting the code we need here.
+#'
+#' @param x a value to test
+#'
+#' @return logical, false if not an integer, true if an integer
+#' @keywords internal
+check_integer <- function(x) {
+  if (!is.double(x)) {
+    # Return early if wrapped in quotes
+    return(FALSE)
+  } else {
+    !grepl("[^[:digit:]]", format(x, digits = 20, scientific = FALSE))
+  }
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index d74a49a..8e5df8e 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -12,7 +12,7 @@ reference:
   - get_publications
   - get_data_catalogue
   - get_meta
-  - download_dataset
+  - preview_dataset
   - query_dataset
 
 - title: Support for generating API URLs and interpreting responses
diff --git a/man/check_integer.Rd b/man/check_integer.Rd
new file mode 100644
index 0000000..2edbeed
--- /dev/null
+++ b/man/check_integer.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{check_integer}
+\alias{check_integer}
+\title{Check if a value is an integer}
+\usage{
+check_integer(x)
+}
+\arguments{
+\item{x}{a value to test}
+}
+\value{
+logical, false if not an integer, true if an integer
+}
+\description{
+is.integer checks the object class, not the value, so credit to VitoshKa
+on stack overflow for the core of this function...
+}
+\details{
+https://stackoverflow.com/questions/3476782/check-if-the-number-is-integer
+
+looks like it's been adopted in installr too, avoiding needing that as a
+dependency by putting the code we need here.
+}
+\keyword{internal}
diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd
deleted file mode 100644
index c39dd0e..0000000
--- a/man/download_dataset.Rd
+++ /dev/null
@@ -1,48 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/download_dataset.R
-\name{download_dataset}
-\alias{download_dataset}
-\title{Download the raw CSV for an API data set}
-\usage{
-download_dataset(
-  dataset_id,
-  dataset_version = NULL,
-  api_version = NULL,
-  verbose = FALSE
-)
-}
-\arguments{
-\item{dataset_id}{ID of data set}
-
-\item{dataset_version}{Version number of data set}
-
-\item{api_version}{EES API version}
-
-\item{verbose}{Run with additional contextual messaging, logical, default = FALSE}
-}
-\value{
-data.frame
-}
-\description{
-This function is mostly designed for exploring the API, and is unlikely to
-be suitable for long term production use.
-
-There is no filtering down of the file so you will always get the whole file
-and in some instances this may be very large.
-
-As there are no IDs involved, this is brittle and code relying on this
-function will likely break whenever there is renaming of variables or items
-in the data.
-
-It is recommended to take the time to set up custom queries using the
-\code{query_dataset()} function instead. If you are using this function for more
-than exploratory purposes, make sure you subscribe to the data set you're
-downloading and then keep track of any updates to the data.
-}
-\details{
-This gives a super quick way to just fetch the whole file in a human
-readable format.
-}
-\examples{
-download_dataset(example_id("dataset"))
-}
diff --git a/man/preview_dataset.Rd b/man/preview_dataset.Rd
new file mode 100644
index 0000000..0d06b94
--- /dev/null
+++ b/man/preview_dataset.Rd
@@ -0,0 +1,62 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/preview_dataset.R
+\name{preview_dataset}
+\alias{preview_dataset}
+\title{Preview the raw CSV for an API data set}
+\usage{
+preview_dataset(
+  dataset_id,
+  dataset_version = NULL,
+  api_version = NULL,
+  n_max = 10,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{dataset_id}{ID of data set}
+
+\item{dataset_version}{Version number of data set}
+
+\item{api_version}{EES API version}
+
+\item{n_max}{maximum number of rows to preview, 10 by default, Inf will get
+all available rows}
+
+\item{verbose}{Run with additional contextual messaging, logical,
+default = FALSE}
+}
+\value{
+data.frame
+}
+\description{
+This function is mostly designed for exploring the API, and is unlikely to
+be suitable for long term production use.
+
+You can set the number of rows to preview using the n_max parameter. This
+uses the n_max from \code{readr::read_csv()} under the hood.
+
+As there are no IDs involved, this is brittle and code relying on this
+function will likely break whenever there is renaming of variables or items
+in the data.
+
+It is recommended to take the time to set up custom queries using the
+\code{query_dataset()} function instead.
+
+If you are using this function for more than exploratory purposes, make
+sure you subscribe to the data set you're downloading and then keep track
+of any updates to the data.
+}
+\details{
+This gives a super quick way to just fetch the file in a human readable
+format.
+}
+\examples{
+# Preview first 10 rows
+preview_dataset(example_id("dataset"))
+
+# Get 2 rows
+preview_dataset(example_id("dataset"), n_max = 2)
+
+# Get all rows
+preview_dataset(example_id("dataset"), n_max = Inf)
+}
diff --git a/tests/testthat/test-download_dataset.R b/tests/testthat/test-download_dataset.R
deleted file mode 100644
index 884af3d..0000000
--- a/tests/testthat/test-download_dataset.R
+++ /dev/null
@@ -1,12 +0,0 @@
-test_that("Returns a data frame and has no errors", {
-  expect_true(class(download_dataset(example_id("dataset"))) == "data.frame")
-  expect_no_error(download_dataset(example_id("dataset")))
-})
-
-test_that("Incorrect inputs cause errors", {
-  expect_error(download_dataset("ark-of-the-covenent"))
-  expect_error(
-    download_dataset(example_id("dataset"), verbose = "chatty"),
-    "verbose must be a logical value, either TRUE or FALSE"
-  )
-})
diff --git a/tests/testthat/test-preview_dataset.R b/tests/testthat/test-preview_dataset.R
new file mode 100644
index 0000000..517413a
--- /dev/null
+++ b/tests/testthat/test-preview_dataset.R
@@ -0,0 +1,66 @@
+test_that("Returns a data frame and has no errors", {
+  expect_true(class(preview_dataset(example_id("dataset"))) == "data.frame")
+  expect_no_error(preview_dataset(example_id("dataset")))
+})
+
+test_that("Incorrect inputs cause errors", {
+  expect_error(preview_dataset("ark-of-the-covenent"))
+
+  expect_error(
+    preview_dataset(example_id("dataset"), verbose = "chatty"),
+    "verbose must be a logical value, either TRUE or FALSE"
+  )
+
+  expect_error(
+    preview_dataset(example_id("dataset"), n_max = 20.2),
+    "n_max must be a positive integer value, e.g. 15, or Inf"
+  )
+
+  expect_error(
+    preview_dataset(example_id("dataset"), n_max = "20"),
+    "n_max must be a positive integer value, e.g. 15, or Inf"
+  )
+
+  expect_error(
+    preview_dataset(example_id("dataset"), n_max = -2),
+    "n_max must be a positive integer value, e.g. 15, or Inf"
+  )
+
+  expect_error(
+    preview_dataset(example_id("dataset"), n_max = "fifty"),
+    "n_max must be a positive integer value, e.g. 15, or Inf"
+  )
+
+  expect_error(
+    preview_dataset(example_id("dataset"), n_max = -Inf),
+    "n_max must be a positive integer value, e.g. 15, or Inf"
+  )
+})
+
+test_that("only previews a specified number of rows", {
+  expect_equal(
+    preview_dataset(example_id("dataset")) |> nrow(),
+    10
+  )
+
+  expect_equal(
+    preview_dataset(example_id("dataset"), n_max = 0) |> nrow(),
+    0
+  )
+
+  expect_equal(
+    preview_dataset(example_id("dataset"), n_max = 42) |> nrow(),
+    42
+  )
+})
+
+test_that("returns all rows", {
+  # In truth, the test doesn't check all rows as we don't reliably know that
+  # number, so just that it returns more than an arbitrary number above the
+  # default
+  expect_gt(
+    preview_dataset(example_id("dataset"), n_max = Inf) |>
+      nrow(),
+    200
+  )
+})
diff --git a/vignettes/ees-api-workflow.Rmd b/vignettes/ees-api-workflow.Rmd
index 9fda99b..f769321 100644
--- a/vignettes/ees-api-workflow.Rmd
+++ b/vignettes/ees-api-workflow.Rmd
@@ -75,6 +75,30 @@ taken to it's details page in the catalogue. On this page (pictured below), you'
 
 ![An example of an API data set's details](ees-data-catalogue-details-api-data-set.png)
 
+## Preview underlying data
+
+In the EES API you have the option to preview the underlying data for a given data set. 
+This bypasses the use of IDs, making it a simpler short-term option to explore the available data.
+
+Using `eesyapi`, you can preview a data set using `preview_dataset()` and providing the data set 
+ID as follows:
+
+```
+eesyapi::preview_dataset("7c0e9201-c7c0-ff73-bee4-304e731ec0e6")
+```
+
+This will return a data frame containing the first 10 rows of the data in the form that DfE 
+analysts uploaded it to EES in.
+
+While you can set n_max to Inf, and get the whole data set using this, you should avoid relying on
+the `preview_dataset()` function in your pipelines or dashboards and instead use it to explore the
+data as you create your own custom query using `query_dataset()`.
+
+If you only read the whole file in using the preview you risk:
+
+* code breaking whenever element labels change
+* reading in too much data into active memory, causing R to slow or even crash
+
 ## Getting the data set meta data
 
 When querying a data set via the API, column names (indicators and filters) and the options available within columns (filter items) are referenced 
@@ -183,24 +207,6 @@ eesyapi::get_meta(dataset_id = "7c0e9201-c7c0-ff73-bee4-304e731ec0e6") |>
 
 ## Retrieving data from a data set
 
-### Downloading an entire data set in a single file
-
-Whilst specific queries (as detailed in the following sections) are best used for automated pipeline 
-workflows, the EES API also offers the option
-to just download the entirety of the underlying data file for a given data set. This bypasses the 
-use of IDs, making it a simpler short-term option to retrieve data, but with the risk of creating
-breaking changes in automated pipelines should element labels change.
-
-Using `eesyapi`, you can retrieve a data set using `download_dataset()` and providing the data set 
-ID as follows:
-
-```
-download_dataset("7c0e9201-c7c0-ff73-bee4-304e731ec0e6")
-```
-
-This will return a data frame containing the data in the form that DfE analysts uploaded it to 
-EES in.
-
 ### Using `query_dataset()`
 
 The recommended go-to option for retrieving data is the `query_dataset()` function. This provides