Merge branch 'main' into readme_template

dfe-analytical-services · Nov 28, 2024 · 8321224 · 8321224
2 parents 15ffe10 + 7010d10
commit 8321224
Show file tree

Hide file tree

Showing 13 changed files with 459 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: dfeR
 Title: Common DfE R tasks
-Version: 0.6.1
+Version: 0.6.1.9000
 Authors@R: c(
     person("Cam", "Race", , "[email protected]", role = c("aut", "cre")),
     person("Laura", "Selby", , "[email protected]", role = "aut"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,7 @@ export(pretty_num_table)
 export(pretty_time_taken)
 export(round_five_up)
 export(toggle_message)
+export(z_replace)
 import(renv, except = run)
 importFrom(emoji,emoji)
 importFrom(lifecycle,deprecated)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# dfeR (development version)
+
+Added lookup data geog_time_identifiers
+Added z_replace() to replace NA values in tables except for ones in geography and time columns that match ones in geog_time_identifiers. 
+
 # dfeR 0.6.1
 
 Patch to update the pretty_num() function so that the `dp` argument's default is 0. 

diff --git a/R/datasets_documentation.R b/R/datasets_documentation.R
@@ -104,3 +104,15 @@
 #' from
 #' https://geoportal.statistics.gov.uk/search?q=NAC_RGN
 "regions"
+
+#' Potential names for geography and time columns
+#'
+#' Potential names for geography and time columns in line with the ones used for
+#' the explore education statistics data screener.
+#'
+#'
+#' @format ## `geog_time_identifiers`
+#' A character vector with 38 potential column names in snake case format.
+#' @source curated by explore.statistics@@education.gov.uk.
+#' \href{https://shorturl.at/j4532}{Get guidance on time and geography data.}
+"geog_time_identifiers"
diff --git a/R/z_replace.R b/R/z_replace.R
@@ -0,0 +1,152 @@
+#' Replaces `NA` values in tables
+#'
+#' @description
+#' Replaces `NA` values in tables except for ones in time and geography
+#' columns that must be included in DfE official statistics.
+#' \href{https://shorturl.at/chy76}{Get more guidance on Open Data Standards.}
+#'
+#' @details
+
+#' Names of geography and time columns that are used in this function can be
+#' found in `dfeR::geog_time_identifiers`.
+#'
+#' @param data name of the table that you want to replace NA values in
+#' @param replacement_alt optional - if you want the NA replacement
+#' value to be different to "z"
+#' @param exclude_columns optional - additional columns to exclude from
+#' NA replacement.
+#' Column names that match ones found in `dfeR::geog_time_identifiers`
+#' will always be excluded because any missing data for these columns
+#' need more explicit codes to explain why data is not available.
+#'
+#' @return table with "z" or an alternate replacement value instead of `NA`
+#' values for columns that are not for time or geography.
+#' @export
+#' @seealso [dfeR::geog_time_identifiers]
+#' @examples
+#' # Create a table for the example
+#'
+#' df <- data.frame(
+#'   time_period = c(2022, 2022, 2022),
+#'   time_identifier = c("Calendar year", "Calendar year", "Calendar year"),
+#'   geographic_level = c("National", "Regional", "Regional"),
+#'   country_code = c("E92000001", "E92000001", "E92000001"),
+#'   country_name = c("England", "England", "England"),
+#'   region_code = c(NA, "E12000001", "E12000002"),
+#'   region_name = c(NA, "North East", "North West"),
+#'   mystery_count = c(42, 25, NA)
+#' )
+#'
+#' z_replace(df)
+#'
+#' # Use a different replacement value
+#' z_replace(df, replacement_alt = "c")
+#'
+z_replace <- function(data,
+                      replacement_alt = NULL,
+                      exclude_columns = NULL) {
+  # check if table is empty
+
+  # Check if the table has rows - if not, stop the process
+  if (nrow(data) < 1) {
+    stop("Table is empty or contains no rows.")
+  }
+  # check for same column names but different case or formatting
+
+  # load in potential column names
+
+  geog_time_identifiers <- dfeR::geog_time_identifiers
+
+  # check for same column names but different case or formatting
+
+  # standardize column names for potential column names
+
+  ref_col_names <- gsub("[[:punct:]]", " ", geog_time_identifiers)
+  # removing extra space
+  ref_col_names <- gsub("  ", " ", ref_col_names)
+  # adding _ instead of spaces
+  ref_col_names <- gsub(" ", "_", tolower(ref_col_names))
+
+
+  # standardize column names for data input
+  data_col_names_og <- colnames(data)
+
+  data_col_names <- gsub("[[:punct:]]", " ", data_col_names_og)
+  # removing extra space
+  data_col_names <- gsub("  ", " ", data_col_names)
+  # adding _ instead of spaces
+  data_col_names <- gsub(" ", "_", tolower(data_col_names))
+
+  # check if the column name exists by comparing standardized names
+
+  col_name_exists <- data_col_names %in% ref_col_names
+  # check if the formatting matches by comparing non-standardized
+  formatting_test <- data_col_names_og %in% geog_time_identifiers
+
+  if (any(col_name_exists %in% TRUE & formatting_test %in% FALSE) == TRUE) {
+    stop(
+      "Your table has geography and/or time column(s) that are not ",
+      "in snake_case.\nPlease amend your column names to match the formatting",
+      "to dfeR::geog_time_identifiers."
+    )
+  }
+
+  # check for alt NA replacement
+  # if no alt, provided, use z
+  if (is.null(replacement_alt)) {
+    replacement_alt <- "z"
+    # check that replacement_alt is a single character vector
+  } else if (!is.character(replacement_alt)) {
+    stop(
+      "You provided a ", data.class(replacement_alt),
+      " input for replacement_alt.\n",
+      "Please amend replace it with a character vector."
+    )
+  } else if (length(replacement_alt) > 1) {
+    stop(
+      "You provided multiple values for replacement_alt.\n",
+      "Please, only provide a single value."
+    )
+  } else {
+    # otherwise use the provided replacement
+    replacement_alt <- replacement_alt
+  }
+
+
+  # start loop based on exclude_columns
+
+  # if exclude columns is specified, use the snake case version
+  if (!is.null(exclude_columns)) {
+    data <- data %>%
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(
+          geog_time_identifiers,
+          exclude_columns
+        )),
+        ~ as.character(.)
+      )) %>%
+      # replace NAs
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(
+          geog_time_identifiers,
+          exclude_columns
+        )),
+        ~ dplyr::if_else(is.na(.), replacement_alt, .)
+      ))
+  } else {
+    # if exclude_columns is not specified, then use the saved potential
+    # location and time columns only
+    data <- data %>%
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(geog_time_identifiers)),
+        ~ as.character(.)
+      )) %>%
+      # replace NAs
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(geog_time_identifiers)),
+        ~ dplyr::if_else(is.na(.), replacement_alt, .)
+      ))
+  }
+
+  return(data)
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -14,6 +14,7 @@ reference:
   - wd_pcon_lad_la_rgn_ctry
   - countries
   - regions
+  - geog_time_identifiers
 
 - title: Database connection
   desc: Helpful functions for connecting to databases in DfE
@@ -51,3 +52,8 @@ reference:
   - comma_sep
   - get_ons_api_data
   - toggle_message
+
+- title: Replace NA values
+  desc: Replace NA values with the default "z" or an alternative replacement
+  contents:
+  - z_replace
diff --git a/data-raw/geog_time_identifiers.R b/data-raw/geog_time_identifiers.R
@@ -0,0 +1,22 @@
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Get a list of potential location and time columns
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+# create a vector of possible time and geography column names
+geog_time_identifiers <- c(
+  "geographic_level", "country_code", "region_code", "new_la_code", "lad_code",
+  "pcon_code", "lsip_code", "local_enterprise_partnership_code",
+  "english_devolved_area_code", "opportunity_area_code", "ward_code",
+  "trust_id", "sponsor_id", "school_urn", "provider_ukprn", "institution_id",
+  "planning_area_code", "country_name", "region_name", "la_name", "lad_name",
+  "rsc_region_lead_name", "pcon_name", "lsip_name",
+  "local_enterprise_partnership_name", "english_devolved_area_name",
+  "opportunity_area_name", "ward_name", "trust_name", "sponsor_name",
+  "school_name", "provider_name", "institution_name", "planning_area_name",
+  "old_la_code", "school_laestab", "time_period", "time_identifier"
+)
+
+# write it out to the data folder
+
+usethis::use_data(geog_time_identifiers, overwrite = TRUE)
diff --git a/data/geog_time_identifiers.rda b/data/geog_time_identifiers.rda
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -43,6 +43,7 @@ rgn
 sep
 ser
 shorthands
+shorturl
 sql
 uk
 utla

diff --git a/man/geog_time_identifiers.Rd b/man/geog_time_identifiers.Rd
diff --git a/man/pretty_num.Rd b/man/pretty_num.Rd
diff --git a/man/z_replace.Rd b/man/z_replace.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,6 +43,7 @@ rgn @@
     sep
     ser
     shorthands
+    shorturl
     sql
     uk
     utla
@@ Expand Down @@