Skip to content

Commit

Permalink
Merge branch 'main' into readme_template
Browse files Browse the repository at this point in the history
  • Loading branch information
Lsnaathorst1 authored Nov 28, 2024
2 parents 15ffe10 + 7010d10 commit 8321224
Show file tree
Hide file tree
Showing 13 changed files with 459 additions and 2 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: dfeR
Title: Common DfE R tasks
Version: 0.6.1
Version: 0.6.1.9000
Authors@R: c(
person("Cam", "Race", , "[email protected]", role = c("aut", "cre")),
person("Laura", "Selby", , "[email protected]", role = "aut"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export(pretty_num_table)
export(pretty_time_taken)
export(round_five_up)
export(toggle_message)
export(z_replace)
import(renv, except = run)
importFrom(emoji,emoji)
importFrom(lifecycle,deprecated)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# dfeR (development version)

Added lookup data geog_time_identifiers
Added z_replace() to replace NA values in tables except for ones in geography and time columns that match ones in geog_time_identifiers.

# dfeR 0.6.1

Patch to update the pretty_num() function so that the `dp` argument's default is 0.
Expand Down
12 changes: 12 additions & 0 deletions R/datasets_documentation.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,15 @@
#' from
#' https://geoportal.statistics.gov.uk/search?q=NAC_RGN
"regions"

#' Potential names for geography and time columns
#'
#' Potential names for geography and time columns in line with the ones used for
#' the explore education statistics data screener.
#'
#'
#' @format ## `geog_time_identifiers`
#' A character vector with 38 potential column names in snake case format.
#' @source curated by explore.statistics@@education.gov.uk.
#' \href{https://shorturl.at/j4532}{Get guidance on time and geography data.}
"geog_time_identifiers"
152 changes: 152 additions & 0 deletions R/z_replace.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#' Replaces `NA` values in tables
#'
#' @description
#' Replaces `NA` values in tables except for ones in time and geography
#' columns that must be included in DfE official statistics.
#' \href{https://shorturl.at/chy76}{Get more guidance on Open Data Standards.}
#'
#' @details

#' Names of geography and time columns that are used in this function can be
#' found in `dfeR::geog_time_identifiers`.
#'
#' @param data name of the table that you want to replace NA values in
#' @param replacement_alt optional - if you want the NA replacement
#' value to be different to "z"
#' @param exclude_columns optional - additional columns to exclude from
#' NA replacement.
#' Column names that match ones found in `dfeR::geog_time_identifiers`
#' will always be excluded because any missing data for these columns
#' need more explicit codes to explain why data is not available.
#'
#' @return table with "z" or an alternate replacement value instead of `NA`
#' values for columns that are not for time or geography.
#' @export
#' @seealso [dfeR::geog_time_identifiers]
#' @examples
#' # Create a table for the example
#'
#' df <- data.frame(
#' time_period = c(2022, 2022, 2022),
#' time_identifier = c("Calendar year", "Calendar year", "Calendar year"),
#' geographic_level = c("National", "Regional", "Regional"),
#' country_code = c("E92000001", "E92000001", "E92000001"),
#' country_name = c("England", "England", "England"),
#' region_code = c(NA, "E12000001", "E12000002"),
#' region_name = c(NA, "North East", "North West"),
#' mystery_count = c(42, 25, NA)
#' )
#'
#' z_replace(df)
#'
#' # Use a different replacement value
#' z_replace(df, replacement_alt = "c")
#'
z_replace <- function(data,
replacement_alt = NULL,
exclude_columns = NULL) {
# check if table is empty

# Check if the table has rows - if not, stop the process
if (nrow(data) < 1) {
stop("Table is empty or contains no rows.")
}
# check for same column names but different case or formatting

# load in potential column names

geog_time_identifiers <- dfeR::geog_time_identifiers

# check for same column names but different case or formatting

# standardize column names for potential column names

ref_col_names <- gsub("[[:punct:]]", " ", geog_time_identifiers)
# removing extra space
ref_col_names <- gsub(" ", " ", ref_col_names)
# adding _ instead of spaces
ref_col_names <- gsub(" ", "_", tolower(ref_col_names))


# standardize column names for data input
data_col_names_og <- colnames(data)

data_col_names <- gsub("[[:punct:]]", " ", data_col_names_og)
# removing extra space
data_col_names <- gsub(" ", " ", data_col_names)
# adding _ instead of spaces
data_col_names <- gsub(" ", "_", tolower(data_col_names))

# check if the column name exists by comparing standardized names

col_name_exists <- data_col_names %in% ref_col_names
# check if the formatting matches by comparing non-standardized
formatting_test <- data_col_names_og %in% geog_time_identifiers

if (any(col_name_exists %in% TRUE & formatting_test %in% FALSE) == TRUE) {
stop(
"Your table has geography and/or time column(s) that are not ",
"in snake_case.\nPlease amend your column names to match the formatting",
"to dfeR::geog_time_identifiers."
)
}

# check for alt NA replacement
# if no alt, provided, use z
if (is.null(replacement_alt)) {
replacement_alt <- "z"
# check that replacement_alt is a single character vector
} else if (!is.character(replacement_alt)) {
stop(
"You provided a ", data.class(replacement_alt),
" input for replacement_alt.\n",
"Please amend replace it with a character vector."
)
} else if (length(replacement_alt) > 1) {
stop(
"You provided multiple values for replacement_alt.\n",
"Please, only provide a single value."
)
} else {
# otherwise use the provided replacement
replacement_alt <- replacement_alt
}


# start loop based on exclude_columns

# if exclude columns is specified, use the snake case version
if (!is.null(exclude_columns)) {
data <- data %>%
dplyr::mutate(dplyr::across(
-tidyselect::any_of(c(
geog_time_identifiers,
exclude_columns
)),
~ as.character(.)
)) %>%
# replace NAs
dplyr::mutate(dplyr::across(
-tidyselect::any_of(c(
geog_time_identifiers,
exclude_columns
)),
~ dplyr::if_else(is.na(.), replacement_alt, .)
))
} else {
# if exclude_columns is not specified, then use the saved potential
# location and time columns only
data <- data %>%
dplyr::mutate(dplyr::across(
-tidyselect::any_of(c(geog_time_identifiers)),
~ as.character(.)
)) %>%
# replace NAs
dplyr::mutate(dplyr::across(
-tidyselect::any_of(c(geog_time_identifiers)),
~ dplyr::if_else(is.na(.), replacement_alt, .)
))
}

return(data)
}
6 changes: 6 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ reference:
- wd_pcon_lad_la_rgn_ctry
- countries
- regions
- geog_time_identifiers

- title: Database connection
desc: Helpful functions for connecting to databases in DfE
Expand Down Expand Up @@ -51,3 +52,8 @@ reference:
- comma_sep
- get_ons_api_data
- toggle_message

- title: Replace NA values
desc: Replace NA values with the default "z" or an alternative replacement
contents:
- z_replace
22 changes: 22 additions & 0 deletions data-raw/geog_time_identifiers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Get a list of potential location and time columns
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# create a vector of possible time and geography column names
geog_time_identifiers <- c(
"geographic_level", "country_code", "region_code", "new_la_code", "lad_code",
"pcon_code", "lsip_code", "local_enterprise_partnership_code",
"english_devolved_area_code", "opportunity_area_code", "ward_code",
"trust_id", "sponsor_id", "school_urn", "provider_ukprn", "institution_id",
"planning_area_code", "country_name", "region_name", "la_name", "lad_name",
"rsc_region_lead_name", "pcon_name", "lsip_name",
"local_enterprise_partnership_name", "english_devolved_area_name",
"opportunity_area_name", "ward_name", "trust_name", "sponsor_name",
"school_name", "provider_name", "institution_name", "planning_area_name",
"old_la_code", "school_laestab", "time_period", "time_identifier"
)

# write it out to the data folder

usethis::use_data(geog_time_identifiers, overwrite = TRUE)
Binary file added data/geog_time_identifiers.rda
Binary file not shown.
1 change: 1 addition & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ rgn
sep
ser
shorthands
shorturl
sql
uk
utla
Expand Down
24 changes: 24 additions & 0 deletions man/geog_time_identifiers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/pretty_num.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

56 changes: 56 additions & 0 deletions man/z_replace.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 8321224

Please sign in to comment.