-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into readme_template
- Loading branch information
Showing
13 changed files
with
459 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
Type: Package | ||
Package: dfeR | ||
Title: Common DfE R tasks | ||
Version: 0.6.1 | ||
Version: 0.6.1.9000 | ||
Authors@R: c( | ||
person("Cam", "Race", , "[email protected]", role = c("aut", "cre")), | ||
person("Laura", "Selby", , "[email protected]", role = "aut"), | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
#' Replaces `NA` values in tables | ||
#' | ||
#' @description | ||
#' Replaces `NA` values in tables except for ones in time and geography | ||
#' columns that must be included in DfE official statistics. | ||
#' \href{https://shorturl.at/chy76}{Get more guidance on Open Data Standards.} | ||
#' | ||
#' @details | ||
|
||
#' Names of geography and time columns that are used in this function can be | ||
#' found in `dfeR::geog_time_identifiers`. | ||
#' | ||
#' @param data name of the table that you want to replace NA values in | ||
#' @param replacement_alt optional - if you want the NA replacement | ||
#' value to be different to "z" | ||
#' @param exclude_columns optional - additional columns to exclude from | ||
#' NA replacement. | ||
#' Column names that match ones found in `dfeR::geog_time_identifiers` | ||
#' will always be excluded because any missing data for these columns | ||
#' need more explicit codes to explain why data is not available. | ||
#' | ||
#' @return table with "z" or an alternate replacement value instead of `NA` | ||
#' values for columns that are not for time or geography. | ||
#' @export | ||
#' @seealso [dfeR::geog_time_identifiers] | ||
#' @examples | ||
#' # Create a table for the example | ||
#' | ||
#' df <- data.frame( | ||
#' time_period = c(2022, 2022, 2022), | ||
#' time_identifier = c("Calendar year", "Calendar year", "Calendar year"), | ||
#' geographic_level = c("National", "Regional", "Regional"), | ||
#' country_code = c("E92000001", "E92000001", "E92000001"), | ||
#' country_name = c("England", "England", "England"), | ||
#' region_code = c(NA, "E12000001", "E12000002"), | ||
#' region_name = c(NA, "North East", "North West"), | ||
#' mystery_count = c(42, 25, NA) | ||
#' ) | ||
#' | ||
#' z_replace(df) | ||
#' | ||
#' # Use a different replacement value | ||
#' z_replace(df, replacement_alt = "c") | ||
#' | ||
z_replace <- function(data, | ||
replacement_alt = NULL, | ||
exclude_columns = NULL) { | ||
# check if table is empty | ||
|
||
# Check if the table has rows - if not, stop the process | ||
if (nrow(data) < 1) { | ||
stop("Table is empty or contains no rows.") | ||
} | ||
# check for same column names but different case or formatting | ||
|
||
# load in potential column names | ||
|
||
geog_time_identifiers <- dfeR::geog_time_identifiers | ||
|
||
# check for same column names but different case or formatting | ||
|
||
# standardize column names for potential column names | ||
|
||
ref_col_names <- gsub("[[:punct:]]", " ", geog_time_identifiers) | ||
# removing extra space | ||
ref_col_names <- gsub(" ", " ", ref_col_names) | ||
# adding _ instead of spaces | ||
ref_col_names <- gsub(" ", "_", tolower(ref_col_names)) | ||
|
||
|
||
# standardize column names for data input | ||
data_col_names_og <- colnames(data) | ||
|
||
data_col_names <- gsub("[[:punct:]]", " ", data_col_names_og) | ||
# removing extra space | ||
data_col_names <- gsub(" ", " ", data_col_names) | ||
# adding _ instead of spaces | ||
data_col_names <- gsub(" ", "_", tolower(data_col_names)) | ||
|
||
# check if the column name exists by comparing standardized names | ||
|
||
col_name_exists <- data_col_names %in% ref_col_names | ||
# check if the formatting matches by comparing non-standardized | ||
formatting_test <- data_col_names_og %in% geog_time_identifiers | ||
|
||
if (any(col_name_exists %in% TRUE & formatting_test %in% FALSE) == TRUE) { | ||
stop( | ||
"Your table has geography and/or time column(s) that are not ", | ||
"in snake_case.\nPlease amend your column names to match the formatting", | ||
"to dfeR::geog_time_identifiers." | ||
) | ||
} | ||
|
||
# check for alt NA replacement | ||
# if no alt, provided, use z | ||
if (is.null(replacement_alt)) { | ||
replacement_alt <- "z" | ||
# check that replacement_alt is a single character vector | ||
} else if (!is.character(replacement_alt)) { | ||
stop( | ||
"You provided a ", data.class(replacement_alt), | ||
" input for replacement_alt.\n", | ||
"Please amend replace it with a character vector." | ||
) | ||
} else if (length(replacement_alt) > 1) { | ||
stop( | ||
"You provided multiple values for replacement_alt.\n", | ||
"Please, only provide a single value." | ||
) | ||
} else { | ||
# otherwise use the provided replacement | ||
replacement_alt <- replacement_alt | ||
} | ||
|
||
|
||
# start loop based on exclude_columns | ||
|
||
# if exclude columns is specified, use the snake case version | ||
if (!is.null(exclude_columns)) { | ||
data <- data %>% | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c( | ||
geog_time_identifiers, | ||
exclude_columns | ||
)), | ||
~ as.character(.) | ||
)) %>% | ||
# replace NAs | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c( | ||
geog_time_identifiers, | ||
exclude_columns | ||
)), | ||
~ dplyr::if_else(is.na(.), replacement_alt, .) | ||
)) | ||
} else { | ||
# if exclude_columns is not specified, then use the saved potential | ||
# location and time columns only | ||
data <- data %>% | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c(geog_time_identifiers)), | ||
~ as.character(.) | ||
)) %>% | ||
# replace NAs | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c(geog_time_identifiers)), | ||
~ dplyr::if_else(is.na(.), replacement_alt, .) | ||
)) | ||
} | ||
|
||
return(data) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
# Get a list of potential location and time columns | ||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
|
||
# create a vector of possible time and geography column names | ||
geog_time_identifiers <- c( | ||
"geographic_level", "country_code", "region_code", "new_la_code", "lad_code", | ||
"pcon_code", "lsip_code", "local_enterprise_partnership_code", | ||
"english_devolved_area_code", "opportunity_area_code", "ward_code", | ||
"trust_id", "sponsor_id", "school_urn", "provider_ukprn", "institution_id", | ||
"planning_area_code", "country_name", "region_name", "la_name", "lad_name", | ||
"rsc_region_lead_name", "pcon_name", "lsip_name", | ||
"local_enterprise_partnership_name", "english_devolved_area_name", | ||
"opportunity_area_name", "ward_name", "trust_name", "sponsor_name", | ||
"school_name", "provider_name", "institution_name", "planning_area_name", | ||
"old_la_code", "school_laestab", "time_period", "time_identifier" | ||
) | ||
|
||
# write it out to the data folder | ||
|
||
usethis::use_data(geog_time_identifiers, overwrite = TRUE) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ rgn | |
sep | ||
ser | ||
shorthands | ||
shorturl | ||
sql | ||
uk | ||
utla | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.