diff --git a/R/LOS_model.R b/R/LOS_model.R index 785fd14..32610d0 100644 --- a/R/LOS_model.R +++ b/R/LOS_model.R @@ -16,7 +16,7 @@ #' \item{Death}{Binary for death status: 0 = survived, 1= died in hospital} #' } #' -#' @source Generated by Chris Mainey \email{chris.mainey@uhb.nhs.uk}, Feb-2019 +#' @source Generated by Chris Mainey, Feb-2019 #' #' @usage data(LOS_model) #' diff --git a/R/ae_attendances.R b/R/ae_attendances.R index f7199b3..0b4d7fa 100644 --- a/R/ae_attendances.R +++ b/R/ae_attendances.R @@ -4,8 +4,10 @@ #' in England for the years 2016/17 through 2018/19 (Apr-Mar). The data has been #' tidied to be easily usable within the tidyverse of packages. #' -#' Data sourced from \href{https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/}{NHS England Statistical Work Areas} -#' which is available under the \href{https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/}{Open Government Licence v3.0} +#' Data sourced from +#' \href{https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/}{NHS England Statistical Work Areas} +#' which is available under the +#' \href{https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/}{Open Government Licence v3.0} #' #' @docType data #' @@ -14,12 +16,18 @@ #' @format Tibble with six columns #' \describe{ #' \item{period}{The month that this data relates to} -#' \item{org_code}{The \href{https://digital.nhs.uk/services/organisation-data-service}{ODS} code for this provider} -#' \item{type}{The \href{https://web.archive.org/web/20200128111444/https://www.datadictionary.nhs.uk/data_dictionary/attributes/a/acc/accident_and_emergency_department_type_de.asp}{department type}. +#' \item{org_code}{The +#' \href{https://digital.nhs.uk/services/organisation-data-service}{ODS} +#' code for this provider} +#' \item{type}{The +#' \href{https://web.archive.org/web/20200128111444/https://www.datadictionary.nhs.uk/data_dictionary/attributes/a/acc/accident_and_emergency_department_type_de.asp}{department type}. #' either 1, 2 or other} -#' \item{attendances}{the number of patients who attended this department in this month} -#' \item{breaches}{the number of patients who breaches the 4 hour target in this month} -#' \item{admissions}{the number of patients admitted from A&E to the hospital in this month} +#' \item{attendances}{the number of patients who attended this department in +#' this month} +#' \item{breaches}{the number of patients who breaches the 4 hour target in +#' this month} +#' \item{admissions}{the number of patients admitted from A&E to the hospital +#' in this month} #' } #' #' @source \href{https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/}{NHS England Statistical Work Areas} @@ -28,6 +36,7 @@ #' #' @examples #' data(ae_attendances) +#' #' library(dplyr) #' library(ggplot2) #' library(scales) diff --git a/R/covid19.R b/R/covid19.R index 7abd2b6..86886fe 100644 --- a/R/covid19.R +++ b/R/covid19.R @@ -42,15 +42,21 @@ #' library(scales) #' #' # Create a plot of the performance for England over time -#' covid19 %>% +#' covid19 |> #' filter(countries_and_territories == -#' c("United_Kingdom", "Italy", "France", "Germany", "Spain")) %>% -#' ggplot(aes(x = date_reported, y = cases, col = countries_and_territories)) + +#' c("United_Kingdom", "Italy", "France", "Germany", "Spain")) |> +#' ggplot(aes( +#' x = date_reported, +#' y = cases, +#' col = countries_and_territories +#' )) + #' geom_line() + #' scale_color_discrete("Country") + #' scale_y_continuous(labels = comma) + #' labs( -#' y = "Cases", x = "Date", title = "Covid-19 cases for selected countries", +#' y = "Cases", +#' x = "Date", +#' title = "Covid-19 cases for selected countries", #' alt = "A plot of covid-19 cases in France, Germany, Italy, Spain & the UK" #' ) + #' theme_minimal() diff --git a/R/ons_mortality.R b/R/ons_mortality.R index 5f93161..5f44880 100644 --- a/R/ons_mortality.R +++ b/R/ons_mortality.R @@ -1,10 +1,14 @@ #' Deaths registered weekly in England and Wales, provisional #' -#' Provisional counts of the number of deaths registered in England and Wales, by age, sex and region, in the latest weeks for which data are available. +#' Provisional counts of the number of deaths registered in England and Wales, +#' by age, sex and region, from week commencing 8th January 2010 to +#' 3rd April 202. #' #' Source and licence acknowledgement -#' This data has been made available through Office of National Statistics under the Open Government -#' Licence \url{http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/} +#' +#' This data has been made available through Office of National Statistics under +#' the Open Government Licence +#' \url{http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/} #' #' #' @docType data @@ -13,14 +17,19 @@ #' #' @format Data frame with five columns #' \describe{ -#' \item{category_1}{character, containing the names of the groups for counts, for example "Total deaths", "all ages".} -#' \item{category_2}{character, subcategory of names of groups where necessary, for example details of region: "East", details of age bands "15-44".} -#' \item{counts}{numeric, numbers of deaths in whole numbers and average numbers with decimal points. To retain the integrity of the format this column data is left as character.} +#' \item{category_1}{character, containing the names of the groups for counts, +#' for example "Total deaths", "all ages".} +#' \item{category_2}{character, subcategory of names of groups where necessary, +#' for example details of region: "East", details of age bands "15-44".} +#' \item{counts}{numeric, numbers of deaths in whole numbers and average numbers +#' with decimal points. To retain the integrity of the format this column data +#' is left as character.} #' \item{date}{date, format is yyyy-mm-dd; all dates are a Friday.} #' \item{week_no}{integer, each week in a year is numbered sequentially.} #' } #' -#' @source Collected by Zoë Turner \email{zoe.turner3@nhs.net}, Apr-2020 from \url{https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales} +#' @source Collected by Zoë Turner, Apr-2020 from +#' \url{https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales} #' #' @usage data(ons_mortality) #' diff --git a/R/stranded_patient_model.R b/R/stranded_patient_model.R index 795ec48..8862224 100644 --- a/R/stranded_patient_model.R +++ b/R/stranded_patient_model.R @@ -1,6 +1,7 @@ #' Stranded Patient (Patients flagged as having a greater than 7 day LOS) Model #' -#' This model is to be used as a machine learning classification model, for supervised learning. The binary outcome is stranded vs not stranded patients. +#' This model is to be used as a machine learning classification model, for +#' supervised learning. The binary outcome is stranded vs not stranded patients. #' #' #' @docType data @@ -12,12 +13,16 @@ #' \item{stranded.label}{Outcome variable - whether the patient is stranded or not} #' \item{age}{Patient age on admission} #' \item{care.home.referral}{Whether than have been referred from a care home} -#' \item{medicallysafe}{Medically safe for discharge - means the patient is assessed as safe, but has not been discharged yet} -#' \item{hcop}{Indicates whether they have been triaged from a Health Care for Older People specialty} -#' \item{mental_health_care}{Flag to indicate whether they need mental health support and care} +#' \item{medicallysafe}{Medically safe for discharge - means the patient is +#' assessed as safe, but has not been discharged yet} +#' \item{hcop}{Indicates whether they have been triaged from a Health Care for +#' Older People specialty} +#' \item{mental_health_care}{Flag to indicate whether they need mental health +#' support and care} #' \item{periods_of_previous_care}{Count of the number of previous spells of care} #' \item{admit_date}{Date they were admitted to hospital} -#' \item{frailty_index}{An initial index assessment to say if the patient is frail or not. This is needed for alignment of service provision.} +#' \item{frailty_index}{An initial index assessment to say if the patient is +#' frail or not. This is needed for alignment of service provision.} #' } #' #' @source Synthetically generated by Gary Hutson, Mar-2021. @@ -25,10 +30,11 @@ #' @usage data(stranded_data) #' #' @examples -#' library(magrittr) #' library(dplyr) -#' data("stranded_data") -#' stranded_data %>% +#' +#' data(stranded_data) +#' +#' stranded_data |> #' glimpse() #' "stranded_data" diff --git a/R/synthetic_news_data.R b/R/synthetic_news_data.R index 558aa0e..962a77d 100644 --- a/R/synthetic_news_data.R +++ b/R/synthetic_news_data.R @@ -1,7 +1,8 @@ #' Synthetic National Early Warning Scores Data #' #' Synthetic NEWS data to show as the results of the NHSR_synpop package. -#' These datasets have been synthetically generated by this package to be utilised in the NHSRDatasets package. +#' These datasets have been synthetically generated by this package to be +#' utilised in the NHSRDatasets package. #' #' @docType data #' @@ -28,9 +29,11 @@ #' @usage data(synthetic_news_data) #' #' @examples -#' library(magrittr) +#' #' library(dplyr) +#' #' data("synthetic_news_data") -#' synthetic_news_data %>% +#' +#' synthetic_news_data |> #' glimpse() "synthetic_news_data" diff --git a/README.Rmd b/README.Rmd index 1d8c0d5..a083e50 100644 --- a/README.Rmd +++ b/README.Rmd @@ -11,10 +11,10 @@ knitr::opts_chunk$set( ) ``` -# NHS-R Community Datasets +# NHS-R Community Datasets -[![All Contributors](https://img.shields.io/badge/all_contributors-34-orange.svg?style=flat-square)](#contributors-) +[![All Contributors](https://img.shields.io/github/all-contributors/projectOwner/projectName?color=ee8449&style=flat-square)](#contributors) @@ -50,8 +50,8 @@ To install the development version from [GitHub](https://github.com/) with: remotes::install_github("nhs-r-community/NHSRdatasets") ``` -Once installed go to the Getting Started article from the [website](https://nhs-r-community.github.io/NHSRdatasets) and the same link can -be found on the top right of the GitHub Repository. +Once installed go to the Get Started article from the [website](https://nhs-r-community.github.io/NHSRdatasets) (the same link can +be found on the top right of the GitHub Repository). ## Datasets available diff --git a/README.md b/README.md index 14cd36c..be908d2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,11 @@ -# NHS-R Community Datasets +# NHS-R Community Datasets + + + +[![All +Contributors](https://img.shields.io/github/all-contributors/projectOwner/projectName?color=ee8449&style=flat-square)](#contributors) + diff --git a/data-raw/ons-mortality.R b/data-raw/ons-mortality.R index 552f351..65f8c81 100644 --- a/data-raw/ons-mortality.R +++ b/data-raw/ons-mortality.R @@ -15,77 +15,89 @@ download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2019/publishedweek522019.xls", destfile = "2019Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2018 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2018/publishedweek522018withupdatedrespiratoryrow.xls", destfile = "2018Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2017 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2017/publishedweek522017.xls", destfile = "2017Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2016 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2016/publishedweek522016.xls", destfile = "2016Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2015 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2015/publishedweek2015.xls", destfile = "2015Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2014 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2014/publishedweek2014.xls", destfile = "2014Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2013 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2013/publishedweek2013.xls", destfile = "2013Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2012 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2012/publishedweek2012.xls", destfile = "2012Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2011 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2011/publishedweek2011.xls", destfile = "2011Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) # 2010 download.file( "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2010/publishedweek2010.xls", destfile = "2010Mortality.xls", method = "wininet", - mode = "wb") + mode = "wb" +) library(readxl) library(dplyr) -files_list <- list.files(path = "working_files", - pattern = "*.xls", - full.names = TRUE) +files_list <- list.files( + path = "working_files", + pattern = "*.xls", + full.names = TRUE +) read_then_csv <- function(sheet, path) { @@ -98,8 +110,7 @@ read_then_csv <- function(sheet, path) { } -for(j in 1:length(files_list)){ - +for (j in 1:length(files_list)) { path <- paste0(files_list[j]) path %>% @@ -111,13 +122,13 @@ for(j in 1:length(files_list)){ ## Loading the weekly figure worksheets -files_list_sheets <- list.files(path = "working_files", - pattern = "Weekly", - full.names = TRUE - ) - -for(i in files_list_sheets) { +files_list_sheets <- list.files( + path = "working_files", + pattern = "Weekly", + full.names = TRUE +) +for (i in files_list_sheets) { x <- read_csv((i), col_types = cols(.default = col_character())) assign(i, x) @@ -131,115 +142,127 @@ for(i in files_list_sheets) { # Column names that are not related to data points to be removed. This is the same for all years' spreadsheets. # Note that single quotes are used for the categories as one sentence includes '' in the text (4). -remove_lookup <- c('week over the previous five years1', - 'Deaths by underlying cause2,3', - 'Footnotes', - '1 This average is based on the actual number of death registrations recorded for each corresponding week over the previous five years. Moveable public holidays, when register offices are closed, affect the number of registrations made in the published weeks and in the corresponding weeks in previous years.', - '2 Counts of deaths by underlying cause exclude deaths at age under 28 days.', - '3 Coding of deaths by underlying cause for the latest week is not yet complete.', - "4Does not include deaths where age is either missing or not yet fully coded. For this reason counts of 'Persons', 'Males' and 'Females' may not sum to 'Total Deaths, all ages'.", - '5 Does not include deaths of those resident outside England and Wales or those records where the place of residence is either missing or not yet fully coded. For this reason counts for "Deaths by Region of usual residence" may not sum to "Total deaths, all ages".', - 'Source: Office for National Statistics', - 'Deaths by age group' +remove_lookup <- c( + "week over the previous five years1", + "Deaths by underlying cause2,3", + "Footnotes", + "1 This average is based on the actual number of death registrations recorded for each corresponding week over the previous five years. Moveable public holidays, when register offices are closed, affect the number of registrations made in the published weeks and in the corresponding weeks in previous years.", + "2 Counts of deaths by underlying cause exclude deaths at age under 28 days.", + "3 Coding of deaths by underlying cause for the latest week is not yet complete.", + "4Does not include deaths where age is either missing or not yet fully coded. For this reason counts of 'Persons', 'Males' and 'Females' may not sum to 'Total Deaths, all ages'.", + '5 Does not include deaths of those resident outside England and Wales or those records where the place of residence is either missing or not yet fully coded. For this reason counts for "Deaths by Region of usual residence" may not sum to "Total deaths, all ages".', + "Source: Office for National Statistics", + "Deaths by age group" ) -formatFunction <- function(file){ - +formatFunction <- function(file) { ONS <- file %>% - clean_names %>% - remove_empty(c("rows","cols")) %>% + clean_names() %>% + remove_empty(c("rows", "cols")) %>% filter(!contents %in% remove_lookup) %>% - # useful categories are found in the column contents but also include the footnote number - mutate(Category = case_when(is.na(x2) & str_detect(contents, "Region") ~ "Region", - is.na(x2) & str_detect(contents, "Persons") ~ "Persons", - is.na(x2) & str_detect(contents, "Females") ~ "Females", - is.na(x2) & str_detect(contents, "Males") ~ "Males") - ) %>% + mutate(Category = case_when( + is.na(x2) & str_detect(contents, "Region") ~ "Region", + is.na(x2) & str_detect(contents, "Persons") ~ "Persons", + is.na(x2) & str_detect(contents, "Females") ~ "Females", + is.na(x2) & str_detect(contents, "Males") ~ "Males" + )) %>% select(contents, Category, everything()) %>% - # to ensure data like Persons, Males and Females fill(Category) %>% - # categories with Persons, Males and Females in the original column do not correspond directly to data points (wide form data) so are removed by referring to str_detect to find the word - filter(!str_detect(contents, "Persons"), - !str_detect(contents, "Males"), - !str_detect(contents, "Females")) %>% - + filter( + !str_detect(contents, "Persons"), + !str_detect(contents, "Males"), + !str_detect(contents, "Females") + ) %>% # the two columns for Category and contents are merged to Categories to bring the Category column first. Some categories don't have subcategories and these are preceded by NA_ with this merge unite("Categories", Category, contents) %>% filter(Categories != "Region_Deaths by Region of usual residence 5") %>% - # the NA_ is removed from some of the category names - mutate(Categories = case_when(str_detect(Categories, "NA_") ~ str_replace(Categories, "NA_", ""), - TRUE ~ Categories)) + mutate(Categories = case_when( + str_detect(Categories, "NA_") ~ str_replace(Categories, "NA_", ""), + TRUE ~ Categories + )) # Push date row to column names onsFormattedJanitor <- row_to_names(ONS, 3) # move data from wide to long form using pivot_longer x <- onsFormattedJanitor %>% - pivot_longer(cols = -`Week ended`, - names_to = "allDates", - values_to = "counts") %>% - + pivot_longer( + cols = -`Week ended`, + names_to = "allDates", + values_to = "counts" + ) %>% # some spreadsheets import with Excel serial numbers for dates and others as dates, janitor is used to correct this - mutate(realDate = dmy(allDates), - ExcelSerialDate = case_when(stri_length(allDates) == 5 ~ excel_numeric_to_date(as.numeric(allDates), date_system = "modern")), - date = case_when(is.na(realDate) ~ ExcelSerialDate, - TRUE ~ realDate)) %>% + mutate( + realDate = dmy(allDates), + ExcelSerialDate = case_when(stri_length(allDates) == 5 ~ excel_numeric_to_date(as.numeric(allDates), date_system = "modern")), + date = case_when( + is.na(realDate) ~ ExcelSerialDate, + TRUE ~ realDate + ) + ) %>% group_by(`Week ended`) %>% - # the week number is replaced as this was lost with the moving of dates to the column headers mutate(week_no = row_number()) %>% ungroup() %>% - # Category is a staging name as this is followed by a splitting of the column into category_1 and category_2 rename(Category = `Week ended`) %>% - # to split the columns there are various characters used as a split point ",", "-", and ":" in the respiratory category the version is denoted by "v" - mutate(category_1 = case_when(str_detect(Category, ",") ~ - substr(Category,1,str_locate(Category, ",") -1), - str_detect(Category, ":") ~ - substr(Category,1,str_locate(Category, ":") -1), - str_detect(Category, "_") ~ - substr(Category,1,str_locate(Category, "_") -1), - str_detect(Category, "respiratory") ~ - "All respiratory diseases (ICD-10 J00-J99) ICD-10"), - category_2 = case_when(str_detect(Category, ",") ~ - substr(Category,str_locate(Category, ", ") +2, str_length(Category)), - str_detect(Category, ":") ~ - substr(Category,str_locate(Category, ": ") +2, str_length(Category)), - str_detect(Category, "_") ~ - substr(Category,str_locate(Category, "_") +1, str_length(Category)), - str_detect(Category, "respiratory") ~ - substr(Category,str_locate(Category, "v"), str_length(Category)) ), - - # the data for Total deaths: average of corresponding week over the previous 5 years is split over two cells in the spreasheet - category_2 = recode(category_2, - "average of corresponding" = "average of same week over 5 years")) %>% - select(category_1, - category_2, - counts, - date, - week_no + mutate( + category_1 = case_when( + str_detect(Category, ",") ~ + substr(Category, 1, str_locate(Category, ",") - 1), + str_detect(Category, ":") ~ + substr(Category, 1, str_locate(Category, ":") - 1), + str_detect(Category, "_") ~ + substr(Category, 1, str_locate(Category, "_") - 1), + str_detect(Category, "respiratory") ~ + "All respiratory diseases (ICD-10 J00-J99) ICD-10" + ), + category_2 = case_when( + str_detect(Category, ",") ~ + substr(Category, str_locate(Category, ", ") + 2, str_length(Category)), + str_detect(Category, ":") ~ + substr(Category, str_locate(Category, ": ") + 2, str_length(Category)), + str_detect(Category, "_") ~ + substr(Category, str_locate(Category, "_") + 1, str_length(Category)), + str_detect(Category, "respiratory") ~ + substr(Category, str_locate(Category, "v"), str_length(Category)) + ), + + # the data for Total deaths: average of corresponding week over the previous 5 years is split over two cells in the spreasheet + category_2 = recode(category_2, + "average of corresponding" = "average of same week over 5 years" + ) + ) %>% + select( + category_1, + category_2, + counts, + date, + week_no ) %>% - # 2011 requires this code to remove rows where there are no counts and because there are 2 rows relating to respiratory death numbers (see the Respiratory section) the previous methodology has been included in the spreadsheet with : for counts. This code does not affect other years' data/ - filter(!is.na(counts), - counts != ":") %>% + filter( + !is.na(counts), + counts != ":" + ) %>% fill(category_1) return(x) - } Mortality2010 <- formatFunction(`working_files/Weekly/2010Mortality-Weekly Figures 2010.csv`) # 2011 has two lines relating to respiratory, v 2001 only has one data point and the rest of the year is 2010 Mortality2011 <- formatFunction(`working_files/Weekly/2011Mortality-Weekly Figures 2011.csv`) %>% - mutate(category_2 = case_when(is.na(category_2) & category_1 == "All respiratory diseases (ICD-10 J00-J99) ICD-10" ~ "v 2010", - TRUE ~ category_2)) + mutate(category_2 = case_when( + is.na(category_2) & category_1 == "All respiratory diseases (ICD-10 J00-J99) ICD-10" ~ "v 2010", + TRUE ~ category_2 + )) Mortality2012 <- formatFunction(`working_files/Weekly/2012Mortality-Weekly Figures 2012.csv`) Mortality2013 <- formatFunction(`working_files/Weekly/2013Mortality-Weekly Figures 2013.csv`) @@ -249,79 +272,96 @@ Mortality2015 <- formatFunction(`working_files/Weekly/2015Mortality-Weekly Figur #### Format data 2016 - 2019 -formatFunction2016 <- function(file){ - +formatFunction2016 <- function(file) { ONS <- file %>% - clean_names %>% - + clean_names() %>% # An extra column has been added for region codes (not included in the dataset) meaning contents are now found in the janitor generated column name x2 - mutate(x2 = case_when(is.na(x2) ~ contents, - TRUE ~ x2)) %>% - remove_empty(c("rows","cols")) %>% + mutate(x2 = case_when( + is.na(x2) ~ contents, + TRUE ~ x2 + )) %>% + remove_empty(c("rows", "cols")) %>% select(-contents) %>% filter(!x2 %in% remove_lookup) %>% - # Region has changed to region - mutate(Category = case_when(is.na(x3) & str_detect(x2, "region") ~ "Region", - is.na(x3) & str_detect(x2, "Persons") ~ "Persons", - is.na(x3) & str_detect(x2, "Females") ~ "Females", - is.na(x3) & str_detect(x2, "Males") ~ "Males", - TRUE ~ NA_character_) - ) %>% + mutate(Category = case_when( + is.na(x3) & str_detect(x2, "region") ~ "Region", + is.na(x3) & str_detect(x2, "Persons") ~ "Persons", + is.na(x3) & str_detect(x2, "Females") ~ "Females", + is.na(x3) & str_detect(x2, "Males") ~ "Males", + TRUE ~ NA_character_ + )) %>% select(x2, Category, everything()) %>% fill(Category) %>% - filter(!str_detect(x2, 'Persons'), - !str_detect(x2, 'Males'), - !str_detect(x2, 'Females')) %>% + filter( + !str_detect(x2, "Persons"), + !str_detect(x2, "Males"), + !str_detect(x2, "Females") + ) %>% unite("Categories", Category, x2) %>% - filter(Categories != 'Region_Deaths by Region of usual residence 5') %>% - mutate(Categories = case_when(str_detect(Categories, "NA_") ~ str_replace(Categories, "NA_", ""), - TRUE ~ Categories)) + filter(Categories != "Region_Deaths by Region of usual residence 5") %>% + mutate(Categories = case_when( + str_detect(Categories, "NA_") ~ str_replace(Categories, "NA_", ""), + TRUE ~ Categories + )) # Push date row to column names onsFormattedJanitor <- row_to_names(ONS, 3) # move data from wide to long form using pivot_longer x <- onsFormattedJanitor %>% - pivot_longer(cols = -`Week ended`, - names_to = "allDates", - values_to = "counts") %>% - mutate(realDate = dmy(allDates), - ExcelSerialDate = case_when(stri_length(allDates) == 5 ~ excel_numeric_to_date(as.numeric(allDates), date_system = "modern")), - date = case_when(is.na(realDate) ~ ExcelSerialDate, - TRUE ~ realDate)) %>% + pivot_longer( + cols = -`Week ended`, + names_to = "allDates", + values_to = "counts" + ) %>% + mutate( + realDate = dmy(allDates), + ExcelSerialDate = case_when(stri_length(allDates) == 5 ~ excel_numeric_to_date(as.numeric(allDates), date_system = "modern")), + date = case_when( + is.na(realDate) ~ ExcelSerialDate, + TRUE ~ realDate + ) + ) %>% group_by(`Week ended`) %>% mutate(week_no = row_number()) %>% ungroup() %>% rename(Category = `Week ended`) %>% - mutate(category_1 = case_when(str_detect(Category, ",") ~ - substr(Category,1,str_locate(Category, ",") -1), - str_detect(Category, ":") ~ - substr(Category,1,str_locate(Category, ":") -1), - str_detect(Category, "_") ~ - substr(Category,1,str_locate(Category, "_") -1), - str_detect(Category, "respiratory") ~ - "All respiratory diseases (ICD-10 J00-J99) ICD-10"), - category_2 = case_when(str_detect(Category, ",") ~ - substr(Category,str_locate(Category, ", ") +2, str_length(Category)), - str_detect(Category, ":") ~ - substr(Category,str_locate(Category, ": ") +2, str_length(Category)), - str_detect(Category, "_") ~ - substr(Category,str_locate(Category, "_") +1, str_length(Category)), - str_detect(Category, "respiratory") ~ - substr(Category,str_locate(Category, "v"), str_length(Category)) ), - category_2 = recode(category_2, - "average of corresponding" = "average of same week over 5 years")) %>% - select(category_1, - category_2, - counts, - date, - week_no + mutate( + category_1 = case_when( + str_detect(Category, ",") ~ + substr(Category, 1, str_locate(Category, ",") - 1), + str_detect(Category, ":") ~ + substr(Category, 1, str_locate(Category, ":") - 1), + str_detect(Category, "_") ~ + substr(Category, 1, str_locate(Category, "_") - 1), + str_detect(Category, "respiratory") ~ + "All respiratory diseases (ICD-10 J00-J99) ICD-10" + ), + category_2 = case_when( + str_detect(Category, ",") ~ + substr(Category, str_locate(Category, ", ") + 2, str_length(Category)), + str_detect(Category, ":") ~ + substr(Category, str_locate(Category, ": ") + 2, str_length(Category)), + str_detect(Category, "_") ~ + substr(Category, str_locate(Category, "_") + 1, str_length(Category)), + str_detect(Category, "respiratory") ~ + substr(Category, str_locate(Category, "v"), str_length(Category)) + ), + category_2 = recode(category_2, + "average of corresponding" = "average of same week over 5 years" + ) + ) %>% + select( + category_1, + category_2, + counts, + date, + week_no ) %>% filter(!is.na(counts)) return(x) - } @@ -343,7 +383,8 @@ ons_mortality <- do.call("rbind", list( Mortality2016, Mortality2017, Mortality2018, - Mortality2019)) + Mortality2019 +)) ## Load the data @@ -356,13 +397,13 @@ ons_mortality %>% mutate(year = year(date)) %>% select(year, category_1, category_2) %>% group_by(year, category_1, category_2) %>% - filter(category_1 == 'All respiratory diseases (ICD-10 J00-J99) ICD-10') %>% + filter(category_1 == "All respiratory diseases (ICD-10 J00-J99) ICD-10") %>% slice(1) ons_mortality %>% select(category_1, category_2) %>% group_by(category_1, category_2) %>% - filter(category_1 %in% c('Persons','Females','Males')) %>% + filter(category_1 %in% c("Persons", "Females", "Males")) %>% slice(1) unlink("working_files", recursive = TRUE) diff --git a/man/LOS_model.Rd b/man/LOS_model.Rd index fda7dad..fafaf4e 100644 --- a/man/LOS_model.Rd +++ b/man/LOS_model.Rd @@ -15,7 +15,7 @@ Data frame with five columns } } \source{ -Generated by Chris Mainey \email{chris.mainey@uhb.nhs.uk}, Feb-2019 +Generated by Chris Mainey, Feb-2019 } \usage{ data(LOS_model) diff --git a/man/ae_attendances.Rd b/man/ae_attendances.Rd index 3dcc5ee..44a2a5b 100644 --- a/man/ae_attendances.Rd +++ b/man/ae_attendances.Rd @@ -8,12 +8,18 @@ Tibble with six columns \describe{ \item{period}{The month that this data relates to} -\item{org_code}{The \href{https://digital.nhs.uk/services/organisation-data-service}{ODS} code for this provider} -\item{type}{The \href{https://web.archive.org/web/20200128111444/https://www.datadictionary.nhs.uk/data_dictionary/attributes/a/acc/accident_and_emergency_department_type_de.asp}{department type}. +\item{org_code}{The +\href{https://digital.nhs.uk/services/organisation-data-service}{ODS} +code for this provider} +\item{type}{The +\href{https://web.archive.org/web/20200128111444/https://www.datadictionary.nhs.uk/data_dictionary/attributes/a/acc/accident_and_emergency_department_type_de.asp}{department type}. either 1, 2 or other} -\item{attendances}{the number of patients who attended this department in this month} -\item{breaches}{the number of patients who breaches the 4 hour target in this month} -\item{admissions}{the number of patients admitted from A&E to the hospital in this month} +\item{attendances}{the number of patients who attended this department in +this month} +\item{breaches}{the number of patients who breaches the 4 hour target in +this month} +\item{admissions}{the number of patients admitted from A&E to the hospital +in this month} } } \source{ @@ -28,11 +34,14 @@ in England for the years 2016/17 through 2018/19 (Apr-Mar). The data has been tidied to be easily usable within the tidyverse of packages. } \details{ -Data sourced from \href{https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/}{NHS England Statistical Work Areas} -which is available under the \href{https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/}{Open Government Licence v3.0} +Data sourced from +\href{https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/}{NHS England Statistical Work Areas} +which is available under the +\href{https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/}{Open Government Licence v3.0} } \examples{ data(ae_attendances) + library(dplyr) library(ggplot2) library(scales) diff --git a/man/covid19.Rd b/man/covid19.Rd index 7b360eb..a20523c 100644 --- a/man/covid19.Rd +++ b/man/covid19.Rd @@ -47,15 +47,21 @@ library(ggplot2) library(scales) # Create a plot of the performance for England over time -covid19 \%>\% +covid19 |> filter(countries_and_territories == - c("United_Kingdom", "Italy", "France", "Germany", "Spain")) \%>\% - ggplot(aes(x = date_reported, y = cases, col = countries_and_territories)) + + c("United_Kingdom", "Italy", "France", "Germany", "Spain")) |> + ggplot(aes( + x = date_reported, + y = cases, + col = countries_and_territories + )) + geom_line() + scale_color_discrete("Country") + scale_y_continuous(labels = comma) + labs( - y = "Cases", x = "Date", title = "Covid-19 cases for selected countries", + y = "Cases", + x = "Date", + title = "Covid-19 cases for selected countries", alt = "A plot of covid-19 cases in France, Germany, Italy, Spain & the UK" ) + theme_minimal() diff --git a/man/ons_mortality.Rd b/man/ons_mortality.Rd index 5d6de74..7859e0e 100644 --- a/man/ons_mortality.Rd +++ b/man/ons_mortality.Rd @@ -7,26 +7,35 @@ \format{ Data frame with five columns \describe{ -\item{category_1}{character, containing the names of the groups for counts, for example "Total deaths", "all ages".} -\item{category_2}{character, subcategory of names of groups where necessary, for example details of region: "East", details of age bands "15-44".} -\item{counts}{numeric, numbers of deaths in whole numbers and average numbers with decimal points. To retain the integrity of the format this column data is left as character.} +\item{category_1}{character, containing the names of the groups for counts, +for example "Total deaths", "all ages".} +\item{category_2}{character, subcategory of names of groups where necessary, +for example details of region: "East", details of age bands "15-44".} +\item{counts}{numeric, numbers of deaths in whole numbers and average numbers +with decimal points. To retain the integrity of the format this column data +is left as character.} \item{date}{date, format is yyyy-mm-dd; all dates are a Friday.} \item{week_no}{integer, each week in a year is numbered sequentially.} } } \source{ -Collected by Zoë Turner \email{zoe.turner3@nhs.net}, Apr-2020 from \url{https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales} +Collected by Zoë Turner, Apr-2020 from +\url{https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales} } \usage{ data(ons_mortality) } \description{ -Provisional counts of the number of deaths registered in England and Wales, by age, sex and region, in the latest weeks for which data are available. +Provisional counts of the number of deaths registered in England and Wales, +by age, sex and region, from week commencing 8th January 2010 to +3rd April 202. } \details{ Source and licence acknowledgement -This data has been made available through Office of National Statistics under the Open Government -Licence \url{http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/} + +This data has been made available through Office of National Statistics under +the Open Government Licence +\url{http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/} } \examples{ data(ons_mortality) diff --git a/man/stranded_data.Rd b/man/stranded_data.Rd index f39b016..74b108a 100644 --- a/man/stranded_data.Rd +++ b/man/stranded_data.Rd @@ -10,12 +10,16 @@ Tibble with nine columns (1 x outcome and 8 predictors) \item{stranded.label}{Outcome variable - whether the patient is stranded or not} \item{age}{Patient age on admission} \item{care.home.referral}{Whether than have been referred from a care home} -\item{medicallysafe}{Medically safe for discharge - means the patient is assessed as safe, but has not been discharged yet} -\item{hcop}{Indicates whether they have been triaged from a Health Care for Older People specialty} -\item{mental_health_care}{Flag to indicate whether they need mental health support and care} +\item{medicallysafe}{Medically safe for discharge - means the patient is +assessed as safe, but has not been discharged yet} +\item{hcop}{Indicates whether they have been triaged from a Health Care for +Older People specialty} +\item{mental_health_care}{Flag to indicate whether they need mental health +support and care} \item{periods_of_previous_care}{Count of the number of previous spells of care} \item{admit_date}{Date they were admitted to hospital} -\item{frailty_index}{An initial index assessment to say if the patient is frail or not. This is needed for alignment of service provision.} +\item{frailty_index}{An initial index assessment to say if the patient is +frail or not. This is needed for alignment of service provision.} } } \source{ @@ -25,13 +29,15 @@ Synthetically generated by Gary Hutson, Mar-2021. data(stranded_data) } \description{ -This model is to be used as a machine learning classification model, for supervised learning. The binary outcome is stranded vs not stranded patients. +This model is to be used as a machine learning classification model, for +supervised learning. The binary outcome is stranded vs not stranded patients. } \examples{ -library(magrittr) library(dplyr) -data("stranded_data") -stranded_data \%>\% + +data(stranded_data) + +stranded_data |> glimpse() } diff --git a/man/synthetic_news_data.Rd b/man/synthetic_news_data.Rd index a04aa89..55bb05b 100644 --- a/man/synthetic_news_data.Rd +++ b/man/synthetic_news_data.Rd @@ -29,13 +29,16 @@ data(synthetic_news_data) } \description{ Synthetic NEWS data to show as the results of the NHSR_synpop package. -These datasets have been synthetically generated by this package to be utilised in the NHSRDatasets package. +These datasets have been synthetically generated by this package to be +utilised in the NHSRDatasets package. } \examples{ -library(magrittr) + library(dplyr) + data("synthetic_news_data") -synthetic_news_data \%>\% + +synthetic_news_data |> glimpse() } \keyword{datasets} diff --git a/vignettes/NHSRdatasets.Rmd b/vignettes/NHSRdatasets.Rmd index f619137..6dbd943 100644 --- a/vignettes/NHSRdatasets.Rmd +++ b/vignettes/NHSRdatasets.Rmd @@ -27,7 +27,7 @@ library(NHSRdatasets) To view the datasets (which is useful as CRAN and the GitHub versions may differ), type: -```{r} +``` NHSRdatasets:: ``` @@ -66,7 +66,7 @@ Tibbles in an RMarkdown or Quarto output, like this vignette, will print every r To restrict the number of rows in an RMarkdown output like html the code will have to be: ```{r eval=TRUE} -NHSRdatasets::ae_attendances |> +NHSRdatasets::ae_attendances |> head(n = 2) ``` @@ -84,6 +84,13 @@ dat <- NHSRdatasets::ae_attendances A new object called `dat` will appear in the Environment tab of the top right pane of RStudio (if you have the default layout). The object also says it has 12765 obs (which are rows) and 6 variables (which are columns). + +Using code to see something similar but in the Console in RStudio type: + +```{r} +glimpse(dat) +``` + Clicking on the blue circle with a white arrow next to the word `dat` will expand the view in the Environment tab and clicking on the word `dat` will open the data in a new tab in the top left panel. If you are not using RStudio and want to use the Console where code is run directly use: