diff --git a/R/AugmentConceptFiles.R b/R/AugmentConceptFiles.R index ccbd656..50139c5 100644 --- a/R/AugmentConceptFiles.R +++ b/R/AugmentConceptFiles.R @@ -36,7 +36,8 @@ augmentConceptFiles <- function(releaseFolder) { if (file.exists(dataQualityResultsFile)) { writeLines("updating concept files with data quality results") dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile) - results <- dataQualityResults$CheckResults + results <- dataQualityResults$CheckResults %>% + dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper) # augment achilles concept files with data quality failure count for relevant concept checks conceptAggregates <- results %>% filter(!is.na(results$CONCEPT_ID) && results$FAILED==1) %>% count(CONCEPT_ID,tolower(CDM_TABLE_NAME)) diff --git a/R/BuildDataQualityHistoryIndex.R b/R/BuildDataQualityHistoryIndex.R index 9a3642d..d2ff9db 100644 --- a/R/BuildDataQualityHistoryIndex.R +++ b/R/BuildDataQualityHistoryIndex.R @@ -32,16 +32,19 @@ buildDataQualityHistoryIndex <- stratified_index <- data.table::data.table() addResultsToIndex <- function(json) { - cdm_source_name <- json$Metadata[1,"CDM_SOURCE_NAME"] - cdm_source_abbreviation <- json$Metadata[1,"CDM_SOURCE_ABBREVIATION"] - vocabulary_version <- json$Metadata[1,"VOCABULARY_VERSION"] - cdm_release_date <- format(lubridate::ymd(json$Metadata[1,"CDM_RELEASE_DATE"]),"%Y-%m-%d") + thisMetadata <- json$Metadata %>% + dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper) + cdm_source_name <- thisMetadata[1,"CDM_SOURCE_NAME"] + cdm_source_abbreviation <- thisMetadata[1,"CDM_SOURCE_ABBREVIATION"] + vocabulary_version <- thisMetadata[1,"VOCABULARY_VERSION"] + cdm_release_date <- format(lubridate::ymd(thisMetadata[1,"CDM_RELEASE_DATE"]),"%Y-%m-%d") count_passed <- as.numeric(json$Overview$countPassed) count_failed <- as.numeric(json$Overview$countOverallFailed) count_total <- count_passed + count_failed dqd_execution_date <- format(lubridate::ymd_hms(json$endTimestamp),"%Y-%m-%d") stratifiedAggregates <- json$CheckResults %>% + dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper) %>% filter(FAILED==1) %>% group_by(CATEGORY, toupper(CDM_TABLE_NAME)) %>% summarise(count_value=n()) diff --git a/R/BuildDataQualityIndex.R b/R/BuildDataQualityIndex.R index 0c4181c..dbef3c7 100644 --- a/R/BuildDataQualityIndex.R +++ b/R/BuildDataQualityIndex.R @@ -38,7 +38,7 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) { # iterate on sources for (sourceFolder in sourceFolders) { - historicalIndex <- AresIndexer::buildDataQualityHistoryIndex(sourceFolder) + historicalIndex <- buildDataQualityHistoryIndex(sourceFolder) historicalFile <- file.path(sourceFolder, "data-quality-index.json") write(jsonlite::toJSON(historicalIndex),historicalFile) @@ -51,7 +51,11 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) { # process each data quality result file if (file.exists(dataQualityResultsFile)) { dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile) - results <- dataQualityResults$CheckResults + results <- dataQualityResults$CheckResults %>% + dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper) + + metadata <- dataQualityResults$Metadata %>% + dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper) # for each release, generate a summary of failures by cdm_table_name domainAggregates <- results %>% filter(FAILED==1) %>% count(tolower(CDM_TABLE_NAME)) @@ -66,11 +70,11 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) { results[,colName] <- NA } sourceFailures <- results[results[,"FAILED"]==1,outColNames] - sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$CDM_SOURCE_NAME - sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION - sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION) - sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y-%m-%d") - sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y%m%d") + sourceFailures$CDM_SOURCE_NAME <- metadata$CDM_SOURCE_NAME + sourceFailures$CDM_SOURCE_ABBREVIATION <- metadata$CDM_SOURCE_ABBREVIATION + sourceFailures$CDM_SOURCE_KEY <- gsub(" ", "_", metadata$CDM_SOURCE_ABBREVIATION) + sourceFailures$RELEASE_NAME <- format(lubridate::ymd(metadata$CDM_RELEASE_DATE),"%Y-%m-%d") + sourceFailures$RELEASE_ID <- format(lubridate::ymd(metadata$CDM_RELEASE_DATE),"%Y%m%d") networkIndex <- rbind(networkIndex, sourceFailures) } else { writeLines(paste("missing data quality result file ",dataQualityResultsFile)) diff --git a/R/BuildNetworkIndex.R b/R/BuildNetworkIndex.R index c047532..e61abb2 100644 --- a/R/BuildNetworkIndex.R +++ b/R/BuildNetworkIndex.R @@ -94,20 +94,23 @@ buildNetworkIndex <- function(sourceFolders, outputFolder) { writeLines(paste("missing observation period results file ", observationPeriodResultsFile)) } - source$cdm_source_name <- dataQualityResults$Metadata$CDM_SOURCE_NAME - source$cdm_source_abbreviation <- dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION + thisMetadata <- dataQualityResults$Metadata %>% + dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper) + + source$cdm_source_name <- thisMetadata$CDM_SOURCE_NAME + source$cdm_source_abbreviation <- thisMetadata$CDM_SOURCE_ABBREVIATION source$cdm_source_key <- gsub(" ", "_", source$cdm_source_abbreviation) - source$cdm_holder <- dataQualityResults$Metadata$CDM_HOLDER - source$source_description <- dataQualityResults$Metadata$SOURCE_DESCRIPTION + source$cdm_holder <- thisMetadata$CDM_HOLDER + source$source_description <- thisMetadata$SOURCE_DESCRIPTION source$releases <- rbind( source$releases, list( - release_name = format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y-%m-%d"), - release_id = format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y%m%d"), - cdm_version = dataQualityResults$Metadata$CDM_VERSION, - vocabulary_version = dataQualityResults$Metadata$VOCABULARY_VERSION, - dqd_version = dataQualityResults$Metadata$DQD_VERSION, + release_name = format(lubridate::ymd(thisMetadata$CDM_RELEASE_DATE),"%Y-%m-%d"), + release_id = format(lubridate::ymd(thisMetadata$CDM_RELEASE_DATE),"%Y%m%d"), + cdm_version = thisMetadata$CDM_VERSION, + vocabulary_version = thisMetadata$VOCABULARY_VERSION, + dqd_version = thisMetadata$DQD_VERSION, count_data_quality_issues = dataQualityResults$Overview$countOverallFailed, count_data_quality_checks = dataQualityResults$Overview$countTotal, dqd_execution_date = format(lubridate::ymd_hms(dataQualityResults$endTimestamp),"%Y-%m-%d"), diff --git a/R/BuildNetworkPerformanceIndex.R b/R/BuildNetworkPerformanceIndex.R index aefa128..70e3fa7 100644 --- a/R/BuildNetworkPerformanceIndex.R +++ b/R/BuildNetworkPerformanceIndex.R @@ -65,9 +65,12 @@ buildNetworkPerformanceIndex <- performanceTable <- merge(x=performanceTable,y=analysisDetails,by="TASK",all.x=TRUE) - dqdTable <- dplyr::select(dqdData, c("CheckResults.checkId", "CheckResults.EXECUTION_TIME", "CheckResults.CATEGORY")) %>% - rename(TASK = CheckResults.checkId, TIMING = CheckResults.EXECUTION_TIME, CATEGORY = CheckResults.CATEGORY) %>% mutate(PACKAGE = "DQD") %>% - mutate_at("TIMING", str_replace, " secs", "") + dqdTable <- dqdData %>% + dplyr::select(TASK = CheckResults.checkId, + TIMING = CheckResults.executionTime, + CATEGORY = CheckResults.category) %>% + dplyr::mutate(PACKAGE = "DQD") %>% + dplyr::mutate_at("TIMING", str_replace, " secs", "") mergedTable <- rbind(performanceTable, dqdTable)