From 05e79a3adfe4a210c8a4bdc9d323a165bde79046 Mon Sep 17 00:00:00 2001 From: Kyrylo Simonov Date: Wed, 26 Apr 2023 11:23:55 -0500 Subject: [PATCH] Workaround for changed column names in DQD 2.1 See issue OHDSI/AresIndexer#30. The fix is borrowed from https://github.com/OHDSI/AresIndexer/pull/35. --- R/AugmentConceptFiles.R | 2 +- R/BuildDataQualityHistoryIndex.R | 12 ++++++------ R/BuildDataQualityIndex.R | 23 ++++++++++++++++------- R/BuildNetworkIndex.R | 18 +++++++++--------- R/BuildNetworkPerformanceIndex.R | 4 ++-- 5 files changed, 34 insertions(+), 25 deletions(-) diff --git a/R/AugmentConceptFiles.R b/R/AugmentConceptFiles.R index ccbd656..81357d5 100644 --- a/R/AugmentConceptFiles.R +++ b/R/AugmentConceptFiles.R @@ -39,7 +39,7 @@ augmentConceptFiles <- function(releaseFolder) { results <- dataQualityResults$CheckResults # augment achilles concept files with data quality failure count for relevant concept checks - conceptAggregates <- results %>% filter(!is.na(results$CONCEPT_ID) && results$FAILED==1) %>% count(CONCEPT_ID,tolower(CDM_TABLE_NAME)) + conceptAggregates <- results %>% filter(!is.na(results$conceptId) && results$failed==1) %>% count(conceptId,tolower(cdmTableName)) names(conceptAggregates) <- c("concept_id","cdm_table_name", "count_failed") writeLines(paste0(nrow(conceptAggregates), " concept level data quality issues found.")) if (nrow(conceptAggregates) > 0) { diff --git a/R/BuildDataQualityHistoryIndex.R b/R/BuildDataQualityHistoryIndex.R index 9a3642d..f17a15b 100644 --- a/R/BuildDataQualityHistoryIndex.R +++ b/R/BuildDataQualityHistoryIndex.R @@ -32,18 +32,18 @@ buildDataQualityHistoryIndex <- stratified_index <- data.table::data.table() addResultsToIndex <- function(json) { - cdm_source_name <- json$Metadata[1,"CDM_SOURCE_NAME"] - cdm_source_abbreviation <- json$Metadata[1,"CDM_SOURCE_ABBREVIATION"] - vocabulary_version <- json$Metadata[1,"VOCABULARY_VERSION"] - cdm_release_date <- format(lubridate::ymd(json$Metadata[1,"CDM_RELEASE_DATE"]),"%Y-%m-%d") + cdm_source_name <- json$Metadata[1,"cdmSourceName"] + cdm_source_abbreviation <- json$Metadata[1,"cdmSourceAbbreviation"] + vocabulary_version <- json$Metadata[1,"vocabularyVersion"] + cdm_release_date <- format(lubridate::ymd(json$Metadata[1,"cdmReleaseDate"]),"%Y-%m-%d") count_passed <- as.numeric(json$Overview$countPassed) count_failed <- as.numeric(json$Overview$countOverallFailed) count_total <- count_passed + count_failed dqd_execution_date <- format(lubridate::ymd_hms(json$endTimestamp),"%Y-%m-%d") stratifiedAggregates <- json$CheckResults %>% - filter(FAILED==1) %>% - group_by(CATEGORY, toupper(CDM_TABLE_NAME)) %>% + filter(failed==1) %>% + group_by(category, toupper(cdmTableName)) %>% summarise(count_value=n()) names(stratifiedAggregates) <- c("category", "cdm_table_name", "count_value") stratifiedAggregates$dqd_execution_date <- dqd_execution_date diff --git a/R/BuildDataQualityIndex.R b/R/BuildDataQualityIndex.R index 0c4181c..268b365 100644 --- a/R/BuildDataQualityIndex.R +++ b/R/BuildDataQualityIndex.R @@ -54,23 +54,32 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) { results <- dataQualityResults$CheckResults # for each release, generate a summary of failures by cdm_table_name - domainAggregates <- results %>% filter(FAILED==1) %>% count(tolower(CDM_TABLE_NAME)) + domainAggregates <- results %>% filter(failed==1) %>% count(tolower(cdmTableName)) names(domainAggregates) <- c("cdm_table_name", "count_failed") data.table::fwrite(domainAggregates, file.path(releaseFolder,"domain-issues.csv")) # collect all failures from this result file for network analysis + results$CHECK_NAME <- results$checkName + results$CHECK_LEVEL <- results$checkLevel + results$CDM_TABLE_NAME <- results$cmdTableName + results$CATEGORY <- results$category + results$SUBCATEGORY <- results$subcategory + results$CONTEXT <- results$context + results$CDM_FIELD_NAME <- results$cdmFieldName + results$CONCEPT_ID <- results$conceptId + results$UNIT_CONCEPT_ID <- results$unitConceptId outColNames <- c("CHECK_NAME", "CHECK_LEVEL", "CDM_TABLE_NAME", "CATEGORY", "SUBCATEGORY", "CONTEXT", "CDM_FIELD_NAME", "CONCEPT_ID", "UNIT_CONCEPT_ID") missingColNames <- setdiff(outColNames, names(results)) for (colName in missingColNames) { writeLines(paste0("Expected column is missing in DQD results. Adding column with NA values: ", colName)) results[,colName] <- NA } - sourceFailures <- results[results[,"FAILED"]==1,outColNames] - sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$CDM_SOURCE_NAME - sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION - sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION) - sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y-%m-%d") - sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y%m%d") + sourceFailures <- results[results[,"failed"]==1,outColNames] + sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$cdmSourceName + sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$cdmSourceAbbreviation + sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$cdmSourceAbbreviation) + sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d") + sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d") networkIndex <- rbind(networkIndex, sourceFailures) } else { writeLines(paste("missing data quality result file ",dataQualityResultsFile)) diff --git a/R/BuildNetworkIndex.R b/R/BuildNetworkIndex.R index c047532..fba3694 100644 --- a/R/BuildNetworkIndex.R +++ b/R/BuildNetworkIndex.R @@ -94,20 +94,20 @@ buildNetworkIndex <- function(sourceFolders, outputFolder) { writeLines(paste("missing observation period results file ", observationPeriodResultsFile)) } - source$cdm_source_name <- dataQualityResults$Metadata$CDM_SOURCE_NAME - source$cdm_source_abbreviation <- dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION + source$cdm_source_name <- dataQualityResults$Metadata$cdmSourceName + source$cdm_source_abbreviation <- dataQualityResults$Metadata$cdmSourceAbbreviation source$cdm_source_key <- gsub(" ", "_", source$cdm_source_abbreviation) - source$cdm_holder <- dataQualityResults$Metadata$CDM_HOLDER - source$source_description <- dataQualityResults$Metadata$SOURCE_DESCRIPTION + source$cdm_holder <- dataQualityResults$Metadata$cdmHolder + source$source_description <- dataQualityResults$Metadata$sourceDescription source$releases <- rbind( source$releases, list( - release_name = format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y-%m-%d"), - release_id = format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y%m%d"), - cdm_version = dataQualityResults$Metadata$CDM_VERSION, - vocabulary_version = dataQualityResults$Metadata$VOCABULARY_VERSION, - dqd_version = dataQualityResults$Metadata$DQD_VERSION, + release_name = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d"), + release_id = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d"), + cdm_version = dataQualityResults$Metadata$cdmVersion, + vocabulary_version = dataQualityResults$Metadata$vocabularyVersion, + dqd_version = dataQualityResults$Metadata$dqdVersion, count_data_quality_issues = dataQualityResults$Overview$countOverallFailed, count_data_quality_checks = dataQualityResults$Overview$countTotal, dqd_execution_date = format(lubridate::ymd_hms(dataQualityResults$endTimestamp),"%Y-%m-%d"), diff --git a/R/BuildNetworkPerformanceIndex.R b/R/BuildNetworkPerformanceIndex.R index aefa128..04d0efe 100644 --- a/R/BuildNetworkPerformanceIndex.R +++ b/R/BuildNetworkPerformanceIndex.R @@ -65,8 +65,8 @@ buildNetworkPerformanceIndex <- performanceTable <- merge(x=performanceTable,y=analysisDetails,by="TASK",all.x=TRUE) - dqdTable <- dplyr::select(dqdData, c("CheckResults.checkId", "CheckResults.EXECUTION_TIME", "CheckResults.CATEGORY")) %>% - rename(TASK = CheckResults.checkId, TIMING = CheckResults.EXECUTION_TIME, CATEGORY = CheckResults.CATEGORY) %>% mutate(PACKAGE = "DQD") %>% + dqdTable <- dplyr::select(dqdData, c("CheckResults.checkId", "CheckResults.executionTime", "CheckResults.category")) %>% + rename(TASK = CheckResults.checkId, TIMING = CheckResults.executionTime, CATEGORY = CheckResults.category) %>% mutate(PACKAGE = "DQD") %>% mutate_at("TIMING", str_replace, " secs", "") mergedTable <- rbind(performanceTable, dqdTable)