diff --git a/DESCRIPTION b/DESCRIPTION index 179b382..1c55d86 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,6 @@ Depends: Imports: Andromeda, DatabaseConnector (>= 6.3.1), - digest, FeatureExtraction (>= 3.5.0), SqlRender (>= 1.9.0), ParallelLogger (>= 3.0.0), @@ -29,7 +28,6 @@ Imports: rlang Suggests: devtools, - formatR, testthat, Eunomia, kableExtra, diff --git a/NAMESPACE b/NAMESPACE index 1ef9149..2e3185b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand -export(computeAggregateCovariateAnalyses) +export(cleanIncremental) +export(cleanNonIncremental) export(computeDechallengeRechallengeAnalyses) export(computeRechallengeFailCaseSeriesAnalyses) export(computeTimeToEventAnalyses) diff --git a/R/AggregateCovariates.R b/R/AggregateCovariates.R index 2525f9b..dbfbe0c 100644 --- a/R/AggregateCovariates.R +++ b/R/AggregateCovariates.R @@ -25,6 +25,7 @@ #' @param caseCovariateSettings An object created using \code{createDuringCovariateSettings} #' @param casePreTargetDuration The number of days prior to case index we use for FeatureExtraction #' @param casePostOutcomeDuration The number of days prior to case index we use for FeatureExtraction +#' @param extractNonCaseCovariates Whether to extract aggregate covariates and counts for patients in the targets and outcomes in addition to the cases #' #' @return #' A list with the settings @@ -79,7 +80,8 @@ createAggregateCovariateSettings <- function( useVisitConceptCountDuring = T ), casePreTargetDuration = 365, - casePostOutcomeDuration = 365 + casePostOutcomeDuration = 365, + extractNonCaseCovariates = T ) { errorMessages <- checkmate::makeAssertCollection() # check targetIds is a vector of int/double @@ -144,49 +146,35 @@ createAggregateCovariateSettings <- function( # create list result <- list( targetIds = targetIds, - outcomeIds = outcomeIds, minPriorObservation = minPriorObservation, + + outcomeIds = outcomeIds, outcomeWashoutDays = outcomeWashoutDays, + riskWindowStart = riskWindowStart, startAnchor = startAnchor, riskWindowEnd = riskWindowEnd, endAnchor = endAnchor, - covariateSettings = covariateSettings, - caseCovariateSettings = caseCovariateSettings, - casePreTargetDuration = casePreTargetDuration, - casePostOutcomeDuration = casePostOutcomeDuration + + covariateSettings = covariateSettings, # risk factors + caseCovariateSettings = caseCovariateSettings, # case series + casePreTargetDuration = casePreTargetDuration, # case series + casePostOutcomeDuration = casePostOutcomeDuration, # case series, + + extractNonCaseCovariates = extractNonCaseCovariates ) class(result) <- "aggregateCovariateSettings" return(result) } -#' Compute aggregate covariate study -#' -#' @template ConnectionDetails -#' @param cdmDatabaseSchema The schema with the OMOP CDM data -#' @param cdmVersion The version of the OMOP CDM -#' @template TargetOutcomeTables -#' @template TempEmulationSchema -#' @param aggregateCovariateSettings The settings for the AggregateCovariate study -#' @param databaseId Unique identifier for the database (string) -#' @param outputFolder The location to save the results as csv files -#' @param runId (depreciated) Unique identifier for the covariate setting -#' @param threads The number of threads to run in parallel -#' @param incrementalFile (optional) A file that tracks completed studies -#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This -#' will help reduce the file size of the characterization output, but will remove information -#' on covariates that have very low values. The default is 0. -#' @param runExtractTfeatures Whether to extract the target cohort features -#' @param runExtractOfeatures Whether to extract the outcome cohort features -#' @param runExtractCaseFeatures Whether to extract the case cohort features -#' @param settingIds (not recommended to use) User can specify the lookup ids for the settings -#' -#' @return -#' The descriptive results for each target cohort in the settings. -#' -#' @export -computeAggregateCovariateAnalyses <- function( +createExecutionIds <- function(size){ + executionIds <- gsub(" ", "", gsub("[[:punct:]]", "",paste(Sys.time(), sample(1000000,size), sep = ''))) + return(executionIds) +} + + +computeTargetAggregateCovariateAnalyses <- function( connectionDetails = NULL, cdmDatabaseSchema, cdmVersion = 5, @@ -195,274 +183,818 @@ computeAggregateCovariateAnalyses <- function( outcomeDatabaseSchema = targetDatabaseSchema, # remove outcomeTable = targetTable, # remove tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - aggregateCovariateSettings, + settings, databaseId = "database 1", outputFolder = file.path(getwd(),'characterization_results'), - runId = 1, # not needed - threads = 1, - incrementalFile = NULL, minCharacterizationMean = 0, - runExtractTfeatures = T, - runExtractOfeatures = T, - runExtractCaseFeatures = T, - settingIds = NULL + ... +) { + + # get settings + settingId <- unique(settings$settingId) + targetIds <- unique(settings$targetId) + minPriorObservation <- settings$minPriorObservation + covariateSettings <- settings$covariateSettings + + # create cohortDetails - all Ts, minPriorObservation, twice (type = Tall, Target) + cohortDetails <- data.frame( + settingId = settingId, + targetCohortId = rep(targetIds,2), + outcomeCohortId = 0, # cannot be NA due to pk/index + cohortType = c(rep('Target',length(targetIds)),rep('Tall',length(targetIds))), + cohortDefinitionId = 1:(length(targetIds)*2), + minPriorObservation = minPriorObservation, + outcomeWashoutDays = NA, + casePreTargetDuration = NA, + casePostOutcomeDuration = NA, + riskWindowStart = NA, + startAnchor = NA, + riskWindowEnd = NA, + endAnchor = NA, + covariateSettingJson = covariateSettings, + caseCovariateSettingJson = NA + ) + + connection <- DatabaseConnector::connect( + connectionDetails = connectionDetails + ) + on.exit( + DatabaseConnector::disconnect(connection) + ) + + # create the temp table with cohort_details + DatabaseConnector::insertTable( + data = cohortDetails[,c('settingId','targetCohortId','outcomeCohortId','cohortType','cohortDefinitionId')], + camelCaseToSnakeCase = T, + connection = connection, + tableName = '#cohort_details', + tempTable = T, + dropTableIfExists = T, + createTable = T, + progressBar = F + ) + + message("Computing aggregate target cohorts") + start <- Sys.time() + + sql <- SqlRender::loadRenderTranslateSql( + sqlFilename = "TargetCohorts.sql", + packageName = "Characterization", + dbms = connectionDetails$dbms, + cdm_database_schema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + target_database_schema = targetDatabaseSchema, + target_table = targetTable, + target_ids = paste(targetIds, collapse = ",", sep = ","), + min_prior_observation = minPriorObservation + ) + + DatabaseConnector::executeSql( + connection = connection, + sql = sql, + progressBar = FALSE, + reportOverallTime = FALSE + ) + completionTime <- Sys.time() - start + + message(paste0('Computing target cohorts took ',round(completionTime,digits = 1), ' ', units(completionTime))) + ## get counts + message("Extracting target cohort counts") + sql <- "select + cohort_definition_id, + count(*) row_count, + count(distinct subject_id) person_count, + min(datediff(day, cohort_start_date, cohort_end_date)) min_exposure_time, + avg(datediff(day, cohort_start_date, cohort_end_date)) mean_exposure_time, + max(datediff(day, cohort_start_date, cohort_end_date)) max_exposure_time + from + (select * from #agg_cohorts_before union select * from #agg_cohorts_extras) temp + group by cohort_definition_id;" + sql <- SqlRender::translate( + sql = sql, + targetDialect = connectionDetails$dbms + ) + counts <- DatabaseConnector::querySql( + connection = connection, + sql = sql, + snakeCaseToCamelCase = T, + ) + + message("Computing aggregate target covariate results") + + result <- FeatureExtraction::getDbCovariateData( + connection = connection, + oracleTempSchema = tempEmulationSchema, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortTable = "#agg_cohorts_before", + cohortTableIsTemp = T, + cohortIds = -1, + covariateSettings = ParallelLogger::convertJsonToSettings(covariateSettings), + cdmVersion = cdmVersion, + aggregated = T, + minCharacterizationMean = minCharacterizationMean + ) + + # drop temp tables + message("Dropping temp tables") + sql <- SqlRender::loadRenderTranslateSql( + sqlFilename = "DropTargetCovariate.sql", + packageName = "Characterization", + dbms = connectionDetails$dbms, + tempEmulationSchema = tempEmulationSchema + ) + DatabaseConnector::executeSql( + connection = connection, + sql = sql, progressBar = FALSE, + reportOverallTime = FALSE + ) + + # export all results to csv files + exportAndromedaToCsv( + andromeda = result, + outputFolder = outputFolder, + cohortDetails = cohortDetails, + counts = counts, + databaseId = databaseId, + minCharacterizationMean = minCharacterizationMean + ) + + return(invisible(T)) +} + + +computeCaseAggregateCovariateAnalyses <- function( + connectionDetails = NULL, + cdmDatabaseSchema, + cdmVersion = 5, + targetDatabaseSchema, + targetTable, + outcomeDatabaseSchema = targetDatabaseSchema, # remove + outcomeTable = targetTable, # remove + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + settings, + databaseId = "database 1", + outputFolder = file.path(getwd(),'characterization_results'), + minCharacterizationMean = 0, + ... ) { # check inputs + # create cohortDetails - all Ts, minPriorObservation, twice (type = Tall, Target) + + + # get settings + targetIds <- unique(settings$targetId) + outcomeIds <- unique(settings$outcomeId) + minPriorObservation <- settings$minPriorObservation + outcomeWashoutDays <- settings$outcomeWashoutDays + casePreTargetDuration <- settings$casePreTargetDuration + casePostOutcomeDuration <- settings$casePostOutcomeDuration + covariateSettings <- settings$covariateSettings # json + caseCovariateSettings <- settings$caseCovariateSettings # json + + # 'cohortType' + cohortDetails <- expand.grid( + targetCohortId = unique(settings$targetId), + outcomeCohortId = unique(settings$outcomeId), + cohortType = c('Cases', 'CasesBefore', 'CasesAfter', 'CasesBetween') + ) + + cohortDetails$cohortDefinitionId <- 1:nrow(cohortDetails) + cohortDetails$minPriorObservation <- settings$minPriorObservation + cohortDetails$outcomeWashoutDays <- settings$outcomeWashoutDays + cohortDetails$casePreTargetDuration <- settings$casePreTargetDuration + cohortDetails$casePostOutcomeDuration <- settings$casePostOutcomeDuration + cohortDetails$covariateSettingJson <- settings$covariateSettings + cohortDetails$caseCovariateSettingJson <- settings$caseCovariateSettings + + tars <- settings$tar + tars$settingId <- unique(settings$settingId) + + # add executionIds + cohortDetails <- merge(cohortDetails, tars) + + # add 'Exclude' with random tar + cohortDetailsExtra <- expand.grid( + targetCohortId = unique(settings$targetId), + outcomeCohortId = unique(settings$outcomeId), + cohortType = 'Exclude', + minPriorObservation = settings$minPriorObservation, + outcomeWashoutDays = settings$outcomeWashoutDays, + casePreTargetDuration = settings$casePreTargetDuration, + casePostOutcomeDuration = settings$casePostOutcomeDuration, + covariateSettingJson = settings$covariateSettings, + caseCovariateSettingJson = settings$caseCovariateSettings, + riskWindowStart = tars$riskWindowStart[1], + startAnchor = tars$startAnchor[1], + riskWindowEnd = tars$riskWindowEnd[1], + endAnchor = tars$endAnchor[1], + settingId = settings$settingId[1] + ) + cohortDetailsExtra$cohortDefinitionId <- max(cohortDetails$cohortDefinitionId) + (1:nrow(cohortDetailsExtra)) + cohortDetails <- rbind(cohortDetails, cohortDetailsExtra[colnames(cohortDetails)]) + + connection <- DatabaseConnector::connect( + connectionDetails = connectionDetails + ) + on.exit( + DatabaseConnector::disconnect(connection) + ) + + # create the temp table with cohort_details + DatabaseConnector::insertTable( + data = cohortDetails[,c('targetCohortId','outcomeCohortId','cohortType','cohortDefinitionId', 'settingId')], + camelCaseToSnakeCase = T, + connection = connection, + tableName = '#cohort_details', + tempTable = T, + dropTableIfExists = T, + createTable = T, + progressBar = F + ) + + message("Computing aggregate case covariate cohorts") start <- Sys.time() - # if not a list of aggregateCovariateSettings make it one - if(!inherits(aggregateCovariateSettings, 'list')){ - aggregateCovariateSettings <- list(aggregateCovariateSettings) - } + # this is run for all tars + sql <- SqlRender::loadRenderTranslateSql( + sqlFilename = "CaseCohortsPart1.sql", + packageName = "Characterization", + dbms = connectionDetails$dbms, + cdm_database_schema = cdmDatabaseSchema, + tempEmulationSchema = tempEmulationSchema, + target_database_schema = targetDatabaseSchema, + target_table = targetTable, + target_ids = paste(targetIds, collapse = ",", sep = ","), + outcome_database_schema = outcomeDatabaseSchema, + outcome_table = outcomeTable, + outcome_ids = paste(outcomeIds, collapse = ",", sep = ","), + min_prior_observation = minPriorObservation, + outcome_washout_days = outcomeWashoutDays + ) + DatabaseConnector::executeSql( + connection = connection, + sql = sql, + progressBar = FALSE, + reportOverallTime = FALSE + ) - # check that covariateSettings are identical across aggregateCovariateSettings - if(length(aggregateCovariateSettings)>1){ - if(sum(unlist(lapply(2:length(aggregateCovariateSettings), - function(i){ - identical( - aggregateCovariateSettings[[1]]$covariateSettings, - aggregateCovariateSettings[[i]]$covariateSettings - ) - }))) != (length(aggregateCovariateSettings)-1)){ - stop('Cannot have different covariate settings') - } + # extract the excluded people into excluded_covariates, excluded_covariates_continuous, + # excluded_analysis_ref, excluded_covariate_ref + + # loop over settingId which contains tars: + for(i in 1:nrow(tars)){ + sql <- SqlRender::loadRenderTranslateSql( + sqlFilename = "CaseCohortsPart2.sql", + packageName = "Characterization", + dbms = connectionDetails$dbms, + tempEmulationSchema = tempEmulationSchema, + first = i==1, + case_pre_target_duration = casePreTargetDuration, + case_post_outcome_duration = casePostOutcomeDuration, + setting_id = tars$settingId[i], + tar_start = tars$riskWindowStart[i], + tar_start_anchor = ifelse(tars$startAnchor[i] == 'cohort start','cohort_start_date','cohort_end_date'), ##TODO change? + tar_end = tars$riskWindowEnd[i], + tar_end_anchor = ifelse(tars$endAnchor[i] == 'cohort start','cohort_start_date','cohort_end_date') ##TODO change? + ) + DatabaseConnector::executeSql( + connection = connection, + sql = sql, + progressBar = FALSE, + reportOverallTime = FALSE + ) } + completionTime <- Sys.time() - start + + message(paste0('Computing case cohorts took ',round(completionTime,digits = 1), ' ', units(completionTime))) + + ## get counts + message("Extracting case cohort counts") + sql <- "select + cohort_definition_id, + count(*) row_count, + count(distinct subject_id) person_count, + min(datediff(day, cohort_start_date, cohort_end_date)) min_exposure_time, + avg(datediff(day, cohort_start_date, cohort_end_date)) mean_exposure_time, + max(datediff(day, cohort_start_date, cohort_end_date)) max_exposure_time + from #cases + group by cohort_definition_id;" + sql <- SqlRender::translate( + sql = sql, + targetDialect = connectionDetails$dbms + ) + counts <- DatabaseConnector::querySql( + connection = connection, + sql = sql, + snakeCaseToCamelCase = T, + ) - # create covariateSetting lookups - #================================= - covariateSettingsList <- extractCovariateList(aggregateCovariateSettings) - caseCovariateSettingsList <- extractCaseCovariateList(aggregateCovariateSettings) - - #================================= - - # extract the combinations of - # T,O,type, tar, washout, min prior obs, case prior, case post - # case settings, covariate settings, cov hash, case cov hash - cohortDetails <- do.call( - what = 'rbind', - args = lapply( - X = aggregateCovariateSettings, - FUN = extractCombinationSettings, - caseCovariateSettingsList = caseCovariateSettingsList, - covariateSettingsList = covariateSettingsList - ) - ) + message("Computing aggregate before case covariate results") + + result <- FeatureExtraction::getDbCovariateData( + connection = connection, + oracleTempSchema = tempEmulationSchema, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortTable = "#cases", + cohortTableIsTemp = T, + cohortIds = -1, + covariateSettings = ParallelLogger::convertJsonToSettings(covariateSettings), + cdmVersion = cdmVersion, + aggregated = T, + minCharacterizationMean = minCharacterizationMean + ) - #remove redundancy - cohortDetails <- unique(cohortDetails) + message("Computing aggregate during case covariate results") + + result2 <- FeatureExtraction::getDbCovariateData( + connection = connection, + oracleTempSchema = tempEmulationSchema, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortTable = "#case_series", + cohortTableIsTemp = T, + cohortIds = -1, + covariateSettings = ParallelLogger::convertJsonToSettings(caseCovariateSettings), + cdmVersion = cdmVersion, + aggregated = T, + minCharacterizationMean = minCharacterizationMean + ) - # get settings - settingColumns <- c( - 'minPriorObservation', 'outcomeWashoutDays', - 'casePreTargetDuration', 'casePostOutcomeDuration', - 'riskWindowStart','startAnchor', - 'riskWindowEnd','endAnchor', - 'covariateSettingsHash', 'caseCovariateSettingsHash' - ) - settings <- unique(cohortDetails[,settingColumns]) - if(is.null(settingIds)){ - settings$settingId <- 1:nrow(settings) - } else{ - if(nrow(settings) == settingIds){ - settings$settingId <- settingIds - } else{ - stop('SettingsIds input but wrong length') - } - } - # add settingId to cohortDetails - cohortDetails <- merge( - x = cohortDetails, - y = settings, - by = settingColumns + # drop temp tables + message("Dropping temp tables") + sql <- SqlRender::loadRenderTranslateSql( + sqlFilename = "DropCaseCovariate.sql", + packageName = "Characterization", + dbms = connectionDetails$dbms, + tempEmulationSchema = tempEmulationSchema + ) + DatabaseConnector::executeSql( + connection = connection, + sql = sql, progressBar = FALSE, + reportOverallTime = FALSE ) - # save settings, cohort_details - saveSettings( + # export all results to csv files + message("Exporting results to csv") + exportAndromedaToCsv( # TODO combine export of result and result2 + andromeda = result, outputFolder = outputFolder, cohortDetails = cohortDetails, + counts = counts, databaseId = databaseId, - covariateSettingsList = covariateSettingsList, - caseCovariateSettingsList = caseCovariateSettingsList - ) + minCharacterizationMean = minCharacterizationMean + ) + exportAndromedaToCsv( + andromeda = result2, + outputFolder = outputFolder, + cohortDetails = cohortDetails, + counts = NULL, # previously added + databaseId = databaseId, + minCharacterizationMean = minCharacterizationMean, + includeSettings = F + ) + return(invisible(T)) +} - # add cohortDefinitionId - cohortDetails$cohortDefinitionId <- 1:nrow(cohortDetails) - # add folder Id before incremental so folders the same - # each time - cohortDetails <- addFolderId( - cohortDetails = cohortDetails, - outputFolder = file.path(outputFolder,'execution'), - threads = threads - ) - - # if running incremental remove previously executed - # analyses - if(!is.null(incrementalFile)){ - if(file.exists(incrementalFile)){ - executedDetails <- loadIncremental(incrementalFile) - cohortDetails <- removeExecuted( - cohortDetails = cohortDetails, - executedDetails = executedDetails - ) - } +exportAndromedaToCsv <- function( + andromeda, + outputFolder, + cohortDetails, + counts, + databaseId, + minCharacterizationMean, + batchSize = 100000, + minCellCount = 0, + includeSettings = T +){ + + saveLocation <- outputFolder + if(!dir.exists(saveLocation)){ + dir.create(saveLocation, recursive = T) } - # stop if results already exist - if(nrow(cohortDetails) == 0){ - message('All results have been previosuly executed') - } else{ - - # RUN ANALYSES - message('Creating new cluster') - cluster <- ParallelLogger::makeCluster( - numberOfThreads = threads, - singleThreadToMain = T, - setAndromedaTempFolder = T - ) + ids <- data.frame( + settingId = unique(cohortDetails$settingId), + databaseId = databaseId + ) - # 1) get the T and folders to get jobs run in parallel - if(runExtractTfeatures){ - start <- Sys.time() - ind <- cohortDetails$cohortType %in% c('Target','Tall') - if( sum(ind) > 0 ){ - message('Running target cohort features') - runs <- unique(cohortDetails$folderId[ind]) - inputList <- lapply( - X = runs, - FUN = function(folderId){ - list( - connectionDetails = connectionDetails, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortDetails = cohortDetails[cohortDetails$folderId == folderId,], - covariateSettingsList = covariateSettingsList, - targetDatabaseSchema = targetDatabaseSchema, - targetTable = targetTable, - tempEmulationSchema = tempEmulationSchema, - minCharacterizationMean = minCharacterizationMean, - cdmVersion = cdmVersion, - databaseId = databaseId, - incrementalFile = incrementalFile - ) - } + # analysis_ref and covariate_ref + # add database_id and setting_id + if(!is.null(andromeda$analysisRef)){ + Andromeda::batchApply( + tbl = andromeda$analysisRef, + fun = function(x){ + data <- merge(x, ids) + colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) + + if(file.exists(file.path(saveLocation, 'analysis_ref.csv'))){ + append <- T + } else{ + append = F + } + readr::write_csv( + x = formatDouble(data), + file = file.path(saveLocation, 'analysis_ref.csv'), + append = append ) + }, + batchSize = batchSize + ) + } - ParallelLogger::clusterApply( - cluster = cluster, - x = inputList, - fun = extractTargetFeatures - ) - end <- Sys.time() - start - message( - paste0( - 'Extracting target cohort features took ', - round(end, digits = 2), ' ', - units(end) - ) + if(!is.null(andromeda$covariateRef)){ + Andromeda::batchApply( + tbl = andromeda$covariateRef, + fun = function(x){ + data <- merge(x, ids) + colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) + + if(file.exists(file.path(saveLocation, 'covariate_ref.csv'))){ + append <- T + } else{ + append = F + } + readr::write_csv( + x = formatDouble(data), + file = file.path(saveLocation, 'covariate_ref.csv'), + append = append ) - } - } + }, + batchSize = batchSize + ) + } - # 2) get the outcomes and folders to get jobs run in parallel - if(runExtractOfeatures){ - start <- Sys.time() - ind <- cohortDetails$cohortType %in% c('Outcome','Oall') - if( sum(ind) > 0 ){ - message('Running outcome cohort features') - runs <- unique(cohortDetails$folderId[ind]) - inputList <- lapply( - X = runs, - FUN = function(folderId){ - list( - connectionDetails = connectionDetails, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortDetails = cohortDetails[cohortDetails$folderId == folderId,], - covariateSettingsList = covariateSettingsList, - outcomeDatabaseSchema = outcomeDatabaseSchema, - outcomeTable = outcomeTable, - tempEmulationSchema = tempEmulationSchema, - minCharacterizationMean = minCharacterizationMean, - cdmVersion = cdmVersion, - databaseId = databaseId, - incrementalFile = incrementalFile - ) + # covariates and covariate_continuous + extras <- cohortDetails[, c('cohortDefinitionId','settingId', 'targetCohortId', 'outcomeCohortId', 'cohortType')] + extras$databaseId <- databaseId + extras$minCharacterizationMean <- minCharacterizationMean + + # add database_id, setting_id, target_cohort_id, outcome_cohort_id and cohort_type + if(!is.null(andromeda$covariates)){ + Andromeda::batchApply( + tbl = andromeda$covariates, + fun = function(x){ + data <- merge(x, extras, by = 'cohortDefinitionId') + data <- data %>% dplyr::select(-"cohortDefinitionId") + colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) + + # censor minCellCount columns sum_value + removeInd <- data$sum_value < minCellCount + if (sum(removeInd) > 0) { + ParallelLogger::logInfo(paste0("Removing sum_value counts less than ", minCellCount)) + if (sum(removeInd) > 0) { + data$sum_value[removeInd] <- -1 } + } + + if(file.exists(file.path(saveLocation, 'covariates.csv'))){ + append <- T + } else{ + append = F + } + readr::write_csv( + x = formatDouble(data), + file = file.path(saveLocation, 'covariates.csv'), + append = append ) + }, + batchSize = batchSize + ) + } - ParallelLogger::clusterApply( - cluster = cluster, - x = inputList, - fun = extractOutcomeFeatures - ) - end <- Sys.time() - start - message( - paste0( - 'Extracting outcome cohort features took ', - round(end, digits = 2), ' ', - units(end) - ) + if(!is.null(andromeda$covariatesContinuous)){ + Andromeda::batchApply( + tbl = andromeda$covariatesContinuous, + fun = function(x){ + data <- merge(x, extras %>% dplyr::select(-"minCharacterizationMean"), by = 'cohortDefinitionId') + data <- data %>% dplyr::select(-"cohortDefinitionId") + colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) + + # count_value + removeInd <- data$count_value < minCellCount + if (sum(removeInd) > 0) { + ParallelLogger::logInfo(paste0("Removing count_value counts less than ", minCellCount)) + if (sum(removeInd) > 0) { + data$count_value[removeInd] <- -1 + } + } + + if(file.exists(file.path(saveLocation, 'covariates_continuous.csv'))){ + append <- T + } else{ + append = F + } + readr::write_csv( + x = formatDouble(data), + file = file.path(saveLocation, 'covariates_continuous.csv'), + append = append ) - } + }, + batchSize = batchSize + ) + } + + # cohort_counts: + if(!is.null(counts)){ + cohortCounts <- cohortDetails %>% dplyr::select( + 'targetCohortId', + 'outcomeCohortId', + 'cohortType', + 'cohortDefinitionId', + 'riskWindowStart', + 'riskWindowEnd', + 'startAnchor', + 'endAnchor', + 'minPriorObservation', + 'outcomeWashoutDays' + ) %>% + dplyr::mutate( + databaseId = !!databaseId + ) %>% + dplyr::inner_join(counts, by = 'cohortDefinitionId') %>% + dplyr::select(-"cohortDefinitionId") + cohortCounts <- unique(cohortCounts) + colnames(cohortCounts) <- SqlRender::camelCaseToSnakeCase(colnames(cohortCounts)) + + # TODO apply minCellCount to columns row_count, person_count + + if(file.exists(file.path(saveLocation, 'cohort_counts.csv'))){ + append <- T + } else{ + append = F } + readr::write_csv( + x = formatDouble(cohortCounts), + file = file.path(saveLocation, 'cohort_counts.csv'), + append = append + ) + } + if(includeSettings){ + settings <- cohortDetails %>% + dplyr::select('settingId', 'minPriorObservation', 'outcomeWashoutDays', + 'riskWindowStart', 'riskWindowEnd', 'startAnchor', 'endAnchor', + 'casePreTargetDuration', 'casePostOutcomeDuration', + 'covariateSettingJson', 'caseCovariateSettingJson') %>% + dplyr::mutate(databaseId = !!databaseId) %>% + dplyr::distinct() + colnames(settings) <- SqlRender::camelCaseToSnakeCase(colnames(settings)) + + # add setting.csv with cohortDetails plus database + readr::write_csv( + x = settings, + file = file.path(saveLocation, 'settings.csv'), + append = F + ) - # 3) get the cases and folders to get jobs run in parallel - if(runExtractCaseFeatures){ - start <- Sys.time() - ind <- !cohortDetails$cohortType %in% c('Outcome','Oall', 'Target', 'Tall') - if( sum(ind) > 0 ){ - message('Running case cohort features') - runs <- unique(cohortDetails$folderId[ind]) - inputList <- lapply( - X = runs, - FUN = function(folderId){ - list( - connectionDetails = connectionDetails, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortDetails = cohortDetails[cohortDetails$folderId == folderId,], - covariateSettingsList = covariateSettingsList, - caseCovariateSettingsList = caseCovariateSettingsList, - targetDatabaseSchema = targetDatabaseSchema, - targetTable = targetTable, - outcomeDatabaseSchema = outcomeDatabaseSchema, - outcomeTable = outcomeTable, - tempEmulationSchema = tempEmulationSchema, - minCharacterizationMean = minCharacterizationMean, - cdmVersion = cdmVersion, - databaseId = databaseId, - incrementalFile = incrementalFile - ) - } - ) + cohortDetails <- cohortDetails %>% + dplyr::select('settingId', 'targetCohortId', + 'outcomeCohortId','cohortType') %>% + dplyr::mutate(databaseId = !!databaseId) %>% + dplyr::distinct() + colnames(cohortDetails) <- SqlRender::camelCaseToSnakeCase(colnames(cohortDetails)) + + # add cohort_details.csv with cohortDetails plus database + readr::write_csv( + x = cohortDetails, + file = file.path(saveLocation, 'cohort_details.csv'), + append = F + ) + } - ParallelLogger::clusterApply( - cluster = cluster, - x = inputList, - fun = extractCaseFeatures - ) - end <- Sys.time() - start - message( - paste0( - 'Extracting case cohort features took ', - round(end, digits = 2), ' ', - units(end) - ) - ) + return(invisible(T)) +} + + + + +combineCovariateSettingsJsons <- function(covariateSettingsJsonList){ + + # get unique + covariateSettingsJsonList <- unique(covariateSettingsJsonList) + + # first convert from json + covariateSettings <- lapply( + X = covariateSettingsJsonList, + FUN = function(x){ParallelLogger::convertJsonToSettings(x)} + ) + + # then combine the covariates + singleSettings <- which(unlist(lapply(covariateSettings, function(x) inherits(x, 'covariateSettings')))) + multipleSettings <- which(unlist(lapply(covariateSettings, function(x) inherits(x, 'list')))) + + covariateSettingList <- list() + if(length(singleSettings)>0){ + for(i in singleSettings){ + covariateSettingList[[length(covariateSettingList) + 1]] <- covariateSettings[[i]] + } + } + if(length(multipleSettings) > 0){ + for(i in multipleSettings){ + settingList <- covariateSettings[[i]] + for(j in 1:length(settingList)){ + if(inherits(settingList[[j]], 'covariateSettings')){ + covariateSettingList[[length(covariateSettingList) + 1]] <- settingList[[j]] + } else{ + message('Incorrect covariate settings found') # stop? + } } } + } - # finish cluster - message('Stopping cluster') - ParallelLogger::stopCluster(cluster = cluster) + # check for covariates with same id but different + endDays <- unique(unlist(lapply(covariateSettingList, function(x){x$endDays}))) + if(length(endDays) > 1){ + stop('Covariate settings for aggregate covariates using different end days') + } + longTermStartDays <- unique(unlist(lapply(covariateSettingList, function(x){x$longTermStartDays}))) + if(length(longTermStartDays) > 1){ + stop('Covariate settings for aggregate covariates using different longTermStartDays') + } + mediumTermStartDays <- unique(unlist(lapply(covariateSettingList, function(x){x$mediumTermStartDays}))) + if(length(mediumTermStartDays) > 1){ + stop('Covariate settings for aggregate covariates using different mediumTermStartDays') + } + shortTermStartDays <- unique(unlist(lapply(covariateSettingList, function(x){x$shortTermStartDays}))) + if(length(shortTermStartDays) > 1){ + stop('Covariate settings for aggregate covariates using different shortTermStartDays') } - # combine csvs into one - message('Aggregating csv files') - aggregateCsvs( - outputFolder = outputFolder - ) - return(invisible(TRUE)) + # convert to json + covariateSettingList <- as.character(ParallelLogger::convertSettingsToJson(covariateSettingList)) + return(covariateSettingList) } +getAggregateCovariatesJobs <- function( + characterizationSettings, + threads +){ + + characterizationSettings <- characterizationSettings$aggregateCovariateSettings + if(length(characterizationSettings) == 0){ + return(NULL) + } + ind <- 1:length(characterizationSettings) + + # target combinations + targetCombinations <- do.call(what = 'rbind', + args = + lapply( + 1:length(characterizationSettings), + function(i){ + result <- data.frame( + targetIds = c(characterizationSettings[[i]]$targetIds, + characterizationSettings[[i]]$outcomeIds), + minPriorObservation = characterizationSettings[[i]]$minPriorObservation, + covariateSettingsJson = as.character(ParallelLogger::convertSettingsToJson(characterizationSettings[[i]]$covariateSettings)) + ) + } + ) + ) + + threadCols <- c("targetIds") + settingCols <- c("minPriorObservation") + + # thread split + threadSettings <- targetCombinations %>% + dplyr::select(dplyr::all_of(threadCols)) %>% + dplyr::distinct() + threadSettings$thread <- rep(1:threads, ceiling(nrow(threadSettings)/threads))[1:nrow(threadSettings)] + targetCombinations <- merge(targetCombinations, threadSettings, by = threadCols ) + + executionSettings <- data.frame( + minPriorObservation = unique(targetCombinations$minPriorObservation) + ) + executionSettings$settingId <- createExecutionIds(nrow(executionSettings)) + targetCombinations <- merge(targetCombinations, executionSettings, by = settingCols ) + + # recreate settings + settings <- c() + for(settingId in unique(executionSettings$settingId)){ + settingVal <- executionSettings %>% + dplyr::filter(.data$settingId == !!settingId) %>% + dplyr::select(dplyr::all_of(settingCols)) + + restrictedData <- targetCombinations %>% + dplyr::inner_join(settingVal, by = settingCols) + + for(i in unique(restrictedData$thread)){ + ind <- restrictedData$thread == i + settings <- rbind(settings, + data.frame( + functionName = 'computeTargetAggregateCovariateAnalyses', + settings = as.character(ParallelLogger::convertSettingsToJson( + list( + targetIds = unique(restrictedData$targetId[ind]), + minPriorObservation = unique(restrictedData$minPriorObservation[ind]), + covariateSettingsJson = combineCovariateSettingsJsons(as.list(restrictedData$covariateSettingsJson[ind])), + settingId = settingId + ) + )), + executionFolder = paste('tac', i ,paste(settingVal, collapse = '_'), sep = '_'), + jobId = paste('tac', i ,paste(settingVal, collapse = '_'), sep = '_') + ) + ) + } + } + + # get all combinations of TnOs, then split by treads + caseCombinations <- do.call(what = 'rbind', + args = + lapply( + 1:length(characterizationSettings), + function(i){ + result <- expand.grid( + targetId = characterizationSettings[[i]]$targetIds, + outcomeId = characterizationSettings[[i]]$outcomeIds + ) + result$minPriorObservation = characterizationSettings[[i]]$minPriorObservation + result$outcomeWashoutDays = characterizationSettings[[i]]$outcomeWashoutDays + + result$riskWindowStart = characterizationSettings[[i]]$riskWindowStart + result$startAnchor = characterizationSettings[[i]]$startAnchor + result$riskWindowEnd = characterizationSettings[[i]]$riskWindowEnd + result$endAnchor = characterizationSettings[[i]]$endAnchor + + result$casePreTargetDuration = characterizationSettings[[i]]$casePreTargetDuration + result$casePostOutcomeDuration = characterizationSettings[[i]]$casePostOutcomeDuration + + result$covariateSettingsJson = as.character(ParallelLogger::convertSettingsToJson(characterizationSettings[[i]]$covariateSettings)) + result$caseCovariateSettingsJson = as.character(ParallelLogger::convertSettingsToJson(characterizationSettings[[i]]$caseCovariateSettings)) + return(result) + } + ) + ) + + # create executionIds + settingCols <- c('minPriorObservation', 'outcomeWashoutDays', + 'casePreTargetDuration', 'casePostOutcomeDuration', + 'riskWindowStart', 'startAnchor', + 'riskWindowEnd', 'endAnchor') + executionSettings <- unique(caseCombinations[,settingCols]) + executionSettings$settingId <- createExecutionIds(nrow(executionSettings)) + caseCombinations <- merge(caseCombinations, executionSettings, by = settingCols) + + # create thread split + threadCombinations <- caseCombinations %>% + dplyr::select("targetId", + "minPriorObservation", + "outcomeWashoutDays", + "casePreTargetDuration", + "casePostOutcomeDuration" + ) %>% + dplyr::distinct() + threadCombinations$thread <- rep(1:threads, ceiling(nrow(threadCombinations)/threads))[1:nrow(threadCombinations)] + caseCombinations <- merge(caseCombinations, threadCombinations, by = c("targetId", + "minPriorObservation", + "outcomeWashoutDays", + "casePreTargetDuration", + "casePostOutcomeDuration" + )) + + executionCols <- c('minPriorObservation', 'outcomeWashoutDays', + 'casePreTargetDuration', 'casePostOutcomeDuration') + executions <- unique(caseCombinations[,executionCols]) + + # now create the settings + for(j in 1:nrow(executions)){ + settingVal <- executions[j,] + + restrictedData <- caseCombinations %>% + dplyr::inner_join(settingVal, by = executionCols) + + for(i in unique(restrictedData$thread)){ + ind <- restrictedData$thread == i + settings <- rbind(settings, + data.frame( + functionName = 'computeCaseAggregateCovariateAnalyses', + settings = as.character(ParallelLogger::convertSettingsToJson( + list( + targetIds = unique(restrictedData$targetId[ind]), + outcomeIds = unique(restrictedData$outcomeId[ind]), + minPriorObservation = unique(restrictedData$minPriorObservation[ind]), + outcomeWashoutDays = unique(restrictedData$outcomeWashoutDays[ind]), + tar = unique(data.frame( + riskWindowStart = restrictedData$riskWindowStart[ind], + startAnchor = restrictedData$startAnchor[ind], + riskWindowEnd = restrictedData$riskWindowEnd[ind], + endAnchor = restrictedData$endAnchor[ind] + )), + casePreTargetDuration = unique(restrictedData$casePreTargetDuration[ind]), + casePostOutcomeDuration = unique(restrictedData$casePostOutcomeDuration[ind]), + covariateSettingsJson = combineCovariateSettingsJsons(as.list(restrictedData$covariateSettingsJson[ind])), + caseCovariateSettingsJson = combineCovariateSettingsJsons(as.list(restrictedData$caseCovariateSettingsJson[ind])), + settingIds = unique(restrictedData$settingId[ind]) + ) + )), + executionFolder = paste('cac', i, paste0(settingVal, collapse = '_'), sep = '_'), + jobId = paste('cac', i, paste0(settingVal, collapse = '_'), sep = '_') + ) + ) + } + } + + return(settings) +} diff --git a/R/AggregateCovariatesHelpers.R b/R/AggregateCovariatesHelpers.R deleted file mode 100644 index bc91df8..0000000 --- a/R/AggregateCovariatesHelpers.R +++ /dev/null @@ -1,1049 +0,0 @@ - -hash <- function(x, seed=0){ - result <- digest::digest( - object = x, - algo='xxhash32', - seed=seed - ) - return(result) -} - -extractCovariateList <- function(settings){ - result <- lapply( - X = settings, function(x){ - x$covariateSettings - }) - result <- unique(result) - return(result) -} -extractCaseCovariateList <- function(settings){ - result <- lapply( - X = settings, function(x){ - x$caseCovariateSettings - }) - result <- unique(result) - return(result) -} - -extractCombinationSettings <- function( - x, - covariateSettingsList, - caseCovariateSettingsList - ){ - - types <- data.frame( - cohortType = c('TnO', 'OnT', 'TnObetween', 'TnOprior') - ) - - tars <- data.frame( - riskWindowStart = x$riskWindowStart, - startAnchor = x$startAnchor, - riskWindowEnd = x$riskWindowEnd, - endAnchor = x$endAnchor - ) - - # create hash of covariateSettings - covariateSettingsHash <- hash(x$covariateSettings) - covariateSettingsId <- which(unlist(lapply(covariateSettingsList, function(csl){ - identical(csl,x$covariateSettings) - }))) - caseCovariateSettingsHash <- hash(x$caseCovariateSettings) - caseCovariateSettingsId <- which(unlist(lapply(caseCovariateSettingsList, function(csl){ - identical(csl,x$caseCovariateSettings) - }))) - - # create T/O settings agnostic to TAR - TODO split T/O to prevent dup Ts - cohortDetails <- data.frame( - targetCohortId = c(rep(x$targetIds, 2), rep(x$outcomeIds, 2)), - #targetCohortId = c(rep(x$targetIds, 2), rep(0,2*length(x$outcomeIds))), - outcomeCohortId = 0,#c(rep(0,2*length(x$targetIds)), rep(x$outcomeIds, 2)), - cohortType = c( - rep('Tall', length(x$targetIds)), - rep('Target', length(x$targetIds)), - rep('Tall', length(x$outcomeIds)), - rep('Target', length(x$outcomeIds)) - #rep('Oall', length(x$outcomeIds)), - #rep('Outcome', length(x$outcomeIds)) - ), - riskWindowStart = NA, - startAnchor = NA, - riskWindowEnd = NA, - endAnchor = NA, - minPriorObservation = x$minPriorObservation, - outcomeWashoutDays = NA, #x$outcomeWashoutDays, - covariateSettingsHash = covariateSettingsHash, - covariateSettingsId = covariateSettingsId, - caseCovariateSettingsHash = NA, - caseCovariateSettingsId = NA, - casePreTargetDuration = NA, - casePostOutcomeDuration = NA - ) - - # do we want Oall with washout done seperately? - - # full join T and O - temp <- merge( - x = x$targetIds, - y = x$outcomeIds - ) - - #TnOprior <- data.frame( - # targetCohortId = temp$x, - # outcomeCohortId = temp$y, - # cohortType = 'TnOprior', - # riskWindowStart = NA, - # startAnchor = NA, - # riskWindowEnd = NA, - # endAnchor = NA, - # minPriorObservation = x$minPriorObservation, - # outcomeWashoutDays = x$outcomeWashoutDays, - # covariateSettingsHash = covariateSettingsHash, - # covariateSettingsId = covariateSettingsId, - # caseCovariateSettingsHash = NA, - # caseCovariateSettingsId = NA, - # casePreTargetDuration = NA, - # casePostOutcomeDuration = NA - #) - - TnOs <- data.frame( - targetCohortId = temp$x, - outcomeCohortId = temp$y, - minPriorObservation = x$minPriorObservation, - outcomeWashoutDays = x$outcomeWashoutDays, - covariateSettingsHash = covariateSettingsHash, - covariateSettingsId = covariateSettingsId, - caseCovariateSettingsHash = caseCovariateSettingsHash, - caseCovariateSettingsId = caseCovariateSettingsId, - casePreTargetDuration = x$casePreTargetDuration, - casePostOutcomeDuration = x$casePostOutcomeDuration - ) - - TnOs <- TnOs %>% dplyr::cross_join( - y = types - ) %>% dplyr::cross_join( - y = tars - ) - - combinations <- rbind( - cohortDetails, - #TnOprior, - TnOs[,colnames(cohortDetails)] - ) - - return(combinations) -} - -# specify the columns that impact O cohorts -getOSettingColumns <- function(){ - return(c('minPriorObservation','outcomeWashoutDays')) -} - -# specify the columns that impact T cohorts -getTSettingColumns <- function(){ - return(c('runId','minPriorObservation')) -} - -# specify the columns that impact case cohorts -getCaseSettingColumns <- function(){ - return(c('runId','minPriorObservation', 'outcomeWashoutDays', - 'casePreTargetDuration', 'casePostOutcomeDuration'#, - #'riskWindowStart','startAnchor', - #'riskWindowEnd','endAnchor' - )) -} - -createFolderName <- function( - typeName, - values -){ - - if('startAnchor' %in% colnames(values)){ - values$startAnchor <- gsub(' ','_',values$startAnchor) - } - if('endAnchor' %in% colnames(values)){ - values$endAnchor <- gsub(' ','_',values$endAnchor) - } - - folderNames <- unlist( - lapply( - 1:nrow(values), - function(i){ - paste( - typeName, - paste0(values[i,], collapse = '_' ), sep = '_' - ) - } - ) - ) - - return(folderNames) -} - -addFolderId <- function( - cohortDetails, - outputFolder, - threads -){ - - cohortDetails$folderId <- rep('', nrow(cohortDetails)) - - # partition Ts in thread groups - targetId <- unique(cohortDetails$targetCohortId) - # add runId that splits Ts into thread runs - cohortDetails <- merge( - x = cohortDetails, - y = data.frame( - targetCohortId = targetId, - runId = rep( - 1:threads, - ceiling(length(targetId)/threads) - )[1:length(targetId)] - ) - ) - - ind <- cohortDetails$cohortType %in% c('Target','Tall') - cohortDetails$folderId[ind] <- createFolderName( - typeName = 'T', - values = cohortDetails[ind,getTSettingColumns()] - ) - - ind <- cohortDetails$cohortType %in% c('Outcome', 'Oall') - cohortDetails$folderId[ind] <- createFolderName( - typeName = 'O', - values = cohortDetails[ind,getOSettingColumns()] - ) - - ind <- !cohortDetails$cohortType %in% c('Target','Tall', 'Outcome', 'Oall') - cohortDetails$folderId[ind] <- createFolderName( - typeName = 'cases', - values = cohortDetails[ind,getCaseSettingColumns()] - ) - - # add the main output path - cohortDetails$folderId <- sapply(cohortDetails$folderId, function(x){file.path(outputFolder, x)}) - - return(cohortDetails) -} - -incrementalColumns <- function(){ -colNames <- c( - 'targetCohortId', 'outcomeCohortId', 'cohortType', - 'riskWindowStart','startAnchor', - 'riskWindowEnd','endAnchor', - 'minPriorObservation', 'outcomeWashoutDays', - 'covariateSettingsHash', 'caseCovariateSettingsHash', - 'covariateSettingsId', 'caseCovariateSettingsId', - 'casePreTargetDuration', 'casePostOutcomeDuration' -) -return(colNames ) -} - -removeExecuted <- function( - cohortDetails, - executedDetails -){ - message('Finding completed analyses') - colNames <- incrementalColumns() - - for(colName in colNames){ - class(executedDetails[,colName]) <- class(cohortDetails[,colName]) - } - - cdIds <- apply(cohortDetails, 1, function(x) paste0(x[colNames], collapse = '_')) - edIds <- apply(executedDetails, 1, function(x) paste0(x[colNames], collapse = '_')) - - resultExists <- cdIds %in% edIds - - message(paste0(sum(resultExists), ' analyses already generated')) - message(paste0(sum(!resultExists), ' analyses left')) - return(cohortDetails[!resultExists,]) -} - -saveIncremental <- function( - cohortDetails, - incrementalFile -){ - - colNames <- incrementalColumns() - - if(file.exists(incrementalFile)){ - append <- T - } else{ - append <- F - } - readr::write_csv( - x = cohortDetails[,colNames], - file = incrementalFile, - append = append - ) - - return(invisible(NULL)) -} - -loadIncremental <- function( - incrementalFile -){ - # check columns? -result <- utils::read.csv(incrementalFile) -class(result$startAnchor) <- 'character' -class(result$endAnchor) <- 'character' -class(result$covariateSettingsHash) <- 'character' -class(result$caseCovariateSettingsHash) <- 'character' - -return(result) -} - -extractTargetFeatures <- function( - inputList - ){ - - connectionDetails <- inputList$connectionDetails - cdmDatabaseSchema <- inputList$cdmDatabaseSchema - cohortDetails <- inputList$cohortDetails - covariateSettingsList <- inputList$covariateSettingsList - targetDatabaseSchema <- inputList$targetDatabaseSchema - targetTable <- inputList$targetTable - tempEmulationSchema <- inputList$tempEmulationSchema - minCharacterizationMean <- inputList$minCharacterizationMean - cdmVersion <- inputList$cdmVersion - databaseId <- inputList$databaseId - incrementalFile <- inputList$incrementalFile - - outputFolder <- cohortDetails$folderId[1] - targetIds <- unique(cohortDetails$targetCohortId) - minPriorObservation <- cohortDetails$minPriorObservation[1] - covariateSettingsId <- cohortDetails$covariateSettingsId[1] - covariateSettings <- covariateSettingsList[[covariateSettingsId]] - - connection <- DatabaseConnector::connect( - connectionDetails = connectionDetails - ) - on.exit( - DatabaseConnector::disconnect(connection) - ) - - # create the temp table with cohort_details - DatabaseConnector::insertTable( - data = cohortDetails[,c('targetCohortId','outcomeCohortId','cohortType','cohortDefinitionId')], - camelCaseToSnakeCase = T, - connection = connection, - tableName = '#cohort_details', - tempTable = T, - dropTableIfExists = T, - createTable = T, - progressBar = F - ) - - message("Computing aggregate target cohorts") - start <- Sys.time() - - sql <- SqlRender::loadRenderTranslateSql( - sqlFilename = "TargetCohorts.sql", - packageName = "Characterization", - dbms = connectionDetails$dbms, - cdm_database_schema = cdmDatabaseSchema, - tempEmulationSchema = tempEmulationSchema, - target_database_schema = targetDatabaseSchema, - target_table = targetTable, - target_ids = paste(targetIds, collapse = ",", sep = ","), - min_prior_observation = minPriorObservation - ) - - DatabaseConnector::executeSql( - connection = connection, - sql = sql, - progressBar = FALSE, - reportOverallTime = FALSE - ) - completionTime <- Sys.time() - start - - message(paste0('Computing target cohorts took ',round(completionTime,digits = 1), ' ', units(completionTime))) - ## get counts - message("Extracting target cohort counts") - sql <- "select - cohort_definition_id, - count(*) row_count, - count(distinct subject_id) person_count, - min(datediff(day, cohort_start_date, cohort_end_date)) min_exposure_time, - avg(datediff(day, cohort_start_date, cohort_end_date)) mean_exposure_time, - max(datediff(day, cohort_start_date, cohort_end_date)) max_exposure_time - from - (select * from #agg_cohorts_before union select * from #agg_cohorts_extras) temp - group by cohort_definition_id;" - sql <- SqlRender::translate( - sql = sql, - targetDialect = connectionDetails$dbms - ) - counts <- DatabaseConnector::querySql( - connection = connection, - sql = sql, - snakeCaseToCamelCase = T, - ) - - message("Computing aggregate target covariate results") - - result <- FeatureExtraction::getDbCovariateData( - connection = connection, - oracleTempSchema = tempEmulationSchema, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortTable = "#agg_cohorts_before", - cohortTableIsTemp = T, - cohortIds = -1, - covariateSettings = covariateSettings, - cdmVersion = cdmVersion, - aggregated = T, - minCharacterizationMean = minCharacterizationMean - ) - - # drop temp tables - message("Dropping temp tables") - sql <- SqlRender::loadRenderTranslateSql( - sqlFilename = "DropTargetCovariate.sql", - packageName = "Characterization", - dbms = connectionDetails$dbms, - tempEmulationSchema = tempEmulationSchema - ) - DatabaseConnector::executeSql( - connection = connection, - sql = sql, progressBar = FALSE, - reportOverallTime = FALSE - ) - - # export all results to csv files - exportAndromedaToCsv( - andromeda = result, - outputFolder = outputFolder, - cohortDetails = cohortDetails, - counts = counts, - databaseId = databaseId, - minCharacterizationMean = minCharacterizationMean - ) - - if(!is.null(incrementalFile)){ - saveIncremental( - cohortDetails = cohortDetails, - incrementalFile = incrementalFile - ) - } - -} - -extractOutcomeFeatures <- function( - inputList -){ - - connectionDetails <- inputList$connectionDetails - cdmDatabaseSchema <- inputList$cdmDatabaseSchema - cohortDetails <- inputList$cohortDetails - covariateSettingsList <- inputList$covariateSettingsList - outcomeDatabaseSchema <- inputList$outcomeDatabaseSchema - outcomeTable <- inputList$outcomeTable - tempEmulationSchema <- inputList$tempEmulationSchema - minCharacterizationMean <- inputList$minCharacterizationMean - cdmVersion <- inputList$cdmVersion - databaseId <- inputList$databaseId - incrementalFile <- inputList$incrementalFile - - outputFolder <- cohortDetails$folderId[1] - outcomeIds <- unique(cohortDetails$outcomeCohortId) - minPriorObservation <- cohortDetails$minPriorObservation[1] - outcomeWashoutDays <- cohortDetails$outcomeWashoutDays[1] - covariateSettingsId <- cohortDetails$covariateSettingsId[1] - covariateSettings <- covariateSettingsList[[covariateSettingsId]] - - connection <- DatabaseConnector::connect( - connectionDetails = connectionDetails - ) - on.exit( - DatabaseConnector::disconnect(connection) - ) - - # create the temp table with cohort_details - DatabaseConnector::insertTable( - data = cohortDetails[,c('targetCohortId','outcomeCohortId','cohortType','cohortDefinitionId')], - camelCaseToSnakeCase = T, - connection = connection, - tableName = '#cohort_details', - tempTable = T, - dropTableIfExists = T, - createTable = T, - progressBar = F - ) - - message("Computing aggregate covariate outcome cohorts") - start <- Sys.time() - - sql <- SqlRender::loadRenderTranslateSql( - sqlFilename = "OutcomeCohorts.sql", - packageName = "Characterization", - dbms = connectionDetails$dbms, - cdm_database_schema = cdmDatabaseSchema, - tempEmulationSchema = tempEmulationSchema, - outcome_database_schema = outcomeDatabaseSchema, - outcome_table = outcomeTable, - outcome_ids = paste(outcomeIds, collapse = ",", sep = ","), - min_prior_observation = minPriorObservation, - outcome_washout_days = outcomeWashoutDays - ) - - DatabaseConnector::executeSql( - connection = connection, - sql = sql, - progressBar = FALSE, - reportOverallTime = FALSE - ) - completionTime <- Sys.time() - start - - message(paste0('Computing outcome cohorts took ',round(completionTime,digits = 1), ' ', units(completionTime))) - ## get counts - message("Extracting outcome cohort counts") - sql <- "select - cohort_definition_id, - count(*) row_count, - count(distinct subject_id) person_count, - min(datediff(day, cohort_start_date, cohort_end_date)) min_exposure_time, - avg(datediff(day, cohort_start_date, cohort_end_date)) mean_exposure_time, - max(datediff(day, cohort_start_date, cohort_end_date)) max_exposure_time - from - (select * from #agg_cohorts_before union select * from #agg_cohorts_extras) temp - group by cohort_definition_id;" - sql <- SqlRender::translate( - sql = sql, - targetDialect = connectionDetails$dbms - ) - counts <- DatabaseConnector::querySql( - connection = connection, - sql = sql, - snakeCaseToCamelCase = T, - ) - - message("Computing aggregate outcome covariate results") - - result <- FeatureExtraction::getDbCovariateData( - connection = connection, - oracleTempSchema = tempEmulationSchema, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortTable = "#agg_cohorts_before", - cohortTableIsTemp = T, - cohortIds = -1, - covariateSettings = covariateSettings, - cdmVersion = cdmVersion, - aggregated = T, - minCharacterizationMean = minCharacterizationMean - ) - - # drop temp tables - message("Dropping temp tables") - sql <- SqlRender::loadRenderTranslateSql( - sqlFilename = "DropOutcomeCovariate.sql", - packageName = "Characterization", - dbms = connectionDetails$dbms, - tempEmulationSchema = tempEmulationSchema - ) - DatabaseConnector::executeSql( - connection = connection, - sql = sql, progressBar = FALSE, - reportOverallTime = FALSE - ) - - # export all results to csv files - exportAndromedaToCsv( - andromeda = result, - outputFolder = outputFolder, - cohortDetails = cohortDetails, - counts = counts, - databaseId = databaseId, - minCharacterizationMean = minCharacterizationMean - ) - - # append to incremental file - if(!is.null(incrementalFile)){ - saveIncremental( - cohortDetails = cohortDetails, - incrementalFile = incrementalFile - ) - } - - return(invisible(NULL)) -} - - -extractCaseFeatures <- function( - inputList -){ - - connectionDetails <- inputList$connectionDetails - cdmDatabaseSchema <- inputList$cdmDatabaseSchema - cohortDetails <- inputList$cohortDetails - covariateSettingsList <- inputList$covariateSettingsList - caseCovariateSettingsList <- inputList$caseCovariateSettingsList - targetDatabaseSchema <- inputList$targetDatabaseSchema - targetTable <- inputList$targetTable - outcomeDatabaseSchema <- inputList$outcomeDatabaseSchema - outcomeTable <- inputList$outcomeTable - tempEmulationSchema <- inputList$tempEmulationSchema - minCharacterizationMean <- inputList$minCharacterizationMean - cdmVersion <- inputList$cdmVersion - databaseId <- inputList$databaseId - incrementalFile <- inputList$incrementalFile - - # get settings - outputFolder <- cohortDetails$folderId[1] - targetIds <- unique(cohortDetails$targetCohortId) - outcomeIds <- unique(cohortDetails$outcomeCohortId) - minPriorObservation <- cohortDetails$minPriorObservation[1] - outcomeWashoutDays <- cohortDetails$outcomeWashoutDays[1] - casePreTargetDuration <- cohortDetails$casePreTargetDuration[1] - casePostOutcomeDuration <- cohortDetails$casePostOutcomeDuration[1] - covariateSettingsId <- cohortDetails$covariateSettingsId[1] - covariateSettings <- covariateSettingsList[[covariateSettingsId]] - caseCovariateSettingsId <- cohortDetails$caseCovariateSettingsId[1] - caseCovariateSettings <- caseCovariateSettingsList[[caseCovariateSettingsId]] - - tars <- unique(cohortDetails[, c('settingId','riskWindowStart','startAnchor', 'riskWindowEnd','endAnchor')]) - - connection <- DatabaseConnector::connect( - connectionDetails = connectionDetails - ) - on.exit( - DatabaseConnector::disconnect(connection) - ) - - # create the temp table with cohort_details - DatabaseConnector::insertTable( - data = cohortDetails[,c('targetCohortId','outcomeCohortId','cohortType','cohortDefinitionId', 'settingId')], - camelCaseToSnakeCase = T, - connection = connection, - tableName = '#cohort_details', - tempTable = T, - dropTableIfExists = T, - createTable = T, - progressBar = F - ) - - message("Computing aggregate case covariate cohorts") - start <- Sys.time() - - # this is run for all tars - sql <- SqlRender::loadRenderTranslateSql( - sqlFilename = "CaseCohortsPart1.sql", - packageName = "Characterization", - dbms = connectionDetails$dbms, - cdm_database_schema = cdmDatabaseSchema, - tempEmulationSchema = tempEmulationSchema, - target_database_schema = targetDatabaseSchema, - target_table = targetTable, - target_ids = paste(targetIds, collapse = ",", sep = ","), - outcome_database_schema = outcomeDatabaseSchema, - outcome_table = outcomeTable, - outcome_ids = paste(outcomeIds, collapse = ",", sep = ","), - min_prior_observation = minPriorObservation, - outcome_washout_days = outcomeWashoutDays#, - #case_pre_target_duration = casePreTargetDuration, - #case_post_target_duration = casePostTargetDuration - ) - DatabaseConnector::executeSql( - connection = connection, - sql = sql, - progressBar = FALSE, - reportOverallTime = FALSE - ) - - # loop over settingId which contains tars: - for(i in 1:nrow(tars)){ - sql <- SqlRender::loadRenderTranslateSql( - sqlFilename = "CaseCohortsPart2.sql", - packageName = "Characterization", - dbms = connectionDetails$dbms, - #cdm_database_schema = cdmDatabaseSchema, - tempEmulationSchema = tempEmulationSchema, - #target_database_schema = targetDatabaseSchema, - #target_table = targetTable, - #target_ids = paste(targetIds, collapse = ",", sep = ","), - #outcome_database_schema = outcomeDatabaseSchema, - #outcome_table = outcomeTable, - #outcome_ids = paste(outcomeIds, collapse = ",", sep = ","), - #min_prior_observation = minPriorObservation, - #outcome_washout_days = outcomeWashoutDays, - first = i==1, - case_pre_target_duration = casePreTargetDuration, - case_post_outcome_duration = casePostOutcomeDuration, - setting_id = tars$settingId[i], - tar_start = tars$riskWindowStart[i], - tar_start_anchor = ifelse(tars$startAnchor[i] == 'cohort start','cohort_start_date','cohort_end_date'), ##TODO change? - tar_end = tars$riskWindowEnd[i], - tar_end_anchor = ifelse(tars$endAnchor[i] == 'cohort start','cohort_start_date','cohort_end_date') ##TODO change? - ) - DatabaseConnector::executeSql( - connection = connection, - sql = sql, - progressBar = FALSE, - reportOverallTime = FALSE - ) - } - completionTime <- Sys.time() - start - - message(paste0('Computing case cohorts took ',round(completionTime,digits = 1), ' ', units(completionTime))) - - ## get counts - message("Extracting case cohort counts") - sql <- "select - cohort_definition_id, - count(*) row_count, - count(distinct subject_id) person_count, - min(datediff(day, cohort_start_date, cohort_end_date)) min_exposure_time, - avg(datediff(day, cohort_start_date, cohort_end_date)) mean_exposure_time, - max(datediff(day, cohort_start_date, cohort_end_date)) max_exposure_time - from #agg_cohorts_before - group by cohort_definition_id;" - sql <- SqlRender::translate( - sql = sql, - targetDialect = connectionDetails$dbms - ) - counts <- DatabaseConnector::querySql( - connection = connection, - sql = sql, - snakeCaseToCamelCase = T, - ) - - message("Computing aggregate before case covariate results") - - result <- FeatureExtraction::getDbCovariateData( - connection = connection, - oracleTempSchema = tempEmulationSchema, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortTable = "#agg_cohorts_before", - cohortTableIsTemp = T, - cohortIds = -1, - covariateSettings = covariateSettings, - cdmVersion = cdmVersion, - aggregated = T, - minCharacterizationMean = minCharacterizationMean - ) - - message("Computing aggregate during case covariate results") - - result2 <- FeatureExtraction::getDbCovariateData( - connection = connection, - oracleTempSchema = tempEmulationSchema, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortTable = "#agg_cohorts_cases", - cohortTableIsTemp = T, - cohortIds = -1, - covariateSettings = caseCovariateSettings, - cdmVersion = cdmVersion, - aggregated = T, - minCharacterizationMean = minCharacterizationMean - ) - - # drop temp tables - message("Dropping temp tables") - sql <- SqlRender::loadRenderTranslateSql( - sqlFilename = "DropCaseCovariate.sql", - packageName = "Characterization", - dbms = connectionDetails$dbms, - tempEmulationSchema = tempEmulationSchema - ) - DatabaseConnector::executeSql( - connection = connection, - sql = sql, progressBar = FALSE, - reportOverallTime = FALSE - ) - - # export all results to csv files - message("Exporting results to csv") - exportAndromedaToCsv( # TODO combine export of result and result2 - andromeda = result, - outputFolder = outputFolder, - cohortDetails = cohortDetails, - counts = counts, - databaseId = databaseId, - minCharacterizationMean = minCharacterizationMean - ) - exportAndromedaToCsv( - andromeda = result2, - outputFolder = outputFolder, - cohortDetails = cohortDetails, - counts = NULL, # previously added - databaseId = databaseId, - minCharacterizationMean = minCharacterizationMean - ) - - # append to incremental file - if(!is.null(incrementalFile)){ - saveIncremental( - cohortDetails = cohortDetails, - incrementalFile = incrementalFile - ) - } - - return(invisible(NULL)) -} - - -exportAndromedaToCsv <- function( - andromeda, - outputFolder, - cohortDetails, - counts, - databaseId, - minCharacterizationMean, - batchSize = 100000 -){ - - saveLocation <- outputFolder - if(!dir.exists(saveLocation)){ - dir.create(saveLocation, recursive = T) - } - - ids <- data.frame( - settingId = unique(cohortDetails$settingId), - databaseId = databaseId - ) - - # analysis_ref and covariate_ref - # add database_id and setting_id - if(!is.null(andromeda$analysisRef)){ - Andromeda::batchApply( - tbl = andromeda$analysisRef, - fun = function(x){ - data <- merge(x, ids) - colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) - if(file.exists(file.path(saveLocation, 'analysis_ref.csv'))){ - append <- T - } else{ - append = F - } - readr::write_csv(data, file = file.path(saveLocation, 'analysis_ref.csv'), append = append) - }, - batchSize = batchSize - ) - } - - if(!is.null(andromeda$covariateRef)){ - Andromeda::batchApply( - tbl = andromeda$covariateRef, - fun = function(x){ - data <- merge(x, ids) - colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) - if(file.exists(file.path(saveLocation, 'covariate_ref.csv'))){ - append <- T - } else{ - append = F - } - readr::write_csv(data, file = file.path(saveLocation, 'covariate_ref.csv'), append = append) - }, - batchSize = batchSize - ) - } - - # covariates and covariate_continuous - extras <- cohortDetails[, c('cohortDefinitionId','settingId', 'targetCohortId', 'outcomeCohortId', 'cohortType')] - extras$databaseId <- databaseId - extras$minCharacterizationMean <- minCharacterizationMean - - # add database_id, setting_id, target_cohort_id, outcome_cohort_id and cohort_type - if(!is.null(andromeda$covariates)){ - Andromeda::batchApply( - tbl = andromeda$covariates, - fun = function(x){ - data <- merge(x, extras, by = 'cohortDefinitionId') - data <- data %>% dplyr::select(-"cohortDefinitionId") - colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) - if(file.exists(file.path(saveLocation, 'covariates.csv'))){ - append <- T - } else{ - append = F - } - readr::write_csv(data, file = file.path(saveLocation, 'covariates.csv'), append = append) - }, - batchSize = batchSize - ) - } - - if(!is.null(andromeda$covariatesContinuous)){ - Andromeda::batchApply( - tbl = andromeda$covariatesContinuous, - fun = function(x){ - data <- merge(x, extras %>% dplyr::select(-"minCharacterizationMean"), by = 'cohortDefinitionId') - data <- data %>% dplyr::select(-"cohortDefinitionId") - colnames(data) <- SqlRender::camelCaseToSnakeCase(colnames(data)) - if(file.exists(file.path(saveLocation, 'covariates_continuous.csv'))){ - append <- T - } else{ - append = F - } - readr::write_csv(data, file = file.path(saveLocation, 'covariates_continuous.csv'), append = append) - }, - batchSize = batchSize - ) - } - - # cohort_counts: - if(!is.null(counts)){ - cohortCounts <- cohortDetails %>% dplyr::select( - 'targetCohortId', - 'outcomeCohortId', - 'cohortType', - 'cohortDefinitionId', - 'riskWindowStart', - 'riskWindowEnd', - 'startAnchor', - 'endAnchor', - 'minPriorObservation', - 'outcomeWashoutDays' - ) %>% - dplyr::mutate( - databaseId = !!databaseId - ) %>% - dplyr::inner_join(counts, by = 'cohortDefinitionId') %>% - dplyr::select(-"cohortDefinitionId") - cohortCounts <- unique(cohortCounts) - colnames(cohortCounts) <- SqlRender::camelCaseToSnakeCase(colnames(cohortCounts)) - if(file.exists(file.path(saveLocation, 'cohort_counts.csv'))){ - append <- T - } else{ - append = F - } - readr::write_csv(cohortCounts, file = file.path(saveLocation, 'cohort_counts.csv'), append = append) - } - - return(invisible(T)) -} - - - -aggregateCsvs <- function( - outputFolder -){ - - # this makes sure results are recreated - firstTracker <- data.frame( - table = c('covariates.csv','covariates_continuous.csv','covariate_ref.csv','analysis_ref.csv','cohort_counts.csv'), - first = rep(T, 5) - ) - - analysisRefTracker <- c() - covariateRefTracker <- c() - - mainFolder <- 'results' - if(!dir.exists(file.path(outputFolder, mainFolder))){ - dir.create(file.path(outputFolder, mainFolder), recursive = T) - } - - executionFolder <- file.path(outputFolder, 'execution') - folderNames <- dir(executionFolder) - - # for each folder load covariates, covariates_continuous, - # covariate_ref and analysis_ref - for(folderName in folderNames){ - for(csvType in c('covariates.csv','covariates_continuous.csv','covariate_ref.csv','analysis_ref.csv','cohort_counts.csv')){ - - loadPath <- file.path(executionFolder, folderName, csvType) - savePath <- file.path(outputFolder, mainFolder, csvType) - if(file.exists(loadPath)){ - - #TODO do this in batches - data <- readr::read_csv( - file = loadPath, - show_col_types = F - ) - - if(csvType == 'analysis_ref.csv'){ - data <- data %>% - dplyr::filter( - !.data$setting_id %in% analysisRefTracker - ) - analysisRefTracker <- c(analysisRefTracker, unique(data$setting_id)) - } - if(csvType == 'covariate_ref.csv'){ - data <- data %>% - dplyr::filter( - !.data$setting_id %in% covariateRefTracker - ) - covariateRefTracker <- c(covariateRefTracker, unique(data$setting_id)) - } - - append <- file.exists(savePath) - readr::write_csv( - x = data, - file = savePath, quote = 'all', - append = append & !firstTracker$first[firstTracker$table == csvType] - ) - firstTracker$first[firstTracker$table == csvType] <- F - } - - } - } -} - -saveSettings <- function( - outputFolder, - cohortDetails, - databaseId, - covariateSettingsList, - caseCovariateSettingsList - ){ - - saveLocation <- file.path(outputFolder, 'results') - if(!dir.exists(saveLocation)){ - dir.create(saveLocation, recursive = T) - } - - covariateSettingLookup <- data.frame( - covariateSettingsId = 1:length(covariateSettingsList), - covariateSettingJson = unlist(lapply(covariateSettingsList, function(x){ParallelLogger::convertSettingsToJson(x)}) - )) - - caseCovariateSettingLookup <- data.frame( - caseCovariateSettingsId = 1:length(caseCovariateSettingsList), - caseCovariateSettingJson = unlist(lapply(caseCovariateSettingsList, function(x){ParallelLogger::convertSettingsToJson(x)}) - )) - - - settings <- cohortDetails %>% - dplyr::select( - 'settingId', - 'minPriorObservation', - 'outcomeWashoutDays', - 'riskWindowStart', - 'riskWindowEnd', - 'startAnchor', - 'endAnchor', - 'casePreTargetDuration', - 'casePostOutcomeDuration', - 'covariateSettingsId', - 'caseCovariateSettingsId' - ) %>% - dplyr::left_join( - covariateSettingLookup, - by = 'covariateSettingsId' - ) %>% - dplyr::left_join( - caseCovariateSettingLookup, - by = 'caseCovariateSettingsId' - ) %>% - dplyr::mutate( - databaseId = !!databaseId - ) %>% - dplyr::select( - -'covariateSettingsId', -'caseCovariateSettingsId' - ) - colnames(settings) <- SqlRender::camelCaseToSnakeCase(colnames(settings)) - settings <- unique(settings) - readr::write_csv(settings, file = file.path(saveLocation, 'settings.csv')) - -# cohort details: database_id, setting_id, target_cohort_id, outcome_cohort_id and cohort_type - cd <- cohortDetails %>% dplyr::select( - 'settingId', - 'targetCohortId', - 'outcomeCohortId', - 'cohortType' - ) %>% - dplyr::mutate( - databaseId = !!databaseId - ) - cd <- unique(cd) - colnames(cd) <- SqlRender::camelCaseToSnakeCase(colnames(cd)) - readr::write_csv(cd, file = file.path(saveLocation, 'cohort_details.csv')) -} diff --git a/R/Database.R b/R/Database.R index 6b81402..1dbfa03 100644 --- a/R/Database.R +++ b/R/Database.R @@ -62,6 +62,7 @@ createSqliteDatabase <- function( #' @param schema The schema for the result database #' @param resultsFolder The folder containing the csv results #' @param tablePrefix A prefix to append to the result tables for the characterization results +#' @param csvTablePrefix. The prefix added to the csv results - default is 'c_' #' #' @return #' Returns the connection to the sqlite database @@ -71,12 +72,14 @@ insertResultsToDatabase <- function( connectionDetails, schema, resultsFolder, - tablePrefix = 'c_' + tablePrefix = '', + csvTablePrefix = 'c_' ){ specLoc <- system.file('settings', 'resultsDataModelSpecification.csv', package = 'Characterization') specs <- utils::read.csv(specLoc) colnames(specs) <- SqlRender::snakeCaseToCamelCase(colnames(specs)) + specs$tableName <- paste0(csvTablePrefix, specs$tableName) ResultModelManager::uploadResults( connectionDetails = connectionDetails, schema = schema, diff --git a/R/DechallengeRechallenge.R b/R/DechallengeRechallenge.R index f35ac60..4cd7520 100644 --- a/R/DechallengeRechallenge.R +++ b/R/DechallengeRechallenge.R @@ -85,10 +85,11 @@ createDechallengeRechallengeSettings <- function( #' @template ConnectionDetails #' @template TargetOutcomeTables #' @template TempEmulationSchema -#' @param dechallengeRechallengeSettings The settings for the timeToEvent study +#' @param settings The settings for the timeToEvent study #' @param databaseId An identifier for the database (string) #' @param outputFolder A directory to save the results as csv files #' @param minCellCount The minimum cell value to display, values less than this will be replaced by -1 +#' @param ... extra inputs #' #' @return #' An \code{Andromeda::andromeda()} object containing the dechallenge rechallenge results @@ -101,10 +102,11 @@ computeDechallengeRechallengeAnalyses <- function( outcomeDatabaseSchema = targetDatabaseSchema, outcomeTable = targetTable, tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - dechallengeRechallengeSettings, + settings, databaseId = "database 1", outputFolder = file.path(getwd(),'results'), - minCellCount = 0 + minCellCount = 0, + ... ) { # check inputs errorMessages <- checkmate::makeAssertCollection() @@ -126,7 +128,7 @@ computeDechallengeRechallengeAnalyses <- function( errorMessages = errorMessages ) .checkDechallengeRechallengeSettings( - settings = dechallengeRechallengeSettings, + settings = settings, errorMessages = errorMessages ) @@ -158,10 +160,10 @@ computeDechallengeRechallengeAnalyses <- function( target_table = targetTable, outcome_database_schema = outcomeDatabaseSchema, outcome_table = outcomeTable, - target_ids = paste(dechallengeRechallengeSettings$targetCohortDefinitionIds, sep = "", collapse = ","), - outcome_ids = paste(dechallengeRechallengeSettings$outcomeCohortDefinitionIds, sep = "", collapse = ","), - dechallenge_stop_interval = dechallengeRechallengeSettings$dechallengeStopInterval, - dechallenge_evaluation_window = dechallengeRechallengeSettings$dechallengeEvaluationWindow + target_ids = paste(settings$targetCohortDefinitionIds, sep = "", collapse = ","), + outcome_ids = paste(settings$outcomeCohortDefinitionIds, sep = "", collapse = ","), + dechallenge_stop_interval = settings$dechallengeStopInterval, + dechallenge_evaluation_window = settings$dechallengeEvaluationWindow ) DatabaseConnector::executeSql( connection = connection, @@ -200,8 +202,8 @@ computeDechallengeRechallengeAnalyses <- function( message( paste0( "Computing dechallenge rechallenge for ", - length(dechallengeRechallengeSettings$targetCohortDefinitionIds), " target ids and ", - length(dechallengeRechallengeSettings$outcomeCohortDefinitionIds), "outcome ids took ", + length(settings$targetCohortDefinitionIds), " target ids and ", + length(settings$outcomeCohortDefinitionIds), " outcome ids took ", signif(delta, 3), " ", attr(delta, "units") ) @@ -225,11 +227,12 @@ computeDechallengeRechallengeAnalyses <- function( #' @template ConnectionDetails #' @template TargetOutcomeTables #' @template TempEmulationSchema -#' @param dechallengeRechallengeSettings The settings for the timeToEvent study +#' @param settings The settings for the timeToEvent study #' @param databaseId An identifier for the database (string) #' @param showSubjectId if F then subject_ids are hidden (recommended if sharing results) #' @param outputFolder A directory to save the results as csv files #' @param minCellCount The minimum cell value to display, values less than this will be replaced by -1 +#' @param ... extra inputs #' #' @return #' An \code{Andromeda::andromeda()} object with the case series details of the failed rechallenge @@ -242,11 +245,12 @@ computeRechallengeFailCaseSeriesAnalyses <- function( outcomeDatabaseSchema = targetDatabaseSchema, outcomeTable = targetTable, tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - dechallengeRechallengeSettings, + settings, databaseId = "database 1", showSubjectId = F, outputFolder = file.path(getwd(),'results'), - minCellCount = 0 + minCellCount = 0, + ... ){ # check inputs errorMessages <- checkmate::makeAssertCollection() @@ -268,7 +272,7 @@ computeRechallengeFailCaseSeriesAnalyses <- function( errorMessages = errorMessages ) .checkDechallengeRechallengeSettings( - settings = dechallengeRechallengeSettings, + settings = settings, errorMessages = errorMessages ) @@ -298,10 +302,10 @@ computeRechallengeFailCaseSeriesAnalyses <- function( target_table = targetTable, outcome_database_schema = outcomeDatabaseSchema, outcome_table = outcomeTable, - target_ids = paste(dechallengeRechallengeSettings$targetCohortDefinitionIds, sep = "", collapse = ","), - outcome_ids = paste(dechallengeRechallengeSettings$outcomeCohortDefinitionIds, sep = "", collapse = ","), - dechallenge_stop_interval = dechallengeRechallengeSettings$dechallengeStopInterval, - dechallenge_evaluation_window = dechallengeRechallengeSettings$dechallengeEvaluationWindow, + target_ids = paste(settings$targetCohortDefinitionIds, sep = "", collapse = ","), + outcome_ids = paste(settings$outcomeCohortDefinitionIds, sep = "", collapse = ","), + dechallenge_stop_interval = settings$dechallengeStopInterval, + dechallenge_evaluation_window = settings$dechallengeEvaluationWindow, show_subject_id = showSubjectId ) DatabaseConnector::executeSql( @@ -341,8 +345,8 @@ computeRechallengeFailCaseSeriesAnalyses <- function( message( paste0( "Computing dechallenge failed case series for ", - length(dechallengeRechallengeSettings$targetCohortDefinitionIds), " target IDs and ", - length(dechallengeRechallengeSettings$outcomeCohortDefinitionIds), " outcome IDs took ", + length(settings$targetCohortDefinitionIds), " target IDs and ", + length(settings$outcomeCohortDefinitionIds), " outcome IDs took ", signif(delta, 3), " ", attr(delta, "units") ) @@ -358,3 +362,121 @@ computeRechallengeFailCaseSeriesAnalyses <- function( return(invisible(TRUE)) } } + +getDechallengeRechallengeJobs <- function( + characterizationSettings, + threads +){ + + characterizationSettings <- characterizationSettings$dechallengeRechallengeSettings + if(length(characterizationSettings) == 0){ + return(NULL) + } + ind <- 1:length(characterizationSettings) + targetIds <- lapply(ind, function(i){characterizationSettings[[i]]$targetCohortDefinitionIds}) + outcomeIds <- lapply(ind, function(i){characterizationSettings[[i]]$outcomeCohortDefinitionIds}) + dechallengeStopIntervals <- lapply(ind, function(i){characterizationSettings[[i]]$dechallengeStopInterval}) + dechallengeEvaluationWindows <- lapply(ind, function(i){characterizationSettings[[i]]$dechallengeEvaluationWindow}) + + # get all combinations of TnOs, then split by treads + + combinations <- do.call(what = 'rbind', + args = + lapply( + 1:length(targetIds), + function(i){ + result <- expand.grid( + targetId = targetIds[[i]], + outcomeId = outcomeIds[[i]] + ) + result$dechallengeStopInterval <- dechallengeStopIntervals[[i]] + result$dechallengeEvaluationWindow <- dechallengeEvaluationWindows[[i]] + return(result) + } + ) + ) + # find out whether more Ts or more Os + tcount <- nrow( + combinations %>% + dplyr::count( + .data$targetId, + .data$dechallengeStopInterval, + .data$dechallengeEvaluationWindow + ) + ) + + ocount <- nrow( + combinations %>% + dplyr::count( + .data$outcomeId, + .data$dechallengeStopInterval, + .data$dechallengeEvaluationWindow + ) + ) + + if(threads > max(tcount, ocount)){ + message('Tnput parameter threads greater than number of targets and outcomes') + message(paste0('Only using ', max(tcount, ocount) ,' threads for TimeToEvent')) + } + + if(tcount >= ocount){ + threadDf <- combinations %>% + dplyr::count( + .data$targetId, + .data$dechallengeStopInterval, + .data$dechallengeEvaluationWindow + ) + threadDf$thread = rep(1:threads, ceiling(tcount/threads))[1:tcount] + mergeColumn <- c('targetId','dechallengeStopInterval', 'dechallengeEvaluationWindow') + } else{ + threadDf <- combinations %>% + dplyr::count( + .data$outcomeId, + .data$dechallengeStopInterval, + .data$dechallengeEvaluationWindow + ) + threadDf$thread = rep(1:threads, ceiling(ocount/threads))[1:ocount] + mergeColumn <- c('outcomeId','dechallengeStopInterval', 'dechallengeEvaluationWindow') + } + + combinations <- merge(combinations, threadDf, by = mergeColumn) + sets <- lapply( + X = 1:max(threadDf$thread), + FUN = function(i){ + createDechallengeRechallengeSettings( + targetIds = unique(combinations$targetId[combinations$thread == i]), + outcomeIds = unique(combinations$outcomeId[combinations$thread == i]), + dechallengeStopInterval = unique(combinations$dechallengeStopInterval[combinations$thread == i]), + dechallengeEvaluationWindow = unique(combinations$dechallengeEvaluationWindow[combinations$thread == i]) + ) + } + ) + + # recreate settings + settings <- c() + for(i in 1:length(sets)){ + settings <- rbind(settings, + data.frame( + functionName = 'computeDechallengeRechallengeAnalyses', + settings = as.character(ParallelLogger::convertSettingsToJson( + sets[[i]] + )), + executionFolder = paste0('dr_', i), + jobId = paste0('dr_', i) + ) + ) + settings <- rbind(settings, + data.frame( + functionName = 'computeRechallengeFailCaseSeriesAnalyses', + settings = as.character(ParallelLogger::convertSettingsToJson( + sets[[i]] + )), + executionFolder = paste0('rfcs_', i), + jobId = paste0('rfcs_', i) + ) + ) + + } + + return(settings) +} diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index d9db90f..c784a97 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -239,3 +239,26 @@ add = errorMessages ) } + + + +checkNoCsv <- function( + csvFiles, + errorMessage + ){ + + csvExists <- sapply(csvFiles, function(x){file.exists(x)}) + + if(sum(csvExists) > 0){ + stop(errorMessage) + } + + return(invisible(TRUE)) +} + +cleanCsv <- function( + resultFolder, + fileName = 'time_to_event.csv' + ){ + file.remove(file.path(resultFolder, fileName )) +} diff --git a/R/Incremental.R b/R/Incremental.R new file mode 100644 index 0000000..0e458d8 --- /dev/null +++ b/R/Incremental.R @@ -0,0 +1,196 @@ +createIncrementalLog <- function( + executionFolder, + logname = 'execution.csv' + ){ + + if(!dir.exists(executionFolder)){ + dir.create(executionFolder, recursive = T) + } + + if(!file.exists(file.path(executionFolder, logname))){ + x <- data.frame( + run_date_time = Sys.time(), + job_id = 0, + start_time = Sys.time(), + end_time = Sys.time() + ) + readr::write_csv( + x = x, + file = file.path(executionFolder, logname) + ) + } + +} + +loadIncrementalFiles <- function(executionFolder){ + if(file.exists(file.path(executionFolder, 'execution.csv'))){ + executed <- utils::read.csv(file.path(executionFolder, 'execution.csv')) + } else{ + stop('execution.csv missing') + } + + if(file.exists(file.path(executionFolder, 'completed.csv'))){ + completed <- utils::read.csv(file.path(executionFolder, 'completed.csv')) + } else{ + stop('completed.csv missing') + } + return(list( + executed = executed, + completed = completed + )) +} + +getExecutionJobIssues <- function( + executed, + completed +){ + + executedJobs <- unique(executed$job_id) + completedJobs <- unique(completed$job_id) + + issues <- executedJobs[!executedJobs %in% completedJobs] + return(issues) +} + +#' Removes csv files from folders that have not been marked as completed +#' and removes the record of the execution file +#' +#' @param executionFolder The folder that has the execution files +#' +#' @return +#' A list with the settings +#' +#' @export +cleanIncremental <- function( + executionFolder +){ + + incrementalFiles <- loadIncrementalFiles( + executionFolder + ) + + issues <- getExecutionJobIssues( + executed = incrementalFiles$executed, + completed = incrementalFiles$completed + ) + + if(length(issues) > 0 ){ + # delete contents inside folder + for(i in 1:length(issues)){ + files <- dir(file.path(executionFolder,issues[i]), full.names = T) + for(file in files){ + message(paste0('Deleting incomplete result file ', file)) + file.remove(file) + } + } + } + + # now update the execution to remove the issue rows + executionFile <- utils::read.csv( + file = file.path(executionFolder, 'execution.csv') + ) + fixedExecution <- executionFile[!executionFile$job_id %in% issues,] + utils::write.csv( + x = fixedExecution, + file = file.path(executionFolder, 'execution.csv') + ) + +return(invisible(NULL)) +} + +checkResultFilesIncremental <- function( + executionFolder + ){ + + incrementalFiles <- loadIncrementalFiles( + executionFolder + ) + + issues <- getExecutionJobIssues( + executed = incrementalFiles$executed, + completed = incrementalFiles$completed + ) + + if(length(issues) > 0 ){ + stop(paste0('jobIds: ', paste0(issues, collapse = ','), 'executed but not completed. Please run cleanIncremental() to remove incomplete results.')) + } + + return(invisible(NULL)) +} + +findCompletedJobs <- function(executionFolder){ + incrementalFiles <- loadIncrementalFiles(executionFolder) + return(unique(incrementalFiles$completed$job_id)) +} + + +recordIncremental <- function( + executionFolder, + runDateTime, + jobId, + startTime, + endTime, + logname = 'execution.csv' +){ + if(file.exists(file.path(executionFolder, logname))){ + x <- data.frame( + run_date_time = runDateTime, + job_id = jobId, + start_time = startTime, + end_time = endTime + ) + readr::write_csv( + x = x, + file = file.path(executionFolder, logname), + append = T + ) + } else{ + warning(paste0(logname, ' file missing so no logging possible')) + } + +} + +#' Removes csv files from the execution folder as there should be no csv files +#' when running in non-incremental model +#' +#' @param executionFolder The folder that has the execution files +#' +#' @return +#' A list with the settings +#' +#' @export +cleanNonIncremental <- function( + executionFolder + ){ + # remove all files from the executionFolder + files <- dir( + path = executionFolder, + recursive = T, + full.names = T, + pattern = '.csv' + ) + if(length(files) > 0 ){ + for(file in files){ + message(paste0('Deleting file ', file)) + file.remove(file) + } + } +} + +checkResultFilesNonIncremental <- function( + executionFolder +){ + files <- dir( + path = executionFolder, + recursive = T, + full.names = T, + pattern = '.csv' + ) + if(length(files) > 0 ){ + errorMessage <- paste0('Running in non-incremental but csv files exist in execution folder.', + ' please delete manually or using cleanNonIncremental()') + stop(errorMessage) + } + + return(invisible(NULL)) +} diff --git a/R/RunCharacterization.R b/R/RunCharacterization.R index fb6c170..f182664 100644 --- a/R/RunCharacterization.R +++ b/R/RunCharacterization.R @@ -16,7 +16,8 @@ createCharacterizationSettings <- function( timeToEventSettings = NULL, dechallengeRechallengeSettings = NULL, - aggregateCovariateSettings = NULL) { + aggregateCovariateSettings = NULL +) { errorMessages <- checkmate::makeAssertCollection() .checkTimeToEventSettingsList( settings = timeToEventSettings, @@ -54,6 +55,7 @@ createCharacterizationSettings <- function( return(settings) } + #' Save the characterization settings as a json #' @description #' This function converts the settings into a json object and saves it @@ -115,8 +117,9 @@ loadCharacterizationSettings <- function( #' @template TempEmulationSchema #' @param cdmDatabaseSchema The schema with the OMOP CDM data #' @param characterizationSettings The study settings created using \code{createCharacterizationSettings} -#' @param saveDirectory The location to save the results to -#' @param tablePrefix A string to append the tables in the results +#' @param outputDirectory The location to save the final csv files to +#' @param executionPath The location where intermediate results are saved to +#' @param csvFilePrefix A string to append the csv files in the outputDirectory #' @param databaseId The unique identifier for the cdm database #' @param showSubjectId Whether to include subjectId of failed rechallenge case series or hide #' @param minCellCount The minimum count value that is calculated @@ -125,8 +128,7 @@ loadCharacterizationSettings <- function( #' @param minCharacterizationMean The minimum mean threshold to extract when running aggregate covariates #' #' @return -#' An sqlite database with the results is saved into the saveDirectory and a csv file named tacker.csv -#' details which analyses have run to completion. +#' Multiple csv files in the outputDirectory. #' #' @export runCharacterizationAnalyses <- function( @@ -138,15 +140,16 @@ runCharacterizationAnalyses <- function( tempEmulationSchema = NULL, cdmDatabaseSchema, characterizationSettings, - saveDirectory, - tablePrefix = "c_", + outputDirectory, + executionPath = file.path(outputDirectory, 'execution'), + csvFilePrefix = "c_", databaseId = "1", showSubjectId = F, minCellCount = 0, incremental = T, threads = 1, minCharacterizationMean = 0.01 - ) { +) { # inputs checks errorMessages <- checkmate::makeAssertCollection() .checkCharacterizationSettings( @@ -154,266 +157,312 @@ runCharacterizationAnalyses <- function( errorMessages = errorMessages ) .checkTablePrefix( - tablePrefix = tablePrefix, + tablePrefix = csvFilePrefix, errorMessages = errorMessages ) checkmate::reportAssertions( errorMessages ) - # load completed runs if file is present - completedRuns <- tryCatch( - {readr::read_csv(file.path(saveDirectory, "tracker.csv"), show_col_types = FALSE)}, - error = function(e){return(data.frame( - completedRuns = -1, - analysis_type = 'None', - run_id = 1 - ))} + runDateTime <- Sys.time() + + createDirectory(outputDirectory) + createDirectory(executionPath) + + #logger <- createLogger( + # logPath = file.path(outputDirectory), + # logName = 'log.txt' + #) + #ParallelLogger::registerLogger(logger) + #on.exit(ParallelLogger::unregisterLogger(logger)) + + jobs <- createJobs( + characterizationSettings = characterizationSettings, + threads = threads ) - if (!is.null(characterizationSettings$timeToEventSettings)) { - for (i in 1:length(characterizationSettings$timeToEventSettings)) { - message("Running time to event analysis ", i) - - runId <- i - - if(incremental){ - # check whether result already run - do we want to keep loaded or have this as a saved object? - done <- sum(runId %in% as.vector(completedRuns %>% - dplyr::filter( - .data$analysis_type == 'timeToEvent' - ) %>% - dplyr::select("run_id")) - ) > 0 - } else{ - done <- FALSE - } + # save settings + if(!file.exists(file.path(executionPath, 'settings.rds'))){ + saveRDS( + object = list( + characterizationSettings = characterizationSettings, + threads = threads + ), + file = file.path(executionPath, 'settings.rds') + ) + } - if(!done){ - - result <- tryCatch( - { - computeTimeToEventAnalyses( - connectionDetails = connectionDetails, - targetDatabaseSchema = targetDatabaseSchema, - targetTable = targetTable, - outcomeDatabaseSchema = outcomeDatabaseSchema, - outcomeTable = outcomeTable, - tempEmulationSchema = tempEmulationSchema, - cdmDatabaseSchema = cdmDatabaseSchema, - timeToEventSettings = characterizationSettings$timeToEventSettings[[i]], - databaseId = databaseId, - outputFolder = file.path(saveDirectory,'results'), - minCellCount = minCellCount - ) - }, - error = function(e) { - message(paste0("ERROR in time-to-event analysis: ", e$message)) - return(NULL) - } + if(incremental){ + # check for any issues with current incremental + oldSettings <- readRDS( + file = file.path(executionPath, 'settings.rds') ) + if(!identical(characterizationSettings,oldSettings$characterizationSettings)){ + stop('Settings have changed - please turn off incremental') + } + if(!identical(threads,oldSettings$threads)){ + stop('Cannot change number of threads in incremental model') + } - if (!is.null(result)) { - # log that run was sucessful - readr::write_csv( - x = data.frame( - analysis_type = "timeToEvent", - run_id = runId, - database_id = databaseId, - date_time = as.character(Sys.time()) - ), - file = file.path(saveDirectory, "tracker.csv"), - append = file.exists(file.path(saveDirectory, "tracker.csv")) - ) - } - } else{ - message('Results exist for setting - Skipping') - } + # create logs if not exists + createIncrementalLog( + executionFolder = executionPath, + logname = 'execution.csv' + ) + createIncrementalLog( + executionFolder = executionPath, + logname = 'completed.csv' + ) + + checkResultFilesIncremental( + executionFolder = executionPath + ) + + # remove any previously completed jobs + completedJobIds <- findCompletedJobs(executionFolder = executionPath) + + completedJobIndex <- jobs$jobId %in% completedJobIds + if(sum(completedJobIndex) > 0){ + message(paste0('Removing ', sum(completedJobIndex), ' previously completed jobs')) + jobs <- jobs[!completedJobIndex,] } + + if(nrow(jobs) == 0){ + message('No jobs left') + return(invisible(T)) + } + + } else{ + # check for any csv files in folder + checkResultFilesNonIncremental( + executionFolder = executionPath + ) } - if (!is.null(characterizationSettings$dechallengeRechallengeSettings)) { - for (i in 1:length(characterizationSettings$dechallengeRechallengeSettings)) { - ParallelLogger::logInfo(paste0("Running dechallenge rechallenge analysis ", i)) - - runId <- i - # check whether result already run - do we want to keep loaded or have this as a saved object? - if(incremental){ - # check whether result already run - do we want to keep loaded or have this as a saved object? - done <- sum(runId %in% as.vector(completedRuns %>% - dplyr::filter( - .data$analysis_type == 'dechallengeRechallenge' - ) %>% - dplyr::select("run_id")) - ) > 0 - } else{ - done <- FALSE - } - if(!done){ - - result <- tryCatch( - { - computeDechallengeRechallengeAnalyses( - connectionDetails = connectionDetails, - targetDatabaseSchema = targetDatabaseSchema, - targetTable = targetTable, - outcomeDatabaseSchema = outcomeDatabaseSchema, - outcomeTable = outcomeTable, - tempEmulationSchema = tempEmulationSchema, - dechallengeRechallengeSettings = characterizationSettings$dechallengeRechallengeSettings[[i]], - databaseId = databaseId, - outputFolder = file.path(saveDirectory,'results'), - minCellCount = minCellCount - ) - }, - error = function(e) { - message(paste0("ERROR in dechallenge rechallenge analysis: ", e$message)) - return(NULL) - } - ) - if (!is.null(result)) { - # log that run was sucessful - readr::write_csv( - x = data.frame( - analysis_type = "dechallengeRechallenge", - run_id = runId, - database_id = databaseId, - date_time = as.character(Sys.time()) - ), - file = file.path(saveDirectory, "tracker.csv"), - append = file.exists(file.path(saveDirectory, "tracker.csv")) - ) - } - } else{ - message('Results exist - Skipping') - } + # Now loop over the jobs + inputSettings <- list( + connectionDetails = connectionDetails, + targetDatabaseSchema = targetDatabaseSchema, + targetTable = targetTable, + outcomeDatabaseSchema = outcomeDatabaseSchema, + outcomeTable = outcomeTable, + tempEmulationSchema = tempEmulationSchema, + cdmDatabaseSchema = cdmDatabaseSchema, + databaseId = databaseId, + showSubjectId = showSubjectId, + minCellCount = minCellCount, + minCharacterizationMean = minCharacterizationMean, + executionPath = executionPath, + incremental = incremental + ) - # run failed analysis - message("Running rechallenge failed case analysis ", i) - - if(incremental){ - # check whether result already run - do we want to keep loaded or have this as a saved object? - done <- sum(runId %in% as.vector(completedRuns %>% - dplyr::filter( - .data$analysis_type == 'rechallengeFailCaseSeries' - ) %>% - dplyr::select("run_id")) - ) > 0 - } else{ - done <- FALSE - } + # convert jobList to list with extra inputs + jobList <- lapply( + X = 1:nrow(jobs), + FUN = function(ind){ + inputs <- inputSettings + inputs$settings <- jobs$settings[ind] + inputs$functionName <- jobs$functionName[ind] + inputs$executionFolder<- jobs$executionFolder[ind] + inputs$jobId <-jobs$jobId[ind] + inputs$runDateTime <- runDateTime + return(inputs) + }) + + message('Creating new cluster') + cluster <- ParallelLogger::makeCluster( + numberOfThreads = threads, + singleThreadToMain = T, + setAndromedaTempFolder = T + ) - if(!done){ - - result <- tryCatch( - { - computeRechallengeFailCaseSeriesAnalyses( - connectionDetails = connectionDetails, - targetDatabaseSchema = targetDatabaseSchema, - targetTable = targetTable, - outcomeDatabaseSchema = outcomeDatabaseSchema, - outcomeTable = outcomeTable, - tempEmulationSchema = tempEmulationSchema, - dechallengeRechallengeSettings = characterizationSettings$dechallengeRechallengeSettings[[i]], - databaseId = databaseId, - showSubjectId = showSubjectId, - outputFolder = file.path(saveDirectory,'results'), - minCellCount = minCellCount - ) - }, - error = function(e) { - message(paste0("ERROR in rechallenge failed case analysis: ", e$message)) - return(NULL) - } + ParallelLogger::clusterApply( + cluster = cluster, + x = jobList, + fun = runCharacterizationsInParallel + ) + + # code to export all csvs into one file + aggregateCsvs( + outputFolder = outputDirectory, + executionPath = executionPath, + executionFolders = jobs$executionFolder, + csvFilePrefix = csvFilePrefix + ) + + invisible(outputDirectory) +} + +createDirectory <- function(x){ + if(!dir.exists(x)){ + message(paste0('Creating directory ', x)) + dir.create(x, recursive = T) + } +} + +createLogger <- function(logPath, logName){ + createDirectory(logPath) + ParallelLogger::createLogger( + appenders = ParallelLogger::createFileAppender( + fileName = file.path(logPath, logName), + layout = ParallelLogger::layoutParallel + ) + ) +} + +runCharacterizationsInParallel <- function(x){ + + startTime <- Sys.time() + + functionName <- x$functionName + inputSettings <- x + inputSettings$functionName <- NULL + inputSettings$settings <- ParallelLogger::convertJsonToSettings(inputSettings$settings) + inputSettings$outputFolder <- file.path(x$executionPath, x$executionFolder) + + if(x$incremental){ + recordIncremental( + executionFolder = x$executionPath, + runDateTime = x$runDateTime, + jobId = x$jobId, + startTime = startTime, + endTime = startTime, + logname = 'execution.csv' + ) + } + + completed <- tryCatch( + { + do.call( + what = eval(parse(text = functionName)), + args = inputSettings ) + }, error = function(e){print(e); return(FALSE)} - if (!is.null(result)) { - # log that run was successful - readr::write_csv( - x = data.frame( - analysis_type = "rechallengeFailCaseSeries", - run_id = runId, - database_id = databaseId, - date_time = as.character(Sys.time()) - ), - file = file.path(saveDirectory, "tracker.csv"), - append = file.exists(file.path(saveDirectory, "tracker.csv")) - ) - } - } else{ - message('Results exist - Skipping') - } + ) + + endTime <- Sys.time() + + # if it completed without issues save it + if(x$incremental & completed){ + recordIncremental( + executionFolder = x$executionPath, + runDateTime = x$runDateTime, + jobId = x$jobId, + startTime = startTime, + endTime = endTime, + logname = 'completed.csv' + ) } - } +} +createJobs <- function( + characterizationSettings, + threads +){ + + jobDf <- rbind( + getTimeToEventJobs( + characterizationSettings, + threads + ), + getDechallengeRechallengeJobs( + characterizationSettings, + threads + ), + getAggregateCovariatesJobs( + characterizationSettings, + threads + ) + ) - if (!is.null(characterizationSettings$aggregateCovariateSettings)) { - ParallelLogger::logInfo("Running aggregate covariate analyses") + #data.frame( + # functionName, + # settings # json, + # executionFolder, + # jobId + #) - runId <- 1 + return(jobDf) +} - # check whether result already run - do we want to keep loaded or have this as a saved object? - if(incremental){ - # check whether result already run - do we want to keep loaded or have this as a saved object? - done <- sum(runId %in% as.vector(completedRuns %>% - dplyr::filter( - .data$analysis_type == 'aggregateCovariates' - ) %>% - dplyr::select("run_id")) - ) > 0 - } else{ - done <- FALSE - } - if(!done){ - result <- tryCatch( - { - computeAggregateCovariateAnalyses( - connectionDetails = connectionDetails, - cdmDatabaseSchema = cdmDatabaseSchema, - targetDatabaseSchema = targetDatabaseSchema, - targetTable = targetTable, - outcomeDatabaseSchema = outcomeDatabaseSchema, - outcomeTable = outcomeTable, - tempEmulationSchema = tempEmulationSchema, - aggregateCovariateSettings = characterizationSettings$aggregateCovariateSettings, - databaseId = databaseId, - runId = runId, - threads = threads, - minCharacterizationMean = minCharacterizationMean, - outputFolder = saveDirectory, - incrementalFile = ifelse(incremental, file.path(saveDirectory, 'executed.csv'), NULL) - ) - }, - error = function(e) { - message(paste0("ERROR in aggregate covariate analyses: ", e$message)) - message(e) - return(NULL) - } - ) - if(!is.null(result)) { - # log that run was successful - readr::write_csv( - x = data.frame( - analysis_type = "aggregateCovariates", - run_id = runId, - database_id = databaseId, - date_time = as.character(Sys.time()) - ), - file = file.path(saveDirectory, "tracker.csv"), - append = file.exists(file.path(saveDirectory, "tracker.csv")) - ) +aggregateCsvs <- function( + executionPath, + outputFolder, + executionFolders, # needed? + csvFilePrefix +){ - } + tables <- c('cohort_details.csv', 'settings.csv','covariates.csv', + 'covariates_continuous.csv','covariate_ref.csv', + 'analysis_ref.csv','cohort_counts.csv', + 'time_to_event.csv', + 'rechallenge_fail_case_series.csv', 'dechallenge_rechallenge.csv') - } else{ - message('Results Exist - Skipping run') - } # skip if done - } + # this makes sure results are recreated + firstTracker <- data.frame( + table = tables, + first = rep(T, length(tables)) + ) + + analysisRefTracker <- c() + covariateRefTracker <- c() + settingsTracker <- c() + + # create outputFolder + + folderNames <- dir(executionPath) + + # for each folder load covariates, covariates_continuous, + # covariate_ref and analysis_ref + for(folderName in folderNames){ + for(csvType in tables){ + + loadPath <- file.path(executionPath, folderName, csvType) + savePath <- file.path(outputFolder, paste0(csvFilePrefix,csvType)) + if(file.exists(loadPath)){ + + #TODO do this in batches + data <- readr::read_csv( + file = loadPath, + show_col_types = F + ) + + if(csvType == 'analysis_ref.csv'){ + data <- data %>% + dplyr::filter( # need to filter analysis_id and covariate_setting_id + !.data$setting_id %in% analysisRefTracker + ) + analysisRefTracker <- c(analysisRefTracker, unique(data$setting_id)) + } + if(csvType == 'covariate_ref.csv'){ + data <- data %>% + dplyr::filter( + !.data$setting_id %in% covariateRefTracker + ) + covariateRefTracker <- c(covariateRefTracker, unique(data$setting_id)) + } + if(csvType == 'settings.csv'){ + data <- data %>% + dplyr::filter( + !.data$setting_id %in% settingsTracker + ) + settingsTracker <- c(settingsTracker, unique(data$setting_id)) + } + append <- file.exists(savePath) + readr::write_csv( + x = data, + file = savePath, quote = 'all', + append = append & !firstTracker$first[firstTracker$table == csvType] + ) + firstTracker$first[firstTracker$table == csvType] <- F + } - invisible(saveDirectory) + } + } } diff --git a/R/SaveLoad.R b/R/SaveLoad.R index 4348366..0bc025f 100644 --- a/R/SaveLoad.R +++ b/R/SaveLoad.R @@ -14,13 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -colnamesLower <- function(data) { - colnames(data) <- tolower( - x = colnames(data) - ) - return(data) -} - #' export the TimeToEvent results as csv #' #' @param result The output of running \code{computeTimeToEventAnalyses()} @@ -61,13 +54,13 @@ exportTimeToEventToCsv <- function( string = colnames(dat) ) - if (sum(dat$NUM_EVENTS < minCellCount) > 0) { - ParallelLogger::logInfo(paste0("Removing NUM_EVENTS less than ", minCellCount)) - dat$NUM_EVENTS[dat$NUM_EVENTS < minCellCount] <- -1 + if (sum(dat$num_events < minCellCount) > 0) { + ParallelLogger::logInfo(paste0("Removing num_events less than ", minCellCount)) + dat$num_events[dat$num_events < minCellCount] <- -minCellCount } readr::write_csv( - x = dat, + x = formatDouble(x = dat), file = file.path( saveDirectory, "time_to_event.csv" @@ -105,6 +98,10 @@ exportDechallengeRechallengeToCsv <- function( ) message("Writing ", countN, " rows to csv") + if(!dir.exists(saveDirectory)){ + dir.create(saveDirectory, recursive = T) + } + Andromeda::batchApply( tbl = result$dechallengeRechallenge, fun = function(x) { @@ -123,50 +120,58 @@ exportDechallengeRechallengeToCsv <- function( string = colnames(dat) ) - removeInd <- dat$NUM_EVENTS < minCellCount + removeInd <- dat$num_persons_exposed < minCellCount + if (sum(removeInd) > 0) { + ParallelLogger::logInfo(paste0("Removing num_persons_exposed counts less than ", minCellCount)) + if (sum(removeInd) > 0) { + dat$num_persons_exposed[removeInd] <- -minCellCount + } + } + + removeInd <- dat$num_cases < minCellCount if (sum(removeInd) > 0) { - ParallelLogger::logInfo(paste0("Removing NUM_EVENTS counts less than ", minCellCount)) + ParallelLogger::logInfo(paste0("Removing num_cases counts less than ", minCellCount)) if (sum(removeInd) > 0) { - dat$NUM_CASES[removeInd] <- -1 + dat$num_cases[removeInd] <- -minCellCount } } - removeInd <- dat$DECHALLENGE_ATTEMPT < minCellCount + removeInd <- dat$dechallenge_attempt < minCellCount if (sum(removeInd) > 0) { - ParallelLogger::logInfo(paste0("Removing DECHALLENGE_ATTEMPT counts less than ", minCellCount)) + ParallelLogger::logInfo(paste0("Removing dechallenge_attempt counts less than ", minCellCount)) if (sum(removeInd) > 0) { - dat$DECHALLENGE_ATTEMPT[removeInd] <- -1 + dat$dechallenge_attempt[removeInd] <- -minCellCount } } - removeInd <- dat$DECHALLENGE_FAIL < minCellCount | dat$DECHALLENGE_SUCCESS < minCellCount + removeInd <- dat$dechallenge_fail < minCellCount | dat$dechallenge_success < minCellCount if (sum(removeInd) > 0) { ParallelLogger::logInfo(paste0("Removing DECHALLENGE FAIL or SUCCESS counts less than ", minCellCount)) if (sum(removeInd) > 0) { - dat$DECHALLENGE_FAIL[removeInd] <- -1 - dat$DECHALLENGE_SUCCESS[removeInd] <- -1 + dat$dechallenge_fail[removeInd] <- -minCellCount + dat$dechallenge_success[removeInd] <- -minCellCount } } - removeInd <- dat$RECHALLENGE_ATTEMPT < minCellCount + removeInd <- dat$rechallenge_attempt < minCellCount if (sum(removeInd) > 0) { - ParallelLogger::logInfo(paste0("Removing RECHALLENGE_ATTEMPT counts less than ", minCellCount)) + ParallelLogger::logInfo(paste0("Removing rechallenge_attempt counts less than ", minCellCount)) if (sum(removeInd) > 0) { - dat$RECHALLENGE_ATTEMPT[removeInd] <- -1 + dat$rechallenge_attempt[removeInd] <- -minCellCount } } - removeInd <- dat$RECHALLENGE_FAIL < minCellCount | dat$RECHALLENGE_SUCCESS < minCellCount + removeInd <- dat$rechallenge_fail < minCellCount | dat$rechallenge_success < minCellCount if (sum(removeInd) > 0) { - ParallelLogger::logInfo(paste0("Removing RECHALLENGE FAIL or SUCCESS counts less than ", minCellCount)) + ParallelLogger::logInfo(paste0("Removing rechallenge_fail or rechallenge_success counts less than ", minCellCount)) if (sum(removeInd) > 0) { - dat$RECHALLENGE_FAIL[removeInd] <- -1 - dat$RECHALLENGE_SUCCESS[removeInd] <- -1 + dat$rechallenge_fail[removeInd] <- -minCellCount + dat$rechallenge_success[removeInd] <- -minCellCount } } readr::write_csv( - x = dat, + x = formatDouble(x = dat), file = file.path( saveDirectory, "dechallenge_rechallenge.csv" @@ -195,7 +200,8 @@ exportDechallengeRechallengeToCsv <- function( #' @export exportRechallengeFailCaseSeriesToCsv <- function( result, - saveDirectory) { + saveDirectory + ) { if (!dir.exists(saveDirectory)) { dir.create( path = saveDirectory, @@ -229,7 +235,7 @@ exportRechallengeFailCaseSeriesToCsv <- function( ) readr::write_csv( - x = dat, + x = formatDouble(x = dat), file = file.path( saveDirectory, "rechallenge_fail_case_series.csv" diff --git a/R/TimeToEvent.R b/R/TimeToEvent.R index 5c2ed06..7de912c 100644 --- a/R/TimeToEvent.R +++ b/R/TimeToEvent.R @@ -25,7 +25,8 @@ #' @export createTimeToEventSettings <- function( targetIds, - outcomeIds) { + outcomeIds + ) { # check indicationIds errorMessages <- checkmate::makeAssertCollection() # check targetIds is a vector of int/double @@ -59,10 +60,11 @@ createTimeToEventSettings <- function( #' @template TargetOutcomeTables #' @template TempEmulationSchema #' @param cdmDatabaseSchema The database schema containing the OMOP CDM data -#' @param timeToEventSettings The settings for the timeToEvent study +#' @param settings The settings for the timeToEvent study #' @param databaseId An identifier for the database (string) #' @param outputFolder A directory to save the results as csv files #' @param minCellCount The minimum cell value to display, values less than this will be replaced by -1 +#' @param ... extra inputs #' #' @return #' An \code{Andromeda::andromeda()} object containing the time to event results. @@ -76,10 +78,11 @@ computeTimeToEventAnalyses <- function( outcomeTable = targetTable, tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), cdmDatabaseSchema, - timeToEventSettings, + settings, databaseId = "database 1", outputFolder = file.path(getwd(),'results'), - minCellCount = 0 + minCellCount = 0, + ... ) { # check inputs errorMessages <- checkmate::makeAssertCollection() @@ -101,7 +104,7 @@ computeTimeToEventAnalyses <- function( errorMessages = errorMessages ) .checkTimeToEventSettings( - settings = timeToEventSettings, + settings = settings, errorMessages = errorMessages ) @@ -121,8 +124,8 @@ computeTimeToEventAnalyses <- function( message("Uploading #cohort_settings") pairs <- expand.grid( - targetCohortDefinitionId = timeToEventSettings$targetIds, - outcomeCohortDefinitionId = timeToEventSettings$outcomeIds + targetCohortDefinitionId = settings$targetIds, + outcomeCohortDefinitionId = settings$outcomeIds ) DatabaseConnector::insertTable( @@ -189,8 +192,8 @@ computeTimeToEventAnalyses <- function( message( paste0( "Computing time-to-event for ", - nrow(timeToEventSettings), - "T-O pairs took ", + nrow(pairs), + " T-O pairs took ", signif(delta, 3), " ", attr(delta, "units") ) @@ -207,3 +210,84 @@ computeTimeToEventAnalyses <- function( return(invisible(TRUE)) } } + +# code that takes a characterizationSettings list, extracts +# timeToEvent settings and then converts into distinct jobs +# based on the number of threads +getTimeToEventJobs <- function( + characterizationSettings, + threads +){ + + + characterizationSettings <- characterizationSettings$timeToEventSettings + if(length(characterizationSettings) == 0){ + return(NULL) + } + ind <- 1:length(characterizationSettings) + targetIds <- lapply(ind, function(i){characterizationSettings[[i]]$targetIds}) + outcomeIds <- lapply(ind, function(i){characterizationSettings[[i]]$outcomeIds}) + + # get all combinations of TnOs, then split by treads + + tnos <- do.call(what = 'rbind', + args = + lapply( + 1:length(targetIds), + function(i){expand.grid( + targetId = targetIds[[i]], + outcomeId = outcomeIds[[i]] + )} + ) + ) + # find out whether more Ts or more Os + tcount <- length(unique(tnos$targetId)) + ocount <- length(unique(tnos$outcomeId)) + + if(threads > max(tcount, ocount)){ + message('Tnput parameter threads greater than number of targets and outcomes') + message(paste0('Only using ', max(tcount, ocount) ,' threads for TimeToEvent')) + } + + if(tcount >= ocount){ + threadDf <- data.frame( + targetId = unique(tnos$targetId), + thread = rep(1:threads, ceiling(tcount/threads))[1:tcount] + ) + mergeColumn <- 'targetId' + } else{ + threadDf <- data.frame( + outcomeId = unique(tnos$outcomeId), + thread = rep(1:threads, ceiling(ocount/threads))[1:ocount] + ) + mergeColumn <- 'outcomeId' + } + + tnos <- merge(tnos, threadDf, by = mergeColumn) + sets <- lapply( + X = 1:max(threadDf$thread), + FUN = function(i){ + createTimeToEventSettings( + targetIds = unique(tnos$targetId[tnos$thread == i]), + outcomeIds = unique(tnos$outcomeId[tnos$thread == i]) + ) + } + ) + + # recreate settings + settings <- c() + for(i in 1:length(sets)){ + settings <- rbind(settings, + data.frame( + functionName = 'computeTimeToEventAnalyses', + settings = as.character(ParallelLogger::convertSettingsToJson( + sets[[i]] + )), + executionFolder = paste0('tte_', i), + jobId = paste0('tte_', i) + ) + ) + } + + return(settings) +} diff --git a/R/ViewShiny.R b/R/ViewShiny.R index 2bbb0dd..ef0b9bc 100644 --- a/R/ViewShiny.R +++ b/R/ViewShiny.R @@ -25,7 +25,9 @@ viewCharacterization <- function( prepareCharacterizationShiny <- function( resultFolder, cohortDefinitionSet, - sqliteLocation = file.path(tempdir(), 'results.sqlite') + sqliteLocation = file.path(tempdir(), 'results.sqlite'), + tablePrefix = '', + csvTablePrefix = 'c_' ) { if(!dir.exists(dirname(sqliteLocation))){ @@ -46,7 +48,7 @@ prepareCharacterizationShiny <- function( targetDialect = "sqlite", deleteExistingTables = T, createTables = T, - tablePrefix = 'c_' + tablePrefix = paste0(tablePrefix, csvTablePrefix) ) # upload the results @@ -54,7 +56,8 @@ prepareCharacterizationShiny <- function( connectionDetails = connectionDetails, schema = 'main', resultsFolder = resultFolder, - tablePrefix = 'c_' + tablePrefix = tablePrefix, + csvTablePrefix = csvTablePrefix ) # add extra tables (cohorts and databases) @@ -63,15 +66,16 @@ prepareCharacterizationShiny <- function( tables <- tolower(DatabaseConnector::getTableNames(con, "main")) + # this now works for different prefixes if (!"cg_cohort_definition" %in% tables) { cohortIds <- unique( c( - DatabaseConnector::querySql(con, "select distinct TARGET_COHORT_ID from c_cohort_details where TARGET_COHORT_ID != 0;")$TARGET_COHORT_ID, - DatabaseConnector::querySql(con, "select distinct OUTCOME_COHORT_ID from c_cohort_details where OUTCOME_COHORT_ID != 0;")$OUTCOME_COHORT_ID, - DatabaseConnector::querySql(con, "select distinct TARGET_COHORT_DEFINITION_ID from c_time_to_event;")$TARGET_COHORT_DEFINITION_ID, - DatabaseConnector::querySql(con, "select distinct OUTCOME_COHORT_DEFINITION_ID from c_time_to_event;")$OUTCOME_COHORT_DEFINITION_ID, - DatabaseConnector::querySql(con, "select distinct TARGET_COHORT_DEFINITION_ID from c_rechallenge_fail_case_series;")$TARGET_COHORT_DEFINITION_ID, - DatabaseConnector::querySql(con, "select distinct OUTCOME_COHORT_DEFINITION_ID from c_rechallenge_fail_case_series;")$OUTCOME_COHORT_DEFINITION_ID + DatabaseConnector::querySql(con, paste0("select distinct TARGET_COHORT_ID from ",tablePrefix,csvTablePrefix,"cohort_details where COHORT_TYPE = 'Target';"))$TARGET_COHORT_ID, + DatabaseConnector::querySql(con, paste0("select distinct OUTCOME_COHORT_ID from ",tablePrefix,csvTablePrefix,"cohort_details where COHORT_TYPE = 'TnO';"))$OUTCOME_COHORT_ID, + DatabaseConnector::querySql(con, paste0("select distinct TARGET_COHORT_DEFINITION_ID from ",tablePrefix,csvTablePrefix,"time_to_event;"))$TARGET_COHORT_DEFINITION_ID, + DatabaseConnector::querySql(con, paste0("select distinct OUTCOME_COHORT_DEFINITION_ID from ",tablePrefix,csvTablePrefix,"time_to_event;"))$OUTCOME_COHORT_DEFINITION_ID, + DatabaseConnector::querySql(con, paste0("select distinct TARGET_COHORT_DEFINITION_ID from ",tablePrefix,csvTablePrefix,"rechallenge_fail_case_series;"))$TARGET_COHORT_DEFINITION_ID, + DatabaseConnector::querySql(con, paste0("select distinct OUTCOME_COHORT_DEFINITION_ID from ",tablePrefix,csvTablePrefix,"rechallenge_fail_case_series;"))$OUTCOME_COHORT_DEFINITION_ID ) ) @@ -90,9 +94,9 @@ prepareCharacterizationShiny <- function( if (!"database_meta_data" %in% tables) { dbIds <- unique( c( - DatabaseConnector::querySql(con, "select distinct DATABASE_ID from c_analysis_ref;")$DATABASE_ID, - DatabaseConnector::querySql(con, "select distinct DATABASE_ID from c_dechallenge_rechallenge;")$DATABASE_ID, - DatabaseConnector::querySql(con, "select distinct DATABASE_ID from c_time_to_event;")$DATABASE_ID + DatabaseConnector::querySql(con, paste0("select distinct DATABASE_ID from ",tablePrefix,csvTablePrefix,"analysis_ref;"))$DATABASE_ID, + DatabaseConnector::querySql(con, paste0("select distinct DATABASE_ID from ",tablePrefix,csvTablePrefix,"dechallenge_rechallenge;"))$DATABASE_ID, + DatabaseConnector::querySql(con, paste0("select distinct DATABASE_ID from ",tablePrefix,csvTablePrefix,"time_to_event;"))$DATABASE_ID ) ) @@ -115,7 +119,7 @@ prepareCharacterizationShiny <- function( server = server ), schema = "main", - tablePrefix = "c_", + tablePrefix = paste0(tablePrefix,csvTablePrefix), cohortTablePrefix = "cg_", databaseTable = "DATABASE_META_DATA" ) diff --git a/inst/sql/sql_server/CaseCohortsPart1.sql b/inst/sql/sql_server/CaseCohortsPart1.sql index db31087..4275113 100644 --- a/inst/sql/sql_server/CaseCohortsPart1.sql +++ b/inst/sql/sql_server/CaseCohortsPart1.sql @@ -47,7 +47,7 @@ and datediff(day, op.observation_period_start_date, o.cohort_start_date) >= @min -- 2) get all the people with the outcome during washout -drop table if exists #target_outcome_prior; +drop table if exists #case_exclude; -- people with outcome prior select @@ -56,7 +56,7 @@ t.cohort_start_date, t.cohort_end_date, t.cohort_definition_id as target_cohort_id, o.cohort_definition_id as outcome_cohort_id -into #target_outcome_prior +into #case_exclude from #targets_inclusions t inner join #outcomes_washout o on t.subject_id = o.subject_id where @@ -68,8 +68,8 @@ o.cohort_start_date <= dateadd(day, -1, t.cohort_start_date); ---- Create TAR agnostic cohorts -drop table if exists #agg_cohorts_before; -select * into #agg_cohorts_before +drop table if exists #cases; +select * into #cases from ( @@ -80,13 +80,13 @@ tno.subject_id, tno.cohort_start_date, tno.cohort_end_date, cd.cohort_definition_id -from #target_outcome_prior tno +from #case_exclude tno INNER JOIN #cohort_details cd on cd.target_cohort_id = tno.target_cohort_id and cd.outcome_cohort_id = tno.outcome_cohort_id -and cd.cohort_type = 'TnOprior' +and cd.cohort_type = 'Exclude' -- changed from TnOprior ) temp_ts2; --- drop the table needed by the TnO cohorts -drop table if exists #agg_cohorts_cases; +-- drop the table needed by the case series cohorts +drop table if exists #case_series; diff --git a/inst/sql/sql_server/CaseCohortsPart2.sql b/inst/sql/sql_server/CaseCohortsPart2.sql index 8a7b5fc..50ea456 100644 --- a/inst/sql/sql_server/CaseCohortsPart2.sql +++ b/inst/sql/sql_server/CaseCohortsPart2.sql @@ -1,7 +1,7 @@ -- PER TAR RUN TO GET TnO cohorts -- 1) get all the people with the outcome in TAR -drop table if exists #target_outcome_tar; +drop table if exists #cases_tar; -- cases select @@ -12,7 +12,7 @@ o.cohort_start_date as outcome_start_date, o.cohort_end_date as outcome_end_date, t.cohort_definition_id as target_cohort_id, o.cohort_definition_id as outcome_cohort_id -into #target_outcome_tar +into #cases_tar from #targets_inclusions t inner join #outcomes_washout o on t.subject_id = o.subject_id where @@ -22,8 +22,8 @@ and -- outcome starts (ends?) after TAR start o.cohort_start_date >= dateadd(day, @tar_start, t.@tar_start_anchor); --- add the TnO for specific TAR -insert into #agg_cohorts_before +-- add the cases for specific TAR +insert into #cases select * from ( @@ -34,16 +34,16 @@ tno.subject_id, tno.cohort_start_date, tno.cohort_end_date, cd.cohort_definition_id -from #target_outcome_tar tno +from #cases_tar tno INNER JOIN #cohort_details cd on cd.target_cohort_id = tno.target_cohort_id and cd.outcome_cohort_id = tno.outcome_cohort_id -and cd.cohort_type = 'TnO' -and cd.setting_id = @setting_id +and cd.cohort_type = 'Cases' +and cd.setting_id = '@setting_id' ) temp_ts2; -{@first}?{select * into #agg_cohorts_cases}:{insert into #agg_cohorts_cases select *} +{@first}?{select * into #case_series}:{insert into #case_series select *} from ( @@ -53,12 +53,12 @@ tno.subject_id, dateadd(day, 1, tno.cohort_start_date) as cohort_start_date, tno.outcome_start_date as cohort_end_date, cd.cohort_definition_id -from #target_outcome_tar tno +from #cases_tar tno INNER JOIN #cohort_details cd on cd.target_cohort_id = tno.target_cohort_id and cd.outcome_cohort_id = tno.outcome_cohort_id -and cd.cohort_type = 'TnObetween' -and cd.setting_id = @setting_id +and cd.cohort_type = 'CasesBetween' +and cd.setting_id = '@setting_id' union @@ -68,12 +68,12 @@ tno.subject_id, dateadd(day, 1, tno.outcome_start_date) as cohort_start_date, dateadd(day, @case_post_outcome_duration, tno.outcome_start_date) as cohort_end_date, cd.cohort_definition_id -from #target_outcome_tar tno +from #cases_tar tno INNER JOIN #cohort_details cd on cd.target_cohort_id = tno.target_cohort_id and cd.outcome_cohort_id = tno.outcome_cohort_id -and cd.cohort_type = 'OnT' -and cd.setting_id = @setting_id +and cd.cohort_type = 'CasesAfter' +and cd.setting_id = '@setting_id' union @@ -82,12 +82,12 @@ tno.subject_id, dateadd(day, -@case_pre_target_duration, tno.cohort_start_date) as cohort_start_date, tno.cohort_start_date as cohort_end_date, cd.cohort_definition_id -from #target_outcome_tar tno +from #cases_tar tno INNER JOIN #cohort_details cd on cd.target_cohort_id = tno.target_cohort_id and cd.outcome_cohort_id = tno.outcome_cohort_id -and cd.cohort_type = 'TnO' -and cd.setting_id = @setting_id +and cd.cohort_type = 'CasesBefore' +and cd.setting_id = '@setting_id' ) temp_ts2; diff --git a/inst/sql/sql_server/DechallengeRechallenge.sql b/inst/sql/sql_server/DechallengeRechallenge.sql index 370a777..51d67cc 100644 --- a/inst/sql/sql_server/DechallengeRechallenge.sql +++ b/inst/sql/sql_server/DechallengeRechallenge.sql @@ -51,13 +51,38 @@ from group by cohort_definition_id ) exposures inner join + + ( + select + target_cohort_definition_id, + outcome_cohort_definition_id, + sum(num_cases) as num_cases + + from ( - select dc1.cohort_definition_id as target_cohort_definition_id, io1.cohort_definition_id as outcome_cohort_definition_id, count(dc1.subject_id) as num_cases + select dc1.cohort_definition_id as target_cohort_definition_id, + io1.cohort_definition_id as outcome_cohort_definition_id, + count(dc1.subject_id) as num_cases from #target_cohort dc1 inner join #outcome_cohort io1 on dc1.subject_id = io1.subject_id and io1.cohort_start_date > dc1.cohort_start_date and io1.cohort_start_date <= dc1.cohort_end_date group by dc1.cohort_definition_id, io1.cohort_definition_id + + -- added this code to return 0s when there are no outcomes + -- so we can tell whether the dechal has been run or not + union + select distinct + dc1_temp.cohort_definition_id as target_cohort_definition_id, + io1_temp.cohort_definition_id as outcome_cohort_definition_id, + 0 as num_cases + from #target_cohort dc1_temp + join #outcome_cohort io1_temp + ) temp_cases + group by + target_cohort_definition_id, + outcome_cohort_definition_id + ) cases on exposures.cohort_definition_id = cases.target_cohort_definition_id left join diff --git a/inst/sql/sql_server/DropAggregateCovariate.sql b/inst/sql/sql_server/DropAggregateCovariate.sql deleted file mode 100644 index 8c445f9..0000000 --- a/inst/sql/sql_server/DropAggregateCovariate.sql +++ /dev/null @@ -1,39 +0,0 @@ --- clean up by removing the temp tables - -TRUNCATE TABLE #targets_all; -DROP TABLE #targets_all; - -TRUNCATE TABLE #targets_inclusions; -DROP TABLE #targets_inclusions; - -TRUNCATE TABLE #outcomes_all; -DROP TABLE #outcomes_all; - -TRUNCATE TABLE #outcomes_washout; -DROP TABLE #outcomes_washout; - -TRUNCATE TABLE #cohort_details; -DROP TABLE #cohort_details; - -TRUNCATE TABLE #target_outcome_tar; -DROP TABLE #target_outcome_tar; - -TRUNCATE TABLE #target_outcome_prior; -DROP TABLE #target_outcome_prior; - -TRUNCATE TABLE #agg_cohorts_before; -DROP TABLE #agg_cohorts_before; - -TRUNCATE TABLE #agg_cohorts_cases; -DROP TABLE #agg_cohorts_cases; - -TRUNCATE TABLE #agg_cohorts_extras; -DROP TABLE #agg_cohorts_extras; - - - - - - - - diff --git a/inst/sql/sql_server/DropCaseCovariate.sql b/inst/sql/sql_server/DropCaseCovariate.sql index 59531f0..b29b044 100644 --- a/inst/sql/sql_server/DropCaseCovariate.sql +++ b/inst/sql/sql_server/DropCaseCovariate.sql @@ -15,17 +15,17 @@ DROP TABLE #outcomes_washout; TRUNCATE TABLE #cohort_details; DROP TABLE #cohort_details; -TRUNCATE TABLE #target_outcome_tar; -DROP TABLE #target_outcome_tar; +TRUNCATE TABLE #cases_tar; +DROP TABLE #cases_tar; -TRUNCATE TABLE #target_outcome_prior; -DROP TABLE #target_outcome_prior; +TRUNCATE TABLE #case_exclude; +DROP TABLE #case_exclude; -TRUNCATE TABLE #agg_cohorts_before; -DROP TABLE #agg_cohorts_before; +TRUNCATE TABLE #cases; +DROP TABLE #cases; -TRUNCATE TABLE #agg_cohorts_cases; -DROP TABLE #agg_cohorts_cases; +TRUNCATE TABLE #case_series; +DROP TABLE #case_series; diff --git a/inst/sql/sql_server/DropOutcomeCovariate.sql b/inst/sql/sql_server/DropOutcomeCovariate.sql deleted file mode 100644 index 0147bf1..0000000 --- a/inst/sql/sql_server/DropOutcomeCovariate.sql +++ /dev/null @@ -1,24 +0,0 @@ --- clean up by removing the temp tables - -TRUNCATE TABLE #outcomes_all; -DROP TABLE #outcomes_all; - -TRUNCATE TABLE #outcomes_washout; -DROP TABLE #outcomes_washout; - -TRUNCATE TABLE #cohort_details; -DROP TABLE #cohort_details; - -TRUNCATE TABLE #agg_cohorts_before; -DROP TABLE #agg_cohorts_before; - -TRUNCATE TABLE #agg_cohorts_extras; -DROP TABLE #agg_cohorts_extras; - - - - - - - - diff --git a/inst/sql/sql_server/OutcomeCohorts.sql b/inst/sql/sql_server/OutcomeCohorts.sql deleted file mode 100644 index 31830e4..0000000 --- a/inst/sql/sql_server/OutcomeCohorts.sql +++ /dev/null @@ -1,61 +0,0 @@ --- all outcomes -drop table if exists #outcomes_all; -select * into #outcomes_all -from @outcome_database_schema.@outcome_table -where cohort_definition_id in -(@outcome_ids); - --- first outcomes in washout days and min prior obs -drop table if exists #outcomes_washout; -select o.* into #outcomes_washout -from (select *, - ISNULL(datediff(day, LAG(cohort_start_date) OVER(partition by subject_id, cohort_definition_id order by cohort_start_date asc), cohort_start_date ), 100000) as time_between - from #outcomes_all -) as o -inner join @cdm_database_schema.observation_period op -on op.person_id = o.subject_id -and o.cohort_start_date >= op.observation_period_start_date -and o.cohort_start_date <= op.observation_period_end_date -where o.time_between >= @outcome_washout_days -and datediff(day, op.observation_period_start_date, o.cohort_start_date) >= @min_prior_observation; - - ----- Create TAR agnostic cohorts -drop table if exists #agg_cohorts_before; -select * into #agg_cohorts_before - -from -( --- outcomes with restrictions -select distinct -o.subject_id, -o.cohort_start_date, -o.cohort_end_date, -cd.cohort_definition_id -from #outcomes_washout as o -INNER JOIN #cohort_details cd -on cd.outcome_cohort_id = o.cohort_definition_id -and cd.cohort_type = 'Outcome' - -) temp_ts2; - - - --- add extra cohorts -drop table if exists #agg_cohorts_extras; -select * into #agg_cohorts_extras - -from -( --- outcomes with restrictions -select distinct -o.subject_id, -o.cohort_start_date, -o.cohort_end_date, -cd.cohort_definition_id -from #outcomes_washout as o -INNER JOIN #cohort_details cd -on cd.outcome_cohort_id = o.cohort_definition_id -and cd.cohort_type = 'Oall' - -) temp_ts2; diff --git a/man/cleanIncremental.Rd b/man/cleanIncremental.Rd new file mode 100644 index 0000000..2e53b1e --- /dev/null +++ b/man/cleanIncremental.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Incremental.R +\name{cleanIncremental} +\alias{cleanIncremental} +\title{Removes csv files from folders that have not been marked as completed +and removes the record of the execution file} +\usage{ +cleanIncremental(executionFolder) +} +\arguments{ +\item{executionFolder}{The folder that has the execution files} +} +\value{ +A list with the settings +} +\description{ +Removes csv files from folders that have not been marked as completed +and removes the record of the execution file +} diff --git a/man/cleanNonIncremental.Rd b/man/cleanNonIncremental.Rd new file mode 100644 index 0000000..5c666ab --- /dev/null +++ b/man/cleanNonIncremental.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Incremental.R +\name{cleanNonIncremental} +\alias{cleanNonIncremental} +\title{Removes csv files from the execution folder as there should be no csv files +when running in non-incremental model} +\usage{ +cleanNonIncremental(executionFolder) +} +\arguments{ +\item{executionFolder}{The folder that has the execution files} +} +\value{ +A list with the settings +} +\description{ +Removes csv files from the execution folder as there should be no csv files +when running in non-incremental model +} diff --git a/man/computeAggregateCovariateAnalyses.Rd b/man/computeAggregateCovariateAnalyses.Rd deleted file mode 100644 index 5f95833..0000000 --- a/man/computeAggregateCovariateAnalyses.Rd +++ /dev/null @@ -1,82 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/AggregateCovariates.R -\name{computeAggregateCovariateAnalyses} -\alias{computeAggregateCovariateAnalyses} -\title{Compute aggregate covariate study} -\usage{ -computeAggregateCovariateAnalyses( - connectionDetails = NULL, - cdmDatabaseSchema, - cdmVersion = 5, - targetDatabaseSchema, - targetTable, - outcomeDatabaseSchema = targetDatabaseSchema, - outcomeTable = targetTable, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - aggregateCovariateSettings, - databaseId = "database 1", - outputFolder = file.path(getwd(), "characterization_results"), - runId = 1, - threads = 1, - incrementalFile = NULL, - minCharacterizationMean = 0, - runExtractTfeatures = T, - runExtractOfeatures = T, - runExtractCaseFeatures = T, - settingIds = NULL -) -} -\arguments{ -\item{connectionDetails}{An object of type `connectionDetails` as created using the -[DatabaseConnector::createConnectionDetails()] function.} - -\item{cdmDatabaseSchema}{The schema with the OMOP CDM data} - -\item{cdmVersion}{The version of the OMOP CDM} - -\item{targetDatabaseSchema}{Schema name where your target cohort table resides. Note that for SQL Server, -this should include both the database and schema name, for example -'scratch.dbo'.} - -\item{targetTable}{Name of the target cohort table.} - -\item{outcomeDatabaseSchema}{Schema name where your outcome cohort table resides. Note that for SQL Server, -this should include both the database and schema name, for example -'scratch.dbo'.} - -\item{outcomeTable}{Name of the outcome cohort table.} - -\item{tempEmulationSchema}{Some database platforms like Oracle and Impala do not truly support temp tables. -To emulate temp tables, provide a schema with write privileges where temp tables -can be created} - -\item{aggregateCovariateSettings}{The settings for the AggregateCovariate study} - -\item{databaseId}{Unique identifier for the database (string)} - -\item{outputFolder}{The location to save the results as csv files} - -\item{runId}{(depreciated) Unique identifier for the covariate setting} - -\item{threads}{The number of threads to run in parallel} - -\item{incrementalFile}{(optional) A file that tracks completed studies} - -\item{minCharacterizationMean}{The minimum mean value for characterization output. Values below this will be cut off from output. This -will help reduce the file size of the characterization output, but will remove information -on covariates that have very low values. The default is 0.} - -\item{runExtractTfeatures}{Whether to extract the target cohort features} - -\item{runExtractOfeatures}{Whether to extract the outcome cohort features} - -\item{runExtractCaseFeatures}{Whether to extract the case cohort features} - -\item{settingIds}{(not recommended to use) User can specify the lookup ids for the settings} -} -\value{ -The descriptive results for each target cohort in the settings. -} -\description{ -Compute aggregate covariate study -} diff --git a/man/computeDechallengeRechallengeAnalyses.Rd b/man/computeDechallengeRechallengeAnalyses.Rd index 91e4e09..4924d09 100644 --- a/man/computeDechallengeRechallengeAnalyses.Rd +++ b/man/computeDechallengeRechallengeAnalyses.Rd @@ -11,10 +11,11 @@ computeDechallengeRechallengeAnalyses( outcomeDatabaseSchema = targetDatabaseSchema, outcomeTable = targetTable, tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - dechallengeRechallengeSettings, + settings, databaseId = "database 1", outputFolder = file.path(getwd(), "results"), - minCellCount = 0 + minCellCount = 0, + ... ) } \arguments{ @@ -37,13 +38,15 @@ this should include both the database and schema name, for example To emulate temp tables, provide a schema with write privileges where temp tables can be created} -\item{dechallengeRechallengeSettings}{The settings for the timeToEvent study} +\item{settings}{The settings for the timeToEvent study} \item{databaseId}{An identifier for the database (string)} \item{outputFolder}{A directory to save the results as csv files} \item{minCellCount}{The minimum cell value to display, values less than this will be replaced by -1} + +\item{...}{extra inputs} } \value{ An \code{Andromeda::andromeda()} object containing the dechallenge rechallenge results diff --git a/man/computeRechallengeFailCaseSeriesAnalyses.Rd b/man/computeRechallengeFailCaseSeriesAnalyses.Rd index e24dff6..b48d660 100644 --- a/man/computeRechallengeFailCaseSeriesAnalyses.Rd +++ b/man/computeRechallengeFailCaseSeriesAnalyses.Rd @@ -11,11 +11,12 @@ computeRechallengeFailCaseSeriesAnalyses( outcomeDatabaseSchema = targetDatabaseSchema, outcomeTable = targetTable, tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - dechallengeRechallengeSettings, + settings, databaseId = "database 1", showSubjectId = F, outputFolder = file.path(getwd(), "results"), - minCellCount = 0 + minCellCount = 0, + ... ) } \arguments{ @@ -38,7 +39,7 @@ this should include both the database and schema name, for example To emulate temp tables, provide a schema with write privileges where temp tables can be created} -\item{dechallengeRechallengeSettings}{The settings for the timeToEvent study} +\item{settings}{The settings for the timeToEvent study} \item{databaseId}{An identifier for the database (string)} @@ -47,6 +48,8 @@ can be created} \item{outputFolder}{A directory to save the results as csv files} \item{minCellCount}{The minimum cell value to display, values less than this will be replaced by -1} + +\item{...}{extra inputs} } \value{ An \code{Andromeda::andromeda()} object with the case series details of the failed rechallenge diff --git a/man/computeTimeToEventAnalyses.Rd b/man/computeTimeToEventAnalyses.Rd index ef91674..e453b37 100644 --- a/man/computeTimeToEventAnalyses.Rd +++ b/man/computeTimeToEventAnalyses.Rd @@ -12,10 +12,11 @@ computeTimeToEventAnalyses( outcomeTable = targetTable, tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), cdmDatabaseSchema, - timeToEventSettings, + settings, databaseId = "database 1", outputFolder = file.path(getwd(), "results"), - minCellCount = 0 + minCellCount = 0, + ... ) } \arguments{ @@ -40,13 +41,15 @@ can be created} \item{cdmDatabaseSchema}{The database schema containing the OMOP CDM data} -\item{timeToEventSettings}{The settings for the timeToEvent study} +\item{settings}{The settings for the timeToEvent study} \item{databaseId}{An identifier for the database (string)} \item{outputFolder}{A directory to save the results as csv files} \item{minCellCount}{The minimum cell value to display, values less than this will be replaced by -1} + +\item{...}{extra inputs} } \value{ An \code{Andromeda::andromeda()} object containing the time to event results. diff --git a/man/createAggregateCovariateSettings.Rd b/man/createAggregateCovariateSettings.Rd index 1f74fc0..44e09dd 100644 --- a/man/createAggregateCovariateSettings.Rd +++ b/man/createAggregateCovariateSettings.Rd @@ -32,7 +32,8 @@ createAggregateCovariateSettings( = T, useMeasurementDuring = T, useObservationDuring = T, useVisitConceptCountDuring = T), casePreTargetDuration = 365, - casePostOutcomeDuration = 365 + casePostOutcomeDuration = 365, + extractNonCaseCovariates = T ) } \arguments{ @@ -61,6 +62,8 @@ or `"cohort end"`.} \item{casePreTargetDuration}{The number of days prior to case index we use for FeatureExtraction} \item{casePostOutcomeDuration}{The number of days prior to case index we use for FeatureExtraction} + +\item{extractNonCaseCovariates}{Whether to extract aggregate covariates and counts for patients in the targets and outcomes in addition to the cases} } \value{ A list with the settings diff --git a/man/getCaseSettingColumns.Rd b/man/getCaseSettingColumns.Rd deleted file mode 100644 index 5a5f280..0000000 --- a/man/getCaseSettingColumns.Rd +++ /dev/null @@ -1,13 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/AggregateCovariatesHelpers.R -\name{getCaseSettingColumns} -\alias{getCaseSettingColumns} -\title{riskWindowStart','startAnchor', -riskWindowEnd','endAnchor'} -\usage{ -getCaseSettingColumns() -} -\description{ -riskWindowStart','startAnchor', -riskWindowEnd','endAnchor' -} diff --git a/man/insertResultsToDatabase.Rd b/man/insertResultsToDatabase.Rd index 8c569b9..f1ae176 100644 --- a/man/insertResultsToDatabase.Rd +++ b/man/insertResultsToDatabase.Rd @@ -8,7 +8,8 @@ insertResultsToDatabase( connectionDetails, schema, resultsFolder, - tablePrefix = "c_" + tablePrefix = "", + csvTablePrefix = "c_" ) } \arguments{ @@ -19,6 +20,8 @@ insertResultsToDatabase( \item{resultsFolder}{The folder containing the csv results} \item{tablePrefix}{A prefix to append to the result tables for the characterization results} + +\item{csvTablePrefix.}{The prefix added to the csv results - default is 'c_'} } \value{ Returns the connection to the sqlite database diff --git a/man/runCharacterizationAnalyses.Rd b/man/runCharacterizationAnalyses.Rd index 9bb6925..80db9b5 100644 --- a/man/runCharacterizationAnalyses.Rd +++ b/man/runCharacterizationAnalyses.Rd @@ -13,8 +13,9 @@ runCharacterizationAnalyses( tempEmulationSchema = NULL, cdmDatabaseSchema, characterizationSettings, - saveDirectory, - tablePrefix = "c_", + outputDirectory, + executionPath = file.path(outputDirectory, "execution"), + csvFilePrefix = "c_", databaseId = "1", showSubjectId = F, minCellCount = 0, @@ -46,9 +47,11 @@ can be created} \item{characterizationSettings}{The study settings created using \code{createCharacterizationSettings}} -\item{saveDirectory}{The location to save the results to} +\item{outputDirectory}{The location to save the final csv files to} -\item{tablePrefix}{A string to append the tables in the results} +\item{executionPath}{The location where intermediate results are saved to} + +\item{csvFilePrefix}{A string to append the csv files in the outputDirectory} \item{databaseId}{The unique identifier for the cdm database} @@ -63,8 +66,7 @@ can be created} \item{minCharacterizationMean}{The minimum mean threshold to extract when running aggregate covariates} } \value{ -An sqlite database with the results is saved into the saveDirectory and a csv file named tacker.csv -details which analyses have run to completion. +Multiple csv files in the outputDirectory. } \description{ Specify the database connection containing the CDM data, the cohort database schemas/tables, diff --git a/tests/testthat/test-Incremental.R b/tests/testthat/test-Incremental.R new file mode 100644 index 0000000..428af80 --- /dev/null +++ b/tests/testthat/test-Incremental.R @@ -0,0 +1,322 @@ +context("Incremental") + +logFolder <- file.path(tempdir(),'log1') +on.exit(unlink(logFolder)) +logFolder2 <- file.path(tempdir(),'log2') +on.exit(unlink(logFolder2)) +logFolder3 <- file.path(tempdir(),'log3') +on.exit(unlink(logFolder3)) +logFolder4 <- file.path(tempdir(),'log4') +on.exit(unlink(logFolder4)) +logFolder5 <- file.path(tempdir(),'log5') +on.exit(unlink(logFolder5)) +logFolder6 <- file.path(tempdir(),'log6') +on.exit(unlink(logFolder6)) + +for(folder in c(logFolder, logFolder2, logFolder3, + logFolder4, logFolder5, logFolder6)){ + if(!dir.exists(folder)){ + dir.create(folder) + } +} + +test_that("createIncrementalLog", { + + Characterization:::createIncrementalLog( + executionFolder = logFolder, + logname = 'execution.csv' + ) + testthat::expect_true("execution.csv" %in% dir(logFolder)) + executionLog <- read.csv(file.path(logFolder, "execution.csv")) + testthat::expect_true(nrow(executionLog) == 1) + testthat::expect_true(executionLog$job_id == 0) + + Characterization:::createIncrementalLog( + executionFolder = logFolder, + logname = 'madeup.csv' + ) + testthat::expect_true("madeup.csv" %in% dir(logFolder)) + +}) + + +test_that("loadIncrementalFiles", { + + # should error as not completed.csv + testthat::expect_error( + Characterization:::loadIncrementalFiles( + executionFolder = logFolder + ) + ) + + # now create the completed.csv + Characterization:::createIncrementalLog( + executionFolder = logFolder, + logname = 'completed.csv' + ) + + result <- Characterization:::loadIncrementalFiles( + executionFolder = logFolder + ) + testthat::expect_true(sum(c('executed','completed') %in% names(result)) == 2) + testthat::expect_true(nrow(result$executed) == 1) + testthat::expect_true(nrow(result$completed) == 1) + +}) + +test_that("getExecutionJobIssues", { + + result <- Characterization:::loadIncrementalFiles( + executionFolder = logFolder + ) + # should error as not completed.csv + issues <- Characterization:::getExecutionJobIssues( + executed = result$executed, + completed = result$completed + ) + testthat::expect_true(length(issues) == 0) + + # now add some executed but not completed results + issues <- Characterization:::getExecutionJobIssues( + executed = data.frame( + run_date_time = c(1,1), + job_id = c(1,2), + start_time = c(1,2), + end_time = c(1,2) + ), + completed = data.frame( + run_date_time = c(1), + job_id = c(1), + start_time = c(1), + end_time = c(1) + ) + ) + testthat::expect_true(issues == 2) + + issues <- Characterization:::getExecutionJobIssues( + executed = data.frame( + run_date_time = c(1,1), + job_id = c(1,20), + start_time = c(1,2), + end_time = c(1,2) + ), + completed = data.frame( + run_date_time = c(1), + job_id = c(1), + start_time = c(1), + end_time = c(1) + ) + ) + testthat::expect_true(issues == 20) + +}) + +test_that("cleanIncremental", { + + # create folder with issues + Characterization:::createIncrementalLog( + executionFolder = logFolder2, + logname = 'execution.csv' + ) + Characterization:::createIncrementalLog( + executionFolder = logFolder2, + logname = 'completed.csv' + ) + + # add a job into executed that is not in completed + readr::write_csv( + x = data.frame( + run_date_time = c(10), + job_id = c(1), + start_time = c(1), + end_time = c(1) + ), + file = file.path(logFolder2, 'execution.csv'), + append = T + ) + + incrementalFiles <- Characterization:::loadIncrementalFiles( + executionFolder = logFolder2 + ) + issues <- Characterization:::getExecutionJobIssues( + executed = incrementalFiles$executed, + completed = incrementalFiles$completed + ) + + testthat::expect_true(nrow(incrementalFiles$executed) == 2) + testthat::expect_true(nrow(incrementalFiles$completed) == 1) + testthat::expect_true(length(issues) == 1) + + dir.create(file.path(logFolder2, '1')) + write.csv( + x = data.frame(a=1), + file = file.path(logFolder2, '1', 'madeup.csv') + ) + testthat::expect_true(file.exists(file.path(logFolder2, '1', 'madeup.csv'))) + + # run clean to fix issues + Characterization:::cleanIncremental( + executionFolder = logFolder2 + ) + + # check issues are fixed + testthat::expect_true(!file.exists(file.path(logFolder2, '1', 'madeup.csv'))) + incrementalFiles <- Characterization:::loadIncrementalFiles( + executionFolder = logFolder2 + ) + issues <- Characterization:::getExecutionJobIssues( + executed = incrementalFiles$executed, + completed = incrementalFiles$completed + ) + + testthat::expect_true(nrow(incrementalFiles$executed) == 1) + testthat::expect_true(nrow(incrementalFiles$completed) == 1) + testthat::expect_true(length(issues) == 0) + +}) + +test_that("checkResultFilesIncremental ", { + + # create folder with issues + Characterization:::createIncrementalLog( + executionFolder = logFolder3, + logname = 'execution.csv' + ) + Characterization:::createIncrementalLog( + executionFolder = logFolder3, + logname = 'completed.csv' + ) + + result <- Characterization:::checkResultFilesIncremental( + executionFolder = logFolder3 + ) + testthat::expect_true(is.null(result)) + + # add a job into executed that is not in completed + readr::write_csv( + x = data.frame( + run_date_time = c(10), + job_id = c(1), + start_time = c(1), + end_time = c(1) + ), + file = file.path(logFolder3, 'execution.csv'), + append = T + ) + + testthat::expect_error(Characterization:::checkResultFilesIncremental( + executionFolder = logFolder3 + )) + +}) + +test_that("checkResultFilesIncremental ", { + + # create folder with issues + Characterization:::createIncrementalLog( + executionFolder = logFolder4, + logname = 'execution.csv' + ) + Characterization:::createIncrementalLog( + executionFolder = logFolder4, + logname = 'completed.csv' + ) + + # add a job into executed and completed + readr::write_csv( + x = data.frame( + run_date_time = c(10), + job_id = c(1), + start_time = c(1), + end_time = c(1) + ), + file = file.path(logFolder4, 'execution.csv'), + append = T + ) + readr::write_csv( + x = data.frame( + run_date_time = c(10), + job_id = c(1), + start_time = c(1), + end_time = c(1) + ), + file = file.path(logFolder4, 'completed.csv'), + append = T + ) + +jobs <- Characterization:::findCompletedJobs(logFolder4) +testthat::expect_true(1 %in% jobs) + +}) + +test_that("recordIncremental ", { + Characterization:::createIncrementalLog( + executionFolder = logFolder6, + logname = 'execution.csv' + ) + execution <- read.csv( + file = file.path(logFolder6, 'execution.csv') + ) + testthat::expect_true(!'example100' %in% execution$job_id) + +Characterization:::recordIncremental( + executionFolder = logFolder6, + runDateTime = Sys.time(), + jobId = 'example100', + startTime = Sys.time(), + endTime = Sys.time(), + logname = 'execution.csv' +) + executionJobs <- read.csv( + file = file.path(logFolder6, 'execution.csv') + ) + testthat::expect_true('example100' %in% executionJobs$job_id) + + # test warning if no file + testthat::expect_warning( + Characterization:::recordIncremental( + executionFolder = logFolder6, + runDateTime = 1, + jobId = 'example100', + startTime = 1, + endTime = 1, + logname = 'execution2.csv' + ) + ) + +}) + + +test_that("No Incremental works", { + + result <- Characterization:::checkResultFilesNonIncremental( + executionFolder = logFolder5 + ) + testthat::expect_true(is.null(result)) + + dir.create( + path = file.path(logFolder5, 'job_1'), + recursive = T + ) + on.exit(unlink(file.path(logFolder5, 'job_1'))) + + write.csv( + x = data.frame(a=1), + file = file.path(logFolder5, 'job_1', 'anyCsvFile.csv') + ) + + # now there is a csv file it should error + testthat::expect_error( + Characterization:::checkResultFilesNonIncremental( + executionFolder = logFolder5 + ) + ) + + # this should clean the folder of any csv files +Characterization:::cleanNonIncremental( + executionFolder = logFolder5 + ) +# previously created csv should have been deleted +testthat::expect_true(length(dir(file.path(logFolder5, 'job_1'))) == 0) + +}) diff --git a/tests/testthat/test-aggregateCovariate.R b/tests/testthat/test-aggregateCovariate.R index db2b75f..c2d4386 100644 --- a/tests/testthat/test-aggregateCovariate.R +++ b/tests/testthat/test-aggregateCovariate.R @@ -3,6 +3,12 @@ context("AggregateCovariate") +tempFolder1 <- tempfile("runAggregate1") +on.exit(unlink(tempFolder1, recursive = TRUE), add = TRUE) +tempFolder2 <- tempfile("runAggregate2") +on.exit(unlink(tempFolder1, recursive = TRUE), add = TRUE) + + test_that("createAggregateCovariateSettings", { targetIds <- sample(x = 100, size = sample(10, 1)) outcomeIds <- sample(x = 100, size = sample(10, 1)) @@ -148,7 +154,174 @@ test_that("createAggregateCovariateSettingsList", { ) }) -test_that("computeAggregateCovariateAnalyses", { +test_that("createExecutionIds", { + testIds <- createExecutionIds(10) + testthat::expect_true(length(testIds) == 10) + testthat::expect_true(length(unique(testIds)) == 10) + + testId1 <- createExecutionIds(1) + testId2 <- createExecutionIds(1) + testthat::expect_true(testId1 != testId2) +}) + + +test_that("getAggregateCovariatesJobs", { + targetIds <- c(1, 2, 4) + outcomeIds <- c(3) + covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + useDemographicsAge = T, + useCharlsonIndex = T + ) + caseCovariateSettings <- createDuringCovariateSettings( + useConditionOccurrenceDuring = T + ) + + minPriorObservation <- sample(30,1) + + res <- createAggregateCovariateSettings( + targetIds = targetIds, + outcomeIds = outcomeIds, + minPriorObservation = minPriorObservation, + outcomeWashoutDays = 1, + riskWindowStart = 1, startAnchor = "cohort start", + riskWindowEnd = 5 * 365, endAnchor = "cohort start", + covariateSettings = covariateSettings, + caseCovariateSettings = caseCovariateSettings + ) + + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = Characterization::createCharacterizationSettings( + aggregateCovariateSettings = res + ), + threads = 1 + ) + + testthat::expect_true( + sum(c('computeTargetAggregateCovariateAnalyses', + 'computeCaseAggregateCovariateAnalyses') %in% + jobDf$functionName) == 2 + ) + testthat::expect_true(nrow(jobDf) == 2) + + testthat::expect_true( + paste0('tac_1_',minPriorObservation) %in% jobDf$executionFolder + ) + testthat::expect_true( + paste0('cac_1_',minPriorObservation, '_1_365_365') %in% jobDf$executionFolder + ) + + settings <- ParallelLogger::convertJsonToSettings(jobDf$settings[1]) + covSettings <- ParallelLogger::convertJsonToSettings(settings$covariateSettingsJson) + testthat::expect_true( + covSettings[[1]]$DemographicsGender == T + ) + testthat::expect_true( + covSettings[[1]]$CharlsonIndex == T + ) + testthat::expect_true( + covSettings[[1]]$DemographicsAge == T + ) + + + # now check threads = 2 + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = Characterization::createCharacterizationSettings( + aggregateCovariateSettings = res + ), + threads = 2 + ) + testthat::expect_true(nrow(jobDf) == 4) + + testthat::expect_true( + sum(c(paste0('tac_1_',minPriorObservation), + paste0('tac_2_',minPriorObservation)) + %in% jobDf$executionFolder ) == 2 + ) + testthat::expect_true( + sum(c(paste0('cac_1_',minPriorObservation, '_1_365_365'), + paste0('cac_2_',minPriorObservation, '_1_365_365')) + %in% jobDf$executionFolder) == 2 + ) + + # now check threads = 3 + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = Characterization::createCharacterizationSettings( + aggregateCovariateSettings = res + ), + threads = 3 + ) + testthat::expect_true(nrow(jobDf) == 2*3) + + # now check threads = 4 + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = createCharacterizationSettings( + aggregateCovariateSettings = res + ), + threads = 4 + ) + testthat::expect_true(nrow(jobDf) == 7) + + # now check threads = 5 + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = createCharacterizationSettings( + aggregateCovariateSettings = res + ), + threads = 5 + ) + testthat::expect_true(nrow(jobDf) == 7) + + testthat::expect_true( + length(unique(unlist(lapply(1:nrow(jobDf), + function(i){ + ParallelLogger::convertJsonToSettings(jobDf$settings[i])$settingId + } + )))) == 2) + + + # add more settings + res2 <- createAggregateCovariateSettings( + targetIds = targetIds, + outcomeIds = outcomeIds, + minPriorObservation = minPriorObservation + 1, + outcomeWashoutDays = 100, + riskWindowStart = 1, startAnchor = "cohort start", + riskWindowEnd = 5 * 365, endAnchor = "cohort start", + covariateSettings = covariateSettings, + caseCovariateSettings = caseCovariateSettings + ) + + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = createCharacterizationSettings( + aggregateCovariateSettings = list(res, res2) + ), + threads = 1 + ) + testthat::expect_true(nrow(jobDf) == 4) + testthat::expect_true( + length(unique(unlist(lapply(1:nrow(jobDf), + function(i){ + ParallelLogger::convertJsonToSettings(jobDf$settings[i])$settingId + } + )))) == 4) + + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = createCharacterizationSettings( + aggregateCovariateSettings = list(res, res2) + ), + threads = 3 + ) + testthat::expect_true(nrow(jobDf) == 12) + testthat::expect_true( + length(unique(unlist(lapply(1:nrow(jobDf), + function(i){ + ParallelLogger::convertJsonToSettings(jobDf$settings[i])$settingId + } + )))) == 4) + +}) + +test_that("computeTargetAggregateCovariateAnalyses", { targetIds <- c(1, 2, 4) outcomeIds <- c(3) covariateSettings <- FeatureExtraction::createCovariateSettings( @@ -171,110 +344,136 @@ test_that("computeAggregateCovariateAnalyses", { caseCovariateSettings = caseCovariateSettings ) - tempFolder1 <- tempfile("runAggregate1") - on.exit(unlink(tempFolder1, recursive = TRUE), add = TRUE) + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = createCharacterizationSettings( + aggregateCovariateSettings = res + ), + threads = 1 + ) - computeAggregateCovariateAnalyses( + computeTargetAggregateCovariateAnalyses( connectionDetails = connectionDetails, cdmDatabaseSchema = "main", cdmVersion = 5, targetDatabaseSchema = "main", targetTable = "cohort", - aggregateCovariateSettings = res, + settings = ParallelLogger::convertJsonToSettings(jobDf$settings[1]), minCharacterizationMean = 0.01, databaseId = 'madeup', outputFolder = tempFolder1 ) # check incremental does not run - testthat::expect_true(sum(c('results','execution') %in% dir(tempFolder1)) == length(dir(tempFolder1))) + testthat::expect_true( + sum(c('cohort_details.csv', + 'settings.csv', + 'covariates.csv', + 'covariates_continuous.csv', + 'cohort_counts.csv', + 'covariate_ref.csv', + 'analysis_ref.csv' + ) %in% dir(tempFolder1) + ) == length(dir(tempFolder1)) + ) + + # check cohortCounts is done for all + cohortDetails <- readr::read_csv( + file.path(tempFolder1,'cohort_details.csv'), + show_col_types = F + ) + testthat::expect_true( + nrow(unique(cohortDetails)) == nrow(cohortDetails) + ) + testthat::expect_true( + nrow(cohortDetails) == 8 + ) + + aggCovs <- readr::read_csv( + file = file.path(tempFolder1, 'covariates.csv'), + show_col_types = F + ) + # check covariates is unique + testthat::expect_true( + nrow(aggCovs) == nrow(unique(aggCovs)) + ) + + # check databaseId is added + testthat::expect_true( + aggCovs$database_id[1] == 'madeup' + ) + +}) + - tempFolder2 <- tempfile("runAggregate2") - on.exit(unlink(tempFolder2, recursive = TRUE), add = TRUE) +test_that("computeCaseAggregateCovariateAnalyses", { + targetIds <- c(1, 2, 4) + outcomeIds <- c(3) + covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + useDemographicsAge = T, + useCharlsonIndex = T + ) + caseCovariateSettings <- createDuringCovariateSettings( + useConditionOccurrenceDuring = T + ) - computeAggregateCovariateAnalyses( + res <- createAggregateCovariateSettings( + targetIds = targetIds, + outcomeIds = outcomeIds, + minPriorObservation = 30, + outcomeWashoutDays = 1, + riskWindowStart = 1, startAnchor = "cohort start", + riskWindowEnd = 5 * 365, endAnchor = "cohort start", + covariateSettings = covariateSettings, + caseCovariateSettings = caseCovariateSettings + ) + + jobDf <- getAggregateCovariatesJobs( + characterizationSettings = createCharacterizationSettings( + aggregateCovariateSettings = res + ), + threads = 1 + ) + + computeCaseAggregateCovariateAnalyses( connectionDetails = connectionDetails, cdmDatabaseSchema = "main", cdmVersion = 5, targetDatabaseSchema = "main", targetTable = "cohort", - aggregateCovariateSettings = res, + settings = ParallelLogger::convertJsonToSettings(jobDf$settings[2]), minCharacterizationMean = 0.01, databaseId = 'madeup', - outputFolder = tempFolder2, - incrementalFile = file.path(tempFolder2,'executed.csv') - ) - # check incremental does run - testthat::expect_true(sum(c('executed.csv','results','execution') %in% dir(tempFolder2)) == 3) - - # make sure the execution logs all the completed runs - executed <- readr::read_csv(file.path(tempFolder2,'executed.csv'), show_col_types = F) - nrowExpected <- length(targetIds)*2 + length(outcomeIds)*2 + 4*length(targetIds)*length(outcomeIds) - testthat::expect_true(nrow(executed) == nrowExpected) - - # check one execution folder where we have all results - resultFiles <- dir(file.path(tempFolder2, 'execution', 'T_1_30')) - testthat::expect_true( - sum(resultFiles %in% c( - "analysis_ref.csv", - "covariate_ref.csv", - "covariates.csv", - "covariates_continuous.csv" - )) == 4 + outputFolder = tempFolder2 ) - - # make sure the files are written - resultFiles <- dir(file.path(tempFolder2, 'results')) + # check incremental does not run testthat::expect_true( - sum(resultFiles %in% c( - "analysis_ref.csv", - "covariate_ref.csv", - "covariates.csv", - "covariates_continuous.csv", - "settings.csv", - "cohort_details.csv" - )) == 6 + sum(c('cohort_details.csv', + 'settings.csv', + 'covariates.csv', + 'covariates_continuous.csv', + 'cohort_counts.csv', + 'covariate_ref.csv', + 'analysis_ref.csv' + ) %in% dir(tempFolder2) + ) == length(dir(tempFolder2)) ) # check cohortCounts is done for all cohortDetails <- readr::read_csv( - file.path(tempFolder2, 'results', 'cohort_details.csv'), + file.path(tempFolder2,'cohort_details.csv'), show_col_types = F ) testthat::expect_true( nrow(unique(cohortDetails)) == nrow(cohortDetails) ) testthat::expect_true( - nrow(executed) == nrow(cohortDetails) + nrow(cohortDetails) == 3*5 ) - testthat::expect_true( - nrow(as.data.frame(cohortDetails)) == 20 # 8 T/Os, 3 TnO, 0 TnOc, 3 OnT, 3 TnOprior, 3 TnObetween - ) - - # test results combine execution files correctly - executionFolders <- dir(file.path(tempFolder2, 'execution')) - allcovs <- c() - for(executionFolder in executionFolders){ - covstemp <- readr::read_csv( - file = file.path( - tempFolder2, - 'execution', - executionFolder, - 'covariates.csv' - ), - show_col_types = F - ) - allcovs <- rbind(covstemp, allcovs) - } - aggCovs <- readr::read_csv( - file = file.path(tempFolder2, 'results', 'covariates.csv'), + file = file.path(tempFolder2, 'covariates.csv'), show_col_types = F ) - testthat::expect_true( - nrow(aggCovs) == nrow(allcovs) - ) - # check covariates is unique testthat::expect_true( nrow(aggCovs) == nrow(unique(aggCovs)) diff --git a/tests/testthat/test-aggregateCovariateHelpers.R b/tests/testthat/test-aggregateCovariateHelpers.R deleted file mode 100644 index 2a50389..0000000 --- a/tests/testthat/test-aggregateCovariateHelpers.R +++ /dev/null @@ -1,267 +0,0 @@ -# library(Characterization) -# library(testthat) - -context("AggregateCovariateHelpers") - -test_that("hash seed works", { - - hash1 <- hash( - x = "{a:'fake json'}", - seed = 0 - ) - hash2 <- hash( - x = "{a:'fake json'}", - seed = 1 - ) - - testthat::expect_false(hash1 == hash2) - testthat::expect_true(inherits(x = hash1, 'character')) - -}) - -test_that("extractCovariateList works", { - - covList <- list( - list( - covariateSettings = FeatureExtraction::createDefaultCovariateSettings() - ), - list( - covariateSettings = FeatureExtraction::createDefaultCovariateSettings() - ), - list( - covariateSettings = FeatureExtraction::createCovariateSettings(useDemographicsAge = T) - ) - ) - - result <- extractCovariateList(settings = covList) - testthat::expect_true(length(result) == 2) - testthat::expect_true(identical(result[[1]], FeatureExtraction::createDefaultCovariateSettings())) - testthat::expect_true(identical(result[[2]], FeatureExtraction::createCovariateSettings(useDemographicsAge = T))) - -}) - -test_that("extractCaseCovariateList works", { - - covList <- list( - list( - covariateSettings = createDuringCovariateSettings( - useConditionOccurrenceDuring = T - ) - ), - list( - covariateSettings = createDuringCovariateSettings( - useConditionOccurrenceDuring = T - ) - ), - list( - covariateSettings = createDuringCovariateSettings( - useConditionEraDuring = T - ) - ) - ) - - result <- Characterization:::extractCovariateList(settings = covList) - testthat::expect_true(length(result) == 2) - testthat::expect_true(identical(result[[1]], createDuringCovariateSettings( - useConditionOccurrenceDuring = T - ))) - testthat::expect_true(identical(result[[2]], createDuringCovariateSettings( - useConditionEraDuring = T - ))) - -}) - -test_that("extractCombinationSettings works", { - - covSet <- FeatureExtraction::createCovariateSettings( - useDemographicsAge = T - ) - targetIds <- c(1,2,4) - riskWindowStart <- 1 - settings <- createAggregateCovariateSettings( - targetIds = targetIds, - outcomeIds = 3, - minPriorObservation = 10, - outcomeWashoutDays = 100, - riskWindowStart = riskWindowStart, - startAnchor = rep("cohort start", length(riskWindowStart)), - riskWindowEnd = rep(365,length(riskWindowStart)), - endAnchor = rep("cohort start",length(riskWindowStart)), - covariateSettings = covSet, - caseCovariateSettings = createDuringCovariateSettings( - useConditionEraDuring = T - ), - casePreTargetDuration = 300, - casePostOutcomeDuration = 300 - ) - - - combo <- extractCombinationSettings( - x = settings, - covariateSettingsList = list(list(covSet)), - caseCovariateSettingsList = list(createDuringCovariateSettings( - useConditionEraDuring = T - )) - ) - - testthat::expect_equal( - nrow(combo), - length(targetIds)*2+2+length(targetIds)*1*length(riskWindowStart)*4 - ) - - - targetIds <- c(1,2) - riskWindowStart <- 1:5 - settings <- createAggregateCovariateSettings( - targetIds = targetIds, - outcomeIds = 3, - minPriorObservation = 10, - outcomeWashoutDays = 100, - riskWindowStart = riskWindowStart, - startAnchor = rep("cohort start", length(riskWindowStart)), - riskWindowEnd = rep(365,length(riskWindowStart)), - endAnchor = rep("cohort start",length(riskWindowStart)), - covariateSettings = covSet, - caseCovariateSettings = createDuringCovariateSettings( - useConditionEraDuring = T - ), - casePreTargetDuration = 300, - casePostOutcomeDuration = 300 - ) - - - combo <- extractCombinationSettings( - x = settings, - covariateSettingsList = list(list(covSet)), - caseCovariateSettingsList = list(createDuringCovariateSettings( - useConditionEraDuring = T - )) - ) - - testthat::expect_equal( - nrow(combo), - length(targetIds)*2+2+length(targetIds)*1*length(riskWindowStart)*4 - ) - - -}) - -test_that("createFolderName works", { - -names <- createFolderName( - typeName = 'T', - values = data.frame( - a = 1:5, - b = rep(1,5) - ) -) - -testthat::expect_equal( - names, - c("T_1_1","T_2_1","T_3_1","T_4_1","T_5_1") -) - -}) - -test_that("addFolderId works", { - - covSet <- FeatureExtraction::createCovariateSettings( - useDemographicsAge = T - ) - targetIds <- c(1,2,10,11) - riskWindowStart <- 1:5 - settings <- createAggregateCovariateSettings( - targetIds = targetIds, - outcomeIds = 3, - minPriorObservation = 10, - outcomeWashoutDays = 100, - riskWindowStart = riskWindowStart, - startAnchor = rep("cohort start", length(riskWindowStart)), - riskWindowEnd = rep(365,length(riskWindowStart)), - endAnchor = rep("cohort start",length(riskWindowStart)), - covariateSettings = covSet, - caseCovariateSettings = createDuringCovariateSettings( - useConditionEraDuring = T - ), - casePreTargetDuration = 300, - casePostOutcomeDuration = 300 - ) - - - cohortDetails <- Characterization:::extractCombinationSettings( - x = settings, - covariateSettingsList = list(list(covSet)), - caseCovariateSettingsList = list(createDuringCovariateSettings( - useConditionEraDuring = T - )) - ) - - cohortDetailsWithFolder <- Characterization:::addFolderId( - cohortDetails = cohortDetails, - outputFolder = 'test', - threads = 4 - ) - - testthat::expect_true(max(cohortDetailsWithFolder$runId) <= 4) - - testthat::expect_true(nrow(cohortDetailsWithFolder) == nrow(cohortDetails)) - testthat::expect_true(ncol(cohortDetailsWithFolder) == (2+ncol(cohortDetails))) - - testthat::expect_true( - sum(unique(cohortDetails$targetCohortId) %in% - cohortDetailsWithFolder$targetCohortId) == - length(unique(cohortDetails$targetCohortId)) - ) - - testthat::expect_true( - sum(unique(cohortDetails$outcomeCohortId) %in% - cohortDetailsWithFolder$outcomeCohortId) == - length(unique(cohortDetails$outcomeCohortId) - )) - -}) - -test_that("incremental files", { - - # removeExecuted( - # cohortDetails, - # executedDetails - #) - -# saveIncremental <- function( -# cohortDetails, -# incrementalFile -#) - - #loadIncremental <- function( - # incrementalFile - #) - -}) - -# extractTargetFeatures -# extractOutcomeFeatures -# extractCaseFeatures - -#exportAndromedaToCsv <- function( -# andromeda, -# outputFolder, -# cohortDetails, -# counts, -# databaseId, -# minCharacterizationMean, -# batchSize = 100000 -# ) - -#aggregateCsvs <- function( -# outputFolder -#) - -#saveSettings <- function( -# outputFolder, -# cohortDetails, -# databaseId, -# covariateSettingsList, -# caseCovariateSettingsList -#) - diff --git a/tests/testthat/test-dechallengeRechallenge.R b/tests/testthat/test-dechallengeRechallenge.R index 7373080..f01ea8d 100644 --- a/tests/testthat/test-dechallengeRechallenge.R +++ b/tests/testthat/test-dechallengeRechallenge.R @@ -55,7 +55,7 @@ test_that("computeDechallengeRechallengeAnalyses", { connectionDetails = connectionDetails, targetDatabaseSchema = "main", targetTable = "cohort", - dechallengeRechallengeSettings = res, + settings = res, databaseId = "testing", outputFolder = dcLoc, minCellCount = 0 @@ -63,8 +63,9 @@ test_that("computeDechallengeRechallengeAnalyses", { testthat::expect_true(dc) # No results with Andromeda - bug? - #dc <- readr::read_csv(file.path(dcLoc,'dechallenge_rechallenge.csv'), show_col_types = F) - + dc <- readr::read_csv(file.path(dcLoc,'dechallenge_rechallenge.csv'), show_col_types = F) + # one T and 2 Os, so should have 2 rows + testthat::expect_true(nrow(dc) == 2) }) @@ -141,7 +142,7 @@ test_that("computeRechallengeFailCaseSeriesAnalyses with known data", { connectionDetails = connectionDetails, targetDatabaseSchema = "main", targetTable = "cohort", - dechallengeRechallengeSettings = res, + settings = res, outcomeDatabaseSchema = "main", outcomeTable = "cohort", databaseId = "testing", @@ -160,7 +161,7 @@ test_that("computeRechallengeFailCaseSeriesAnalyses with known data", { connectionDetails = connectionDetails, targetDatabaseSchema = "main", targetTable = "cohort", - dechallengeRechallengeSettings = res, + settings = res, outcomeDatabaseSchema = "main", outcomeTable = "cohort", databaseId = "testing", @@ -173,4 +174,144 @@ test_that("computeRechallengeFailCaseSeriesAnalyses with known data", { dc <- readr::read_csv(file.path(dcLoc,'rechallenge_fail_case_series.csv'), show_col_types = F) testthat::expect_equal(nrow(dc), 1) testthat::expect_equal(dc$subject_id, 2) + + + # check minCellCount + dcLoc <- tempfile("runADechal4") + dr <- computeDechallengeRechallengeAnalyses( + connectionDetails = connectionDetails, + targetDatabaseSchema = "main", + targetTable = "cohort", + settings = res, + outcomeDatabaseSchema = "main", + outcomeTable = "cohort", + databaseId = "testing", + outputFolder = dcLoc, + minCellCount = 9999 + ) + + # checking minCellCount + # person 2 should be in results but all min cell count + # values should be censored + dr <- readr::read_csv(file.path(dcLoc,'dechallenge_rechallenge.csv'), show_col_types = F) + testthat::expect_true(nrow(dr) > 0 ) + testthat::expect_equal(max(dr$num_persons_exposed), -9999) + testthat::expect_equal(max(dr$num_cases), -9999) + testthat::expect_equal(max(dr$dechallenge_attempt), -9999) + testthat::expect_equal(max(dr$dechallenge_fail), -9999) + testthat::expect_equal(max(dr$dechallenge_success), -9999) + testthat::expect_equal(max(dr$rechallenge_attempt), -9999) + testthat::expect_equal(max(dr$rechallenge_fail), -9999) + testthat::expect_equal(max(dr$rechallenge_success), -9999) + }) + + +# add test for job creation code +test_that("computeDechallengeRechallengeAnalyses", { + targetIds <- c(2,5,6,7,8) + outcomeIds <- c(3, 4, 9, 10) + + res <- createDechallengeRechallengeSettings( + targetIds = targetIds, + outcomeIds = outcomeIds, + dechallengeStopInterval = 30, + dechallengeEvaluationWindow = 30 + ) +jobs <- Characterization:::getDechallengeRechallengeJobs( + characterizationSettings = createCharacterizationSettings( + dechallengeRechallengeSettings = res + ), + threads = 1 + ) + +# as 1 thread should be 2 rows for two analyses +testthat::expect_true(nrow(jobs) == 2) + +# check all target ids are in there +targetIdFromSettings <- do.call( + what = unique, + args = lapply(1:nrow(jobs), function(i){ + ParallelLogger::convertJsonToSettings(jobs$settings[i])$targetCohortDefinitionIds} + ) +) +testthat::expect_true(sum(targetIds %in% targetIdFromSettings) == + length(targetIds)) + +# check all outcome ids are in there +outcomeIdFromSettings <- do.call( + what = unique, + args = lapply(1:nrow(jobs), function(i){ + ParallelLogger::convertJsonToSettings(jobs$settings[i])$outcomeCohortDefinitionIds} + ) +) +testthat::expect_true(sum(outcomeIds %in% outcomeIdFromSettings) == + length(outcomeIds)) + + +# checking more threads 3 +jobs <- Characterization:::getDechallengeRechallengeJobs( + characterizationSettings = createCharacterizationSettings( + dechallengeRechallengeSettings = res + ), + threads = 3 +) + +# as 3 thread should be 2*3 rows for two analyses +testthat::expect_true(nrow(jobs) == 2*3) + +# check all target ids are in there +targetIdFromSettings <- do.call( + what = c, + args = lapply(1:nrow(jobs), function(i){ + ParallelLogger::convertJsonToSettings(jobs$settings[i])$targetCohortDefinitionIds} + ) +) +testthat::expect_true(sum(targetIds %in% targetIdFromSettings) == + length(targetIds)) + +# check all outcome ids are in there +outcomeIdFromSettings <- do.call( + what =c, + args = lapply(1:nrow(jobs), function(i){ + ParallelLogger::convertJsonToSettings(jobs$settings[i])$outcomeCohortDefinitionIds} + ) +) +testthat::expect_true(sum(outcomeIds %in% outcomeIdFromSettings) == + length(outcomeIds)) + + + +# checking more threads than needed 20 +jobs <- Characterization:::getDechallengeRechallengeJobs( + characterizationSettings = createCharacterizationSettings( + dechallengeRechallengeSettings = res + ), + threads = 20 +) + +# as 3 thread should be 2*5 rows for two analyses +testthat::expect_true(nrow(jobs) == 2*5) + +# check all target ids are in there +targetIdFromSettings <- do.call( + what = c, + args = lapply(1:nrow(jobs), function(i){ + ParallelLogger::convertJsonToSettings(jobs$settings[i])$targetCohortDefinitionIds} + ) +) +testthat::expect_true(sum(targetIds %in% targetIdFromSettings) == + length(targetIds)) + +# check all outcome ids are in there +outcomeIdFromSettings <- do.call( + what =c, + args = lapply(1:nrow(jobs), function(i){ + ParallelLogger::convertJsonToSettings(jobs$settings[i])$outcomeCohortDefinitionIds} + ) +) +testthat::expect_true(sum(outcomeIds %in% outcomeIdFromSettings) == + length(outcomeIds)) + +}) + diff --git a/tests/testthat/test-runCharacterization.R b/tests/testthat/test-runCharacterization.R index 0e51b81..0fa0b83 100644 --- a/tests/testthat/test-runCharacterization.R +++ b/tests/testthat/test-runCharacterization.R @@ -119,8 +119,9 @@ test_that("runCharacterizationAnalyses", { outcomeDatabaseSchema = "main", outcomeTable = "cohort", characterizationSettings = characterizationSettings, - saveDirectory = tempFolder, - tablePrefix = "c_", + outputDirectory = file.path(tempFolder,'result'), + executionPath = file.path(tempFolder,'execution'), + csvFilePrefix = "c_", databaseId = "1", incremental = T, minCharacterizationMean = 0.01, @@ -128,53 +129,48 @@ test_that("runCharacterizationAnalyses", { ) testthat::expect_true( - file.exists(file.path(tempFolder, "tracker.csv")) + dir.exists(file.path(tempFolder, "result")) ) - tracker <- readr::read_csv( - file = file.path(tempFolder, "tracker.csv"), - show_col_types = FALSE - ) - testthat::expect_equal(nrow(tracker), 5) # check csv files testthat::expect_true( - length(dir(file.path(tempFolder, "results"))) > 0 + length(dir(file.path(tempFolder, "result"))) > 0 ) # check cohort details is saved testthat::expect_true( - file.exists(file.path(tempFolder, "results", "cohort_details.csv")) + file.exists(file.path(tempFolder, "result", "c_cohort_details.csv")) ) testthat::expect_true( - file.exists(file.path(tempFolder, "results", "settings.csv")) + file.exists(file.path(tempFolder, "result", "c_settings.csv")) ) testthat::expect_true( - file.exists(file.path(tempFolder, "results", "analysis_ref.csv")) + file.exists(file.path(tempFolder, "result", "c_analysis_ref.csv")) ) testthat::expect_true( - file.exists(file.path(tempFolder, "results", "covariate_ref.csv")) + file.exists(file.path(tempFolder, "result", "c_covariate_ref.csv")) ) testthat::expect_true( - file.exists(file.path(tempFolder, "results", "covariates.csv")) + file.exists(file.path(tempFolder, "result", "c_covariates.csv")) ) testthat::expect_true( - file.exists(file.path(tempFolder, "results", "covariates_continuous.csv")) + file.exists(file.path(tempFolder, "result", "c_covariates_continuous.csv")) ) # no results for dechal due to Eunomia - how to test? + testthat::expect_true( + file.exists(file.path(tempFolder, "result", "c_dechallenge_rechallenge.csv")) + ) #testthat::expect_true( - # file.exists(file.path(tempFolder, "results", "dechallenge_rechallenge.csv")) - #) - #testthat::expect_true( - # file.exists(file.path(tempFolder, "results", "rechallenge_fail_case_series.csv")) + # file.exists(file.path(tempFolder, "result", "rechallenge_fail_case_series.csv")) #) testthat::expect_true( - file.exists(file.path(tempFolder, "results", "time_to_event.csv")) + file.exists(file.path(tempFolder, "result", "c_time_to_event.csv")) ) # make sure both tte runs are in the csv tte <- readr::read_csv( - file = file.path(tempFolder,'results' ,"time_to_event.csv"), + file = file.path(tempFolder,'result' ,"c_time_to_event.csv"), show_col_types = FALSE ) testthat::expect_equivalent( diff --git a/tests/testthat/test-timeToEvent.R b/tests/testthat/test-timeToEvent.R index 19c6197..4a82c5b 100644 --- a/tests/testthat/test-timeToEvent.R +++ b/tests/testthat/test-timeToEvent.R @@ -37,7 +37,7 @@ test_that("computeTimeToEventSettings", { cdmDatabaseSchema = "main", targetDatabaseSchema = "main", targetTable = "cohort", - timeToEventSettings = res, + settings = res, outputFolder = tteFolder, databaseId = 'tte_test' ) @@ -83,4 +83,25 @@ test_that("computeTimeToEventSettings", { length(unique(tte$outcome_cohort_definition_id)) ) + + # test minCellCount + tteFolder <- tempfile("tte2") + computeTimeToEventAnalyses( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + targetDatabaseSchema = "main", + targetTable = "cohort", + settings = res, + outputFolder = tteFolder, + databaseId = 'tte_test', + minCellCount = 9999 + ) + + tte <- readr::read_csv( + file = file.path(tteFolder,'time_to_event.csv'), + show_col_types = F + ) + + testthat::expect_true(max(tte$num_events) == -9999) + }) diff --git a/tests/testthat/test-viewShiny.R b/tests/testthat/test-viewShiny.R index e74a1df..ddba78a 100644 --- a/tests/testthat/test-viewShiny.R +++ b/tests/testthat/test-viewShiny.R @@ -85,8 +85,9 @@ test_that("prepareCharacterizationShiny works", { outcomeDatabaseSchema = "main", outcomeTable = "cohort", characterizationSettings = characterizationSettings, - saveDirectory = resultLocation, - tablePrefix = "c_", + outputDirectory = file.path(resultLocation, 'result'), + executionPath = file.path(resultLocation, 'execution'), + csvFilePrefix = "c_", databaseId = "1", threads = 1, incremental = T, @@ -94,8 +95,8 @@ test_that("prepareCharacterizationShiny works", { minCharacterizationMean = 0.01 ) - settings <- prepareCharacterizationShiny( - resultFolder = file.path(resultLocation,'results'), + settings <- Characterization:::prepareCharacterizationShiny( + resultFolder = file.path(resultLocation,'result'), cohortDefinitionSet = NULL, sqliteLocation = file.path(resultLocation, "sqliteCharacterization", "sqlite.sqlite") ) @@ -127,7 +128,7 @@ test_that("prepareCharacterizationShiny works", { test_that("shiny app works", { settings <- prepareCharacterizationShiny( - resultFolder = file.path(resultLocation,'results'), + resultFolder = file.path(resultLocation,'result'), cohortDefinitionSet = NULL, sqliteLocation = file.path(resultLocation, "sqliteCharacterization", "sqlite.sqlite") )