diff --git a/.Rbuildignore b/.Rbuildignore index a0bd289..9faa4a9 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,4 @@ ^man/figures/pipeline\.png$ ^example_data$ ^data-raw$ +^vignettes/articles/iceqream\.R$ diff --git a/.gitignore b/.gitignore index 1a5f1bb..ca075f0 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ example_data vignettes/articles/iceqream_cache/ data-raw/*.ipynb data-raw/*.png +vignettes/articles/iceqream.R diff --git a/DESCRIPTION b/DESCRIPTION index 8c5a6cc..2c342c7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,6 +8,9 @@ Authors@R: c( ) Description: iceqream is a package for regressing accessibility from sequences using physical models of TF binding, It models TF effective concentrations as latent variables that activate or repress regulatory elements in a nonlinear fashion, with possible contribution from pairwise interactions and synergistic chromosomal domain effects. iceqream allows inference and synthesis of models explaining accessibility dynamics over an entire single cell manifold. License: MIT + file LICENSE +Depends: + R (>= 2.10), + misha (>= 4.2.0) Imports: cli, dplyr, @@ -50,11 +53,9 @@ Remotes: tanaylab/prego Config/testthat/edition: 3 Encoding: UTF-8 -Language: es +Language: en-US Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 Config/Needs/website: rmarkdown -Depends: - R (>= 2.10) LazyData: true URL: https://tanaylab.github.io/iceqream/ diff --git a/NAMESPACE b/NAMESPACE index 3e35698..011f01f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export("%>%") export(add_features_r2) +export(add_interactions) export(compute_motif_directional_hits) export(compute_motif_energies) export(compute_tracks_q) @@ -65,6 +66,7 @@ export(rescale) export(split_traj_model_to_train_test) export(traj_model_to_iq_feature_list) export(traj_model_to_pbm_list) +export(traj_model_variable_response) exportClasses(IQFeature) exportClasses(IQFeatureGroup) exportClasses(IQSeqFeature) diff --git a/R/TrajectoryModel.R b/R/TrajectoryModel.R index 4ea4e42..2f1e307 100644 --- a/R/TrajectoryModel.R +++ b/R/TrajectoryModel.R @@ -46,6 +46,9 @@ #' @slot normalization_intervals data.frame #' A data frame containing the intervals used for energy normalization. #' +#' @slot interactions matrix +#' A matrix of the interaction features. +#' #' #' #' @exportClass TrajectoryModel @@ -65,6 +68,7 @@ TrajectoryModel <- setClass( normalization_intervals = "data.frame", additional_features = "data.frame", features_r2 = "numeric", + interactions = "matrix", params = "list" ) ) @@ -74,7 +78,12 @@ TrajectoryModel <- setClass( #' @exportMethod show setMethod("show", signature = "TrajectoryModel", definition = function(object) { cli::cli({ - cli::cli_text("{.cls TrajectoryModel} with {.val {length(object@motif_models)}} motifs and {.val {length(object@additional_features)}} additional features\n") + if (has_interactions(object)) { + cli::cli_text("{.cls TrajectoryModel} with {.val {length(object@motif_models)}} motifs, {.val {length(object@additional_features)}} additional features and {.val {n_interactions(object)}} interaction terms\n") + } else { + cli::cli_text("{.cls TrajectoryModel} with {.val {length(object@motif_models)}} motifs and {.val {length(object@additional_features)}} additional features\n") + } + cli::cli_text("\n") cli::cli_text("Slots include:") cli_ul(c("{.field @model}: A GLM model object. Number of non-zero coefficients: {.val {sum(object@model$beta[, 1] != 0)}}")) @@ -88,8 +97,9 @@ setMethod("show", signature = "TrajectoryModel", definition = function(object) { cli_ul(c("{.field @predicted_diff_score}: A numeric value representing the predicted difference score")) cli_ul(c("{.field @initial_prego_models}: A list of prego models used in the initial phase of the algorithm ({.val {length(object@initial_prego_models)}} models)")) cli_ul(c("{.field @peak_intervals}: A data frame containing the peak intervals ({.val {nrow(object@peak_intervals)}} elements)")) - if ("normalization_intervals" %in% slotNames(object)) { # here for backwards compatibility - cli_ul(c("{.field @normalization_intervals}: A data frame containing the intervals used for energy normalization ({.val {nrow(object@normalization_intervals)}} elements)")) + cli_ul(c("{.field @normalization_intervals}: A data frame containing the intervals used for energy normalization ({.val {nrow(object@normalization_intervals)}} elements)")) + if (has_interactions(object)) { + cli_ul(c("{.field @interactions}: A matrix of the interaction features ({.val {nrow(object@interactions)}}x{.val {ncol(object@interactions)}})")) } if (length(object@features_r2) > 0) { cli_ul(c("{.field @features_r2}: A numeric vector of the added R^2 values for each feature ({.val {length(object@features_r2)}} elements)")) diff --git a/R/distill-motifs.R b/R/distill-motifs.R index fc6633c..4e0b741 100644 --- a/R/distill-motifs.R +++ b/R/distill-motifs.R @@ -178,10 +178,10 @@ distill_motifs <- function(features, target_number, glm_model, y, seqs, norm_seq select(pos, A, C, G, T) optimize_pwm <- FALSE } else { - cli::cli_alert_warning("No current model found for {.val {x$feat}}. Distilling on a single motif") + cli::cli_alert_warning("No current model found for {.val {x$feat}}.") } } else { - cli_alert_info("Running {.field prego} on cluster {.val {x$feat}} (distilling {.val {n_feats}} motifs)") + cli_alert_info("Running {.field prego} on cluster {.val {x$feat}} (fusing {.val {n_feats}} motifs)") } res <- run_prego_on_clust_residuals( diff --git a/R/distill-multi-traj.R b/R/distill-multi-traj.R index 8d0b4b4..959ecf2 100644 --- a/R/distill-multi-traj.R +++ b/R/distill-multi-traj.R @@ -171,7 +171,7 @@ distill_traj_model_multi <- function(traj_models, max_motif_num = NULL, min_diff optimize_pwm <- FALSE motif <- motif_models[[x$feat[1]]]$pssm } else { - cli_alert_info("Running {.field prego} on cluster {.val {clust_name}}, distilling {.val {n_feats}} features") + cli_alert_info("Running {.field prego} on cluster {.val {clust_name}}, fusing {.val {n_feats}} features") } @@ -205,7 +205,7 @@ distill_traj_model_multi <- function(traj_models, max_motif_num = NULL, min_diff ) %>% cli::cli_fmt() - cli::cli_alert_success("Finished distilling cluster {.val {clust_name}}") + cli::cli_alert_success("Finished fusing cluster {.val {clust_name}}") return(prego::export_regression_model(prego_model)) }, .parallel = TRUE) @@ -247,7 +247,7 @@ distill_traj_model_multi <- function(traj_models, max_motif_num = NULL, min_diff compute_traj_list_stats(traj_models_full) %>% mutate(type = "full") ) - cli_alert_success("Finished distilling trajectory models") + cli_alert_success("Finished fusing trajectory models") purrr::walk(names(traj_models), ~ { cli_alert_info("Model {.field {.x}}: R^2: {.val {traj_models_new[[.x]]@params$stats$r2_all}} ({.val {length(traj_models_new[[.x]]@motif_models)}} motifs), before distillation: {.val {traj_models[[.x]]@params$stats$r2_all}} ({.val {length(traj_models[[.x]]@motif_models)}} motifs)") }) diff --git a/R/filter-model.R b/R/filter-model.R index e81e518..4fad211 100644 --- a/R/filter-model.R +++ b/R/filter-model.R @@ -95,46 +95,3 @@ filter_traj_model <- function(traj_model, r2_threshold = 0.0005, bits_threshold return(traj_model_new) } - - -filter_model_using_coefs <- function(X, coefs, diff_score, alpha, lambda, seed, full_model, n_motifs, ignore_variables = NULL) { - y <- norm01(diff_score) - variables <- coefs$variable - if (!is.null(ignore_variables)) { - variables <- variables[!(variables %in% ignore_variables)] - } - coefs_max <- coefs %>% - tibble::column_to_rownames("variable") %>% - .[variables, ] %>% - apply(1, max) %>% - sort(decreasing = TRUE) - - vars_f <- names(coefs_max)[1:n_motifs] - - X_f <- X[, grep(paste0("(", paste(c(vars_f, ignore_variables), collapse = "|"), ").+"), colnames(X))] - cli_alert_info("Number of features left: {.val {length(vars_f)}}") - - model_f <- glmnet::glmnet(X_f, y, binomial(link = "logit"), alpha = alpha, lambda = lambda, parallel = FALSE, seed = seed) - pred_f <- logist(glmnet::predict.glmnet(model_f, newx = X_f, type = "link", s = lambda))[, 1] - pred_f <- norm01(pred_f) - pred_f <- rescale(pred_f, diff_score) - r2_f <- cor(pred_f, y)^2 - cli_alert_info("R^2 after filtering: {.val {r2_f}}") - - return(list(model = model_f, pred = pred_f, X = X_f, r2 = r2_f, vars = vars_f)) -} - - -filter_traj_model_using_coefs <- function(traj_model, n_motifs) { - res <- filter_model_using_coefs(traj_model@model_features, traj_model@coefs, traj_model@diff_score, traj_model@params$alpha, traj_model@params$lambda, traj_model@params$seed, traj_model@model, ignore_variables = colnames(traj_model@additional_features), n_motifs = n_motifs) - - traj_model@model <- res$model - traj_model@predicted_diff_score <- res$pred - traj_model@model_features <- res$X - traj_model@coefs <- get_model_coefs(res$model) - traj_model@normalized_energies <- traj_model@normalized_energies[, res$vars, drop = FALSE] - - cli_alert_success("After filtering: Number of non-zero coefficients: {.val {sum(traj_model@model$beta != 0)}} (out of {.val {ncol(traj_model@model_features)}}). R^2: {.val {cor(traj_model@predicted_diff_score, norm01(traj_model@diff_score))^2}}") - - return(traj_model) -} diff --git a/R/inference.R b/R/inference.R index 24c6754..2dff54c 100644 --- a/R/inference.R +++ b/R/inference.R @@ -47,9 +47,20 @@ infer_trajectory_motifs <- function(traj_model, peak_intervals, atac_scores = NU e_test <- cbind(e_test, additional_features) } + if (has_interactions(traj_model)) { + ftv_inter <- feat_to_variable(traj_model, add_type = TRUE) %>% + filter(type == "interaction") %>% + distinct(variable, term1, term2) + cli::cli_alert_info("Computing {.val {nrow(ftv_inter)}} interaction terms") + interactions <- create_specifc_terms(e_test, ftv_inter) + interactions <- interactions[, colnames(traj_model@interactions), drop = FALSE] + e_test <- cbind(e_test, interactions) + } + e_test_logist <- create_logist_features(e_test) e_test_logist <- e_test_logist[, colnames(traj_model@model_features), drop = FALSE] + cli::cli_alert_info("Inferring the model on {.val {nrow(e_test_logist)}} intervals") pred <- predict_traj_model(traj_model, e_test_logist) traj_model@predicted_diff_score <- c(traj_model@predicted_diff_score, pred) @@ -70,6 +81,9 @@ infer_trajectory_motifs <- function(traj_model, peak_intervals, atac_scores = NU traj_model@additional_features <- bind_rows(traj_model@additional_features, as.data.frame(additional_features)) } + if (has_interactions(traj_model)) { + traj_model@interactions <- rbind(traj_model@interactions, interactions) + } return(traj_model) } diff --git a/R/interactions.R b/R/interactions.R new file mode 100644 index 0000000..244fa5d --- /dev/null +++ b/R/interactions.R @@ -0,0 +1,143 @@ +has_interactions <- function(traj_model) { + n_interactions(traj_model) > 0 +} + +n_interactions <- function(traj_model) { + ncol(traj_model@interactions) +} + +create_specifc_terms <- function(energies, terms) { + term1_matrix <- energies[, terms$term1] + term2_matrix <- energies[, terms$term2] + inter <- term1_matrix * term2_matrix + inter <- t(t(inter) / apply(inter, 2, max, na.rm = TRUE)) + inter <- apply(inter, 2, norm01) * 1 + colnames(inter) <- terms$variable + return(inter) +} + + +create_interaction_terms <- function(energies, motif_feats = NULL, add_feats = NULL, additional_features = NULL, max_motif_n = NULL, max_add_n = NULL) { + create_interactions <- function(features, data, max_n) { + if (is.null(features) || is.null(data)) { + return(NULL) + } + + features <- head(features, n = max_n %||% length(features)) + + interactions <- purrr::map_dfc(features, ~ { + inter <- energies[, setdiff(colnames(energies), .x)] * data[, .x] + inter <- t(t(inter) / apply(inter, 2, max, na.rm = TRUE)) + colnames(inter) <- paste0(.x, ":", colnames(inter)) + inter + }) + + interactions <- apply(interactions, 2, norm01) * 1 + interactions + } + + + add_inter <- create_interactions(add_feats, additional_features, max_add_n) + + if (!is.null(add_inter)) { + cli::cli_alert_info("Created {.val {ncol(add_inter)}} interactions between additional features and motif features.") + } + + + motif_inter <- create_interactions(motif_feats, energies, max_motif_n) + if (!is.null(motif_inter)) { + cli::cli_alert_info("Created {.val {ncol(motif_inter)}} interactions between motif features.") + } + + interactions <- cbind(motif_inter, add_inter) + if (!is.null(interactions)) { + if (!is.null(rownames(energies))) { + rownames(interactions) <- rownames(energies) + } + cli::cli_alert_info("Created {.val {ncol(interactions)}} interactions in total.") + } + + return(interactions) +} + +get_significant_interactions <- function( + energies, y, interaction_threshold, max_motif_n = NULL, max_add_n = NULL, + additional_features = NULL, lambda = 1e-5, alpha = 1, seed = 60427, + ignore_feats = c("TT", "CT", "GT", "AT", "TC", "CC", "GC", "AC", "TG", "CG", "GG", "AG", "TA", "CA", "GA", "AA")) { + glm_model_lin <- glmnet::glmnet(as.matrix(energies), y, binomial(link = "logit"), alpha = alpha, lambda = lambda, seed = seed) + + feats_all <- abs(stats::coef(glm_model_lin)[-1]) + names(feats_all) <- rownames(stats::coef(glm_model_lin))[-1] + sig_feats <- names(feats_all)[feats_all > interaction_threshold] + sig_feats <- setdiff(sig_feats, ignore_feats) + + if (length(sig_feats) == 0) { + cli::cli_alert_warning("No significant features to consider for interactions.") + return(NULL) + } + + add_feats <- intersect(sig_feats, colnames(additional_features)) + motif_feats <- setdiff(sig_feats, add_feats) + + cli::cli_alert_info("# of significant features to consider for interactions: {.val {length(sig_feats)}} (out of {.val {ncol(energies)}}) above the threshold of {.val {interaction_threshold}}. Of these, {.val {length(motif_feats)}} are motif features and {.val {length(add_feats)}} are additional features.") + + if (!is.null(additional_features)) { + # remove the features from energies + energies <- energies[, setdiff(colnames(energies), colnames(additional_features))] + + # remove the ignored features + energies <- energies[, setdiff(colnames(energies), ignore_feats)] + } + + create_interaction_terms(energies, + motif_feats = motif_feats, add_feats = add_feats, + additional_features = additional_features, max_motif_n = max_motif_n, max_add_n = max_add_n + ) +} + +#' Add interactions to a trajectory model +#' +#' This function adds significant interactions to a given trajectory model if they do not already exist. +#' It identifies significant interactions based on the provided threshold and updates the model features +#' with logistic features derived from these interactions. The trajectory model is then re-learned with +#' the new features. +#' +#' @inheritParams regress_trajectory_motifs +#' +#' @return The updated trajectory model with added interactions. +#' @export +add_interactions <- function(traj_model, interaction_threshold = 0.001, max_motif_n = NULL, max_add_n = NULL, lambda = 1e-5, alpha = 1, seed = 60427) { + if (!has_interactions(traj_model)) { + cli::cli_alert("Adding interactions") + interactions <- get_significant_interactions( + cbind(traj_model@normalized_energies, traj_model@additional_features), norm01(traj_model@diff_score), interaction_threshold, + max_motif_n = max_motif_n, max_add_n = max_add_n, + additional_features = traj_model@additional_features, lambda = lambda, alpha = alpha, seed = seed + ) + + if (!is.null(interactions)) { + traj_model@interactions <- interactions + } + + logist_inter <- create_logist_features(interactions) + traj_model@model_features <- cbind(traj_model@model_features, logist_inter) + + cli::cli_alert_info("Re-learning the model with the new interactions. Number of features: {.val {ncol(traj_model@model_features)}}") + cli::cli_alert_info("R^2 all before learning: {.val {cor(traj_model@diff_score, traj_model@predicted_diff_score)^2}}") + if (traj_model_has_test(traj_model)) { + cli::cli_alert_info("R^2 train before learning: {.val {cor(traj_model@diff_score[traj_model@type == 'train'], traj_model@predicted_diff_score[traj_model@type == 'train'])^2}}") + cli::cli_alert_info("R^2 test before learning: {.val {cor(traj_model@diff_score[traj_model@type == 'test'], traj_model@predicted_diff_score[traj_model@type == 'test'])^2}}") + } + + traj_model <- relearn_traj_model(traj_model, new_energies = FALSE, new_logist = FALSE, use_additional_features = TRUE, use_motifs = TRUE, verbose = FALSE) + cli::cli_alert_info("R^2 all after learning: {.val {cor(traj_model@diff_score, traj_model@predicted_diff_score)^2}}") + if (traj_model_has_test(traj_model)) { + cli::cli_alert_info("R^2 train after learning: {.val {cor(traj_model@diff_score[traj_model@type == 'train'], traj_model@predicted_diff_score[traj_model@type == 'train'])^2}}") + cli::cli_alert_info("R^2 test after learning: {.val {cor(traj_model@diff_score[traj_model@type == 'test'], traj_model@predicted_diff_score[traj_model@type == 'test'])^2}}") + } + } else { + cli::cli_alert_warning("Interactions already exist.") + } + + return(traj_model) +} diff --git a/R/partial-response.R b/R/partial-response.R index 2e53183..a113e25 100644 --- a/R/partial-response.R +++ b/R/partial-response.R @@ -1,8 +1,34 @@ -feat_to_variable <- function(traj_model) { - tibble::tibble( +classify_var <- function(var, traj_model) { + case_when( + var %in% names(traj_model@motif_models) ~ "motif", + var %in% colnames(traj_model@additional_features) ~ "additional", + var %in% colnames(traj_model@interactions) ~ "interaction" + ) +} + +feat_to_variable <- function(traj_model, add_types = FALSE) { + ftv <- tibble::tibble( feature = colnames(traj_model@model_features), variable = sub("_(low-energy|high-energy|higher-energy|sigmoid)$", "", feature) ) + if (add_types) { + ftv <- ftv %>% + mutate(type = classify_var(variable, traj_model)) + + if (has_interactions(traj_model)) { + ftv_nointer <- ftv %>% + filter(type != "interaction") + ftv_inter <- ftv %>% + filter(type == "interaction") %>% + separate(variable, c("term1", "term2"), sep = ":", remove = FALSE) %>% + mutate( + term1_type = classify_var(term1, traj_model), + term2_type = classify_var(term2, traj_model) + ) + ftv <- bind_rows(ftv_nointer, ftv_inter) + } + } + return(ftv) } variable_to_feat <- function(glm_model, var) { @@ -33,3 +59,21 @@ compute_partial_response <- function(traj_model, vars = NULL, lambda = 1e-5, rev return(as.data.frame(pr)) } + +#' Compute Partial Response for Trajectory Model +#' +#' This function computes the partial response for a given trajectory model. +#' +#' @param traj_model A trajectory model object. +#' @param vars A vector of variables to compute the partial response for. Default is NULL (all variables). +#' @param lambda A regularization parameter. Default is 1e-5. +#' @param reverse If TRUE, the partial response is computed for all variables except the ones in `vars`. Default is FALSE. +#' +#' @return A data frame with the computed partial responses, where the column names are the variables. +#' +#' +#' @export +traj_model_variable_response <- function(traj_model, vars = NULL, lambda = 1e-5, reverse = FALSE) { + pr <- compute_partial_response(traj_model, vars = vars, lambda = lambda, reverse = reverse) + return(pr) +} diff --git a/R/prego.R b/R/prego.R index fd8aa39..67a87c8 100644 --- a/R/prego.R +++ b/R/prego.R @@ -1,6 +1,6 @@ #' Learn 'prego' models for ATAC difference of a trajectory #' -#' @param peak_intervals A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive. Optionally, a column named "cluster" can be added with indication of the cluster of each peak. +#' @param peak_intervals A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive and therefore shouldn't be used in the regression. Optionally, a column named "cluster" can be added with indication of the cluster of each peak. #' @param atac_diff A numeric vector, indicating the ATAC difference of each peak #' @param n_motifs Number of motifs to learn. Should be at least 2 #' @param min_diff Minimum ATAC difference to include a peak in the training diff --git a/R/regression-pipeline.R b/R/regression-pipeline.R index 5963c91..e163c9f 100644 --- a/R/regression-pipeline.R +++ b/R/regression-pipeline.R @@ -26,7 +26,12 @@ iq_regression <- function( seed = 60427, frac_train = 0.8, filter_model = TRUE, + include_interactions = FALSE, + interaction_threshold = 0.001, + max_motif_interaction_n = NULL, + max_add_interaction_n = NULL, ...) { + set.seed(seed) n_intervals <- nrow(peak_intervals) train_idxs <- sample(1:n_intervals, frac_train * n_intervals) test_idxs <- setdiff(1:n_intervals, train_idxs) @@ -46,6 +51,11 @@ iq_regression <- function( bin_start = bin_start, bin_end = bin_end, max_motif_num = max_motif_num, + seed = seed, + include_interactions = include_interactions, + interaction_threshold = interaction_threshold, + max_motif_interaction_n = max_motif_interaction_n, + max_add_interaction_n = max_add_interaction_n, ... ) diff --git a/R/traj-model-utils.R b/R/traj-model-utils.R index 6aabdd9..b41f415 100644 --- a/R/traj-model-utils.R +++ b/R/traj-model-utils.R @@ -95,12 +95,28 @@ remove_motif_models_from_traj <- function(traj_model, motif_models, verbose = TR } vars_f <- vars[!(vars %in% motif_models)] - X_f <- X[, grep(paste0("(", paste(motif_models, collapse = "|"), ")(_low-energy|_high-energy|_higher-energy|_sigmoid)"), colnames(X), invert = TRUE)] + ftv <- feat_to_variable(traj_model, add_types = TRUE) + interactions <- traj_model@interactions + if (has_interactions(traj_model)) { + ftv <- ftv %>% filter( + !(variable %in% motif_models), !(term1 %in% motif_models), !(term2 %in% motif_models) + ) + + feats <- ftv %>% pull(feature) + interactions <- interactions[, intersect(colnames(interactions), ftv$variable)] + } else { + feats <- ftv %>% + filter(!(variable %in% motif_models)) %>% + pull(feature) + } + + X_f <- X[, feats] traj_model@model_features <- X_f traj_model@motif_models <- traj_model@motif_models[vars_f] traj_model@features_r2 <- traj_model@features_r2[vars_f[vars_f %in% names(traj_model@features_r2)]] traj_model@normalized_energies <- traj_model@normalized_energies[, vars_f, drop = FALSE] + traj_model@interactions <- interactions traj_model <- relearn_traj_model(traj_model, verbose = FALSE) r2_f <- cor(traj_model@predicted_diff_score, norm01(traj_model@diff_score))^2 @@ -118,11 +134,27 @@ remove_additional_feature_from_traj <- function(traj_model, feature_name, verbos if (!feature_name %in% vars) { cli_abort("Feature {.val {feature_name}} not found in the trajectory model.") } + ftv <- feat_to_variable(traj_model, add_types = TRUE) + + interactions <- traj_model@interactions + if (has_interactions(traj_model)) { + ftv <- ftv %>% filter( + variable != feature_name, term1 != feature_name, term2 != feature_name + ) + + feats <- ftv %>% pull(feature) + interactions <- interactions[, intersect(colnames(interactions), ftv$variable)] + } else { + feats <- ftv %>% + filter(variable != feature_name) %>% + pull(feature) + } - X_f <- X[, grep(paste0("(", feature_name, ")"), colnames(X), invert = TRUE)] + X_f <- X[, feats] traj_model@model_features <- X_f traj_model@additional_features <- traj_model@additional_features[, vars[vars != feature_name], drop = FALSE] + traj_model@interactions <- interactions traj_model <- relearn_traj_model(traj_model, verbose = FALSE) r2_f <- cor(traj_model@predicted_diff_score, norm01(traj_model@diff_score))^2 diff --git a/R/trajectory-regression.R b/R/trajectory-regression.R index 293d824..fdaf08c 100644 --- a/R/trajectory-regression.R +++ b/R/trajectory-regression.R @@ -5,8 +5,8 @@ #' The basic inputs for regression are the genomic positions of the peaks, two vectors of ATAC scores (as an \code{atac_scores} matrix) or differential accessibility (\code{atac_diff}), and database energies computed for all the genomics positions. #' Optionally, additional features such as epigenomic features can be provided. #' -#' @param peak_intervals A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive. Optionally, a column named "cluster" can be added with indication of the cluster of each peak. -#' @param atac_scores Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either set bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1. +#' @param peak_intervals A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak. +#' @param atac_scores Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either change bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1. #' @param atac_diff Optional. A numeric vector representing the differential accessibility between the start and end of the trajectory. Either this or \code{atac_scores} must be provided. #' @param normalize_bins whether to normalize the ATAC scores to be between 0 and 1. Default: TRUE #' @param norm_intervals A data frame, indicating the genomic positions ('chrom', 'start', 'end') of peaks used for energy normalization. If NULL, the function will use \code{peak_intervals} for normalization. @@ -14,7 +14,7 @@ #' @param n_clust_factor factor to divide the number of to keep after clustering. e.g. if n_clust_factor > 1 the number of motifs to keep will be reduced by a factor of n_clust_factor. Default: 1 #' @param motif_energies A numeric matrix, representing the energy of each motif in each peak. If NULL, the function will use \code{pssm_db} to calculate the motif energies. Note that this might take a while. #' @param norm_motif_energies A numeric matrix, representing the normalized energy of each motif in each interval of \code{norm_intervals}. If NULL, the function will use \code{pssm_db} to calculate the motif energies. Note that this might take a while. -#' @param pssm_db a data frame with PSSMs ('A', 'C', 'G' and 'T' columns), with an additional column 'motif' containing the motif name. All the motifs in \code{motif_energies} (column names) should be present in the 'motif' column. Default: all motifs in the prego package. +#' @param pssm_db a data frame with PSSMs ('A', 'C', 'G' and 'T' columns), with an additional column 'motif' containing the motif name. All the motifs in \code{motif_energies} (column names) should be present in the 'motif' column. Default: all motifs. #' @param additional_features A data frame, representing additional genomic features (e.g. CpG content, distance to TSS, etc.) for each peak. Note that NA values would be replaced with 0. #' @param min_tss_distance distance from Transcription Start Site (TSS) to classify a peak as an enhancer. Default: 5000. If NULL, no filtering will be performed - use this option if your peaks are already filtered. \cr #' Note that in order to filter peaks that are too close to TSS, the current \code{misha} genome must have an intervals set called \code{intervs.global.tss}. @@ -39,6 +39,10 @@ #' @param spat_num_bins number of spatial bins to use. #' @param spat_bin_size size of each spatial bin. #' @param kmer_sequence_length length of the kmer sequence to use for kmer screening. By default the full sequence is used. +#' @param include_interactions whether to include interactions between motifs / additional fetures as model features. IQ will create interactions between significant additional features and all motifs, and between significant motifs. Default: FALSE +#' @param interaction_threshold threshold for the selecting features to create interactions. IQ learns a linear model on the features and selects the features with coefficients above this threshold. Default: 0.001 +#' @param max_motif_interaction_n maximum number of motifs to consider for interactions. If NULL, all motifs above the interaction_threshold will be considered. Default: NULL +#' @param max_add_interaction_n maximum number of additional features to consider for interactions. If NULL, all additional features above the interaction_threshold will be considered. Default: NULL #' #' @return An instance of `TrajectoryModel` containing model information and results: #' \itemize{ @@ -51,6 +55,7 @@ #' \item initial_prego_models: List, inferred prego models at the initial step of the algorithm. #' \item peak_intervals: data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak used for training. #' } +#' Print the model to see more details. #' #' @inheritParams glmnet::glmnet #' @export @@ -88,7 +93,11 @@ regress_trajectory_motifs <- function(peak_intervals, peaks_size = 500, spat_num_bins = NULL, spat_bin_size = 2, - kmer_sequence_length = 300) { + kmer_sequence_length = 300, + include_interactions = FALSE, + interaction_threshold = 0.001, + max_motif_interaction_n = NULL, + max_add_interaction_n = NULL) { withr::local_options(list(gmax.data.size = 1e9)) if (is.null(atac_scores) && is.null(atac_diff)) { @@ -129,10 +138,20 @@ regress_trajectory_motifs <- function(peak_intervals, min_energy <- -7 + # check if misha package is loaded + if (!requireNamespace("misha", quietly = TRUE)) { + cli_abort("Please install the misha package to use the 'min_tss_distance' parameter") + } + # check that an environment called .misha exists + if (!exists(".misha", envir = .GlobalEnv)) { + cli_abort("Please load the misha package {.code library(misha)} and set the genome using {.code misha::gsetroot()}") + } + # filter peaks that are too close to TSS if (!is.null(min_tss_distance)) { if (!misha::gintervals.exists("intervs.global.tss")) { - cli_abort("Please make sure the current genome ({.field {GROOT}}) has an intervals set called {.val intervs.global.tss}") + misha_root <- .misha$GROOT + cli_abort("Please make sure the current genome ({.field {misha_root}}) has an intervals set called {.val intervs.global.tss}") } tss_dist <- abs(misha::gintervals.neighbors(as.data.frame(peak_intervals), "intervs.global.tss", na.if.notfound = TRUE)$dist) @@ -236,23 +255,63 @@ regress_trajectory_motifs <- function(peak_intervals, } else { diff_filter <- rep(TRUE, nrow(peak_intervals)) } - distilled <- distill_motifs(features, max_motif_num, glm_model2, y = atac_diff_n, seqs = all_seqs[enhancers_filter], norm_seqs = norm_seqs, diff_filter, additional_features = additional_features, pssm_db = pssm_db, lambda = lambda, alpha = alpha, energy_norm_quantile = energy_norm_quantile, norm_energy_max = norm_energy_max, min_energy = min_energy, seed = seed, spat_num_bins = spat_num_bins, spat_bin_size = spat_bin_size, kmer_sequence_length = kmer_sequence_length, n_clust_factor = n_clust_factor, distill_single = FALSE) + + distilled <- distill_motifs( + features, + max_motif_num, + glm_model2, + y = atac_diff_n, + seqs = all_seqs[enhancers_filter], + norm_seqs = norm_seqs, + diff_filter, + additional_features = additional_features, + pssm_db = pssm_db, + lambda = lambda, + alpha = alpha, + energy_norm_quantile = energy_norm_quantile, + norm_energy_max = norm_energy_max, + min_energy = min_energy, + seed = seed, + spat_num_bins = spat_num_bins, + spat_bin_size = spat_bin_size, + kmer_sequence_length = kmer_sequence_length, + n_clust_factor = n_clust_factor, + distill_single = FALSE + ) clust_energies <- distilled$energies + if (include_interactions) { + interactions <- get_significant_interactions( + energies = clust_energies, + y = atac_diff_n, + interaction_threshold = interaction_threshold, additional_features = additional_features, + max_motif_n = max_motif_interaction_n, + max_add_n = max_add_interaction_n, + lambda = lambda, alpha = alpha, seed = seed + ) + clust_energies <- cbind(clust_energies, interactions) + } else { + interactions <- matrix(0, nrow = nrow(clust_energies), ncol = 0) + } + clust_energies_logist <- create_logist_features(clust_energies) + cli_alert_info("Running final regression, number of features: {.val {ncol(clust_energies_logist)}}") model <- glmnet::glmnet(clust_energies_logist, atac_diff_n, binomial(link = "logit"), alpha = alpha, lambda = lambda, parallel = parallel, seed = seed) predicted_diff_score <- logist(glmnet::predict.glmnet(model, newx = clust_energies_logist, type = "link", s = lambda))[, 1] predicted_diff_score <- norm01(predicted_diff_score) predicted_diff_score <- rescale(predicted_diff_score, atac_diff) - cli_alert_success("Finished running model. Number of non-zero coefficients: {.val {sum(model$beta != 0)}} (out of {.val {ncol(clust_energies_logist)}}). R^2: {.val {cor(predicted_diff_score, atac_diff_n)^2}}") # remove additional features from clust_energies normalized_energies <- clust_energies[, setdiff(colnames(clust_energies), colnames(additional_features))] + if (include_interactions) { + normalized_energies <- normalized_energies[, setdiff(colnames(normalized_energies), colnames(interactions))] + } + traj_model <- TrajectoryModel( model = model, motif_models = homogenize_pssm_models(distilled$motifs), @@ -266,6 +325,7 @@ regress_trajectory_motifs <- function(peak_intervals, initial_prego_models = prego_models, peak_intervals = peak_intervals, normalization_intervals = norm_intervals, + interactions = interactions, params = list( energy_norm_quantile = energy_norm_quantile, norm_energy_max = norm_energy_max, @@ -277,6 +337,8 @@ regress_trajectory_motifs <- function(peak_intervals, spat_bin_size = spat_bin_size, distilled_features = distilled$features, n_clust_factor = n_clust_factor, + include_interactions = include_interactions, + interaction_threshold = interaction_threshold, seed = seed ) ) @@ -303,8 +365,8 @@ validate_atac_scores <- function(atac_scores, bin_start, bin_end) { } } -validate_peak_intervals <- function(peak_intervals, columns = c("chrom", "start", "end", "const")) { - # make sure that peak intervals have 'chrom' 'start' 'end' and 'const' columns +validate_peak_intervals <- function(peak_intervals, columns = c("chrom", "start", "end")) { + # make sure that peak intervals have 'chrom' 'start' 'end' columns if (!all(columns %in% colnames(peak_intervals))) { cli_abort("{.field 'peak_intervals'} must have '{.val {columns}} columns", call = parent.frame(1)) } diff --git a/man/TrajectoryModel-class.Rd b/man/TrajectoryModel-class.Rd index 34a75cd..6725efd 100644 --- a/man/TrajectoryModel-class.Rd +++ b/man/TrajectoryModel-class.Rd @@ -59,5 +59,8 @@ A numeric vector of R^2 values for each feature.} \item{\code{normalization_intervals}}{data.frame A data frame containing the intervals used for energy normalization.} + +\item{\code{interactions}}{matrix +A matrix of the interaction features.} }} diff --git a/man/add_interactions.Rd b/man/add_interactions.Rd new file mode 100644 index 0000000..2abd467 --- /dev/null +++ b/man/add_interactions.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interactions.R +\name{add_interactions} +\alias{add_interactions} +\title{Add interactions to a trajectory model} +\usage{ +add_interactions( + traj_model, + interaction_threshold = 0.001, + max_motif_n = NULL, + max_add_n = NULL, + lambda = 0.00001, + alpha = 1, + seed = 60427 +) +} +\arguments{ +\item{interaction_threshold}{threshold for the selecting features to create interactions. IQ learns a linear model on the features and selects the features with coefficients above this threshold. Default: 0.001} + +\item{lambda}{A user supplied \code{lambda} sequence. Typical usage is to +have the program compute its own \code{lambda} sequence based on +\code{nlambda} and \code{lambda.min.ratio}. Supplying a value of +\code{lambda} overrides this. WARNING: use with care. Avoid supplying a +single value for \code{lambda} (for predictions after CV use +\code{predict()} instead). Supply instead a decreasing sequence of +\code{lambda} values. \code{glmnet} relies on its warms starts for speed, +and its often faster to fit a whole path than compute a single fit.} + +\item{alpha}{The elasticnet mixing parameter, with \eqn{0\le\alpha\le 1}. +The penalty is defined as +\deqn{(1-\alpha)/2||\beta||_2^2+\alpha||\beta||_1.} \code{alpha=1} is the +lasso penalty, and \code{alpha=0} the ridge penalty.} + +\item{seed}{random seed for reproducibility.} +} +\value{ +The updated trajectory model with added interactions. +} +\description{ +This function adds significant interactions to a given trajectory model if they do not already exist. +It identifies significant interactions based on the provided threshold and updates the model features +with logistic features derived from these interactions. The trajectory model is then re-learned with +the new features. +} diff --git a/man/infer_trajectory_motifs.Rd b/man/infer_trajectory_motifs.Rd index 28f5902..7aac47b 100644 --- a/man/infer_trajectory_motifs.Rd +++ b/man/infer_trajectory_motifs.Rd @@ -20,9 +20,9 @@ infer_trajectory_motifs( \arguments{ \item{traj_model}{A trajectory model object, as returned by \code{regress_trajectory_motifs}} -\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive. Optionally, a column named "cluster" can be added with indication of the cluster of each peak.} +\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak.} -\item{atac_scores}{Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either set bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1.} +\item{atac_scores}{Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either change bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1.} \item{bin_start}{the start of the trajectory. Default: 1} diff --git a/man/infer_trajectory_motifs_multi.Rd b/man/infer_trajectory_motifs_multi.Rd index 861bf6e..198439d 100644 --- a/man/infer_trajectory_motifs_multi.Rd +++ b/man/infer_trajectory_motifs_multi.Rd @@ -16,7 +16,7 @@ infer_trajectory_motifs_multi( \arguments{ \item{traj_multi}{A TrajectoryModelMulti object.} -\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive. Optionally, a column named "cluster" can be added with indication of the cluster of each peak.} +\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak.} \item{atac_scores}{A list of data frames with ATAC-seq scores for each individual trajectory.} diff --git a/man/iq_regression.Rd b/man/iq_regression.Rd index 39e0c0f..b29ff8a 100644 --- a/man/iq_regression.Rd +++ b/man/iq_regression.Rd @@ -20,13 +20,17 @@ iq_regression( seed = 60427, frac_train = 0.8, filter_model = TRUE, + include_interactions = FALSE, + interaction_threshold = 0.001, + max_motif_interaction_n = NULL, + max_add_interaction_n = NULL, ... ) } \arguments{ -\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive. Optionally, a column named "cluster" can be added with indication of the cluster of each peak.} +\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak.} -\item{atac_scores}{Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either set bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1.} +\item{atac_scores}{Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either change bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1.} \item{atac_diff}{Optional. A numeric vector representing the differential accessibility between the start and end of the trajectory. Either this or \code{atac_scores} must be provided.} @@ -54,12 +58,20 @@ iq_regression( \item{filter_model}{A logical value indicating whether to filter the model (default is TRUE).} +\item{include_interactions}{whether to include interactions between motifs / additional fetures as model features. IQ will create interactions between significant additional features and all motifs, and between significant motifs. Default: FALSE} + +\item{interaction_threshold}{threshold for the selecting features to create interactions. IQ learns a linear model on the features and selects the features with coefficients above this threshold. Default: 0.001} + +\item{max_motif_interaction_n}{maximum number of motifs to consider for interactions. If NULL, all motifs above the interaction_threshold will be considered. Default: NULL} + +\item{max_add_interaction_n}{maximum number of additional features to consider for interactions. If NULL, all additional features above the interaction_threshold will be considered. Default: NULL} + \item{...}{ Arguments passed on to \code{\link[=regress_trajectory_motifs]{regress_trajectory_motifs}} \describe{ \item{\code{n_clust_factor}}{factor to divide the number of to keep after clustering. e.g. if n_clust_factor > 1 the number of motifs to keep will be reduced by a factor of n_clust_factor. Default: 1} \item{\code{norm_motif_energies}}{A numeric matrix, representing the normalized energy of each motif in each interval of \code{norm_intervals}. If NULL, the function will use \code{pssm_db} to calculate the motif energies. Note that this might take a while.} - \item{\code{pssm_db}}{a data frame with PSSMs ('A', 'C', 'G' and 'T' columns), with an additional column 'motif' containing the motif name. All the motifs in \code{motif_energies} (column names) should be present in the 'motif' column. Default: all motifs in the prego package.} + \item{\code{pssm_db}}{a data frame with PSSMs ('A', 'C', 'G' and 'T' columns), with an additional column 'motif' containing the motif name. All the motifs in \code{motif_energies} (column names) should be present in the 'motif' column. Default: all motifs.} \item{\code{min_tss_distance}}{distance from Transcription Start Site (TSS) to classify a peak as an enhancer. Default: 5000. If NULL, no filtering will be performed - use this option if your peaks are already filtered. \cr Note that in order to filter peaks that are too close to TSS, the current \code{misha} genome must have an intervals set called \code{intervs.global.tss}.} \item{\code{normalize_energies}}{whether to normalize the motif energies. Set this to FALSE if the motif energies are already normalized.} @@ -104,6 +116,7 @@ An instance of \code{TrajectoryModel} containing model information and results: \item initial_prego_models: List, inferred prego models at the initial step of the algorithm. \item peak_intervals: data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak used for training. } +Print the model to see more details. } \description{ Perform IQ regression on peak intervals using the provided ATAC-seq scores, ATAC-seq score differences, normalized intervals, motif energies, and additional features, after dividing the intervals into training and testing sets. diff --git a/man/learn_traj_prego.Rd b/man/learn_traj_prego.Rd index 7064655..8520995 100644 --- a/man/learn_traj_prego.Rd +++ b/man/learn_traj_prego.Rd @@ -23,7 +23,7 @@ learn_traj_prego( ) } \arguments{ -\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive. Optionally, a column named "cluster" can be added with indication of the cluster of each peak.} +\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive and therefiore shouldn't be used in the regression. Optionally, a column named "cluster" can be added with indication of the cluster of each peak.} \item{atac_diff}{A numeric vector, indicating the ATAC difference of each peak} diff --git a/man/regress_trajectory_motifs.Rd b/man/regress_trajectory_motifs.Rd index 6e0d53b..420afb2 100644 --- a/man/regress_trajectory_motifs.Rd +++ b/man/regress_trajectory_motifs.Rd @@ -39,13 +39,17 @@ regress_trajectory_motifs( peaks_size = 500, spat_num_bins = NULL, spat_bin_size = 2, - kmer_sequence_length = 300 + kmer_sequence_length = 300, + include_interactions = FALSE, + interaction_threshold = 0.001, + max_motif_interaction_n = NULL, + max_add_interaction_n = NULL ) } \arguments{ -\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak, with an additional column named "const" indicating whether the peak is constitutive. Optionally, a column named "cluster" can be added with indication of the cluster of each peak.} +\item{peak_intervals}{A data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak.} -\item{atac_scores}{Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either set bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1.} +\item{atac_scores}{Optional. A numeric matrix, representing mean ATAC score per bin per peak. Rows: peaks, columns: bins. By default iceqream would regress the last column minus the first column. If you want to regress something else, please either change bin_start or bin_end, or provide \code{atac_diff} instead. If \code{normalize_bins} is TRUE, the scores will be normalized to be between 0 and 1.} \item{atac_diff}{Optional. A numeric vector representing the differential accessibility between the start and end of the trajectory. Either this or \code{atac_scores} must be provided.} @@ -61,7 +65,7 @@ regress_trajectory_motifs( \item{norm_motif_energies}{A numeric matrix, representing the normalized energy of each motif in each interval of \code{norm_intervals}. If NULL, the function will use \code{pssm_db} to calculate the motif energies. Note that this might take a while.} -\item{pssm_db}{a data frame with PSSMs ('A', 'C', 'G' and 'T' columns), with an additional column 'motif' containing the motif name. All the motifs in \code{motif_energies} (column names) should be present in the 'motif' column. Default: all motifs in the prego package.} +\item{pssm_db}{a data frame with PSSMs ('A', 'C', 'G' and 'T' columns), with an additional column 'motif' containing the motif name. All the motifs in \code{motif_energies} (column names) should be present in the 'motif' column. Default: all motifs.} \item{additional_features}{A data frame, representing additional genomic features (e.g. CpG content, distance to TSS, etc.) for each peak. Note that NA values would be replaced with 0.} @@ -123,6 +127,14 @@ lasso penalty, and \code{alpha=0} the ridge penalty.} \item{spat_bin_size}{size of each spatial bin.} \item{kmer_sequence_length}{length of the kmer sequence to use for kmer screening. By default the full sequence is used.} + +\item{include_interactions}{whether to include interactions between motifs / additional fetures as model features. IQ will create interactions between significant additional features and all motifs, and between significant motifs. Default: FALSE} + +\item{interaction_threshold}{threshold for the selecting features to create interactions. IQ learns a linear model on the features and selects the features with coefficients above this threshold. Default: 0.001} + +\item{max_motif_interaction_n}{maximum number of motifs to consider for interactions. If NULL, all motifs above the interaction_threshold will be considered. Default: NULL} + +\item{max_add_interaction_n}{maximum number of additional features to consider for interactions. If NULL, all additional features above the interaction_threshold will be considered. Default: NULL} } \value{ An instance of \code{TrajectoryModel} containing model information and results: @@ -136,6 +148,7 @@ An instance of \code{TrajectoryModel} containing model information and results: \item initial_prego_models: List, inferred prego models at the initial step of the algorithm. \item peak_intervals: data frame, indicating the genomic positions ('chrom', 'start', 'end') of each peak used for training. } +Print the model to see more details. } \description{ This function performs motif regression on ATAC trajectories. It can work with either ATAC scores on trajectory bins or directly with differential accessibility. diff --git a/man/traj_model_variable_response.Rd b/man/traj_model_variable_response.Rd new file mode 100644 index 0000000..b7ce6f0 --- /dev/null +++ b/man/traj_model_variable_response.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/partial-response.R +\name{traj_model_variable_response} +\alias{traj_model_variable_response} +\title{Compute Partial Response for Trajectory Model} +\usage{ +traj_model_variable_response( + traj_model, + vars = NULL, + lambda = 0.00001, + reverse = FALSE +) +} +\arguments{ +\item{traj_model}{A trajectory model object.} + +\item{vars}{A vector of variables to compute the partial response for. Default is NULL (all variables).} + +\item{lambda}{A regularization parameter. Default is 1e-5.} + +\item{reverse}{If TRUE, the partial response is computed for all variables except the ones in \code{vars}. Default is FALSE.} +} +\value{ +A data frame with the computed partial responses, where the column names are the variables. +} +\description{ +This function computes the partial response for a given trajectory model. +} diff --git a/vignettes/articles/iceqream.Rmd b/vignettes/articles/iceqream.Rmd index 42c684c..22b0bcb 100644 --- a/vignettes/articles/iceqream.Rmd +++ b/vignettes/articles/iceqream.Rmd @@ -69,18 +69,15 @@ Let's examine the structure of our input data: ```{r examine_data} # Peak intervals -print("Peak Intervals:") head(peak_intervals) print(paste("Number of peaks:", nrow(peak_intervals))) # ATAC scores -print("\nATAC Scores:") head(atac_scores) print(paste("Number of peaks:", nrow(atac_scores))) print(paste("Number of bins:", ncol(atac_scores))) # Additional features -print("\nAdditional Features:") head(additional_features) print(paste("Number of peaks:", nrow(additional_features))) print(paste("Number of features:", ncol(additional_features))) @@ -127,6 +124,7 @@ traj_model <- iq_regression( seed = 60427, frac_train = 0.8, max_motif_num = 30 + # include_interactions = TRUE # uncomment to include pairwise interactions ) ``` @@ -169,7 +167,7 @@ The model report provides several key pieces of information (from left to right) 2. Response curves show how the accessibility changes as a function of binding energy for each TF. 3. Barplots show the coefficient of each non-linear term of every motif in the model. 4. Spatial model curves show the parameters of the spatial model for each TF. The R² values indicate the predictive power each TF adds to the model. -5. Spatial curves show the frequencey of each TF binding site along the peaks from the bottom 10% (blue) and top 10% (red) of the differential accessibility (dAP) distribution. +5. Spatial curves show the frequency of each TF binding site along the peaks from the bottom 10% (blue) and top 10% (red) of the differential accessibility (dAP) distribution. 6. Boxplots show the distribution of ATAC differences (dAP, y-axis) for bins of binding energy (x-axis) for each TF. ## Renaming the motif models