flatten history

fraunhofer-izi · Nov 23, 2020 · e4d970b · e4d970b
commit e4d970b
Show file tree

Hide file tree

Showing 26 changed files with 7,538 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,50 @@
+.DS_Store
+*.texe
+*.pyc
+*.pdf
+.ipynb_checkpoints
+__pycache__/
+slurmlogs/
+
+# python pickel
+*.pkl
+
+# theano compile dirs
+.theano*
+
+.slurmlogs
+_rslurm*/
+
+# R
+*.tmp.R
+
+# LaTeX:
+tex/build/
+Thesis/build/
+*.synctex
+*.aux
+*.glo
+*.idx
+*.log
+*.toc
+*.ist
+*.acn
+*.acr
+*.alg
+*.bbl
+*.blg
+*.dvi
+*.glg
+*.gls
+*.ilg
+*.ind
+*.lof
+*.lot
+*.maf
+*.mtc
+*.mtc1
+*.out
+*.synctex.gz
+*.fdb_latexmk
+*.fls
+!Thesis/figures/
diff --git a/1-d-moments.ipynb b/1-d-moments.ipynb
diff --git a/DR_DESeq2.R b/DR_DESeq2.R
@@ -0,0 +1,121 @@
+#' ---
+#' jupyter:
+#'   jupytext:
+#'     text_representation:
+#'       extension: .R
+#'       format_name: spin
+#'       format_version: '1.0'
+#'       jupytext_version: 0.8.5
+#'   kernelspec:
+#'     display_name: R
+#'     language: R
+#'     name: ir
+#'   language_info:
+#'     codemirror_mode: r
+#'     file_extension: .r
+#'     mimetype: text/x-r-source
+#'     name: R
+#'     pygments_lexer: r
+#'     version: 3.4.1
+#' ---
+
+#SBATCH --job-name="DESeq on DR"
+#SBATCH --out=.slurmlogs/slurm.dr_deseq2.%j_%N.out
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=7000
+#SBATCH --time=1-00:00:00
+#SBATCH --mail-type=FAIL
+#SBATCH [email protected]
+
+# This script uses DESeq2 and fdrtool in dimensional reduced
+# data to estimate the share of differntially expressed features.
+
+if (!exists("ncomps")){
+    args = commandArgs(trailingOnly = TRUE)
+    if (length(args) < 1){
+        stop(paste0("No numer of components specified.\n",
+            "You can specify the number by providing the it behind ",
+            "--args when calling the script from the command line ",
+            "or setting the 'ncomps' variable in the R enviorment."))
+    }else{
+        ncomps = as.numeric(args[1])
+    }
+}
+ncomps = ceiling(ncomps)
+if (!is.finite(ncomps) || ncomps < 1) {
+    stop("The number of components musst be an integer larger than 0.")
+}
+
+if (!exists("cohort") && length(args) > 1) {
+    cohort = args[2]
+} else if (!exists("cohort")) {
+    cohort = NULL
+}
+
+message(paste("This is ncomps:", ncomps, "; for cohort: ", cohort))
+
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(fdrtool))
+
+inPath = "data/deg_counts_prior"
+outPath = "data/DR_DESeq2_prior"
+dir.create(outPath, showWarnings = FALSE, recursive = TRUE)
+
+pheno = readRDS(paste0(inPath, "/pheno.RDS"))
+counts = readRDS(paste0(inPath, "/counts.RDS"))
+
+dg_path = paste0(inPath, "/", ncomps - 1, "-comp.RDS")
+if (!file.exists(dg_path)) {
+    message(paste('The file', dg_path, 'does not exist. Stopping...'))
+    quit()
+}
+raw_deg_counts = readRDS(dg_path)
+deg_counts = data.frame(t(raw_deg_counts))
+colnames(deg_counts) = paste0(colnames(counts), "-DR")
+rownames(deg_counts) = rownames(counts)
+
+deg_pheno = pheno
+rownames(deg_pheno) = paste0(rownames(deg_pheno), "-DR")
+pheno$DR = TRUE
+deg_pheno$DR = FALSE
+
+a_counts = cbind(counts, deg_counts)
+a_pheno = rbind(pheno, deg_pheno)
+
+if (!is.null(cohort)) {
+    ind = a_pheno$Cohort == cohort
+    a_counts = a_counts[, ind]
+    a_pheno = a_pheno[ind, ]
+}
+
+broken = apply(is.na(a_counts), 1, any)
+if (any(broken)) {
+    message(paste0("There are ", sum(broken),
+                   " features with NA values that are removed."))
+    a_counts = a_counts[!broken, ]
+}
+
+dds_dat = DESeqDataSetFromMatrix(countData = round(a_counts),
+                                 colData = a_pheno,
+                                 design = ~ DR)
+
+dds = DESeq(dds_dat)
+
+res = results(dds)
+
+
+ind = is.finite(res$pvalue)
+pdf(paste0(outPath, "/fdrtool_plot_", cohort, "_", ncomps - 1, "-comp.pdf"))
+fdr_result = fdrtool(res$pvalue[ind], statistic = "pvalue", verbose = FALSE)
+dev.off()
+
+share_diff_expr = max(fdr_result$qval, na.rm = TRUE)
+
+write(share_diff_expr, paste0(outPath, "/share_diff_expr_",
+                              cohort, "_", ncomps - 1, "-comp.txt"))
+
+data = list(dds_data = dds_dat, dds = dds, dds_result = res,
+            fdr_result = fdr_result)
+saveRDS(data, paste0(outPath, "/DESeq2_results_", cohort, "_",
+                     ncomps - 1, "-comp.RDS"))
diff --git a/DR_inv_digamma.py b/DR_inv_digamma.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#SBATCH --job-name="invert digamma"
+#SBATCH --out=.slurmlogs/slurm.dr_inv_digamma.%j_%N.out
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=240G
+#SBATCH --time=10-00:00:00
+#SBATCH --mail-type=ALL
+#SBATCH [email protected]
+
+import os
+os.environ['OMP_NUM_THREADS'] = '1' # single thread openMP and BLAS
+if 'SLURM_SUBMIT_DIR' in os.environ:
+    os.chdir(os.environ['SLURM_SUBMIT_DIR'])
+import numpy as np
+import pickle
+from models import marker_model as my_model
+from tqdm import tqdm
+from multiprocessing import Pool
+from DR_plots import *
+from rpy2.robjects import r, globalenv, numpy2ri
+numpy2ri.activate()
+
+dg, sub = dgamma(counts)
+def pool_f(n):
+    red = inv_phi(phi(dg, n * 10))
+    deg_counts = inv_dg(red, sub, parallel = False)
+    with open('data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.pkl', 'wb') as buff:
+        pickle.dump(deg_counts, buff)
+    rPath = 'data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.RDS'
+    nr,nc = deg_counts.shape
+    dg_r = r.matrix(deg_counts, nrow=nr, ncol=nc)
+    r.saveRDS(dg_r, rPath)
+
+tasks = set(range(np.floor(counts.shape[0] / 10).astype(int)))
+tasks = tasks.union(range(10))
+
+if __name__ == "__main__":
+    with Pool() as pool:
+        _ = list(tqdm(pool.imap_unordered(pool_f, tasks), total=len(tasks)))
diff --git a/DR_plots.py b/DR_plots.py
@@ -0,0 +1,171 @@
+import argparse
+import matplotlib
+if __name__ == "__main__":
+    matplotlib.use('Agg')
+
+    parser = argparse.ArgumentParser(description = 'Correlation plot of before and after DR to specified dimension.')
+    parser.add_argument('integers', metavar = 'N', type=int, nargs = '+',
+                       help = 'The number of dimensions to project to. (multiple possible)')
+    parser.add_argument("--res", type = int, default = 200,
+                        help = 'Corrleation plot resolution.')
+    args = parser.parse_args()
+
+import os
+os.environ['OMP_NUM_THREADS'] = '1' # single thread openMP and BLAS
+from matplotlib import pyplot as plt
+import numpy as np
+import pickle
+from tqdm import tqdm_notebook, tqdm
+from multiprocessing import Pool
+from scipy.stats import gaussian_kde, spearmanr, pearsonr
+from scipy.special import digamma
+from scipy.optimize import minimize_scalar
+
+with open(os.environ['degOutPath'] + 'other_data.pkl', 'rb') as f:
+    data = pickle.load(f)
+
+pca = data['pca']
+counts = data['counts']
+n = counts.shape[1]
+
+def beta(counts):
+    alpha = counts + 1
+    sub = np.log(np.sum(alpha, axis = 1, keepdims = True))
+    nom = np.log(alpha)
+    result = nom - sub
+    return result
+
+def inv_beta(log_alpha, seq_depth = 1e8, prior = 0):
+    #bound = np.percentile(log_alpha, 20, axis = 1, keepdims = True)
+    n = log_alpha.shape[1]
+    alpha = np.exp(log_alpha)
+    result = alpha*(seq_depth + n) - 1 - prior
+    result = np.clip(result, 0, None)
+    return result
+
+def re_counts(log_p, seq_depth = 1e8):
+    return np.exp(log_p) * seq_depth
+
+def phi(X, ncomp):
+    ind = range(ncomp)
+    if pca.mean_ is not None:
+        X = X - pca.mean_
+    X_transformed = np.dot(X, pca.components_[ind, :].T)
+    return X_transformed
+
+def inv_phi(X):
+    ind = range(X.shape[1])
+    return np.dot(X, pca.components_[ind, :]) + pca.mean_
+
+def dgamma(counts):
+    seq_depth = np.sum(counts, axis = 1, keepdims = True)
+    alpha = counts + 1
+    sub = digamma(seq_depth + counts.shape[1])
+    nom = digamma(alpha)
+    result = nom - sub
+    return result, seq_depth
+
+def proj_O(log_counts):
+    n = log_counts.shape[1]
+    diff = log_counts.sum(axis = 1, keepdims = True) / n
+    return log_counts - diff
+
+def rev_O(o_counts):
+    level = np.exp(o_counts).sum(axis = 1, keepdims = True)
+    t = np.log(level)
+    return o_counts - t
+
+def _objective(l_count, dg, sub):
+    alpha = np.exp(l_count) + 1
+    nom = digamma(alpha)
+    result = nom - sub
+    return (result - dg)**2
+
+def _single(dat):
+    lalpha, sub = dat
+    res = minimize_scalar(_objective, args = (lalpha, sub), tol = 1e-20)
+    return np.round(np.exp(res.x))
+
+def inv_dg(log_alpha, sub = 18, parallel = True):
+    results = []
+    N = np.repeat(1, 10).shape
+    if len(sub) != N:
+        sub = np.repeat(sub, N)
+    iterator = range(log_alpha.shape[0])
+    if parallel:
+        iterator = tqdm_notebook(iterator)
+    for samp in iterator:
+        tasks = [(lalpha, sub[samp]) for lalpha in log_alpha[samp, :]]
+        if parallel:
+            with Pool() as pool:
+                samp_res = pool.map(_single, tasks)
+        else:
+            samp_res = [_single(t) for t in tasks]
+        results.append(np.array(samp_res))
+    return np.stack(results, axis = 0)
+
+def degenerate(X, ncomp, m=1e8):
+    return inv_beta(rev_O(inv_phi(phi(beta(X), ncomp))), m)
+
+def fast_deg(beta_counts, ncomp, m=1e8, prior=0):
+    return inv_beta(rev_O(inv_phi(phi(beta_counts, ncomp))), m, prior)
+
+def lin_deg(beta_counts, ncomp):
+    return rev_O(inv_phi(phi(beta_counts, ncomp)))
+
+def deplot(x, y, legend = True, ax = None, fig = None, res = 200):
+    # fit an array of size [Ndim, Nsamples]
+    data = np.vstack([x, y])
+    kde = gaussian_kde(data)
+
+    # evaluate on a regular grid
+    minx = min(x)
+    maxx = max(x)
+    miny = min(y)
+    maxy = max(y)
+    xgrid = np.linspace(minx, maxx, res)
+    ygrid = np.linspace(miny, maxy, res)
+    Xgrid, Ygrid = np.meshgrid(xgrid, ygrid)
+    Z = kde.evaluate(np.vstack([Xgrid.ravel(), Ygrid.ravel()]))
+
+    # Plot the result as an image
+    if ax is None:
+        ax = plt
+    if fig is None:
+        fig = plt
+    img = ax.imshow(Z.reshape(Xgrid.shape),
+               origin='lower', aspect='auto',
+               extent=[minx, maxx, miny, maxy],
+               cmap='Blues')
+    if legend:
+        cb = fig.colorbar(img)
+        cb.set_label("density")
+
+    # draw diagonal
+    vals = [max(minx, miny), min(maxx, maxy)]
+    ax.plot(vals, vals, ls = '--', c = '.3')
+
+    # add text
+    spearman = spearmanr(x, y)[0]
+    person = pearsonr(x, y)[0]
+    text = ('Spearman Correlation = {:f}' +
+            '\nPearson Correlation = {:f}').format(spearman, person)
+    xpos = (maxx - minx) * .05 + minx
+    ypos = - (maxy - miny) * .05 + maxy
+    ax.text(xpos, ypos, text,
+            verticalalignment = 'top')
+
+def save_cor_plot(n):
+    deg_counts = lin_deg(beta_counts, n + 1)
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    deplot(beta_counts.flatten(), deg_counts.flatten(), ax = ax, res = args.res)
+    plt.savefig('data/correlations/' + str(n) + '-comp.png')
+
+if __name__ == "__main__":
+    tasks = args.integers
+    if len(tasks) == 1:
+        save_cor_plot(tasks[0])
+    elif len(tasks) > 1:
+        with Pool() as pool:
+            _ = list(tqdm(pool.imap(save_cor_plot, tasks), total=len(tasks)))