Skip to content

Commit

Permalink
flatten history
Browse files Browse the repository at this point in the history
  • Loading branch information
katosh committed Nov 23, 2020
0 parents commit e4d970b
Show file tree
Hide file tree
Showing 26 changed files with 7,538 additions and 0 deletions.
50 changes: 50 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
.DS_Store
*.texe
*.pyc
*.pdf
.ipynb_checkpoints
__pycache__/
slurmlogs/

# python pickel
*.pkl

# theano compile dirs
.theano*

.slurmlogs
_rslurm*/

# R
*.tmp.R

# LaTeX:
tex/build/
Thesis/build/
*.synctex
*.aux
*.glo
*.idx
*.log
*.toc
*.ist
*.acn
*.acr
*.alg
*.bbl
*.blg
*.dvi
*.glg
*.gls
*.ilg
*.ind
*.lof
*.lot
*.maf
*.mtc
*.mtc1
*.out
*.synctex.gz
*.fdb_latexmk
*.fls
!Thesis/figures/
1,395 changes: 1,395 additions & 0 deletions 1-d-moments.ipynb

Large diffs are not rendered by default.

121 changes: 121 additions & 0 deletions DR_DESeq2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#' ---
#' jupyter:
#' jupytext:
#' text_representation:
#' extension: .R
#' format_name: spin
#' format_version: '1.0'
#' jupytext_version: 0.8.5
#' kernelspec:
#' display_name: R
#' language: R
#' name: ir
#' language_info:
#' codemirror_mode: r
#' file_extension: .r
#' mimetype: text/x-r-source
#' name: R
#' pygments_lexer: r
#' version: 3.4.1
#' ---

#SBATCH --job-name="DESeq on DR"
#SBATCH --out=.slurmlogs/slurm.dr_deseq2.%j_%N.out
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=7000
#SBATCH --time=1-00:00:00
#SBATCH --mail-type=FAIL
#SBATCH [email protected]

# This script uses DESeq2 and fdrtool in dimensional reduced
# data to estimate the share of differntially expressed features.

if (!exists("ncomps")){
args = commandArgs(trailingOnly = TRUE)
if (length(args) < 1){
stop(paste0("No numer of components specified.\n",
"You can specify the number by providing the it behind ",
"--args when calling the script from the command line ",
"or setting the 'ncomps' variable in the R enviorment."))
}else{
ncomps = as.numeric(args[1])
}
}
ncomps = ceiling(ncomps)
if (!is.finite(ncomps) || ncomps < 1) {
stop("The number of components musst be an integer larger than 0.")
}

if (!exists("cohort") && length(args) > 1) {
cohort = args[2]
} else if (!exists("cohort")) {
cohort = NULL
}

message(paste("This is ncomps:", ncomps, "; for cohort: ", cohort))

suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(fdrtool))

inPath = "data/deg_counts_prior"
outPath = "data/DR_DESeq2_prior"
dir.create(outPath, showWarnings = FALSE, recursive = TRUE)

pheno = readRDS(paste0(inPath, "/pheno.RDS"))
counts = readRDS(paste0(inPath, "/counts.RDS"))

dg_path = paste0(inPath, "/", ncomps - 1, "-comp.RDS")
if (!file.exists(dg_path)) {
message(paste('The file', dg_path, 'does not exist. Stopping...'))
quit()
}
raw_deg_counts = readRDS(dg_path)
deg_counts = data.frame(t(raw_deg_counts))
colnames(deg_counts) = paste0(colnames(counts), "-DR")
rownames(deg_counts) = rownames(counts)

deg_pheno = pheno
rownames(deg_pheno) = paste0(rownames(deg_pheno), "-DR")
pheno$DR = TRUE
deg_pheno$DR = FALSE

a_counts = cbind(counts, deg_counts)
a_pheno = rbind(pheno, deg_pheno)

if (!is.null(cohort)) {
ind = a_pheno$Cohort == cohort
a_counts = a_counts[, ind]
a_pheno = a_pheno[ind, ]
}

broken = apply(is.na(a_counts), 1, any)
if (any(broken)) {
message(paste0("There are ", sum(broken),
" features with NA values that are removed."))
a_counts = a_counts[!broken, ]
}

dds_dat = DESeqDataSetFromMatrix(countData = round(a_counts),
colData = a_pheno,
design = ~ DR)

dds = DESeq(dds_dat)

res = results(dds)


ind = is.finite(res$pvalue)
pdf(paste0(outPath, "/fdrtool_plot_", cohort, "_", ncomps - 1, "-comp.pdf"))
fdr_result = fdrtool(res$pvalue[ind], statistic = "pvalue", verbose = FALSE)
dev.off()

share_diff_expr = max(fdr_result$qval, na.rm = TRUE)

write(share_diff_expr, paste0(outPath, "/share_diff_expr_",
cohort, "_", ncomps - 1, "-comp.txt"))

data = list(dds_data = dds_dat, dds = dds, dds_result = res,
fdr_result = fdr_result)
saveRDS(data, paste0(outPath, "/DESeq2_results_", cohort, "_",
ncomps - 1, "-comp.RDS"))
40 changes: 40 additions & 0 deletions DR_inv_digamma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python
#SBATCH --job-name="invert digamma"
#SBATCH --out=.slurmlogs/slurm.dr_inv_digamma.%j_%N.out
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=240G
#SBATCH --time=10-00:00:00
#SBATCH --mail-type=ALL
#SBATCH [email protected]

import os
os.environ['OMP_NUM_THREADS'] = '1' # single thread openMP and BLAS
if 'SLURM_SUBMIT_DIR' in os.environ:
os.chdir(os.environ['SLURM_SUBMIT_DIR'])
import numpy as np
import pickle
from models import marker_model as my_model
from tqdm import tqdm
from multiprocessing import Pool
from DR_plots import *
from rpy2.robjects import r, globalenv, numpy2ri
numpy2ri.activate()

dg, sub = dgamma(counts)
def pool_f(n):
red = inv_phi(phi(dg, n * 10))
deg_counts = inv_dg(red, sub, parallel = False)
with open('data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.pkl', 'wb') as buff:
pickle.dump(deg_counts, buff)
rPath = 'data/deg_counts_sd/digamma_deg_' + str(n * 10) + '-comp.RDS'
nr,nc = deg_counts.shape
dg_r = r.matrix(deg_counts, nrow=nr, ncol=nc)
r.saveRDS(dg_r, rPath)

tasks = set(range(np.floor(counts.shape[0] / 10).astype(int)))
tasks = tasks.union(range(10))

if __name__ == "__main__":
with Pool() as pool:
_ = list(tqdm(pool.imap_unordered(pool_f, tasks), total=len(tasks)))
171 changes: 171 additions & 0 deletions DR_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import argparse
import matplotlib
if __name__ == "__main__":
matplotlib.use('Agg')

parser = argparse.ArgumentParser(description = 'Correlation plot of before and after DR to specified dimension.')
parser.add_argument('integers', metavar = 'N', type=int, nargs = '+',
help = 'The number of dimensions to project to. (multiple possible)')
parser.add_argument("--res", type = int, default = 200,
help = 'Corrleation plot resolution.')
args = parser.parse_args()

import os
os.environ['OMP_NUM_THREADS'] = '1' # single thread openMP and BLAS
from matplotlib import pyplot as plt
import numpy as np
import pickle
from tqdm import tqdm_notebook, tqdm
from multiprocessing import Pool
from scipy.stats import gaussian_kde, spearmanr, pearsonr
from scipy.special import digamma
from scipy.optimize import minimize_scalar

with open(os.environ['degOutPath'] + 'other_data.pkl', 'rb') as f:
data = pickle.load(f)

pca = data['pca']
counts = data['counts']
n = counts.shape[1]

def beta(counts):
alpha = counts + 1
sub = np.log(np.sum(alpha, axis = 1, keepdims = True))
nom = np.log(alpha)
result = nom - sub
return result

def inv_beta(log_alpha, seq_depth = 1e8, prior = 0):
#bound = np.percentile(log_alpha, 20, axis = 1, keepdims = True)
n = log_alpha.shape[1]
alpha = np.exp(log_alpha)
result = alpha*(seq_depth + n) - 1 - prior
result = np.clip(result, 0, None)
return result

def re_counts(log_p, seq_depth = 1e8):
return np.exp(log_p) * seq_depth

def phi(X, ncomp):
ind = range(ncomp)
if pca.mean_ is not None:
X = X - pca.mean_
X_transformed = np.dot(X, pca.components_[ind, :].T)
return X_transformed

def inv_phi(X):
ind = range(X.shape[1])
return np.dot(X, pca.components_[ind, :]) + pca.mean_

def dgamma(counts):
seq_depth = np.sum(counts, axis = 1, keepdims = True)
alpha = counts + 1
sub = digamma(seq_depth + counts.shape[1])
nom = digamma(alpha)
result = nom - sub
return result, seq_depth

def proj_O(log_counts):
n = log_counts.shape[1]
diff = log_counts.sum(axis = 1, keepdims = True) / n
return log_counts - diff

def rev_O(o_counts):
level = np.exp(o_counts).sum(axis = 1, keepdims = True)
t = np.log(level)
return o_counts - t

def _objective(l_count, dg, sub):
alpha = np.exp(l_count) + 1
nom = digamma(alpha)
result = nom - sub
return (result - dg)**2

def _single(dat):
lalpha, sub = dat
res = minimize_scalar(_objective, args = (lalpha, sub), tol = 1e-20)
return np.round(np.exp(res.x))

def inv_dg(log_alpha, sub = 18, parallel = True):
results = []
N = np.repeat(1, 10).shape
if len(sub) != N:
sub = np.repeat(sub, N)
iterator = range(log_alpha.shape[0])
if parallel:
iterator = tqdm_notebook(iterator)
for samp in iterator:
tasks = [(lalpha, sub[samp]) for lalpha in log_alpha[samp, :]]
if parallel:
with Pool() as pool:
samp_res = pool.map(_single, tasks)
else:
samp_res = [_single(t) for t in tasks]
results.append(np.array(samp_res))
return np.stack(results, axis = 0)

def degenerate(X, ncomp, m=1e8):
return inv_beta(rev_O(inv_phi(phi(beta(X), ncomp))), m)

def fast_deg(beta_counts, ncomp, m=1e8, prior=0):
return inv_beta(rev_O(inv_phi(phi(beta_counts, ncomp))), m, prior)

def lin_deg(beta_counts, ncomp):
return rev_O(inv_phi(phi(beta_counts, ncomp)))

def deplot(x, y, legend = True, ax = None, fig = None, res = 200):
# fit an array of size [Ndim, Nsamples]
data = np.vstack([x, y])
kde = gaussian_kde(data)

# evaluate on a regular grid
minx = min(x)
maxx = max(x)
miny = min(y)
maxy = max(y)
xgrid = np.linspace(minx, maxx, res)
ygrid = np.linspace(miny, maxy, res)
Xgrid, Ygrid = np.meshgrid(xgrid, ygrid)
Z = kde.evaluate(np.vstack([Xgrid.ravel(), Ygrid.ravel()]))

# Plot the result as an image
if ax is None:
ax = plt
if fig is None:
fig = plt
img = ax.imshow(Z.reshape(Xgrid.shape),
origin='lower', aspect='auto',
extent=[minx, maxx, miny, maxy],
cmap='Blues')
if legend:
cb = fig.colorbar(img)
cb.set_label("density")

# draw diagonal
vals = [max(minx, miny), min(maxx, maxy)]
ax.plot(vals, vals, ls = '--', c = '.3')

# add text
spearman = spearmanr(x, y)[0]
person = pearsonr(x, y)[0]
text = ('Spearman Correlation = {:f}' +
'\nPearson Correlation = {:f}').format(spearman, person)
xpos = (maxx - minx) * .05 + minx
ypos = - (maxy - miny) * .05 + maxy
ax.text(xpos, ypos, text,
verticalalignment = 'top')

def save_cor_plot(n):
deg_counts = lin_deg(beta_counts, n + 1)
fig = plt.figure()
ax = fig.add_subplot(111)
deplot(beta_counts.flatten(), deg_counts.flatten(), ax = ax, res = args.res)
plt.savefig('data/correlations/' + str(n) + '-comp.png')

if __name__ == "__main__":
tasks = args.integers
if len(tasks) == 1:
save_cor_plot(tasks[0])
elif len(tasks) > 1:
with Pool() as pool:
_ = list(tqdm(pool.imap(save_cor_plot, tasks), total=len(tasks)))
Loading

0 comments on commit e4d970b

Please sign in to comment.