-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from lli-rice/master
add end to end file
- Loading branch information
Showing
6 changed files
with
248 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,17 @@ | ||
name: template | ||
name: ANDES | ||
channels: | ||
- defaults | ||
- conda-forge | ||
dependencies: | ||
- python=3.8.2 | ||
- pip=23.3=py38h06a4308_0 | ||
- python=3.8.18=h955ad1f_0 | ||
- pip: | ||
- joblib==1.3.2 | ||
- numpy==1.24.4 | ||
- pandas==2.0.3 | ||
- python-dateutil==2.8.2 | ||
- pytz==2023.3.post1 | ||
- scikit-learn==1.3.2 | ||
- scipy==1.10.1 | ||
- six==1.16.0 | ||
- threadpoolctl==3.2.0 | ||
- tzdata==2023.3 |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
import argparse | ||
import numpy as np | ||
import pandas as pd | ||
import sys | ||
import random | ||
from functools import partial | ||
from collections import defaultdict | ||
from sklearn import metrics | ||
from multiprocessing import Pool | ||
import load_data as ld | ||
import set_analysis_func as func | ||
|
||
|
||
|
||
if __name__=='__main__': | ||
parser = argparse.ArgumentParser( | ||
description='calculate gene sets similarity in a embedding space') | ||
parser.add_argument('--emb', dest='emb_f', type=str, | ||
help='input file path and file name for embedding') | ||
parser.add_argument('--genelist', dest='genelist_f', type=str, | ||
help='input file path and file name for embedding genes') | ||
parser.add_argument('--geneset1', dest='geneset1_f', type=str, | ||
help='input file path and file name for the first gene set database') | ||
parser.add_argument('--geneset2', dest='geneset2_f', type=str, | ||
help='input file path and file name for the second gene set database') | ||
parser.add_argument('--out', dest='out_f', type=str, | ||
help='output file path and name') | ||
parser.add_argument('-n', '--n_processor', dest='n_process', type=int, default=10, | ||
help='number of processors') | ||
parser.add_argument('--min', dest='min_size', type=int, default=10, | ||
help='the minimum number of genes in a set') | ||
parser.add_argument('--max', dest='max_size', type=int, default=300, | ||
help='the maximum number of genes in a set') | ||
|
||
args = parser.parse_args() | ||
|
||
# load embedding | ||
node_vectors = np.loadtxt(args.emb_f, delimiter=',') | ||
node_list = [] | ||
with open(args.genelist_f, 'r') as f: | ||
for line in f: | ||
node_list.append(line.strip()) | ||
|
||
print('finish load embedding') | ||
if len(node_list)!=node_vectors.shape[0]: | ||
print('embedding dimension must match the number of gene ids') | ||
sys.exit() | ||
print(str(len(node_list)), 'genes') | ||
|
||
S = metrics.pairwise.cosine_similarity(node_vectors, node_vectors) | ||
|
||
# create gene to embedding id mapping | ||
g_node2index = {j:i for i,j in enumerate(node_list)} | ||
g_index2node = {i:j for i,j in enumerate(node_list)} | ||
g_node2index = defaultdict(lambda:-1, g_node2index) | ||
|
||
# load gene set data | ||
geneset1 = ld.load_gmt(args.geneset1_f) | ||
|
||
geneset1_indices = ld.term2indexes( | ||
geneset1, g_node2index, upper=args.max_size, lower=args.min_size) | ||
|
||
geneset2 = ld.load_gmt(args.geneset2_f) | ||
|
||
geneset2_indices = ld.term2indexes( | ||
geneset2, g_node2index, upper=args.max_size, lower=args.min_size) | ||
|
||
# get background genes | ||
geneset1_all_genes = set() | ||
for x in geneset1: | ||
geneset1_all_genes = geneset1_all_genes.union(geneset1[x]) | ||
|
||
geneset1_all_genes = geneset1_all_genes.intersection(node_list) | ||
geneset1_all_indices = [g_node2index[x] for x in geneset1_all_genes] | ||
geneset1_terms = list(geneset1_indices) | ||
|
||
geneset2_all_genes = set() | ||
for x in geneset2: | ||
geneset2_all_genes = geneset2_all_genes.union(geneset2[x]) | ||
|
||
geneset2_all_genes = geneset2_all_genes.intersection(node_list) | ||
geneset2_all_indices = [g_node2index[x] for x in geneset2_all_genes] | ||
geneset2_terms = list(geneset2_indices) | ||
|
||
print('database 1:', str(len(geneset1_terms)), 'terms,', str(len(geneset1_all_indices)), 'background genes') | ||
print('database 2:', str(len(geneset2_terms)), 'terms,', str(len(geneset2_all_indices)), 'background genes') | ||
|
||
|
||
# define andes function | ||
f = partial(func.andes, matrix=S, g1_term2index=geneset1_indices, | ||
g2_term2index=geneset2_indices, | ||
g1_population=geneset1_all_indices, | ||
g2_population=geneset2_all_indices) | ||
|
||
all_terms = [(x,y) for x in geneset1_terms for y in geneset2_terms] | ||
shuffled_terms = random.sample(all_terms, len(all_terms)) | ||
|
||
|
||
|
||
geneset1_term2index = {j:i for i,j in enumerate(geneset1_terms)} | ||
geneset2_term2index = {j:i for i,j in enumerate(geneset2_terms)} | ||
|
||
with Pool(args.n_process) as p: | ||
rets = p.map(f, shuffled_terms) | ||
|
||
|
||
zscores = np.zeros((len(geneset1_terms), len(geneset2_terms))) | ||
for i, (x,y) in enumerate(shuffled_terms): | ||
idx = geneset1_term2index[x] | ||
idy = geneset2_term2index[y] | ||
zscores[idx, idy] = rets[i][1] | ||
|
||
|
||
zscores = pd.DataFrame(zscores, index=geneset1_terms, columns=geneset2_terms) | ||
zscores.to_csv(args.out_f, sep=',') | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import argparse | ||
import numpy as np | ||
import pandas as pd | ||
import sys | ||
from functools import partial | ||
from collections import defaultdict | ||
from sklearn import metrics | ||
from multiprocessing import Pool | ||
import load_data as ld | ||
import set_analysis_func as func | ||
|
||
|
||
if __name__=='__main__': | ||
parser = argparse.ArgumentParser( | ||
description='calculate gene sets similarity in a embedding space') | ||
parser.add_argument('--emb', dest='emb_f', type=str, | ||
help='input file path and file name for embedding') | ||
parser.add_argument('--genelist', dest='genelist_f', type=str, | ||
help='input file path and file name for embedding genes') | ||
parser.add_argument('--geneset', dest='geneset_f', type=str, | ||
help='input file path and file name for the gene set database') | ||
parser.add_argument('--rankedlist', dest='rankedlist_f', type=str, | ||
help='input file path and file name for the ranked gene list') | ||
parser.add_argument('--out', dest='out_f', type=str, | ||
help='output file path and name') | ||
parser.add_argument('-n', '--n_processor', dest='n_process', type=int, default=10, | ||
help='number of processors') | ||
parser.add_argument('--min', dest='min_size', type=int, default=10, | ||
help='the minimum number of genes in a set') | ||
parser.add_argument('--max', dest='max_size', type=int, default=300, | ||
help='the maximum number of genes in a set') | ||
|
||
args = parser.parse_args() | ||
|
||
# load embedding | ||
node_vectors = np.loadtxt(args.emb_f, delimiter=',') | ||
node_list = [] | ||
with open(args.genelist_f, 'r') as f: | ||
for line in f: | ||
node_list.append(line.strip()) | ||
|
||
S = metrics.pairwise.cosine_similarity(node_vectors, node_vectors) | ||
|
||
print('finish load embedding') | ||
if len(node_list)!=node_vectors.shape[0]: | ||
print('embedding dimension must match the number of gene ids') | ||
sys.exit() | ||
print(str(len(node_list)), 'genes') | ||
|
||
|
||
# create gene to embedding id mapping | ||
g_node2index = {j:i for i,j in enumerate(node_list)} | ||
g_index2node = {i:j for i,j in enumerate(node_list)} | ||
g_node2index = defaultdict(lambda:-1, g_node2index) | ||
|
||
|
||
# load gene set data | ||
geneset = ld.load_gmt(args.geneset_f) | ||
|
||
geneset_indices = ld.term2indexes( | ||
geneset, g_node2index, upper=args.max_size, lower=args.min_size) | ||
|
||
|
||
# get background genes | ||
geneset_all_genes = set() | ||
for x in geneset: | ||
geneset_all_genes = geneset_all_genes.union(geneset[x]) | ||
|
||
geneset_all_genes = geneset_all_genes.intersection(node_list) | ||
geneset_all_indices = [g_node2index[x] for x in geneset_all_genes] | ||
geneset_terms = list(geneset_indices.keys()) | ||
|
||
print('finish load gene set database') | ||
print(str(len(geneset_terms)), 'terms,', str(len(geneset_all_indices)), 'background genes') | ||
|
||
#load ranked list | ||
ranked_list = pd.read_csv(args.rankedlist_f, sep='\t', | ||
index_col=0, header=None) | ||
ranked_list = [str(y) for y in ranked_list.index] | ||
ranked_list = [g_node2index[y] for y in ranked_list if y in node_list] | ||
|
||
print('finish load ranked list') | ||
print(str(len(ranked_list)), 'genes') | ||
|
||
f = partial(func.gsea_andes, ranked_list=ranked_list, matrix=S, | ||
term2indices=geneset_indices, | ||
annotated_indices=geneset_all_indices) | ||
|
||
with Pool(args.n_process) as p: | ||
rets = p.map(f, geneset_terms) | ||
|
||
zscores = pd.DataFrame([x[1] for x in rets], index=geneset_terms, columns=['z-score']) | ||
zscores.to_csv(args.out_f, sep=',') |