diff --git a/README.md b/README.md index 1ee6ee6..ebc6b7c 100644 --- a/README.md +++ b/README.md @@ -12,3 +12,24 @@ ANDES's method has two main functions: These functions are implemented in `src/set_analysis_fun.py`. The demo jupyter notebook (`demo.ipynb`) illustrates how to use these two funcions + + +## Usage +This project uses conda to manage the required packages and setup a virtual environment. Once conda is installed on your machine get started by setting up the virtual environment. + +```sh +conda env create -f env.yml +conda activate ANDES +``` + +ANDES can be run through the command line as follows: + +```sh +python src/andes.py --emb embedding_file.csv --genelist embedding_gene_ids.txt --geneset1 first_gene_set_database.gmt --geneset2 second_gene_set_database.gmt --out output_file.csv -n num_processor +``` + +ANDES performing GSEA can be run through the command line as follows: + +```sh +python src/andes_gsea.py --emb embedding_file.csv --genelist embedding_gene_ids.txt --geneset gene_set_database.gmt --rankedlist ranked_genes.txt --out output_file.csv -n num_processor +``` diff --git a/env.yml b/env.yml index 7d71ed3..9038558 100644 --- a/env.yml +++ b/env.yml @@ -1,6 +1,17 @@ -name: template +name: ANDES channels: - defaults - - conda-forge dependencies: - - python=3.8.2 + - pip=23.3=py38h06a4308_0 + - python=3.8.18=h955ad1f_0 + - pip: + - joblib==1.3.2 + - numpy==1.24.4 + - pandas==2.0.3 + - python-dateutil==2.8.2 + - pytz==2023.3.post1 + - scikit-learn==1.3.2 + - scipy==1.10.1 + - six==1.16.0 + - threadpoolctl==3.2.0 + - tzdata==2023.3 diff --git a/ref/README.md b/ref/README.md deleted file mode 100644 index 428b078..0000000 --- a/ref/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# References - -This optional directory is a place where you can dump reference material, including papers, slides, or any other useful background materials. diff --git a/results/README.md b/results/README.md deleted file mode 100644 index 2a3e86b..0000000 --- a/results/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# Project results - -## Subdirectory structure - -Add documentation to explain any subdirectories. For example, they can be arranged by date / version number. - -## Files - -Add a simple bullet point to explain the source and contents of each file, including column descriptors or captions for figures. diff --git a/src/andes.py b/src/andes.py new file mode 100644 index 0000000..6ed536b --- /dev/null +++ b/src/andes.py @@ -0,0 +1,120 @@ +import argparse +import numpy as np +import pandas as pd +import sys +import random +from functools import partial +from collections import defaultdict +from sklearn import metrics +from multiprocessing import Pool +import load_data as ld +import set_analysis_func as func + + + +if __name__=='__main__': + parser = argparse.ArgumentParser( + description='calculate gene sets similarity in a embedding space') + parser.add_argument('--emb', dest='emb_f', type=str, + help='input file path and file name for embedding') + parser.add_argument('--genelist', dest='genelist_f', type=str, + help='input file path and file name for embedding genes') + parser.add_argument('--geneset1', dest='geneset1_f', type=str, + help='input file path and file name for the first gene set database') + parser.add_argument('--geneset2', dest='geneset2_f', type=str, + help='input file path and file name for the second gene set database') + parser.add_argument('--out', dest='out_f', type=str, + help='output file path and name') + parser.add_argument('-n', '--n_processor', dest='n_process', type=int, default=10, + help='number of processors') + parser.add_argument('--min', dest='min_size', type=int, default=10, + help='the minimum number of genes in a set') + parser.add_argument('--max', dest='max_size', type=int, default=300, + help='the maximum number of genes in a set') + + args = parser.parse_args() + + # load embedding + node_vectors = np.loadtxt(args.emb_f, delimiter=',') + node_list = [] + with open(args.genelist_f, 'r') as f: + for line in f: + node_list.append(line.strip()) + + print('finish load embedding') + if len(node_list)!=node_vectors.shape[0]: + print('embedding dimension must match the number of gene ids') + sys.exit() + print(str(len(node_list)), 'genes') + + S = metrics.pairwise.cosine_similarity(node_vectors, node_vectors) + + # create gene to embedding id mapping + g_node2index = {j:i for i,j in enumerate(node_list)} + g_index2node = {i:j for i,j in enumerate(node_list)} + g_node2index = defaultdict(lambda:-1, g_node2index) + + # load gene set data + geneset1 = ld.load_gmt(args.geneset1_f) + + geneset1_indices = ld.term2indexes( + geneset1, g_node2index, upper=args.max_size, lower=args.min_size) + + geneset2 = ld.load_gmt(args.geneset2_f) + + geneset2_indices = ld.term2indexes( + geneset2, g_node2index, upper=args.max_size, lower=args.min_size) + + # get background genes + geneset1_all_genes = set() + for x in geneset1: + geneset1_all_genes = geneset1_all_genes.union(geneset1[x]) + + geneset1_all_genes = geneset1_all_genes.intersection(node_list) + geneset1_all_indices = [g_node2index[x] for x in geneset1_all_genes] + geneset1_terms = list(geneset1_indices) + + geneset2_all_genes = set() + for x in geneset2: + geneset2_all_genes = geneset2_all_genes.union(geneset2[x]) + + geneset2_all_genes = geneset2_all_genes.intersection(node_list) + geneset2_all_indices = [g_node2index[x] for x in geneset2_all_genes] + geneset2_terms = list(geneset2_indices) + + print('database 1:', str(len(geneset1_terms)), 'terms,', str(len(geneset1_all_indices)), 'background genes') + print('database 2:', str(len(geneset2_terms)), 'terms,', str(len(geneset2_all_indices)), 'background genes') + + + # define andes function + f = partial(func.andes, matrix=S, g1_term2index=geneset1_indices, + g2_term2index=geneset2_indices, + g1_population=geneset1_all_indices, + g2_population=geneset2_all_indices) + + all_terms = [(x,y) for x in geneset1_terms for y in geneset2_terms] + shuffled_terms = random.sample(all_terms, len(all_terms)) + + + + geneset1_term2index = {j:i for i,j in enumerate(geneset1_terms)} + geneset2_term2index = {j:i for i,j in enumerate(geneset2_terms)} + + with Pool(args.n_process) as p: + rets = p.map(f, shuffled_terms) + + + zscores = np.zeros((len(geneset1_terms), len(geneset2_terms))) + for i, (x,y) in enumerate(shuffled_terms): + idx = geneset1_term2index[x] + idy = geneset2_term2index[y] + zscores[idx, idy] = rets[i][1] + + + zscores = pd.DataFrame(zscores, index=geneset1_terms, columns=geneset2_terms) + zscores.to_csv(args.out_f, sep=',') + + + + + diff --git a/src/andes_gsea.py b/src/andes_gsea.py new file mode 100644 index 0000000..b1ca496 --- /dev/null +++ b/src/andes_gsea.py @@ -0,0 +1,93 @@ +import argparse +import numpy as np +import pandas as pd +import sys +from functools import partial +from collections import defaultdict +from sklearn import metrics +from multiprocessing import Pool +import load_data as ld +import set_analysis_func as func + + +if __name__=='__main__': + parser = argparse.ArgumentParser( + description='calculate gene sets similarity in a embedding space') + parser.add_argument('--emb', dest='emb_f', type=str, + help='input file path and file name for embedding') + parser.add_argument('--genelist', dest='genelist_f', type=str, + help='input file path and file name for embedding genes') + parser.add_argument('--geneset', dest='geneset_f', type=str, + help='input file path and file name for the gene set database') + parser.add_argument('--rankedlist', dest='rankedlist_f', type=str, + help='input file path and file name for the ranked gene list') + parser.add_argument('--out', dest='out_f', type=str, + help='output file path and name') + parser.add_argument('-n', '--n_processor', dest='n_process', type=int, default=10, + help='number of processors') + parser.add_argument('--min', dest='min_size', type=int, default=10, + help='the minimum number of genes in a set') + parser.add_argument('--max', dest='max_size', type=int, default=300, + help='the maximum number of genes in a set') + + args = parser.parse_args() + + # load embedding + node_vectors = np.loadtxt(args.emb_f, delimiter=',') + node_list = [] + with open(args.genelist_f, 'r') as f: + for line in f: + node_list.append(line.strip()) + + S = metrics.pairwise.cosine_similarity(node_vectors, node_vectors) + + print('finish load embedding') + if len(node_list)!=node_vectors.shape[0]: + print('embedding dimension must match the number of gene ids') + sys.exit() + print(str(len(node_list)), 'genes') + + + # create gene to embedding id mapping + g_node2index = {j:i for i,j in enumerate(node_list)} + g_index2node = {i:j for i,j in enumerate(node_list)} + g_node2index = defaultdict(lambda:-1, g_node2index) + + + # load gene set data + geneset = ld.load_gmt(args.geneset_f) + + geneset_indices = ld.term2indexes( + geneset, g_node2index, upper=args.max_size, lower=args.min_size) + + + # get background genes + geneset_all_genes = set() + for x in geneset: + geneset_all_genes = geneset_all_genes.union(geneset[x]) + + geneset_all_genes = geneset_all_genes.intersection(node_list) + geneset_all_indices = [g_node2index[x] for x in geneset_all_genes] + geneset_terms = list(geneset_indices.keys()) + + print('finish load gene set database') + print(str(len(geneset_terms)), 'terms,', str(len(geneset_all_indices)), 'background genes') + + #load ranked list + ranked_list = pd.read_csv(args.rankedlist_f, sep='\t', + index_col=0, header=None) + ranked_list = [str(y) for y in ranked_list.index] + ranked_list = [g_node2index[y] for y in ranked_list if y in node_list] + + print('finish load ranked list') + print(str(len(ranked_list)), 'genes') + + f = partial(func.gsea_andes, ranked_list=ranked_list, matrix=S, + term2indices=geneset_indices, + annotated_indices=geneset_all_indices) + + with Pool(args.n_process) as p: + rets = p.map(f, geneset_terms) + + zscores = pd.DataFrame([x[1] for x in rets], index=geneset_terms, columns=['z-score']) + zscores.to_csv(args.out_f, sep=',')