Skip to content

Commit

Permalink
Merge pull request #1 from lli-rice/master
Browse files Browse the repository at this point in the history
add end to end file
  • Loading branch information
lli-rice authored Nov 12, 2023
2 parents 9fb7f4b + b8a23cd commit f983132
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 15 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,24 @@ ANDES's method has two main functions:

These functions are implemented in `src/set_analysis_fun.py`. The demo
jupyter notebook (`demo.ipynb`) illustrates how to use these two funcions


## Usage
This project uses conda to manage the required packages and setup a virtual environment. Once conda is installed on your machine get started by setting up the virtual environment.

```sh
conda env create -f env.yml
conda activate ANDES
```

ANDES can be run through the command line as follows:

```sh
python src/andes.py --emb embedding_file.csv --genelist embedding_gene_ids.txt --geneset1 first_gene_set_database.gmt --geneset2 second_gene_set_database.gmt --out output_file.csv -n num_processor
```

ANDES performing GSEA can be run through the command line as follows:

```sh
python src/andes_gsea.py --emb embedding_file.csv --genelist embedding_gene_ids.txt --geneset gene_set_database.gmt --rankedlist ranked_genes.txt --out output_file.csv -n num_processor
```
17 changes: 14 additions & 3 deletions env.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
name: template
name: ANDES
channels:
- defaults
- conda-forge
dependencies:
- python=3.8.2
- pip=23.3=py38h06a4308_0
- python=3.8.18=h955ad1f_0
- pip:
- joblib==1.3.2
- numpy==1.24.4
- pandas==2.0.3
- python-dateutil==2.8.2
- pytz==2023.3.post1
- scikit-learn==1.3.2
- scipy==1.10.1
- six==1.16.0
- threadpoolctl==3.2.0
- tzdata==2023.3
3 changes: 0 additions & 3 deletions ref/README.md

This file was deleted.

9 changes: 0 additions & 9 deletions results/README.md

This file was deleted.

120 changes: 120 additions & 0 deletions src/andes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import argparse
import numpy as np
import pandas as pd
import sys
import random
from functools import partial
from collections import defaultdict
from sklearn import metrics
from multiprocessing import Pool
import load_data as ld
import set_analysis_func as func



if __name__=='__main__':
parser = argparse.ArgumentParser(
description='calculate gene sets similarity in a embedding space')
parser.add_argument('--emb', dest='emb_f', type=str,
help='input file path and file name for embedding')
parser.add_argument('--genelist', dest='genelist_f', type=str,
help='input file path and file name for embedding genes')
parser.add_argument('--geneset1', dest='geneset1_f', type=str,
help='input file path and file name for the first gene set database')
parser.add_argument('--geneset2', dest='geneset2_f', type=str,
help='input file path and file name for the second gene set database')
parser.add_argument('--out', dest='out_f', type=str,
help='output file path and name')
parser.add_argument('-n', '--n_processor', dest='n_process', type=int, default=10,
help='number of processors')
parser.add_argument('--min', dest='min_size', type=int, default=10,
help='the minimum number of genes in a set')
parser.add_argument('--max', dest='max_size', type=int, default=300,
help='the maximum number of genes in a set')

args = parser.parse_args()

# load embedding
node_vectors = np.loadtxt(args.emb_f, delimiter=',')
node_list = []
with open(args.genelist_f, 'r') as f:
for line in f:
node_list.append(line.strip())

print('finish load embedding')
if len(node_list)!=node_vectors.shape[0]:
print('embedding dimension must match the number of gene ids')
sys.exit()
print(str(len(node_list)), 'genes')

S = metrics.pairwise.cosine_similarity(node_vectors, node_vectors)

# create gene to embedding id mapping
g_node2index = {j:i for i,j in enumerate(node_list)}
g_index2node = {i:j for i,j in enumerate(node_list)}
g_node2index = defaultdict(lambda:-1, g_node2index)

# load gene set data
geneset1 = ld.load_gmt(args.geneset1_f)

geneset1_indices = ld.term2indexes(
geneset1, g_node2index, upper=args.max_size, lower=args.min_size)

geneset2 = ld.load_gmt(args.geneset2_f)

geneset2_indices = ld.term2indexes(
geneset2, g_node2index, upper=args.max_size, lower=args.min_size)

# get background genes
geneset1_all_genes = set()
for x in geneset1:
geneset1_all_genes = geneset1_all_genes.union(geneset1[x])

geneset1_all_genes = geneset1_all_genes.intersection(node_list)
geneset1_all_indices = [g_node2index[x] for x in geneset1_all_genes]
geneset1_terms = list(geneset1_indices)

geneset2_all_genes = set()
for x in geneset2:
geneset2_all_genes = geneset2_all_genes.union(geneset2[x])

geneset2_all_genes = geneset2_all_genes.intersection(node_list)
geneset2_all_indices = [g_node2index[x] for x in geneset2_all_genes]
geneset2_terms = list(geneset2_indices)

print('database 1:', str(len(geneset1_terms)), 'terms,', str(len(geneset1_all_indices)), 'background genes')
print('database 2:', str(len(geneset2_terms)), 'terms,', str(len(geneset2_all_indices)), 'background genes')


# define andes function
f = partial(func.andes, matrix=S, g1_term2index=geneset1_indices,
g2_term2index=geneset2_indices,
g1_population=geneset1_all_indices,
g2_population=geneset2_all_indices)

all_terms = [(x,y) for x in geneset1_terms for y in geneset2_terms]
shuffled_terms = random.sample(all_terms, len(all_terms))



geneset1_term2index = {j:i for i,j in enumerate(geneset1_terms)}
geneset2_term2index = {j:i for i,j in enumerate(geneset2_terms)}

with Pool(args.n_process) as p:
rets = p.map(f, shuffled_terms)


zscores = np.zeros((len(geneset1_terms), len(geneset2_terms)))
for i, (x,y) in enumerate(shuffled_terms):
idx = geneset1_term2index[x]
idy = geneset2_term2index[y]
zscores[idx, idy] = rets[i][1]


zscores = pd.DataFrame(zscores, index=geneset1_terms, columns=geneset2_terms)
zscores.to_csv(args.out_f, sep=',')





93 changes: 93 additions & 0 deletions src/andes_gsea.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import argparse
import numpy as np
import pandas as pd
import sys
from functools import partial
from collections import defaultdict
from sklearn import metrics
from multiprocessing import Pool
import load_data as ld
import set_analysis_func as func


if __name__=='__main__':
parser = argparse.ArgumentParser(
description='calculate gene sets similarity in a embedding space')
parser.add_argument('--emb', dest='emb_f', type=str,
help='input file path and file name for embedding')
parser.add_argument('--genelist', dest='genelist_f', type=str,
help='input file path and file name for embedding genes')
parser.add_argument('--geneset', dest='geneset_f', type=str,
help='input file path and file name for the gene set database')
parser.add_argument('--rankedlist', dest='rankedlist_f', type=str,
help='input file path and file name for the ranked gene list')
parser.add_argument('--out', dest='out_f', type=str,
help='output file path and name')
parser.add_argument('-n', '--n_processor', dest='n_process', type=int, default=10,
help='number of processors')
parser.add_argument('--min', dest='min_size', type=int, default=10,
help='the minimum number of genes in a set')
parser.add_argument('--max', dest='max_size', type=int, default=300,
help='the maximum number of genes in a set')

args = parser.parse_args()

# load embedding
node_vectors = np.loadtxt(args.emb_f, delimiter=',')
node_list = []
with open(args.genelist_f, 'r') as f:
for line in f:
node_list.append(line.strip())

S = metrics.pairwise.cosine_similarity(node_vectors, node_vectors)

print('finish load embedding')
if len(node_list)!=node_vectors.shape[0]:
print('embedding dimension must match the number of gene ids')
sys.exit()
print(str(len(node_list)), 'genes')


# create gene to embedding id mapping
g_node2index = {j:i for i,j in enumerate(node_list)}
g_index2node = {i:j for i,j in enumerate(node_list)}
g_node2index = defaultdict(lambda:-1, g_node2index)


# load gene set data
geneset = ld.load_gmt(args.geneset_f)

geneset_indices = ld.term2indexes(
geneset, g_node2index, upper=args.max_size, lower=args.min_size)


# get background genes
geneset_all_genes = set()
for x in geneset:
geneset_all_genes = geneset_all_genes.union(geneset[x])

geneset_all_genes = geneset_all_genes.intersection(node_list)
geneset_all_indices = [g_node2index[x] for x in geneset_all_genes]
geneset_terms = list(geneset_indices.keys())

print('finish load gene set database')
print(str(len(geneset_terms)), 'terms,', str(len(geneset_all_indices)), 'background genes')

#load ranked list
ranked_list = pd.read_csv(args.rankedlist_f, sep='\t',
index_col=0, header=None)
ranked_list = [str(y) for y in ranked_list.index]
ranked_list = [g_node2index[y] for y in ranked_list if y in node_list]

print('finish load ranked list')
print(str(len(ranked_list)), 'genes')

f = partial(func.gsea_andes, ranked_list=ranked_list, matrix=S,
term2indices=geneset_indices,
annotated_indices=geneset_all_indices)

with Pool(args.n_process) as p:
rets = p.map(f, geneset_terms)

zscores = pd.DataFrame([x[1] for x in rets], index=geneset_terms, columns=['z-score'])
zscores.to_csv(args.out_f, sep=',')

0 comments on commit f983132

Please sign in to comment.