forked from DEIB-GECO/GeneNetFusion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfusion_networks.py
130 lines (108 loc) · 6.38 KB
/
fusion_networks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import numpy as np
import warnings
from scipy.stats import binom
import snf
import os
from creation_matrices import adj_to_list
os.environ["MKL_NUM_THREADS"] = "10"
os.environ["NUMEXPR_NUM_THREADS"] = "10"
os.environ["OMP_NUM_THREADS"] = "10"
os.environ["OPENBLAS_NUM_THREADS"] = "10"
os.environ["VECLIB_MAXIMUM_THREADS"] = "10"
warnings.filterwarnings('ignore')
#It allows to read a file (as a Dataframe) that has 3 or more columns (sep: tab) nd one line header with column names
# ('src' and 'trg' that indicates the source and destination, the column of interest indicate the weights)
#It return the parsed network data, the number of nodes in the network and the number of edges.
def read(filename, column_of_interest, consider_self_loops = True, drop_zeroes = True):
table = pd.read_csv(filename, sep = '\t')
table = table[["src", "trg", column_of_interest]]
table.rename(columns = {column_of_interest: "nij"}, inplace = True)
if drop_zeroes:
table = table[table["nij"] > 0]
if not consider_self_loops:
table = table[table["src"] != table["trg"]]
original_nodes = len(set(table["src"]) | set(table["trg"]))
original_edges = table.shape[0]
return table, original_nodes, original_edges
#Reads a preprocessed edge table and returns only the edges supassing a significance threshold.
def thresholding(table, threshold):
table = table.copy()
if "sdev_cij" in table:
return table[(table["score"] - (threshold * table["sdev_cij"])) > 0][["src", "trg", "nij", "score"]]
else:
return table[table["score"] > threshold][["src", "trg", "nij", "score"]]
#It allows to correct the noise of the table
def noise_corrected(table, return_self_loops = False, calculate_p_value = False):
table = table.copy()
src_sum = table.groupby(by = "src").sum()[["nij"]]
table = table.merge(src_sum, left_on = "src", right_index = True, suffixes = ("", "_src_sum"))
trg_sum = table.groupby(by = "trg").sum()[["nij"]]
table = table.merge(trg_sum, left_on = "trg", right_index = True, suffixes = ("", "_trg_sum"))
table.rename(columns = {"nij_src_sum": "ni.", "nij_trg_sum": "n.j"}, inplace = True)
table["n.."] = table["nij"].sum()
table["mean_prior_probability"] = ((table["ni."] * table["n.j"]) / table["n.."]) * (1 / table["n.."])
if calculate_p_value:
table["score"] = binom.cdf(table["nij"], table["n.."], table["mean_prior_probability"])
return table[["src", "trg", "nij", "score"]]
table["kappa"] = table["n.."] / (table["ni."] * table["n.j"])
table["score"] = ((table["kappa"] * table["nij"]) - 1) / ((table["kappa"] * table["nij"]) + 1)
table["var_prior_probability"] = (1 / (table["n.."] ** 2)) * (table["ni."] * table["n.j"] * (table["n.."] - table["ni."]) * (table["n.."] - table["n.j"])) / ((table["n.."] ** 2) * ((table["n.."] - 1)))
table["alpha_prior"] = (((table["mean_prior_probability"] ** 2) / table["var_prior_probability"]) * (1 - table["mean_prior_probability"])) - table["mean_prior_probability"]
table["beta_prior"] = (table["mean_prior_probability"] / table["var_prior_probability"]) * (1 - (table["mean_prior_probability"] ** 2)) - (1 - table["mean_prior_probability"])
table["alpha_post"] = table["alpha_prior"] + table["nij"]
table["beta_post"] = table["n.."] - table["nij"] + table["beta_prior"]
table["expected_pij"] = table["alpha_post"] / (table["alpha_post"] + table["beta_post"])
table["variance_nij"] = table["expected_pij"] * (1 - table["expected_pij"]) * table["n.."]
table["d"] = (1.0 / (table["ni."] * table["n.j"])) - (table["n.."] * ((table["ni."] + table["n.j"]) / ((table["ni."] * table["n.j"]) ** 2)))
table["variance_cij"] = table["variance_nij"] * (((2 * (table["kappa"] + (table["nij"] * table["d"]))) / (((table["kappa"] * table["nij"]) + 1) ** 2)) ** 2)
table["sdev_cij"] = table["variance_cij"] ** .5
if not return_self_loops:
table = table[table["src"] != table["trg"]]
return table[["src", "trg", "nij", "score", "sdev_cij"]]
#It computes the disparity filter for a matrix
def disparity_filter(table, return_self_loops = False):
table = table.copy()
table_sum = table.groupby(table["src"]).sum().reset_index()
table_deg = table.groupby(table["src"]).count()["trg"].reset_index()
table = table.merge(table_sum, on = "src", how = "left", suffixes = ("", "_sum"))
table = table.merge(table_deg, on = "src", how = "left", suffixes = ("", "_count"))
table["score"] = 1.0 - ((1.0 - (table["nij"] / table["nij_sum"])) ** (table["trg_count"] - 1))
table["variance"] = (table["trg_count"] ** 2) * (((20 + (4.0 * table["trg_count"])) / ((table["trg_count"] + 1.0) * (table["trg_count"] + 2) * (table["trg_count"] + 3))) - ((4.0) / ((table["trg_count"] + 1.0) ** 2)))
if not return_self_loops:
table = table[table["src"] != table["trg"]]
return table[["src", "trg", "nij", "score", "variance"]]
#To pass from edgelist to adjacency matrix
def from_edgelist_to_matrix(A, length):
List=np.zeros((length,length))
for source in A.index:
List[A['src'][source],A['trg'][source]]=A['nij'][source]
List[A['trg'][source],A['src'][source]]=A['nij'][source]
return List
class fusion_snf:
def __init__(self, normal_sum, cancer_sum, tumor):
self.tumor=tumor
self.normal=normal_sum
self.cancer=cancer_sum
self.both=[self.normal,self.cancer]
self.fused=pd.DataFrame(snf.snf(self.both, K=20))
self.fused_edgelist=adj_to_list(self.fused)
self.fused_edgelist.columns=['src', 'trg', 'weight']
self.fused_edgelist.to_csv('./Networks/fused_edgelist_'+ str(self.tumor)+'.csv', sep = '\t')
self.fused_thr=self.threshold_snf()
np.save('./Networks/fused_'+ str(self.tumor)+'.npy',self.fused_thr)
#Threshold the fused matrix
def threshold_snf(self):
[fused, nodes, edges]=read('./Networks/fused_edgelist_'+ str(self.tumor)+'.csv', 'weight')
fused=noise_corrected(fused)
fused_df=disparity_filter(fused)
quantiles = [99,99.5,99.8]
for q in quantiles:
thr=np.percentile(fused_df['score'].values, q)
fused_dfThr=thresholding(fused_df, thr)
#200000 is the threshold for Gephi to represent relevant edges
if str(len(fused_dfThr))< 200000:
break
print('fused: ' + str(len(fused_dfThr)))
fused=from_edgelist_to_matrix(fused_dfThr, len(self.normal))
return fused