Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
hevalatas authored Dec 21, 2022
1 parent 161942b commit 40aa208
Showing 1 changed file with 259 additions and 0 deletions.
259 changes: 259 additions & 0 deletions train_test_splitting/network_based_train_test_splitting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
import pandas as pd
import numpy as np
import networkx as nx
from operator import itemgetter
from datetime import datetime
import matplotlib.pyplot as plt

print(datetime.now())


#--generating heterogenous networks for train-test splitting--

#-Louvain method-
import community #Package name is community but refer to python-louvain on pypi
# -> to install via conda: conda install -c conda-forge python-louvain
# -> to install via pip: pip install python-louvain

def component(G):
size = [len(i) for i in sorted(nx.connected_components(G),key=len,reverse=True)]
members = [i for i in sorted(nx.connected_components(G),key=len,reverse=True)]
return members, size


def partition(family, init_dtp, G, partition_params):
#split the input graph G into partitions using Louvain community detection method
parts = community.best_partition(G, random_state=42)
values = [parts.get(node) for node in G.nodes()]

#obtain node clusters of initial partitions
df = pd.DataFrame({"cluster_id":["C{}".format(str(i)) for i in values],"cluster_members":[node for node in G.nodes()]})
df = df.groupby(by="cluster_id")["cluster_members"].apply(list).reset_index()
df.insert(loc=1,column="cluster_size",value=[len(i) for i in df["cluster_members"]])
df = df.sort_values(by="cluster_size",ascending=False).reset_index(drop=True)
#df.to_csv(fr"fully_dissimilar_split\{family}\{family}_node_clusters.tsv", sep="\t", index=None)

#df = pd.read_csv(fr"fully_dissimilar_split\{family}\{family}_node_clusters.tsv", sep="\t", converters={"cluster_members":lambda x:x.strip("[]").replace("'","").split(", ")})

#group node clusters with a reasonable size and merge their members to obtain acceptable ratios of train and test samples
group1 = [node for ind in range(partition_params[0], round(len(df)*partition_params[1]),partition_params[2]) for node in df["cluster_members"][ind]]
group2 = [node for node in G.nodes() if node not in group1]

#obtain subgraphs of grouped nodes based on edges in the input graph G
group1_G = nx.subgraph(G,group1).copy()
group2_G = nx.subgraph(G,group2).copy()

#obtain the list of edges to be removed to disconnect similar train and test samples
gr1_gr2_mergeG = nx.compose_all([group1_G,group2_G])
diffG = nx.difference(G,gr1_gr2_mergeG)
edge_list = list(diffG.edges())

#select compounds and datapoints to be removed
rm_cmp = []
rm_dtp_c =[]
rm_dtp_p = []
for edges in edge_list:
if (edges[0].startswith("CHEMBL")) and (edges[1].startswith("CHEMBL")):
rm_cmp.extend([edges[0],edges[1]])

elif (edges[0].startswith("CHEMBL")) and (not edges[1].startswith("CHEMBL")):
rm_dtp_c.append(edges[0])
rm_dtp_p.append(edges[1])
elif (not edges[0].startswith("CHEMBL")) and (edges[1].startswith("CHEMBL")):
rm_dtp_c.append(edges[1])
rm_dtp_p.append(edges[0])

#if train and test samples are connected via bioactivity datapoints, keep their nodes and remove only bioactivity edges between these nodes
rm_dtp = pd.DataFrame({"compound_id":rm_dtp_c,"target_id":rm_dtp_p})
new_dtp = pd.concat([init_dtp,rm_dtp])
new_dtp.drop_duplicates(subset=["compound_id","target_id"],keep=False,inplace=True)

#if train and test samples are connected via two similar compounds, remove both compound nodes
new_dtp2 = new_dtp.loc[~new_dtp["compound_id"].isin(rm_cmp)]

return new_dtp2, rm_cmp


def fully_dissimilar_split(family, partition_params, tr_ts):
#obtain compound-compound similarity graph
cmp = pd.read_csv(fr"initial_files/{family}/{family}_compound_tanimoto_sim_0.5thr.tsv", sep="\t")
G_cmp = nx.from_pandas_edgelist(cmp,source="compound1",target="compound2",edge_attr="similarity")

#obtain protein-protein similarity graph
prot = pd.read_csv(fr"initial_files/{family}/{family}_protein-pairwise-sims.tsv", sep="\t")
G_prot = nx.from_pandas_edgelist(prot,source="target1",target="target2")

#obtain protein-compound bioactivity graph
dtp = pd.read_csv(fr"initial_files/{family}/{family}_dataset.tsv", sep="\t")
G_dtp = nx.from_pandas_edgelist(dtp,source="compound_id",target="target_id")

#merge compound-compound similarity, protein-protein similarity, and protein-compound bioactivity graphs
G_merged = nx.compose_all([G_cmp,G_prot,G_dtp])
comp_merged = component(G_merged)

#apply community detection for the largest component of the merged graph, which is the first one
subG_merged_comp1 = nx.subgraph(G_merged,list(comp_merged[0][0])).copy()
subG_merged_comp1_partition = partition(family, dtp, subG_merged_comp1, partition_params)
new_dtp = subG_merged_comp1_partition[0]

#update merged graph after dividing the largest component into sub-components
newG_dtp = nx.from_pandas_edgelist(new_dtp,source="compound_id",target="target_id")
newG_cmp = nx.subgraph(G_cmp,list(set(G_cmp.nodes())-set(subG_merged_comp1_partition[1]))).copy()

newG_merged = nx.compose_all([newG_cmp,G_prot,newG_dtp])

#obtain components of newly merged graph
comp_newG_merged = component(newG_merged)

#select train/test nodes
if tr_ts == "default":
train_final_nodes = comp_newG_merged[0][0]
test_final_nodes = [node for node_list in comp_newG_merged[0][1:] for node in node_list]
else:
train_final_nodes = list(comp_newG_merged[0][0]) + [node for node_list in comp_newG_merged[0][tr_ts:] for node in node_list]
test_final_nodes = [node for node_list in comp_newG_merged[0][1:tr_ts] for node in node_list]

#obtain final bioactivity files based on selected train/test nodes
for nodes,dataset in zip([train_final_nodes,test_final_nodes],["train","test"]):
subG_member = nx.subgraph(newG_merged,nodes).copy()
member_dtp_cmps = []
member_dtp_prots = []
for edg in subG_member.edges():
if (edg[0].startswith("CHEMBL")) and (not edg[1].startswith("CHEMBL")):
member_dtp_cmps.append(edg[0])
member_dtp_prots.append(edg[1])
elif (not edg[0].startswith("CHEMBL")) and (edg[1].startswith("CHEMBL")):
member_dtp_cmps.append(edg[1])
member_dtp_prots.append(edg[0])
df_member = pd.DataFrame({"compound_id":member_dtp_cmps,"target_id":member_dtp_prots})
df_member_dtps = dtp.merge(df_member,on=["compound_id","target_id"])

#this part is only for hydrolases, some of datapoints were removed to balance train-test bioactivity distributions
if family == "hydrolases":
if dataset == "train":
df_member_dtps = df_member_dtps.sample(frac=1, random_state=42)
df_train_thr = df_member_dtps.loc[(df_member_dtps["pchembl_value"]>4.2)&(df_member_dtps["pchembl_value"]<5.0)]
df_member_dtps = pd.concat([df_member_dtps,df_train_thr.iloc[:2400]]) #df_shuffle
df_member_dtps.drop_duplicates(keep=False,inplace=True)

elif dataset == "test":
df_member_dtps = df_member_dtps.sample(frac=1, random_state=42)
df_test_thr1 = df_member_dtps.loc[(df_member_dtps["pchembl_value"]>6.2)&(df_member_dtps["pchembl_value"]<6.6)]
df_test_thr2 = df_member_dtps.loc[(df_member_dtps["pchembl_value"]>7.6)&(df_member_dtps["pchembl_value"]<10)]
df_member_dtps = pd.concat([df_member_dtps,df_test_thr1.iloc[:300],df_test_thr2.iloc[:300]]) #df_shuffle
df_member_dtps.drop_duplicates(keep=False,inplace=True)

df_member_dtps.to_csv(fr"final_files/fully_dissimilar_split/{family}_{dataset}.tsv", sep="\t", index=None)

print(datetime.now())


# fds_epigenetic_regulators = fully_dissimilar_split("epigenetic-regulators", [6,0.2,2], "default")
# fds_hydrolases = fully_dissimilar_split("hydrolases", [0,0.38,1], -40)
# fds_ion_channels = fully_dissimilar_split("ion-channels", [4,0.14,2], "default")
# fds_membrane_receptors = fully_dissimilar_split("membrane-receptors", [0,0.44,1], "default")
# fds_other_enzymes = fully_dissimilar_split("other-enzymes", [5,0.24,2], 43)
# fds_oxidoreductases = fully_dissimilar_split("oxidoreductases", [0,0.44,1], "default")
# fds_proteases = fully_dissimilar_split("proteases", [0,0.39,1], "default")
# fds_transcription_factors = fully_dissimilar_split("transcription-factors", [4,0.1,2], "default")
# fds_transferases = fully_dissimilar_split("transferases", [0,0.36,1], "default")
# fds_transporters = fully_dissimilar_split("transporters", [10,0.28,2], -17)




"To obtain dissimilar_compound and random_splits, you need to obtain fully_dissimilar_splits first!"


def dissimilar_compound_split(family, params):
#initial bioactivity data is based on train/test samples of fully_dissimilar_split sets
tr = pd.read_csv(fr"../datasets/large_scale/fully_dissimilar_split/{family}_train.tsv", sep="\t")
ts = pd.read_csv(fr"../datasets/large_scale/fully_dissimilar_split/{family}_test.tsv", sep="\t")
tr_ts = pd.concat([tr,ts])

cmp = pd.read_csv(fr"initial_files/{family}/{family}_compound_tanimoto_sim_0.5thr.tsv", sep="\t")
cmp_flt = cmp.loc[(cmp["compound1"].isin(tr_ts["compound_id"].unique()))&(cmp["compound2"].isin(tr_ts["compound_id"].unique()))]

#obtain components of compounds from compound-compound similarity graph
G_cmp = nx.from_pandas_edgelist(cmp_flt,source="compound1",target="compound2",edge_attr="similarity")
comp_cmp = component(G_cmp)

#select test compounds
new_ts_cmp = [cmp for ind in range(params[0],len(comp_cmp[1]), params[1]) for cmp in comp_cmp[0][ind]]

#obtain final bioactivity files based on selected test compounds
new_ts = tr_ts.loc[tr_ts["compound_id"].isin(new_ts_cmp)]
new_tr = tr_ts.loc[~(tr_ts["compound_id"].isin(new_ts_cmp))]

new_tr.to_csv(fr"final_files/dissimilar_compound_split/{family}_train.tsv", sep="\t", index=None)
new_ts.to_csv(fr"final_files/dissimilar_compound_split/{family}_test.tsv", sep="\t", index=None)

print(datetime.now())


# dcs_epigenetic_regulators = dissimilar_compound_split("epigenetic-regulators", [4,5])
# dcs_hydrolases = dissimilar_compound_split("hydrolases", [7,7])
# dcs_ion_channels = dissimilar_compound_split("ion-channels", [6,7])
# dcs_membrane_receptors = dissimilar_compound_split("membrane-receptors", [9,4])
# dcs_other_enzymes = dissimilar_compound_split("other-enzymes", [2,4])
# dcs_oxidoreductases = dissimilar_compound_split("oxidoreductases", [4,9])
# dcs_proteases = dissimilar_compound_split("proteases", [3,5])
# dcs_transcription_factors = dissimilar_compound_split("transcription-factors", [9,9])
# dcs_transferases = dissimilar_compound_split("transferases", [6,5])
# dcs_transporters = dissimilar_compound_split("transporters", [8,5])






def random_split(family, rand_st):
#initial bioactivity data is based on train/test samples of fully_dissimilar_split sets
tr = pd.read_csv(fr"../datasets/large_scale/fully_dissimilar_split/{family}_train.tsv", sep="\t")
ts = pd.read_csv(fr"../datasets/large_scale/fully_dissimilar_split/{family}_test.tsv", sep="\t")
tr_ts = pd.concat([tr,ts])
print(len(tr),len(ts),len(tr_ts),len(tr)+len(ts))

#obtain final bioactivity files using random splitting
new_ts = tr_ts.sample(n=len(ts), random_state=rand_st)
new_tr = pd.concat([tr_ts,new_ts]).drop_duplicates(keep=False)

new_tr.to_csv(fr"final_files/random_split/{family}_train.tsv", sep="\t", index=None)
new_ts.to_csv(fr"final_files/random_split/{family}_test.tsv", sep="\t", index=None)

print(datetime.now())


# rs_epigenetic_regulators = random_split("epigenetic-regulators", 1)
# rs_hydrolases = random_split("hydrolases", 5)
# rs_ion_channels = random_split("ion-channels", 1)
# rs_membrane_receptors = random_split("membrane-receptors", 2)
# rs_other_enzymes = random_split("other-enzymes", 2)
# rs_oxidoreductases = random_split("oxidoreductases", 1)
# rs_proteases = random_split("proteases", 1)
# rs_transcription_factors = random_split("transcription-factors", 3)
# rs_transferases = random_split("transferases", 1)
# rs_transporters = random_split("transporters", 5)























0 comments on commit 40aa208

Please sign in to comment.