Skip to content

Commit

Permalink
Merge pull request #18 from Reed-CompBio/regulatory
Browse files Browse the repository at this point in the history
Regulatory
  • Loading branch information
ctrlaltaf authored Sep 9, 2024
2 parents 746386f + 457fe84 commit f6ddffc
Show file tree
Hide file tree
Showing 78 changed files with 2,927,321 additions and 275,644 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,12 @@ A python program that aims to predict protein function prediction using protein-
- Now you have a conda environment that has all the necessary packages for this project
- To test that everything is working, you can run `python main.py`

# Files
- main.py: File used to run the algorithms
- neighbor_accuracy: Computes how accurately the neighbors of a GO term can be predicted using random walk
- accuracy_stats.py: Computes stats on neighbor_accuracy
- difference.py: Takes two matched outputs from main and prints a table comparing them
- distribution.py: Visualizes the distribution of GO term neighbor counts
- small_graph.py: Visualizes the impact of pagerank on a directed and undirected graph using a test dataset
- subgraph.py: Visualizes a subgraph of the one built in main based on pagerank node ranks
- paired_sample.py: An additional way to generate pos/neg samples keeping specific aspects constant (then run main with new_random_lists set to False)
68 changes: 68 additions & 0 deletions accuracy_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import matplotlib.pyplot as plt
import statistics as stat
from pathlib import Path
from tools.helper import read_specific_columns

'''
Takes the dataframe output by neighbor_accuracy.py and generates a graph (initially used to change alpha value without regenerating all the data). Additionally, the frequency of the number of neighbors and associated score is calculated and printed.
'''

ranked = read_specific_columns("./output/data/go_neighbor_tests/neighbor_rank/all_neighbor_rank_under_100.csv", [1,2], "\t")

all_go_neighbor_rank = []
all_go_neighbor_num = []

# Sorts through data, remvoing values with no score
for i in ranked:
if i[1] != "N/A":
all_go_neighbor_num.append(int(i[0]))
all_go_neighbor_rank.append(float(i[1]))

fig, ax = plt.subplots()
plt.scatter(all_go_neighbor_rank, all_go_neighbor_num, alpha = .05)
plt.xlabel("% Go Neighbors Accurately Predicted")
ax.set_xlim([-5, 105])
plt.ylabel("Number of Neighbors")
plt.savefig("./output/data/go_neighbor_tests/neighbor_rank/under_100_rank.png")
plt.show()

mean_rank = round(stat.mean(all_go_neighbor_rank),2)
mean_num = round(stat.mean(all_go_neighbor_num),2)
print("Mean Rank: " + str(mean_rank))
print("Mean Number of Neighbors: " + str(mean_num))

#Prints the frequency of some number of neighbors having a specific score, can uncomment below to remove any that have a score of zero
freq_dict = {}
for i in ranked:
if i[1] != "N/A": # and float(i[1]) != 0.0:
key = i[0] + "_" + i[1]
if key in freq_dict.keys():
freq_dict[key] += 1
else:
freq_dict[key] = 1

# Recursive function that saves the frequency output as a dictionary
def order(freq_dict):
top_freq = 0
top = ""
for i in freq_dict.keys():
if freq_dict[i] > top_freq:
top_freq = freq_dict[i]
top = i

s = top.split("_")
print(s[0] + " Neighbors with a percent accuracy of " + s[1] + " occurs " + str(top_freq) + " times")
freq_dict.pop(top)
if len(freq_dict) != 1:
x = order(freq_dict)
x.insert(0,[top, top_freq])
return x
return [[top, top_freq]]

lst = order(freq_dict)
print(lst)

#Dictionary is not saved anywhere but could be



55 changes: 32 additions & 23 deletions classes/hypergeometric_distribution_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def predict(
data = {
"protein": [],
"go_term": [],
"pro_pro_neighbor": [],
"protein_neighbor": [],
"go_neighbor": [],
"go_annotated_pro_pro_neighbors": [],
"go_annotated_protein_neighbors": [],
"score": [],
"norm_score": [],
"true_label": [],
Expand All @@ -68,36 +68,45 @@ def predict(
):

# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest

pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest


# if K == -1:
# K = 1

# print("N: ", N)
# print("pos_n: ", pos_n)
# print("K: ", K)
# print("pos_k: ", pos_k)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
negative_go_annotated_protein_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
get_go_annotated_protein_neighbor_count(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors
neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
Expand All @@ -106,19 +115,19 @@ def predict(
# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
positive_go_annotated_pro_pro_neighbor_count
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
data["protein_neighbor"].append(len(negative_protein_neighbor))
data["go_neighbor"].append(len(negative_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
data["go_annotated_protein_neighbors"].append(
negative_go_annotated_protein_neighbor_count
)
data["score"].append(negative_score)
Expand Down Expand Up @@ -146,18 +155,18 @@ def predict(
return y_score, y_true


def get_neighbors(G: nx.Graph, node, edgeType):
def get_neighbors(G: nx.DiGraph, node, edgeTypes):
res = G.edges(node, data=True)
neighbors = []
for edge in res:
if edge[2]["type"] == edgeType:
if edge[2]["type"] in edgeTypes:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)

return neighbors


def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
count = 0
for element in nodeList:
if G.has_edge(element[0], goTerm):
Expand Down
42 changes: 21 additions & 21 deletions classes/hypergeometric_distribution_class_V2.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def predict(
data = {
"protein": [],
"go_term": [],
"pro_pro_neighbor": [],
"protein_neighbor": [],
"go_neighbor": [],
"go_annotated_pro_pro_neighbors": [],
"go_annotated_protein_neighbors": [],
"score": [],
"norm_score": [],
"true_label": [],
Expand All @@ -68,56 +68,56 @@ def predict(
):

# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
pos_n = len(positive_protein_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
pos_k = positive_go_annotated_protein_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
negative_go_annotated_protein_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
get_go_annotated_protein_neighbor_count(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_n = len(negative_protein_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
positive_go_annotated_pro_pro_neighbor_count
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
data["protein_neighbor"].append(len(negative_protein_neighbor))
data["go_neighbor"].append(len(negative_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
data["go_annotated_protein_neighbors"].append(
negative_go_annotated_protein_neighbor_count
)
data["score"].append(negative_score)
Expand Down Expand Up @@ -149,14 +149,14 @@ def get_neighbors(G: nx.Graph, node, edgeType):
res = G.edges(node, data=True)
neighbors = []
for edge in res:
if edge[2]["type"] == edgeType:
if edge[2]["type"] in edgeType:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)

return neighbors


def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
count = 0
for element in nodeList:
if G.has_edge(element[0], goTerm):
Expand Down
Loading

0 comments on commit f6ddffc

Please sign in to comment.