Merge pull request #18 from Reed-CompBio/regulatory

Regulatory
Reed-CompBio · Sep 9, 2024 · f6ddffc · f6ddffc
2 parents 746386f + 457fe84
commit f6ddffc
Show file tree

Hide file tree

Showing 78 changed files with 2,927,321 additions and 275,644 deletions.
diff --git a/README.md b/README.md
@@ -10,3 +10,12 @@ A python program that aims to predict protein function prediction using protein-
 - Now you have a conda environment that has all the necessary packages for this project
 - To test that everything is working, you can run `python main.py`
 
+# Files
+- main.py: File used to run the algorithms
+- neighbor_accuracy: Computes how accurately the neighbors of a GO term can be predicted using random walk
+- accuracy_stats.py: Computes stats on neighbor_accuracy
+- difference.py: Takes two matched outputs from main and prints a table comparing them
+- distribution.py: Visualizes the distribution of GO term neighbor counts
+- small_graph.py: Visualizes the impact of pagerank on a directed and undirected graph using a test dataset
+- subgraph.py: Visualizes a subgraph of the one built in main based on pagerank node ranks
+- paired_sample.py: An additional way to generate pos/neg samples keeping specific aspects constant (then run main with new_random_lists set to False)
diff --git a/accuracy_stats.py b/accuracy_stats.py
@@ -0,0 +1,68 @@
+import matplotlib.pyplot as plt
+import statistics as stat
+from pathlib import Path
+from tools.helper import read_specific_columns
+
+'''
+Takes the dataframe output by neighbor_accuracy.py and generates a graph (initially used to change alpha value without regenerating all the data). Additionally, the frequency of the number of neighbors and associated score is calculated and printed. 
+'''
+
+ranked = read_specific_columns("./output/data/go_neighbor_tests/neighbor_rank/all_neighbor_rank_under_100.csv", [1,2], "\t")
+
+all_go_neighbor_rank = []
+all_go_neighbor_num = []
+
+# Sorts through data, remvoing values with no score
+for i in ranked:
+    if i[1] != "N/A":
+        all_go_neighbor_num.append(int(i[0]))
+        all_go_neighbor_rank.append(float(i[1]))
+
+fig, ax = plt.subplots()
+plt.scatter(all_go_neighbor_rank, all_go_neighbor_num, alpha = .05)
+plt.xlabel("% Go Neighbors Accurately Predicted")
+ax.set_xlim([-5, 105])
+plt.ylabel("Number of Neighbors")
+plt.savefig("./output/data/go_neighbor_tests/neighbor_rank/under_100_rank.png")
+plt.show()
+
+mean_rank = round(stat.mean(all_go_neighbor_rank),2)
+mean_num = round(stat.mean(all_go_neighbor_num),2)
+print("Mean Rank: " + str(mean_rank))
+print("Mean Number of Neighbors: " + str(mean_num))
+
+#Prints the frequency of some number of neighbors having a specific score, can uncomment below to remove any that have a score of zero
+freq_dict = {}
+for i in ranked:
+    if i[1] != "N/A": # and float(i[1]) != 0.0:
+        key = i[0] + "_" + i[1]
+        if key in freq_dict.keys():
+            freq_dict[key] += 1
+        else:
+            freq_dict[key] = 1
+
+# Recursive function that saves the frequency output as a dictionary 
+def order(freq_dict):
+    top_freq = 0
+    top = ""
+    for i in freq_dict.keys():
+        if freq_dict[i] > top_freq:
+            top_freq = freq_dict[i]
+            top = i 
+
+    s = top.split("_")
+    print(s[0] + " Neighbors with a percent accuracy of " + s[1] + " occurs " + str(top_freq) + " times")
+    freq_dict.pop(top)
+    if len(freq_dict) != 1:
+        x = order(freq_dict)
+        x.insert(0,[top, top_freq])
+        return x
+    return [[top, top_freq]]
+
+lst = order(freq_dict)
+print(lst)
+
+#Dictionary is not saved anywhere but could be 
+
+
+
diff --git a/classes/hypergeometric_distribution_class.py b/classes/hypergeometric_distribution_class.py
@@ -48,9 +48,9 @@ def predict(
         data = {
             "protein": [],
             "go_term": [],
-            "pro_pro_neighbor": [],
+            "protein_neighbor": [],
             "go_neighbor": [],
-            "go_annotated_pro_pro_neighbors": [],
+            "go_annotated_protein_neighbors": [],
             "score": [],
             "norm_score": [],
             "true_label": [],
@@ -68,36 +68,45 @@ def predict(
         ):
 
             # calculate the score for the positive set
-            positive_pro_pro_neighbor = get_neighbors(
-                G, positive_protein, "protein_protein"
+            positive_protein_neighbor = get_neighbors(
+                G, positive_protein, ["protein_protein", "regulatory"]
             )
-            positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
-            positive_go_annotated_pro_pro_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, positive_pro_pro_neighbor, positive_go
+            positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
+            positive_go_annotated_protein_neighbor_count = (
+                get_go_annotated_protein_neighbor_count(
+                    G, positive_protein_neighbor, positive_go
                 )
             )
 
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
+            pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has
             K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
-            pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
-
+            pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
+
+
+            # if K == -1:
+            #     K = 1
+
+            # print("N: ", N)
+            # print("pos_n: ", pos_n)
+            # print("K: ", K)
+            # print("pos_k: ", pos_k)
+
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
 
             # calculate the score for the negative set
-            negative_pro_pro_neighbor = get_neighbors(
+            negative_protein_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
             negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
             negative_go_annotated_protein_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, negative_pro_pro_neighbor, negative_go
+                get_go_annotated_protein_neighbor_count(
+                    G, negative_protein_neighbor, negative_go
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
+            neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
@@ -106,19 +115,19 @@ def predict(
             # input positive and negative score to data
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(positive_protein_neighbor))
             data["go_neighbor"].append(len(positive_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
-                positive_go_annotated_pro_pro_neighbor_count
+            data["go_annotated_protein_neighbors"].append(
+                positive_go_annotated_protein_neighbor_count
             )
             data["score"].append(positive_score)
             data["true_label"].append(1)
 
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(negative_protein_neighbor))
             data["go_neighbor"].append(len(negative_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
+            data["go_annotated_protein_neighbors"].append(
                 negative_go_annotated_protein_neighbor_count
             )
             data["score"].append(negative_score)
@@ -146,18 +155,18 @@ def predict(
         return y_score, y_true
 
 
-def get_neighbors(G: nx.Graph, node, edgeType):
+def get_neighbors(G: nx.DiGraph, node, edgeTypes):
     res = G.edges(node, data=True)
     neighbors = []
     for edge in res:
-        if edge[2]["type"] == edgeType:
+        if edge[2]["type"] in edgeTypes:
             neighborNode = [edge[1], edge[2]]
             neighbors.append(neighborNode)
 
     return neighbors
 
 
-def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
+def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
     count = 0
     for element in nodeList:
         if G.has_edge(element[0], goTerm):

diff --git a/classes/hypergeometric_distribution_class_V2.py b/classes/hypergeometric_distribution_class_V2.py
@@ -48,9 +48,9 @@ def predict(
         data = {
             "protein": [],
             "go_term": [],
-            "pro_pro_neighbor": [],
+            "protein_neighbor": [],
             "go_neighbor": [],
-            "go_annotated_pro_pro_neighbors": [],
+            "go_annotated_protein_neighbors": [],
             "score": [],
             "norm_score": [],
             "true_label": [],
@@ -68,56 +68,56 @@ def predict(
         ):
 
             # calculate the score for the positive set
-            positive_pro_pro_neighbor = get_neighbors(
-                G, positive_protein, "protein_protein"
+            positive_protein_neighbor = get_neighbors(
+                G, positive_protein, ["protein_protein", "regulatory"]
             )
-            positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
-            positive_go_annotated_pro_pro_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, positive_pro_pro_neighbor, positive_go
+            positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
+            positive_go_annotated_protein_neighbor_count = (
+                get_go_annotated_protein_neighbor_count(
+                    G, positive_protein_neighbor, positive_go
                 )
             )
 
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
+            pos_n = len(positive_protein_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
             K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
-            pos_k = positive_go_annotated_pro_pro_neighbor_count  + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
+            pos_k = positive_go_annotated_protein_neighbor_count  + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
 
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
 
             # calculate the score for the negative set
-            negative_pro_pro_neighbor = get_neighbors(
+            negative_protein_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
             negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
             negative_go_annotated_protein_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, negative_pro_pro_neighbor, negative_go
+                get_go_annotated_protein_neighbor_count(
+                    G, negative_protein_neighbor, negative_go
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
+            neg_n = len(negative_protein_neighbor) + 1 #Negative protein of interest neighbors (includes self)
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(positive_protein_neighbor))
             data["go_neighbor"].append(len(positive_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
-                positive_go_annotated_pro_pro_neighbor_count
+            data["go_annotated_protein_neighbors"].append(
+                positive_go_annotated_protein_neighbor_count
             )
             data["score"].append(positive_score)
             data["true_label"].append(1)
 
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(negative_protein_neighbor))
             data["go_neighbor"].append(len(negative_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
+            data["go_annotated_protein_neighbors"].append(
                 negative_go_annotated_protein_neighbor_count
             )
             data["score"].append(negative_score)
@@ -149,14 +149,14 @@ def get_neighbors(G: nx.Graph, node, edgeType):
     res = G.edges(node, data=True)
     neighbors = []
     for edge in res:
-        if edge[2]["type"] == edgeType:
+        if edge[2]["type"] in edgeType:
             neighborNode = [edge[1], edge[2]]
             neighbors.append(neighborNode)
 
     return neighbors
 
 
-def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
+def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
     count = 0
     for element in nodeList:
         if G.has_edge(element[0], goTerm):