Merge pull request #15 from Reed-CompBio/self-edge

updated data w/ no self edges
Reed-CompBio · Jul 9, 2024 · 2a7bfe6 · 2a7bfe6
2 parents a2d1de0 + 433bd84
commit 2a7bfe6
Show file tree

Hide file tree

Showing 14 changed files with 726,277 additions and 987,029 deletions.
diff --git a/classes/hypergeometric_distribution_class.py b/classes/hypergeometric_distribution_class.py
@@ -77,15 +77,11 @@ def predict(
                     G, positive_pro_pro_neighbor, positive_go
                 )
             )
-
-            c = 0
-            if G.has_edge(positive_protein, positive_protein):
-                c = 1 #Removes extra node if there is an edge to self 
 
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) - c #Number of protein neighbors the protein of interest has
+            pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
             K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
-            pos_k = positive_go_annotated_pro_pro_neighbor_count - c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
+            pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
 
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
@@ -101,11 +97,7 @@ def predict(
                 )
             )
 
-            c = 0
-            if G.has_edge(negative_protein, negative_protein):
-                c = 1
-
-            neg_n = len(negative_pro_pro_neighbor) - c #Negative protein of interest neighbors
+            neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))

diff --git a/classes/hypergeometric_distribution_class_V2.py b/classes/hypergeometric_distribution_class_V2.py
@@ -77,15 +77,11 @@ def predict(
                     G, positive_pro_pro_neighbor, positive_go
                 )
             )
-
-            c = 1
-            if G.has_edge(positive_protein, positive_protein):
-                c = 0
 
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) + c #Number of protein neighbors the protein of interest has (includes the protein of interest)
+            pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
             K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
-            pos_k = positive_go_annotated_pro_pro_neighbor_count + c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
+            pos_k = positive_go_annotated_pro_pro_neighbor_count  + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
 
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
@@ -100,12 +96,8 @@ def predict(
                     G, negative_pro_pro_neighbor, negative_go
                 )
             )
-
-            c = 1
-            if G.has_edge(negative_protein, negative_protein):
-                c = 0
 
-            neg_n = len(negative_pro_pro_neighbor) + c #Negative protein of interest neighbors (includes self)
+            neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))

diff --git a/classes/overlapping_neighbors_class.py b/classes/overlapping_neighbors_class.py
@@ -64,9 +64,6 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-            c = 0
-            if G.has_edge(positive_protein, positive_protein):
-                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -78,19 +75,16 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            ) - c
+            )
 
             if len(positive_pro_pro_neighbor) == 0:
                 positive_score = 0
             else:
                 positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
-                    len(positive_pro_pro_neighbor) -c + len(positive_go_neighbor)
+                    len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
                 )
 
             # calculate the score for the negative set
-            c = 0
-            if G.has_edge(negative_protein, negative_protein):
-                c = 1
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
@@ -105,7 +99,7 @@ def predict(
                 negative_score = 0
             else:
                 negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
-                    len(negative_pro_pro_neighbor) - c + len(negative_go_neighbor)
+                    len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
                 )
 
             # input positive and negative score to data

diff --git a/classes/overlapping_neighbors_v2_class.py b/classes/overlapping_neighbors_v2_class.py
@@ -62,9 +62,6 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-            c = 0
-            if G.has_edge(positive_protein, positive_protein):
-                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -74,17 +71,17 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            ) - c
+            )
 
-            positive_score = positive_go_annotated_pro_pro_neighbor_count + (
-                1
-                + (len(positive_pro_pro_neighbor) - c)
-                * positive_go_annotated_pro_pro_neighbor_count
-            ) / (len(positive_go_neighbor) / 2)
+            if len(positive_go_neighbor) == 0:
+                positive_score = 0
+            else:
+                positive_score = positive_go_annotated_pro_pro_neighbor_count + (
+                    1
+                    + (len(positive_pro_pro_neighbor))
+                    * positive_go_annotated_pro_pro_neighbor_count
+                ) / (len(positive_go_neighbor) / 2)
 
-            c = 0 
-            if G.has_edge(negative_protein, negative_protein):
-                c = 1
             # calculate the score for the negative set
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
@@ -97,12 +94,14 @@ def predict(
             )
 
 
-
-            negative_score = negative_go_annotated_pro_pro_neighbor_count + (
-                1
-                + (len(negative_pro_pro_neighbor) - c)
-                * negative_go_annotated_pro_pro_neighbor_count
-            ) / (len(negative_go_neighbor) / 2)
+            if len(negative_go_neighbor) == 0:
+                negative_score = 0
+            else:
+                negative_score = negative_go_annotated_pro_pro_neighbor_count + (
+                    1
+                    + (len(negative_pro_pro_neighbor))
+                    * negative_go_annotated_pro_pro_neighbor_count
+                ) / (len(negative_go_neighbor) / 2)
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)

diff --git a/classes/overlapping_neighbors_v3_class.py b/classes/overlapping_neighbors_v3_class.py
@@ -62,9 +62,6 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-            c = 0
-            if G.has_edge(positive_protein, positive_protein):
-                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -74,10 +71,13 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            ) - c
-            positive_score = positive_go_annotated_pro_pro_neighbor_count + (
-                1 + positive_go_annotated_pro_pro_neighbor_count
-            ) / (len(positive_go_neighbor))
+            )
+            if len(positive_go_neighbor) == 0:
+                positive_score = 0
+            else:
+                positive_score = positive_go_annotated_pro_pro_neighbor_count + (
+                    1 + positive_go_annotated_pro_pro_neighbor_count
+                ) / (len(positive_go_neighbor))
 
             # calculate the score for the negative set
             negative_pro_pro_neighbor = get_neighbors(
@@ -89,9 +89,12 @@ def predict(
                     G, negative_pro_pro_neighbor, negative_go
                 )
             ) 
-            negative_score = negative_go_annotated_pro_pro_neighbor_count + (
-                1 + negative_go_annotated_pro_pro_neighbor_count
-            ) / (len(negative_go_neighbor))
+            if len(negative_go_neighbor) == 0:
+                negative_score = 0
+            else:
+                negative_score = negative_go_annotated_pro_pro_neighbor_count + (
+                    1 + negative_go_annotated_pro_pro_neighbor_count
+                ) / (len(negative_go_neighbor))
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)

diff --git a/classes/protein_degree_class.py b/classes/protein_degree_class.py
@@ -53,21 +53,14 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-
-            c = 0 
-            if G.has_edge(positive_protein, positive_protein):
-                c = 1
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["degree"].append(G.degree(positive_protein) - c)
+            data["degree"].append(G.degree(positive_protein))
             data["true_label"].append(1)
 
-            c = 0
-            if G.has_edge(negative_protein, negative_protein):
-                c = 1
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["degree"].append(G.degree(negative_protein) - c)
+            data["degree"].append(G.degree(negative_protein))
             data["true_label"].append(0)
             print_progress(i, len(positive_dataset["protein"]))
             i += 1

diff --git a/classes/protein_degree_v2_class.py b/classes/protein_degree_v2_class.py
@@ -56,23 +56,17 @@ def predict(
             negative_dataset["go"],
         ):
 
-            c = 0
-            if G.has_edge(positive_protein, positive_protein):
-                c = 1
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
             data["degree"].append(
-                len(get_neighbors(G, positive_protein, "protein_protein")) - c
+                len(get_neighbors(G, positive_protein, "protein_protein"))
             )
             data["true_label"].append(1)
 
-            c = 0
-            if G.has_edge(negative_protein, negative_protein):
-                c = 1
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
             data["degree"].append(
-                len(get_neighbors(G, negative_protein, "protein_protein")) - c
+                len(get_neighbors(G, negative_protein, "protein_protein")) 
             )
             data["true_label"].append(0)
             print_progress(i, len(positive_dataset["protein"]))