Skip to content

Commit

Permalink
Merge pull request #15 from Reed-CompBio/self-edge
Browse files Browse the repository at this point in the history
updated data w/ no self edges
  • Loading branch information
amnorman authored Jul 9, 2024
2 parents a2d1de0 + 433bd84 commit 2a7bfe6
Show file tree
Hide file tree
Showing 14 changed files with 726,277 additions and 987,029 deletions.
14 changes: 3 additions & 11 deletions classes/hypergeometric_distribution_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,11 @@ def predict(
G, positive_pro_pro_neighbor, positive_go
)
)

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1 #Removes extra node if there is an edge to self

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) - c #Number of protein neighbors the protein of interest has
pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
pos_k = positive_go_annotated_pro_pro_neighbor_count - c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
Expand All @@ -101,11 +97,7 @@ def predict(
)
)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1

neg_n = len(negative_pro_pro_neighbor) - c #Negative protein of interest neighbors
neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
Expand Down
14 changes: 3 additions & 11 deletions classes/hypergeometric_distribution_class_V2.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,11 @@ def predict(
G, positive_pro_pro_neighbor, positive_go
)
)

c = 1
if G.has_edge(positive_protein, positive_protein):
c = 0

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) + c #Number of protein neighbors the protein of interest has (includes the protein of interest)
pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
pos_k = positive_go_annotated_pro_pro_neighbor_count + c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
Expand All @@ -100,12 +96,8 @@ def predict(
G, negative_pro_pro_neighbor, negative_go
)
)

c = 1
if G.has_edge(negative_protein, negative_protein):
c = 0

neg_n = len(negative_pro_pro_neighbor) + c #Negative protein of interest neighbors (includes self)
neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
Expand Down
12 changes: 3 additions & 9 deletions classes/overlapping_neighbors_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,6 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):
c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
Expand All @@ -78,19 +75,16 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
) - c
)

if len(positive_pro_pro_neighbor) == 0:
positive_score = 0
else:
positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
len(positive_pro_pro_neighbor) -c + len(positive_go_neighbor)
len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
)

# calculate the score for the negative set
c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
negative_pro_pro_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
Expand All @@ -105,7 +99,7 @@ def predict(
negative_score = 0
else:
negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
len(negative_pro_pro_neighbor) - c + len(negative_go_neighbor)
len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
)

# input positive and negative score to data
Expand Down
35 changes: 17 additions & 18 deletions classes/overlapping_neighbors_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):
c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
Expand All @@ -74,17 +71,17 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
) - c
)

positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1
+ (len(positive_pro_pro_neighbor) - c)
* positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor) / 2)
if len(positive_go_neighbor) == 0:
positive_score = 0
else:
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1
+ (len(positive_pro_pro_neighbor))
* positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor) / 2)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -97,12 +94,14 @@ def predict(
)



negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1
+ (len(negative_pro_pro_neighbor) - c)
* negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor) / 2)
if len(negative_go_neighbor) == 0:
negative_score = 0
else:
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1
+ (len(negative_pro_pro_neighbor))
* negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor) / 2)

# input positive and negative score to data
data["protein"].append(positive_protein)
Expand Down
23 changes: 13 additions & 10 deletions classes/overlapping_neighbors_v3_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):
c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
Expand All @@ -74,10 +71,13 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
) - c
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1 + positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor))
)
if len(positive_go_neighbor) == 0:
positive_score = 0
else:
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1 + positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
Expand All @@ -89,9 +89,12 @@ def predict(
G, negative_pro_pro_neighbor, negative_go
)
)
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1 + negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor))
if len(negative_go_neighbor) == 0:
negative_score = 0
else:
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1 + negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor))

# input positive and negative score to data
data["protein"].append(positive_protein)
Expand Down
11 changes: 2 additions & 9 deletions classes/protein_degree_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,14 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["degree"].append(G.degree(positive_protein) - c)
data["degree"].append(G.degree(positive_protein))
data["true_label"].append(1)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(G.degree(negative_protein) - c)
data["degree"].append(G.degree(negative_protein))
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
i += 1
Expand Down
10 changes: 2 additions & 8 deletions classes/protein_degree_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,23 +56,17 @@ def predict(
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["degree"].append(
len(get_neighbors(G, positive_protein, "protein_protein")) - c
len(get_neighbors(G, positive_protein, "protein_protein"))
)
data["true_label"].append(1)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(
len(get_neighbors(G, negative_protein, "protein_protein")) - c
len(get_neighbors(G, negative_protein, "protein_protein"))
)
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
Expand Down
Loading

0 comments on commit 2a7bfe6

Please sign in to comment.