Skip to content

Commit

Permalink
optimized sampling method
Browse files Browse the repository at this point in the history
  • Loading branch information
ctrlaltaf committed Aug 30, 2024
1 parent d617b81 commit 5a6b4cb
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 17 deletions.
14 changes: 7 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def main():
protein_file_path = Path(dataset_directory_path, "protein.pickle")

namespace = ["molecular_function", "biological_process", "cellular_component"]
sample_size = 10
sample_size = 1000
repeats = 1
new_random_lists = True
print_graphs = True
Expand Down Expand Up @@ -147,13 +147,13 @@ def main():
protein_list = []

# Generate a standard graph using the pro-pro, regulatory, and pro-go interactions
# G, protein_list = create_mixed_network(
# interactome, regulatory_interactome, go_protein_pairs, go_depth_dict
# )
# export_graph_to_pickle(G, graph_file_path)
G, protein_list = create_mixed_network(
interactome, regulatory_interactome, go_protein_pairs, go_depth_dict
)
export_graph_to_pickle(G, graph_file_path)
# Creates a graph with only protein-protein edges (used for RandomWalkV4)
P, protein_list = create_only_protein_network(interactome,regulatory_interactome, go_protein_pairs, go_depth_dict)
export_graph_to_pickle(P, "./output/dataset/protein.pickle")
# P, protein_list = create_only_protein_network(interactome,regulatory_interactome, go_protein_pairs, go_depth_dict)
# export_graph_to_pickle(P, "./output/dataset/protein.pickle")
# Creates a graph with only protein-GO term edges (used for RandomWalkV5)
# D, protein_list = create_go_protein_only_network(interactome,regulatory_interactome, go_protein_pairs, go_depth_dict)
# export_graph_to_pickle(D, "./output/dataset/go_protein.pickle")
Expand Down
31 changes: 21 additions & 10 deletions tools/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def run_workflow(
name,
)

sys.exit()

for i in range(
x
): # Creates a pos/neg list each replicate then runs workflow like normal
Expand Down Expand Up @@ -735,7 +737,7 @@ def sample_data_neighbor_degree_ratio(
positive_dataset = {"protein": [], "go": []}
temp_positive_dataset = {"protein": [], "go": []}
negative_dataset = {"protein": [], "go": []}
ratio = 2
ratio = 10

# sample the data
for edge in sample(list(go_protein_pairs), sample_size):
Expand All @@ -747,13 +749,13 @@ def sample_data_neighbor_degree_ratio(
for protein, go in zip(temp_positive_dataset["protein"], temp_positive_dataset["go"]):
positive_dataset["protein"].append(protein)
positive_dataset["go"].append(go)
for k in range(ratio):
closest_neighbor = find_closest_neighbor_without_edge_degree(G, protein, go)
if closest_neighbor != None:
negative_dataset["protein"].append(closest_neighbor)
closest_neighbors = find_closest_neighbor_without_edge_degree(G, protein, go, ratio)
if closest_neighbors != None:
for k in range(ratio):
negative_dataset["protein"].append(closest_neighbors[k])
negative_dataset["go"].append(go)
print_progress(i, sample_size * ratio)
i += 1
print_progress(i, sample_size * ratio)
i += 1
positive_df = pd.DataFrame(positive_dataset)
negative_df = pd.DataFrame(negative_dataset)

Expand Down Expand Up @@ -1077,8 +1079,15 @@ def find_closest_neighbor_without_edge(G, protein, go_term):
return None


def find_closest_neighbor_without_edge_degree(G, protein, go_term):
def find_closest_neighbor_without_edge_degree(G, protein, go_term, num_samples = 1):
# print("inside closest neighbor", protein, go_term)
delta = 5
# neighbors_list = []
# for neighbor in nx.bfs_tree(G,protein):
# neighbors_list.append(neighbor)
# print(neighbor)
neighbor_list = []
i = 0
for neighbor in nx.bfs_tree(G, protein):
if (
neighbor != protein
Expand All @@ -1087,7 +1096,9 @@ def find_closest_neighbor_without_edge_degree(G, protein, go_term):
and G.degree(neighbor) <= (G.degree(protein) + delta)
and G.degree(neighbor) >= (G.degree(protein) - delta)
):
return neighbor

neighbor_list.append(neighbor)
if (len(neighbor_list) == num_samples):
return neighbor_list

# If all neighbors have an edge to go_term1
return None

0 comments on commit 5a6b4cb

Please sign in to comment.