diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index ed0d5b56..19a8bd14 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -148,14 +148,20 @@ def parse_output(raw_pathway_file, standardized_pathway_file): """ # Omicsintegrator2 returns a single line file if no network is found num_lines = sum(1 for line in open(raw_pathway_file)) + # Omicsintegrator2 has corrupted output; list of correct column names + sorted_correct_column_names = ['cost', 'in_solution', 'protein1', 'protein2'] # the order of edge attributes in the NetworkX graph is not guaranteed. + if num_lines < 2: df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction']) else: df = pd.read_csv(raw_pathway_file, sep='\t', header=0) - df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line - df = df.take([0, 1], axis=1) - df = add_rank_column(df) - df = reinsert_direction_col_undirected(df) - df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + if sorted(df.columns) == sorted_correct_column_names: # if column header names are all correct + df = df[df['in_solution'] == True] # the 'in_solution' column exists when the forest is not empty. + df = df.take([0, 1], axis=1) # the first two columns in the df will be 'protein1' and 'protein2', followed by the edge attributes. + df = add_rank_column(df) + df = reinsert_direction_col_undirected(df) + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + else: # corrupted data + df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction']) df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/test/parse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-miss-insolution-raw-pathway.txt b/test/parse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-miss-insolution-raw-pathway.txt new file mode 100644 index 00000000..6ed53c89 --- /dev/null +++ b/test/parse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-miss-insolution-raw-pathway.txt @@ -0,0 +1,3 @@ +protein1 protein2 cost +B A 0.52 +B C 0.73 \ No newline at end of file diff --git a/test/parse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-wrong-order-raw-pathway.txt b/test/parse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-wrong-order-raw-pathway.txt new file mode 100644 index 00000000..bde8f08c --- /dev/null +++ b/test/parse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-wrong-order-raw-pathway.txt @@ -0,0 +1,3 @@ +protein1 protein2 in_solution cost +B A True 0.52 +B C True 0.73 \ No newline at end of file diff --git a/test/parse-outputs/test_parse_outputs.py b/test/parse-outputs/test_parse_outputs.py index 60763d13..7dfe270e 100644 --- a/test/parse-outputs/test_parse_outputs.py +++ b/test/parse-outputs/test_parse_outputs.py @@ -6,6 +6,7 @@ INDIR = "test/parse-outputs/input/" OUTDIR = "test/parse-outputs/output/" EXPDIR = "test/parse-outputs/expected/" +OI2_EDGE_CASES_INDIR = 'test/parse-outputs/input/omicsintegrator-edge-cases/' # DOMINO input is the concatenated module_0.html and module_1.html file from # the DOMINO output of the network dip.sif and the nodes tnfa_active_genes_file.txt @@ -37,3 +38,17 @@ def test_empty_file(self): runner.parse_output(algo, test_file, out_file) assert filecmp.cmp(OUTDIR + f"{algo}-empty-pathway.txt", EXPDIR + f"empty-pathway-expected.txt", shallow=False) + + def test_oi2_miss_insolution(self): + test_file = OI2_EDGE_CASES_INDIR + f"omicsintegrator2-miss-insolution-raw-pathway.txt" + out_file = OUTDIR + f"omicsintegrator2-miss-insolution-pathway.txt" + + runner.parse_output('omicsintegrator2', test_file, out_file) + assert filecmp.cmp(out_file, EXPDIR + f"empty-pathway-expected.txt", shallow=False) + + def test_oi2_wrong_order(self): + test_file = OI2_EDGE_CASES_INDIR + f"omicsintegrator2-wrong-order-raw-pathway.txt" + out_file = OUTDIR + f"omicsintegrator2-wrong-order-pathway.txt" + + runner.parse_output('omicsintegrator2', test_file, out_file) + assert filecmp.cmp(out_file, EXPDIR + f"omicsintegrator2-pathway-expected.txt", shallow=False)