Reed-CompBio · agitter · Aug 23, 2024 · Jul 9, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/Snakefile b/Snakefile
@@ -3,6 +3,7 @@ from spras import runner
 import shutil
 import yaml
 from spras.dataset import Dataset
+from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, graphspace, cytoscape
 import spras.config as _config
 
@@ -27,13 +28,14 @@ hac_params = _config.config.hac_params
 FRAMEWORK = _config.config.container_framework
 print(f"Running {FRAMEWORK} containers")
 
-# Return the dataset dictionary from the config file given the label
+# Return the dataset or gold_standard dictionary from the config file given the label
 def get_dataset(_datasets, label):
     return _datasets[label]
 
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
+dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['datasets']]
 
 # Get algorithms that are running multiple parameter combinations
 def algo_has_mult_param_combos(algo):
@@ -102,6 +104,9 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
 
+    if _config.config.analysis_include_evalution:
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
         # (if analysis is specified, these should be implicitly run).
@@ -153,6 +158,21 @@ rule merge_input:
         dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset)
         runner.merge_input(dataset_dict, output.dataset_file)
 
+# Return all files used in the gold standard
+def get_gold_standard_dependencies(wildcards):
+    gs = _config.config.gold_standards[wildcards.gold_standard]
+    all_files = gs["node_files"]
+    all_files = [gs["data_dir"] + SEP + data_file for data_file in all_files]
+    return all_files
+
+# Merge all node files for a gold_standard into a single node table
+rule merge_gs_input:
+    input: get_gold_standard_dependencies
+    output: gold_standard_file = SEP.join([out_dir, '{gold_standard}-merged.pickle'])
+    run:
+        gold_standard_dict = get_dataset(_config.config.gold_standards, wildcards.gold_standard)
+        Evaluation.merge_gold_standard_input(gold_standard_dict, output.gold_standard_file)
+
 # The checkpoint is like a rule but can be used in dynamic workflows
 # The workflow directed acyclic graph is re-evaluated after the checkpoint job runs
 # If the checkpoint has not executed for the provided wildcard values, it will be run and then the rest of the
@@ -303,6 +323,7 @@ def collect_pathways_per_algo(wildcards):
     filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
     return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)
 
+# Cluster the output pathways per algorithm for each dataset
 rule ml_analysis_aggregate_algo:
     input:
         pathways = collect_pathways_per_algo
@@ -322,6 +343,28 @@ rule ml_analysis_aggregate_algo:
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
         ml.ensemble_network(summary_df, output.ensemble_network_file)
 
+# Return the gold standard pickle file for a specific gold standard
+def get_gold_standard_pickle_file(wildcards):
+    parts = wildcards.dataset_gold_standard_pairs.split('-')
+    gs = parts[1]
+    return SEP.join([out_dir, f'{gs}-merged.pickle'])
+
+# Returns the dataset corresponding to the gold standard pair
+def get_dataset_label(wildcards):
+    parts = wildcards.dataset_gold_standard_pairs.split('-')
+    dataset = parts[0]
+    return dataset
+
+# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
+rule evaluation:
+    input: 
+        gold_standard_file = get_gold_standard_pickle_file,
+        pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
+    output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
+    run:
+        node_table = Evaluation.from_file(input.gold_standard_file).node_table
+        Evaluation.precision(input.pathways, node_table, output.eval_file)
+
 # Remove the output directory
 rule clean:
     shell: f'rm -rf {out_dir}'
diff --git a/config/config.yaml b/config/config.yaml
@@ -117,6 +117,20 @@ datasets:
       # Relative path from the spras directory
       data_dir: "input"
 
+gold_standard:
+    -
+      label: gs0
+      node_files: ["gs_nodes0.txt"]
+      # edge_files: [] TODO: later iteration
+      data_dir: "input"
+      # Set of datasets (dataset labels) to compare with the specific gold standard dataset
+      datasets: ["data0"]
+    -
+      label: gs1
+      node_files: ["gs_nodes1.txt"]
+      data_dir: "input"
+      datasets: ["data1", "data0"]
+
 # If we want to reconstruct then we should set run to true.
 # TODO: if include is true above but run is false here, algs are not run.
 # is this the behavior we want?
@@ -157,3 +171,5 @@ analysis:
         linkage: 'ward'
         # 'euclidean', 'manhattan', 'cosine'
         metric: 'euclidean'
+      evaluation:
+        include: true
diff --git a/config/egfr.yaml b/config/egfr.yaml
@@ -89,3 +89,5 @@ analysis:
     include: true
   ml:
     include: false
+  evaluation:
+        include: false
diff --git a/input/gs_nodes0.txt b/input/gs_nodes0.txt
@@ -0,0 +1,2 @@
+A
+B
diff --git a/input/gs_nodes1.txt b/input/gs_nodes1.txt
@@ -0,0 +1 @@
+C
diff --git a/spras/config.py b/spras/config.py
@@ -70,6 +70,8 @@ def __init__(self, raw_config):
         self.unpack_singularity = False
         # A dictionary to store configured datasets against which SPRAS will be run
         self.datasets = None
+        # A dictionary to store configured gold standard data against ouptut of SPRAS runs
+        self.gold_standards = None
         # The hash length SPRAS will use to identify parameter combinations. Default is 7
         self.hash_length = DEFAULT_HASH_LENGTH
         # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
@@ -95,6 +97,9 @@ def __init__(self, raw_config):
         self.analysis_include_cytoscape  = None
         # A Boolean specifying whether to run the ML analysis
         self.analysis_include_ml = None
+        # A Boolean specifying whether to run the Evaluation analysis
+        self.analysis_include_evalution = None
+
 
         _raw_config = copy.deepcopy(raw_config)
         self.process_config(_raw_config)
@@ -146,6 +151,16 @@ def process_config(self, raw_config):
             if not bool(re.match(pattern, key)):
                 raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.")
 
+        try:
+            self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]}
+        except:
+            self.gold_standards = {}
+
+        for key in self.gold_standards:
+            pattern = r'^\w+$'
+            if not bool(re.match(pattern, key)):
+                raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.")
+
         # Code snipped from Snakefile that may be useful for assigning default labels
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
         # Maps from the dataset label to the dataset list index
@@ -225,6 +240,13 @@ def process_config(self, raw_config):
         self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
         self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
+        self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"]
+
+        # COMMENT: the code will run correctly without this section below due to empty dict in try except above
+        # TODO: decide if this part is needed
+        if self.gold_standards == {} and self.analysis_include_evalution == True:
+            print("Gold standard data not provided. Evaluation analysis cannot run.")
+            self.analysis_include_evalution = False
 
         if 'aggregate_per_algorithm' not in self.ml_params:
             self.analysis_include_ml_aggregate_algo = False

diff --git a/spras/evaluation.py b/spras/evaluation.py
@@ -0,0 +1,98 @@
+import os
+import pickle as pkl
+import warnings
+from pathlib import Path
+from typing import Iterable
+
+import pandas as pd
+from sklearn.metrics import precision_score
+
+
+class Evaluation:
+    NODE_ID = "NODEID"
+
+    def __init__(self, gold_standard_dict):
+        self.label = None
+        self.datasets = None
+        self.node_table = None
+        # self.edge_table = None TODO: later iteration
+        self.load_files_from_dict(gold_standard_dict)
+        return
+
+    def merge_gold_standard_input(gs_dict, gs_file):
+        """
+        Merge files listed for this gold standard dataset and write the dataset to disk
+        @param gs_dict: gold standard dataset to process
+        @param gs_file: output filename
+        """
+        gs_dataset = Evaluation(gs_dict)
+        gs_dataset.to_file(gs_file)
+
+    def to_file(self, file_name):
+        """
+        Saves gold standard object to pickle file
+        """
+        with open(file_name, "wb") as f:
+            pkl.dump(self, f)
+
+    @classmethod
+    def from_file(cls, file_name):
+        """
+        Loads gold standard object from a pickle file.
+        Usage: gold_standard = Evaluation.from_file(pickle_file)
+        """
+        with open(file_name, "rb") as f:
+            return pkl.load(f)
+
+    def load_files_from_dict(self, gold_standard_dict):
+        """
+        Loads gold standard files from gold_standard_dict, which is one gold standard dataset
+        dictionary from the list in the config file with the fields in the config file.
+        Populates node_table.
+
+        node_table is a single column of nodes pandas table.
+
+        returns: none
+        """
+        self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception
+        self.datasets = gold_standard_dict["datasets"] # COMMENT: can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file
+
+        # COMMENT: cannot be empty, snakemake will run evaluation even if empty
+        node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now
+
+        data_loc = gold_standard_dict["data_dir"]
+
+        single_node_table = pd.read_table(os.path.join(data_loc, node_data_files), header=None)
+        single_node_table.columns = [self.NODE_ID]
+        self.node_table = single_node_table
+
+        # TODO: are we allowing multiple node files or single in node_files for gs
+        # if yes, a for loop is needed
+
+        # TODO: later iteration - chose between node and edge file, or allow both
+
+    def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
+        """
+        Takes in file paths for a specific dataset and an associated gold standard node table.
+        Calculates precision for each pathway file
+        Returns output back to output_file
+        @param file_paths: file paths of pathway reconstruction algorithm outputs
+        @param node_table: the gold standard nodes
+        @param output_file: the filename to save the precision of each pathway
+        """
+        y_true = node_table['NODEID'].tolist()
+        results = []
+
+        for file in file_paths:
+            df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"])
+            y_pred = list(set(df['Node1']).union(set(df['Node2'])))
+            all_nodes = set(y_true).union(set(y_pred))
+            y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
+            y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]
+
+            precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
+
+            results.append({"Pathway": file, "Precision": precision})
+
+        precision_df = pd.DataFrame(results)
+        precision_df.to_csv(output_file, sep="\t", index=False)
diff --git a/spras/runner.py b/spras/runner.py
@@ -42,7 +42,6 @@ def merge_input(dataset_dict, dataset_file):
     dataset = Dataset(dataset_dict)
     dataset.to_file(dataset_file)
 
-
 def prepare_inputs(algorithm, data_file, filename_map):
     """
     Prepare general dataset files for this algorithm

diff --git a/test/test_config.py b/test/test_config.py
@@ -20,6 +20,7 @@ def get_test_config():
             }
         },
         "datasets": [{"label":"alg1"}, {"label":"alg2"}],
+        "gold_standard": [{"label":"gs1"}],
         "algorithms": [{"params": ["param2", "param2"]}],
         "analysis": {
             "summary": {
@@ -34,6 +35,9 @@ def get_test_config():
             "cytoscape": {
                 "include": False
             },
+            "evaluation": {
+                "include": False
+            },
         },
     }