Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluation Implementation #170

Merged
merged 20 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ from spras import runner
import shutil
import yaml
from spras.dataset import Dataset
from spras.evaluation import Evaluation
from spras.analysis import ml, summary, graphspace, cytoscape
import spras.config as _config

Expand All @@ -27,13 +28,14 @@ hac_params = _config.config.hac_params
FRAMEWORK = _config.config.container_framework
print(f"Running {FRAMEWORK} containers")

# Return the dataset dictionary from the config file given the label
# Return the dataset or gold_standard dictionary from the config file given the label
def get_dataset(_datasets, label):
return _datasets[label]

algorithms = list(algorithm_params)
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
dataset_labels = list(_config.config.datasets.keys())
dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['datasets']]

# Get algorithms that are running multiple parameter combinations
def algo_has_mult_param_combos(algo):
Expand Down Expand Up @@ -102,6 +104,9 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_evalution:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))

if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
# (if analysis is specified, these should be implicitly run).
Expand Down Expand Up @@ -153,6 +158,21 @@ rule merge_input:
dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset)
runner.merge_input(dataset_dict, output.dataset_file)

# Return all files used in the gold standard
def get_gold_standard_dependencies(wildcards):
gs = _config.config.gold_standards[wildcards.gold_standard]
all_files = gs["node_files"]
all_files = [gs["data_dir"] + SEP + data_file for data_file in all_files]
return all_files

# Merge all node files for a gold_standard into a single node table
rule merge_gs_input:
input: get_gold_standard_dependencies
output: gold_standard_file = SEP.join([out_dir, '{gold_standard}-merged.pickle'])
run:
gold_standard_dict = get_dataset(_config.config.gold_standards, wildcards.gold_standard)
Evaluation.merge_gold_standard_input(gold_standard_dict, output.gold_standard_file)

# The checkpoint is like a rule but can be used in dynamic workflows
# The workflow directed acyclic graph is re-evaluated after the checkpoint job runs
# If the checkpoint has not executed for the provided wildcard values, it will be run and then the rest of the
Expand Down Expand Up @@ -303,6 +323,7 @@ def collect_pathways_per_algo(wildcards):
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)

# Cluster the output pathways per algorithm for each dataset
rule ml_analysis_aggregate_algo:
input:
pathways = collect_pathways_per_algo
Expand All @@ -322,6 +343,28 @@ rule ml_analysis_aggregate_algo:
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Return the gold standard pickle file for a specific gold standard
def get_gold_standard_pickle_file(wildcards):
parts = wildcards.dataset_gold_standard_pairs.split('-')
gs = parts[1]
return SEP.join([out_dir, f'{gs}-merged.pickle'])

# Returns the dataset corresponding to the gold standard pair
def get_dataset_label(wildcards):
parts = wildcards.dataset_gold_standard_pairs.split('-')
dataset = parts[0]
return dataset

# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
rule evaluation:
input:
gold_standard_file = get_gold_standard_pickle_file,
pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
run:
node_table = Evaluation.from_file(input.gold_standard_file).node_table
Evaluation.precision(input.pathways, node_table, output.eval_file)

# Remove the output directory
rule clean:
shell: f'rm -rf {out_dir}'
16 changes: 16 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,20 @@ datasets:
# Relative path from the spras directory
data_dir: "input"

gold_standard:
-
label: gs0
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
node_files: ["gs_nodes0.txt"]
# edge_files: [] TODO: later iteration
data_dir: "input"
# Set of datasets (dataset labels) to compare with the specific gold standard dataset
datasets: ["data0"]
-
label: gs1
node_files: ["gs_nodes1.txt"]
data_dir: "input"
datasets: ["data1", "data0"]
agitter marked this conversation as resolved.
Show resolved Hide resolved

# If we want to reconstruct then we should set run to true.
# TODO: if include is true above but run is false here, algs are not run.
# is this the behavior we want?
Expand Down Expand Up @@ -157,3 +171,5 @@ analysis:
linkage: 'ward'
# 'euclidean', 'manhattan', 'cosine'
metric: 'euclidean'
evaluation:
include: true
2 changes: 2 additions & 0 deletions config/egfr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,5 @@ analysis:
include: true
ml:
include: false
evaluation:
include: false
2 changes: 2 additions & 0 deletions input/gs_nodes0.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A
B
1 change: 1 addition & 0 deletions input/gs_nodes1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
C
22 changes: 22 additions & 0 deletions spras/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def __init__(self, raw_config):
self.unpack_singularity = False
# A dictionary to store configured datasets against which SPRAS will be run
self.datasets = None
# A dictionary to store configured gold standard data against ouptut of SPRAS runs
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
self.gold_standards = None
# The hash length SPRAS will use to identify parameter combinations. Default is 7
self.hash_length = DEFAULT_HASH_LENGTH
# The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
Expand All @@ -95,6 +97,9 @@ def __init__(self, raw_config):
self.analysis_include_cytoscape = None
# A Boolean specifying whether to run the ML analysis
self.analysis_include_ml = None
# A Boolean specifying whether to run the Evaluation analysis
self.analysis_include_evalution = None


_raw_config = copy.deepcopy(raw_config)
self.process_config(_raw_config)
Expand Down Expand Up @@ -146,6 +151,16 @@ def process_config(self, raw_config):
if not bool(re.match(pattern, key)):
raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.")

try:
self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]}
except:
self.gold_standards = {}

for key in self.gold_standards:
pattern = r'^\w+$'
if not bool(re.match(pattern, key)):
raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.")

# Code snipped from Snakefile that may be useful for assigning default labels
# dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
# Maps from the dataset label to the dataset list index
Expand Down Expand Up @@ -225,6 +240,13 @@ def process_config(self, raw_config):
self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"]

# COMMENT: the code will run correctly without this section below due to empty dict in try except above
# TODO: decide if this part is needed
if self.gold_standards == {} and self.analysis_include_evalution == True:
print("Gold standard data not provided. Evaluation analysis cannot run.")
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
self.analysis_include_evalution = False

ntalluri marked this conversation as resolved.
Show resolved Hide resolved
if 'aggregate_per_algorithm' not in self.ml_params:
self.analysis_include_ml_aggregate_algo = False
Expand Down
98 changes: 98 additions & 0 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import os
import pickle as pkl
import warnings
from pathlib import Path
from typing import Iterable

import pandas as pd
from sklearn.metrics import precision_score


class Evaluation:
NODE_ID = "NODEID"

def __init__(self, gold_standard_dict):
self.label = None
self.datasets = None
self.node_table = None
# self.edge_table = None TODO: later iteration
self.load_files_from_dict(gold_standard_dict)
return

def merge_gold_standard_input(gs_dict, gs_file):
"""
Merge files listed for this gold standard dataset and write the dataset to disk
@param gs_dict: gold standard dataset to process
@param gs_file: output filename
"""
gs_dataset = Evaluation(gs_dict)
gs_dataset.to_file(gs_file)

def to_file(self, file_name):
"""
Saves gold standard object to pickle file
"""
with open(file_name, "wb") as f:
pkl.dump(self, f)

@classmethod
def from_file(cls, file_name):
"""
Loads gold standard object from a pickle file.
Usage: gold_standard = Evaluation.from_file(pickle_file)
"""
with open(file_name, "rb") as f:
return pkl.load(f)

def load_files_from_dict(self, gold_standard_dict):
"""
Loads gold standard files from gold_standard_dict, which is one gold standard dataset
dictionary from the list in the config file with the fields in the config file.
Populates node_table.

node_table is a single column of nodes pandas table.

returns: none
"""
self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
self.datasets = gold_standard_dict["datasets"] # COMMENT: can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file

# COMMENT: cannot be empty, snakemake will run evaluation even if empty
node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now

data_loc = gold_standard_dict["data_dir"]

single_node_table = pd.read_table(os.path.join(data_loc, node_data_files), header=None)
single_node_table.columns = [self.NODE_ID]
self.node_table = single_node_table

# TODO: are we allowing multiple node files or single in node_files for gs
# if yes, a for loop is needed

# TODO: later iteration - chose between node and edge file, or allow both

def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
"""
Takes in file paths for a specific dataset and an associated gold standard node table.
Calculates precision for each pathway file
Returns output back to output_file
@param file_paths: file paths of pathway reconstruction algorithm outputs
@param node_table: the gold standard nodes
@param output_file: the filename to save the precision of each pathway
"""
y_true = node_table['NODEID'].tolist()
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
results = []

for file in file_paths:
df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"])
y_pred = list(set(df['Node1']).union(set(df['Node2'])))
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
all_nodes = set(y_true).union(set(y_pred))
y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]

precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)

results.append({"Pathway": file, "Precision": precision})

precision_df = pd.DataFrame(results)
precision_df.to_csv(output_file, sep="\t", index=False)
1 change: 0 additions & 1 deletion spras/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def merge_input(dataset_dict, dataset_file):
dataset = Dataset(dataset_dict)
dataset.to_file(dataset_file)


ntalluri marked this conversation as resolved.
Show resolved Hide resolved
def prepare_inputs(algorithm, data_file, filename_map):
"""
Prepare general dataset files for this algorithm
Expand Down
4 changes: 4 additions & 0 deletions test/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def get_test_config():
}
},
"datasets": [{"label":"alg1"}, {"label":"alg2"}],
"gold_standard": [{"label":"gs1"}],
"algorithms": [{"params": ["param2", "param2"]}],
"analysis": {
"summary": {
Expand All @@ -34,6 +35,9 @@ def get_test_config():
"cytoscape": {
"include": False
},
"evaluation": {
"include": False
},
},
}

Expand Down
Loading