Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluation Implementation #170

Merged
merged 20 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 48 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ from spras import runner
import shutil
import yaml
from spras.dataset import Dataset
from spras.evaluation import Evaluation
from spras.analysis import ml, summary, graphspace, cytoscape
import spras.config as _config

Expand All @@ -27,13 +28,15 @@ hac_params = _config.config.hac_params
FRAMEWORK = _config.config.container_framework
print(f"Running {FRAMEWORK} containers")

# Return the dataset dictionary from the config file given the label
def get_dataset(_datasets, label):
# Return the dataset or goldstandard dictionary from the config file given the label
def get_dict(_datasets, label):
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
print(_datasets, label)
return _datasets[label]

algorithms = list(algorithm_params)
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
dataset_labels = list(_config.config.datasets.keys())
dataset_goldstandard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']]

# Get algorithms that are running multiple parameter combinations
def algo_has_mult_param_combos(algo):
Expand All @@ -56,7 +59,7 @@ def write_parameter_log(algorithm, param_label, logfile):

# Log the dataset contents specified in the config file in a yaml file
def write_dataset_log(dataset, logfile):
dataset_contents = get_dataset(_config.config.datasets,dataset)
dataset_contents = get_dict(_config.config.datasets,dataset)

# safe_dump gives RepresenterError for an OrderedDict
# config file has to convert the dataset from OrderedDict to dict to avoid this
Expand Down Expand Up @@ -102,6 +105,9 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_evalution:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_goldstandard_pairs,algorithm_params=algorithms_with_params))

if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
# (if analysis is specified, these should be implicitly run).
Expand Down Expand Up @@ -150,9 +156,24 @@ rule merge_input:
output: dataset_file = SEP.join([out_dir, '{dataset}-merged.pickle'])
run:
# Pass the dataset to PRRunner where the files will be merged and written to disk (i.e. pickled)
dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset)
dataset_dict = get_dict(_config.config.datasets, wildcards.dataset)
runner.merge_input(dataset_dict, output.dataset_file)

# Return all files used in the gold standard
def get_goldstandard_dependencies(wildcards):
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
gs = _config.config.gold_standard[wildcards.gold_standard]
all_files = gs["node_files"]
all_files = [gs["data_dir"] + SEP + data_file for data_file in all_files]
return all_files

# Merge all node files for a goldstandard into a single node table
rule merge_gs_input:
input: get_goldstandard_dependencies
output: goldstandard_file = SEP.join([out_dir, '{gold_standard}-merged.pickle'])
run:
goldstandard_dict = get_dict(_config.config.gold_standard, wildcards.gold_standard)
runner.merge_gold_standard_input(goldstandard_dict, output.goldstandard_file)

# The checkpoint is like a rule but can be used in dynamic workflows
# The workflow directed acyclic graph is re-evaluated after the checkpoint job runs
# If the checkpoint has not executed for the provided wildcard values, it will be run and then the rest of the
Expand Down Expand Up @@ -303,6 +324,7 @@ def collect_pathways_per_algo(wildcards):
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)

# Cluster the output pathways per algorithm for each dataset
rule ml_analysis_aggregate_algo:
input:
pathways = collect_pathways_per_algo
Expand All @@ -322,6 +344,28 @@ rule ml_analysis_aggregate_algo:
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Return the gold standard pickle file for a specific gold standard
def get_goldstandard_pickle_file(wildcards):
parts = wildcards.dataset_goldstandard_pairs.split('-')
gs = parts[1]
return SEP.join([out_dir, f'{gs}-merged.pickle'])

# Returns the dataset corresponding to the gold standard pair
def get_dataset_label(wildcards):
parts = wildcards.dataset_goldstandard_pairs.split('-')
dataset = parts[0]
return dataset

# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
rule evaluation:
input:
goldstandard_file = get_goldstandard_pickle_file,
pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
output: eval_file = SEP.join([out_dir, "{dataset_goldstandard_pairs}-evaluation.txt"])
run:
node_table = Evaluation.from_file(input.goldstandard_file).node_table
Evaluation.precision(input.pathways, node_table, output.eval_file)

# Remove the output directory
rule clean:
shell: f'rm -rf {out_dir}'
16 changes: 16 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@ datasets:
# Relative path from the spras directory
data_dir: "input"

gold_standard:
-
label: gs0
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
node_files: ["gs_nodes0.txt"]
# edge_files: [] TODO: later iteration
data_dir: "input"
# Set of datasets to compare with the specific gold standard dataset
datasets: ["data0"]
-
label: gs1
node_files: ["gs_nodes1.txt"]
data_dir: "input"
datasets: ["data1", "data0"]
agitter marked this conversation as resolved.
Show resolved Hide resolved

# If we want to reconstruct then we should set run to true.
# TODO: if include is true above but run is false here, algs are not run.
# is this the behavior we want?
Expand Down Expand Up @@ -156,3 +170,5 @@ analysis:
linkage: 'ward'
# 'euclidean', 'manhattan', 'cosine'
metric: 'euclidean'
evaluation:
include: true
2 changes: 2 additions & 0 deletions input/gs_nodes0.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A
B
1 change: 1 addition & 0 deletions input/gs_nodes1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
C
14 changes: 14 additions & 0 deletions spras/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import copy as copy
import itertools as it
import os
import re

import numpy as np
import yaml
Expand Down Expand Up @@ -69,6 +70,8 @@ def __init__(self, raw_config):
self.unpack_singularity = False
# A dictionary to store configured datasets against which SPRAS will be run
self.datasets = None
# A dictionary to store configured gold standard data against ouptut of SPRAS runs
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
self.gold_standard = None
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
# The hash length SPRAS will use to identify parameter combinations. Default is 7
self.hash_length = DEFAULT_HASH_LENGTH
# The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
Expand All @@ -94,6 +97,9 @@ def __init__(self, raw_config):
self.analysis_include_cytoscape = None
# A Boolean specifying whether to run the ML analysis
self.analysis_include_ml = None
# A Boolean specifying whether to run the Evaluation analysis
self.analysis_include_evalution = None


_raw_config = copy.deepcopy(raw_config)
self.process_config(_raw_config)
Expand Down Expand Up @@ -140,6 +146,13 @@ def process_config(self, raw_config):
# Convert to dicts to simplify the yaml logging
self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]}

# TODO: turn into try except
self.gold_standard = {goldstandard["label"]: dict(goldstandard) for goldstandard in raw_config["gold_standard"]}
for key in self.gold_standard:
pattern = r'^\w+$'
if not bool(re.match(pattern, key)):
raise ValueError(f"Gold Standard label \'{key}\' contains invalid values. Gold Standard labels can only contain letters, numbers, or underscores.")
ntalluri marked this conversation as resolved.
Show resolved Hide resolved

# Code snipped from Snakefile that may be useful for assigning default labels
# dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
# Maps from the dataset label to the dataset list index
Expand Down Expand Up @@ -219,6 +232,7 @@ def process_config(self, raw_config):
self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"]

if 'aggregate_per_algorithm' not in self.ml_params:
self.analysis_include_ml_aggregate_algo = False
Expand Down
87 changes: 87 additions & 0 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import os
import pickle as pkl
import warnings
from pathlib import Path
from typing import Iterable

import pandas as pd
from sklearn.metrics import precision_score


class Evaluation:
NODE_ID = "NODEID"

def __init__(self, gold_standard_dict):
self.label = None
self.node_table = None
# self.edge_table = None TODO: later iteration
self.load_files_from_dict(gold_standard_dict)
self.datasets = None
return

def to_file(self, file_name):
"""
Saves gold standard object to pickle file
"""
with open(file_name, "wb") as f:
pkl.dump(self, f)

@classmethod
def from_file(cls, file_name):
"""
Loads gold standard object from a pickle file.
Usage: gold_standard = Evaluation.from_file(pickle_file)
"""
with open(file_name, "rb") as f:
return pkl.load(f)

def load_files_from_dict(self, gold_standard_dict):
"""
Loads gold standard files from gold_standard_dict, which is one gold standard dataset
dictionary from the list in the config file with the fields in the config file.
Populates node_table.

node_table is a single column of nodes pandas table.

returns: none
"""
self.label = gold_standard_dict["label"]
self.datasets = gold_standard_dict["datasets"]

node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now
data_loc = gold_standard_dict["data_dir"]

single_node_table = pd.read_table(os.path.join(data_loc, node_data_files), header=None)
single_node_table.columns = [self.NODE_ID]
self.node_table = single_node_table

# TODO: are we allowing multiple node files or single in node_files for gs
# if yes, a for loop is needed

# TODO: later iteration - chose between node and edge file, or allow both

def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
"""
Takes in file paths for a specific dataset and an associated gold standard node table.
Calculates precision for each pathway file
Returns output back to output_file
@param file_paths: file paths of pathway reconstruction algorithm outputs
@param node_table: the gold standard nodes
@param output_file: the filename to save the precision of each pathway
"""
y_true = node_table['NODEID'].tolist()
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
results = []

for file in file_paths:
df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"])
y_pred = list(set(df['Node1']).union(set(df['Node2'])))
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
all_nodes = set(y_true).union(set(y_pred))
y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]

precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)

results.append({"Pathway": file, "Precision": precision})

precision_df = pd.DataFrame(results)
precision_df.to_csv(output_file, sep="\t", index=False)
10 changes: 10 additions & 0 deletions spras/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from spras.allpairs import AllPairs as allpairs
from spras.dataset import Dataset
from spras.domino import DOMINO as domino
from spras.evaluation import Evaluation
from spras.meo import MEO as meo
from spras.mincostflow import MinCostFlow as mincostflow
from spras.omicsintegrator1 import OmicsIntegrator1 as omicsintegrator1
Expand Down Expand Up @@ -42,6 +43,15 @@ def merge_input(dataset_dict, dataset_file):
dataset = Dataset(dataset_dict)
dataset.to_file(dataset_file)

def merge_gold_standard_input(gs_dict, gs_file):
ntalluri marked this conversation as resolved.
Show resolved Hide resolved
"""
Merge files listed for this gold standard dataset and write the dataset to disk
@param gs_dict: gold standard dataset to process
@param gs_file: output filename
"""
gs_dataset = Evaluation(gs_dict)
gs_dataset.to_file(gs_file)


ntalluri marked this conversation as resolved.
Show resolved Hide resolved
def prepare_inputs(algorithm, data_file, filename_map):
"""
Expand Down
4 changes: 4 additions & 0 deletions test/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def get_test_config():
}
},
"datasets": [{"label":"alg1"}, {"label":"alg2"}],
"gold_standard": [{"label":"gs1"}],
"algorithms": [{"params": ["param2", "param2"]}],
"analysis": {
"summary": {
Expand All @@ -34,6 +35,9 @@ def get_test_config():
"cytoscape": {
"include": False
},
"evaluation": {
"include": False
},
},
}

Expand Down
Loading