From f5c9882a3227a631894a065bcf06c2eacbbbc449 Mon Sep 17 00:00:00 2001 From: David Merrell Date: Mon, 4 May 2020 21:43:37 -0500 Subject: [PATCH] Network predictions in GENIE3 format. Other superficial changes. --- run_ssps/Snakefile | 18 +++++++++-- run_ssps/ssps_config.yaml | 4 ++- scripts/json_to_genie.py | 53 +++++++++++++++++++++++++++++++ scripts/preprocess_dream_prior.py | 10 ++++-- 4 files changed, 79 insertions(+), 6 deletions(-) create mode 100644 scripts/json_to_genie.py diff --git a/run_ssps/Snakefile b/run_ssps/Snakefile index ae9e89c..04980cf 100644 --- a/run_ssps/Snakefile +++ b/run_ssps/Snakefile @@ -20,9 +20,12 @@ configfile: "ssps_config.yaml" # directories TIMESERIES = config["ts_file"] PRIOR = config["prior_file"] +NODE_NAME_FILE = config["node_name_file"] OUTFILE = config["prediction_file"] TEMPDIR = config["temp_dir"] +PRED_JSON = os.path.join(TEMPDIR, "predictions.json") JULIA_PROJ_DIR = os.path.join(os.path.pardir, "julia-project") +SCRIPT_DIR = os.path.join(os.path.pardir, "scripts") # MCMC hyperparameters MCMC_PARAMS = config["mcmc_hyperparams"] @@ -38,16 +41,27 @@ rule all: input: OUTFILE + +rule to_genie: + input: + pred=PRED_JSON, + nn=NODE_NAME_FILE + output: + OUTFILE + shell: + "python {SCRIPT_DIR}/json_to_genie.py {input.pred} {OUTFILE} {input.nn}" + + rule postprocess_mcmc: input: pp=JULIA_PROJ_DIR+"/postprocess_samples.jl", raw=expand(TEMPDIR+"/{chain}.json", chain=CHAINS) output: - out=OUTFILE + out=PRED_JSON resources: runtime=3600, threads=1, - mem_mb=6000 + mem_mb=3000 shell: "julia --project={JULIA_PROJ_DIR} {input.pp} --chain-samples {input.raw} --output-file {output.out}" diff --git a/run_ssps/ssps_config.yaml b/run_ssps/ssps_config.yaml index 26fd582..4fed205 100644 --- a/run_ssps/ssps_config.yaml +++ b/run_ssps/ssps_config.yaml @@ -3,6 +3,8 @@ # Input and output files ts_file: "my_timeseries.csv" # Must be TAB SEPARATED (for now) prior_file: "my_prior.csv" # Must be COMMA SEPARATED (for now) +node_name_file: "node_names.json" # Optional. If included, must be a JSON list. + # Otherwise, set to empty string: "". prediction_file: "my_predictions.json" temp_dir: "temp" @@ -15,5 +17,5 @@ mcmc_hyperparams: lambda_prop_std: 3.0 large_indeg: 20 n_chains: 4 - timeout: 60 + timeout: 300 diff --git a/scripts/json_to_genie.py b/scripts/json_to_genie.py new file mode 100644 index 0000000..a7793ff --- /dev/null +++ b/scripts/json_to_genie.py @@ -0,0 +1,53 @@ +import json +import pandas as pd +import sys + +""" +Converts lists of weighted parent sets into +a dataframe of weighted edges +""" +def psets_to_edgedf(parent_sets, node_names=None): + + if node_names is None: + node_names = ["node_{}".format(i) for i in range(len(parent_sets))] + + edge_df = pd.DataFrame() + + for j, ps in enumerate(parent_sets): + for i, p_prob in enumerate(ps): + row = {0: node_names[i], + 1: node_names[j], + 2: p_prob} + edge_df = edge_df.append(row, ignore_index=True) + + edge_df.sort_values(2, ascending=False, inplace=True) + + return edge_df + + +if __name__=="__main__": + + # get arguments + pred_file = sys.argv[1] + out_file = sys.argv[2] + node_names = None + + # optionally: read in a JSON file of node names + if len(sys.argv) > 3: + name_file = sys.argv[3] + with open(name_file, "r") as f: + node_names = json.load(f) + + # load prediction file + with open(pred_file) as f: + preds = json.load(f) + + # convert parent sets + # to edge-list dataframe + parent_sets = preds[preds["edge_conf_key"]] + edge_df = psets_to_edgedf(parent_sets, node_names=node_names) + + # output GENIE3 + edge_df.to_csv(out_file, sep="\t", header=False, index=False) + + diff --git a/scripts/preprocess_dream_prior.py b/scripts/preprocess_dream_prior.py index c411004..a67c3ac 100644 --- a/scripts/preprocess_dream_prior.py +++ b/scripts/preprocess_dream_prior.py @@ -26,7 +26,11 @@ def build_weighted_adj(eda_filename): print(adj) - return adj, antibody_map + antibody_ls = [0 for i in antibody_map] + for (name, idx) in antibody_map.items(): + antibody_ls[idx] = name + + return adj, antibody_ls @@ -38,11 +42,11 @@ def build_weighted_adj(eda_filename): parser.add_argument("antibody_file", help="path to output JSON file containing the indices of antibodies") args = parser.parse_args() - adj_mat, antibody_map = build_weighted_adj(args.eda_file) + adj_mat, antibody_ls = build_weighted_adj(args.eda_file) df = pd.DataFrame(adj_mat) df.to_csv(args.output_file, sep=",", index=False, header=False) - json.dump(antibody_map, open(args.antibody_file, "w")) + json.dump(antibody_ls, open(args.antibody_file, "w"))