From f5c9882a3227a631894a065bcf06c2eacbbbc449 Mon Sep 17 00:00:00 2001
From: David Merrell <dmerrell@cs.wisc.edu>
Date: Mon, 4 May 2020 21:43:37 -0500
Subject: [PATCH] Network predictions in GENIE3 format. Other superficial
 changes.

---
 run_ssps/Snakefile                | 18 +++++++++--
 run_ssps/ssps_config.yaml         |  4 ++-
 scripts/json_to_genie.py          | 53 +++++++++++++++++++++++++++++++
 scripts/preprocess_dream_prior.py | 10 ++++--
 4 files changed, 79 insertions(+), 6 deletions(-)
 create mode 100644 scripts/json_to_genie.py

diff --git a/run_ssps/Snakefile b/run_ssps/Snakefile
index ae9e89c..04980cf 100644
--- a/run_ssps/Snakefile
+++ b/run_ssps/Snakefile
@@ -20,9 +20,12 @@ configfile: "ssps_config.yaml"
 # directories
 TIMESERIES = config["ts_file"]
 PRIOR = config["prior_file"]
+NODE_NAME_FILE = config["node_name_file"]
 OUTFILE = config["prediction_file"]
 TEMPDIR = config["temp_dir"]
+PRED_JSON = os.path.join(TEMPDIR, "predictions.json")
 JULIA_PROJ_DIR = os.path.join(os.path.pardir, "julia-project")
+SCRIPT_DIR = os.path.join(os.path.pardir, "scripts")
 
 # MCMC hyperparameters
 MCMC_PARAMS = config["mcmc_hyperparams"]
@@ -38,16 +41,27 @@ rule all:
     input:
         OUTFILE
 
+
+rule to_genie:
+    input: 
+        pred=PRED_JSON,
+        nn=NODE_NAME_FILE
+    output:
+        OUTFILE
+    shell:
+        "python {SCRIPT_DIR}/json_to_genie.py {input.pred} {OUTFILE} {input.nn}"
+
+
 rule postprocess_mcmc:
     input:
         pp=JULIA_PROJ_DIR+"/postprocess_samples.jl",
         raw=expand(TEMPDIR+"/{chain}.json", chain=CHAINS)
     output:
-        out=OUTFILE
+        out=PRED_JSON
     resources:
         runtime=3600,
         threads=1,
-        mem_mb=6000
+        mem_mb=3000
     shell:
         "julia --project={JULIA_PROJ_DIR} {input.pp} --chain-samples {input.raw}  --output-file {output.out}"
 
diff --git a/run_ssps/ssps_config.yaml b/run_ssps/ssps_config.yaml
index 26fd582..4fed205 100644
--- a/run_ssps/ssps_config.yaml
+++ b/run_ssps/ssps_config.yaml
@@ -3,6 +3,8 @@
 # Input and output files
 ts_file: "my_timeseries.csv" # Must be TAB SEPARATED (for now)
 prior_file: "my_prior.csv"   # Must be COMMA SEPARATED (for now)
+node_name_file: "node_names.json" # Optional. If included, must be a JSON list.
+                                  # Otherwise, set to empty string: "".
 prediction_file: "my_predictions.json" 
 
 temp_dir: "temp"
@@ -15,5 +17,5 @@ mcmc_hyperparams:
     lambda_prop_std: 3.0 
     large_indeg: 20
     n_chains: 4
-    timeout: 60 
+    timeout: 300 
 
diff --git a/scripts/json_to_genie.py b/scripts/json_to_genie.py
new file mode 100644
index 0000000..a7793ff
--- /dev/null
+++ b/scripts/json_to_genie.py
@@ -0,0 +1,53 @@
+import json
+import pandas as pd
+import sys
+
+"""
+Converts lists of weighted parent sets into 
+a dataframe of weighted edges
+"""
+def psets_to_edgedf(parent_sets, node_names=None):
+
+    if node_names is None:
+        node_names = ["node_{}".format(i) for i in range(len(parent_sets))]
+
+    edge_df = pd.DataFrame()
+
+    for j, ps in enumerate(parent_sets):
+        for i, p_prob in enumerate(ps):
+            row = {0: node_names[i], 
+                   1: node_names[j], 
+                   2: p_prob}
+            edge_df = edge_df.append(row, ignore_index=True)
+    
+    edge_df.sort_values(2, ascending=False, inplace=True)
+
+    return edge_df
+
+
+if __name__=="__main__":
+
+    # get arguments
+    pred_file = sys.argv[1]
+    out_file = sys.argv[2]
+    node_names = None
+
+    # optionally: read in a JSON file of node names
+    if len(sys.argv) > 3:
+        name_file = sys.argv[3]
+        with open(name_file, "r") as f:
+            node_names = json.load(f)
+
+    # load prediction file
+    with open(pred_file) as f:
+        preds = json.load(f)
+
+    # convert parent sets 
+    # to edge-list dataframe
+    parent_sets = preds[preds["edge_conf_key"]]
+    edge_df = psets_to_edgedf(parent_sets, node_names=node_names)
+
+    # output GENIE3
+    edge_df.to_csv(out_file, sep="\t", header=False, index=False)
+
+
diff --git a/scripts/preprocess_dream_prior.py b/scripts/preprocess_dream_prior.py
index c411004..a67c3ac 100644
--- a/scripts/preprocess_dream_prior.py
+++ b/scripts/preprocess_dream_prior.py
@@ -26,7 +26,11 @@ def build_weighted_adj(eda_filename):
 
     print(adj)
 
-    return adj, antibody_map
+    antibody_ls = [0 for i in antibody_map]
+    for (name, idx) in antibody_map.items():
+        antibody_ls[idx] = name
+
+    return adj, antibody_ls
 
 
 
@@ -38,11 +42,11 @@ def build_weighted_adj(eda_filename):
     parser.add_argument("antibody_file", help="path to output JSON file containing the indices of antibodies")
     args = parser.parse_args()
 
-    adj_mat, antibody_map = build_weighted_adj(args.eda_file)
+    adj_mat, antibody_ls = build_weighted_adj(args.eda_file)
 
     df = pd.DataFrame(adj_mat)
     df.to_csv(args.output_file, sep=",", index=False, header=False) 
 
-    json.dump(antibody_map, open(args.antibody_file, "w"))
+    json.dump(antibody_ls, open(args.antibody_file, "w"))