choderalab · dotsdl · Jun 15, 2021 · Dec 8, 2020 · Dec 14, 2020 · Dec 14, 2020
diff --git a/environment.yml b/environment.yml
@@ -16,6 +16,7 @@ dependencies:
   - openmmtools
   - pandas >= 1.1
   - perses
+  - plotly
   - pydantic
   - pymbar
   - python >= 3.6

diff --git a/fah_xchem/analysis/__init__.py b/fah_xchem/analysis/__init__.py
@@ -5,25 +5,31 @@
 import multiprocessing
 import os
 from typing import List, Optional
+import networkx as nx
+import numpy as np
 
 from ..fah_utils import list_results
 from ..schema import (
     AnalysisConfig,
     CompoundSeries,
     CompoundSeriesAnalysis,
+    CompoundMicrostate,
     FahConfig,
     GenAnalysis,
     PhaseAnalysis,
+    PointEstimate,
     ProjectPair,
     Transformation,
     TransformationAnalysis,
     WorkPair,
     FragalysisConfig,
+    RunStatus
 )
-from .diffnet import combine_free_energies
+from .constants import KT_KCALMOL
+from .diffnet import combine_free_energies, pIC50_to_DG
 from .exceptions import AnalysisError, DataValidationError
 from .extract_work import extract_work_pair
-from .free_energy import compute_relative_free_energy
+from .free_energy import compute_relative_free_energy, InsufficientDataError
 from .plots import generate_plots
 from .report import generate_report, gens_are_consistent
 from .structures import generate_representative_snapshots
@@ -33,7 +39,7 @@
 def analyze_phase(server: FahConfig, run: int, project: int, config: AnalysisConfig):
 
     paths = list_results(config=server, run=run, project=project)
-
+    
     if not paths:
         raise AnalysisError(f"No data found for project {project}, RUN {run}")
 
@@ -62,14 +68,24 @@ def get_gen_analysis(gen: int, works: List[WorkPair]) -> GenAnalysis:
         # TODO: round raw work output?
         return GenAnalysis(gen=gen, works=filtered_works, free_energy=free_energy)
 
+    # Analyze gens, omitting incomplete gens
+    gens = list()
+    for gen, works in works_by_gen.items():
+        try:
+            gens.append( get_gen_analysis(gen, works) )
+        except InsufficientDataError as e:
+            # It's OK if we don't have sufficient data here
+            pass
+
     return PhaseAnalysis(
         free_energy=free_energy,
-        gens=[get_gen_analysis(gen, works) for gen, works in works_by_gen.items()],
+        gens=gens,
     )
 
 
 def analyze_transformation(
     transformation: Transformation,
+    compounds: CompoundSeries,
     projects: ProjectPair,
     server: FahConfig,
     config: AnalysisConfig,
@@ -86,11 +102,37 @@ def analyze_transformation(
         complex_phase.free_energy.delta_f - solvent_phase.free_energy.delta_f
     )
 
+    # get associated DDGs between compounds, if experimentally known
+    exp_ddg = calc_exp_ddg(transformation=transformation, compounds=compounds)
+    absolute_error = (
+        abs(binding_free_energy - exp_ddg) if (exp_ddg.point is not None) else None
+    )
+
     # Check for consistency across GENS, if requested
     consistent_bool = None
     if filter_gen_consistency:
         consistent_bool = gens_are_consistent(
-            complex_phase=complex_phase, solvent_phase=solvent_phase, nsigma=3
+            complex_phase=complex_phase, solvent_phase=solvent_phase, nsigma=1
+        )
+
+        return TransformationAnalysis(
+            transformation=transformation,
+            reliable_transformation=consistent_bool,
+            binding_free_energy=binding_free_energy,
+            complex_phase=complex_phase,
+            solvent_phase=solvent_phase,
+            exp_ddg=exp_ddg,
+            absolute_error=absolute_error,
+        )
+
+    else:
+
+        return TransformationAnalysis(
+            transformation=transformation,
+            binding_free_energy=binding_free_energy,
+            complex_phase=complex_phase,
+            solvent_phase=solvent_phase,
+            exp_ddg=exp_ddg,
         )
 
     return TransformationAnalysis(
@@ -101,45 +143,212 @@ def analyze_transformation(
         solvent_phase=solvent_phase,
     )
 
+
+def calc_exp_ddg_DEPRECATED(
+    transformation: TransformationAnalysis, compounds: CompoundSeries
+):
+    """
+    Compute experimental free energy difference between two compounds, if available.
+
+    Parameters
+    ----------
+    transformation : TransformationAnalysis
+        The transformation of interest
+    compounds : CompoundSeries
+       Data for the compound series.
+
+    """
+    graph = nx.DiGraph()
+
+    # make a simple two node graph
+    # NOTE there may be a faster way of doing this
+    graph.add_edge(
+        transformation.initial_microstate,
+        transformation.final_microstate,
+    )
+
+    for compound in compounds:
+        for microstate in compound.microstates:
+            node = CompoundMicrostate(
+                compound_id=compound.metadata.compound_id,
+                microstate_id=microstate.microstate_id,
+            )
+            if node in graph:
+                graph.nodes[node]["compound"] = compound
+                graph.nodes[node]["microstate"] = microstate
+
+    for node_1, node_2, edge in graph.edges(data=True):
+        # if both nodes contain exp pIC50 calculate the free energy difference between them
+        # NOTE assume star map (node 1 is our reference)
+        try:
+            node_1_pic50 = graph.nodes[node_1]["compound"].metadata.experimental_data[
+                "pIC50"
+            ]  # ref molecule
+            node_2_pic50 = graph.nodes[node_2]["compound"].metadata.experimental_data[
+                "pIC50"
+            ]  # new molecule
+
+            n_microstates_node_1 = len(graph.nodes[node_1]["compound"].microstates)
+            n_microstates_node_2 = len(graph.nodes[node_2]["compound"].microstates)
+
+            # Get experimental DeltaDeltaG by subtracting from experimental inspiration fragment (ref)
+
+            node_1_DG = (
+                pIC50_to_DG(node_1_pic50)
+                + (0.6 * np.log(n_microstates_node_1)) / KT_KCALMOL
+            )  # TODO check this is correct
+            node_2_DG = (
+                pIC50_to_DG(node_2_pic50)
+                + (0.6 * np.log(n_microstates_node_2)) / KT_KCALMOL
+            )  # TODO check this is correct
+
+            exp_ddg_ij = node_1_DG - node_2_DG
+
+            exp_ddg_ij_err = 0.1  # TODO check this is correct
+
+        except KeyError:
+            logging.info("Failed to get experimental pIC50 value")
+            exp_ddg_ij = None
+            exp_ddg_ij_err = None
+
+    return PointEstimate(point=exp_ddg_ij, stderr=exp_ddg_ij_err)
+
+
+def calc_exp_ddg(transformation: TransformationAnalysis, compounds: CompoundSeries):
+    """
+    Compute experimental free energy difference between two compounds, if available.
+
+    NOTE: This method makes the approximation that each microstate has the same affinity as the parent compound.
+
+    TODO: Instead, solve DiffNet without experimental data and use derived DDGs between compounds (not transformations).
+
+    Parameters
+    ----------
+    transformation : TransformationAnalysis
+        The transformation of interest
+    compounds : CompoundSeries
+       Data for the compound series.
+
+    Returns
+    -------
+    ddg : PointEstimate
+        Point estimate of free energy difference for this transformation,
+        or PointEstimate(None, None) if not available.
+
+    """
+    compounds_by_microstate = {
+        microstate.microstate_id: compound
+        for compound in compounds
+        for microstate in compound.microstates
+    }
+
+    initial_experimental_data = compounds_by_microstate[
+        transformation.initial_microstate.microstate_id
+    ].metadata.experimental_data
+    final_experimental_data = compounds_by_microstate[
+        transformation.final_microstate.microstate_id
+    ].metadata.experimental_data
+
+    if ("pIC50" in initial_experimental_data) and ("pIC50" in final_experimental_data):
+        exp_ddg_ij_err = 0.2  # TODO check this is correct
+        initial_dg = PointEstimate(
+            point=pIC50_to_DG(initial_experimental_data["pIC50"]), stderr=exp_ddg_ij_err
+        )
+        final_dg = PointEstimate(
+            point=pIC50_to_DG(final_experimental_data["pIC50"]), stderr=exp_ddg_ij_err
+        )
+        error = final_dg - initial_dg
+        return error
+    else:
+        return PointEstimate(point=None, stderr=None)
+
+
 def analyze_transformation_or_warn(
     transformation: Transformation, **kwargs
 ) -> Optional[TransformationAnalysis]:
+
     try:
         return analyze_transformation(transformation, **kwargs)
     except AnalysisError as exc:
         logging.warning("Failed to analyze RUN%d: %s", transformation.run_id, exc)
         return None
 
-
-def analyze_compound_series(
+def analyze_compound_series(    
     series: CompoundSeries,
     config: AnalysisConfig,
     server: FahConfig,
     num_procs: Optional[int] = None,
 ) -> CompoundSeriesAnalysis:
+    """
+    Analyze a compound series to generate JSON.
 
+    """    
     from rich.progress import track
 
+    # TODO: Cache results and only update RUNs for which we have received new data
+
+    # Pre-filter based on which transformations have any work data
+    logging.info(f'Pre-filtering {len(series.transformations)} transformations to identify those with work data...')
+    available_transformations = [
+        transformation for transformation in series.transformations
+        if len(list_results(config=server, run=transformation.run_id, project=series.metadata.fah_projects.complex_phase)) > 0
+        and len(list_results(config=server, run=transformation.run_id, project=series.metadata.fah_projects.solvent_phase)) > 0
+    ]
+    #available_transformations = series.transformations[:50]
+
+    # Process compound series in parallel
+    logging.info(f'Processing {len(available_transformations)} / {len(series.transformations)} available transformations in parallel...')
     with multiprocessing.Pool(num_procs) as pool:
         results_iter = pool.imap_unordered(
             partial(
                 analyze_transformation_or_warn,
                 projects=series.metadata.fah_projects,
                 server=server,
                 config=config,
+                compounds=series.compounds,
             ),
-            series.transformations,
+            available_transformations,            
         )
         transformations = [
             result
             for result in track(
                 results_iter,
-                total=len(series.transformations),
+                total=len(available_transformations),
                 description="Computing transformation free energies",
             )
             if result is not None
         ]
 
+    # Reprocess transformation experimental errors to only include most favorable transformation
+    # NOTE: This is a hack, and should be replaced by a more robust method for accounting for racemic mixtures
+    # Compile list of all microstate transformations for each compound
+    compound_ddgs = dict()
+    for transformation in transformations:
+        compound_id = transformation.transformation.final_microstate.compound_id
+        if compound_id in compound_ddgs:
+            compound_ddgs[compound_id].append(transformation.binding_free_energy.point)
+        else:
+            compound_ddgs[compound_id] = [transformation.binding_free_energy.point]
+    # Collapse to a single estimate
+    from scipy.special import logsumexp
+    for compound_id, ddgs in compound_ddgs.items():
+        compound_ddgs[compound_id] = -logsumexp(-np.array(ddgs)) + np.log(len(ddgs))
+    # Regenerate list of transformations
+    for index, t in enumerate(transformations):
+        if (t.exp_ddg is None) or (t.exp_ddg.point is None):
+            continue
+        compound_id = t.transformation.final_microstate.compound_id
+        absolute_error_point = abs(t.exp_ddg.point - compound_ddgs[compound_id])
+        transformations[index] = TransformationAnalysis(
+            transformation=t.transformation,
+            reliable_transformation=t.reliable_transformation,
+            binding_free_energy=t.binding_free_energy,
+            complex_phase=t.complex_phase,
+            solvent_phase=t.solvent_phase,
+            exp_ddg=t.exp_ddg,            
+            absolute_error=PointEstimate(point=absolute_error_point, stderr=t.absolute_error.stderr),
+        )
+
     # Sort transformations by RUN
     # transformations.sort(key=lambda transformation_analysis : transformation_analysis.transformation.run_id)
     # Sort transformations by free energy difference
@@ -189,10 +398,17 @@ def generate_artifacts(
         data_dir, f"PROJ{series.metadata.fah_projects.complex_phase}"
     )
 
+    # Pre-filter based on which transformations have any data
+    available_transformations = [
+        transformation for transformation in series.transformations
+        if transformation.binding_free_energy is not None
+        and transformation.binding_free_energy.point is not None
+    ]
+
     if snapshots:
         logging.info("Generating representative snapshots")
         generate_representative_snapshots(
-            transformations=series.transformations,
+            transformations=available_transformations,
             project_dir=complex_project_dir,
             project_data_dir=complex_data_dir,
             output_dir=os.path.join(output_dir, "transformations"),

diff --git a/fah_xchem/analysis/free_energy.py b/fah_xchem/analysis/free_energy.py
@@ -172,7 +172,7 @@ def compute_relative_free_energy(
 
     # TODO: Flag problematic RUN/CLONE/GEN trajectories for further analysis and debugging
     works = _filter_work_values(all_works)
-
+    
     if len(works) < (min_num_work_values or 1):
         raise InsufficientDataError(
             f"Need at least {min_num_work_values} good work values for analysis, "