Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "Retrospective" tab to dashboard #121

Merged
merged 34 commits into from
Jun 15, 2021
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
eac01b3
create basic outline for new tab
Dec 8, 2020
eb15091
create basic retrospective analysis tab
glass-w Dec 14, 2020
7be597e
change index skeleton for retrospective analysis
glass-w Dec 14, 2020
34b305d
set new column skeleton and use compound ID only
glass-w Dec 15, 2020
89f50a0
add skeleton function for extracting relatives
glass-w Dec 16, 2020
d740ae0
move function, update schema, test html, tidy up
glass-w Dec 17, 2020
ba32fae
add simple arsenic plots WIP
glass-w Dec 17, 2020
46d8dbc
change func name, add microstate check, tidy plots
glass-w Dec 17, 2020
4ccdc0f
wrangle exp values to fewer decimal places
glass-w Dec 18, 2020
94eed6f
fix plot, minor fixes
glass-w Dec 18, 2020
02a4f15
fix analysis.json formatting
glass-w Dec 18, 2020
addd3e4
format with black, update tab and plots
glass-w Dec 18, 2020
c31f0b8
fix formatting
glass-w Dec 18, 2020
45b2961
fix column format
glass-w Dec 18, 2020
c438652
use 0.1 error
glass-w Jan 11, 2021
54f1a8f
Merge branch 'sprint-5' into retro-tab
glass-w Jan 11, 2021
63d03fd
add plotly
glass-w Jan 12, 2021
fe76b7d
Bugfixes galore!
jchodera Jan 13, 2021
11b6724
Merge branch 'retro-tab' of https://github.com/choderalab/covid-moons…
jchodera Jan 13, 2021
2513495
Add reliable transformations plot to retrospective tab.
jchodera Jan 13, 2021
f2ea714
format with Black
glass-w Jan 13, 2021
b52279d
Handle InsufficientDataError
jchodera Jan 24, 2021
ba64518
Merge branch 'retro-tab' of https://github.com/choderalab/covid-moons…
jchodera Jan 24, 2021
585b87d
Accelerate analysis of incomplete transformations by skipping RUNs
jchodera Jan 24, 2021
2a72392
Accept _ in PostEra compound IDs
jchodera Jan 27, 2021
d3e3a88
Suppress nonpolar hydrogens by default
jchodera Jan 29, 2021
db1474d
Bugfix for sorting on webpages
jchodera Jan 30, 2021
1a4d290
Fix regex when multiple microstate suffixes are in use
jchodera Jan 30, 2021
cfb2604
Some cleanup
jchodera Jan 31, 2021
c30b862
Compress ZIP files
jchodera Feb 6, 2021
d44a60c
Correctly strip hydrogens from PDB files
jchodera Feb 6, 2021
18f6ba8
Small fixes and notes
dotsdl Jun 15, 2021
cea6944
Modifications in light of feedback from @glass-w, @jchodera
dotsdl Jun 15, 2021
8c0d75f
Merge branch 'master' into retro-tab
dotsdl Jun 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies:
- openmmtools
- pandas >= 1.1
- perses
- plotly
- pydantic
- pymbar
- python >= 3.6
Expand Down
236 changes: 226 additions & 10 deletions fah_xchem/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,31 @@
import multiprocessing
import os
from typing import List, Optional
import networkx as nx
import numpy as np

from ..fah_utils import list_results
from ..schema import (
AnalysisConfig,
CompoundSeries,
CompoundSeriesAnalysis,
CompoundMicrostate,
FahConfig,
GenAnalysis,
PhaseAnalysis,
PointEstimate,
ProjectPair,
Transformation,
TransformationAnalysis,
WorkPair,
FragalysisConfig,
RunStatus
)
from .diffnet import combine_free_energies
from .constants import KT_KCALMOL
from .diffnet import combine_free_energies, pIC50_to_DG
from .exceptions import AnalysisError, DataValidationError
from .extract_work import extract_work_pair
from .free_energy import compute_relative_free_energy
from .free_energy import compute_relative_free_energy, InsufficientDataError
from .plots import generate_plots
from .report import generate_report, gens_are_consistent
from .structures import generate_representative_snapshots
Expand All @@ -33,7 +39,7 @@
def analyze_phase(server: FahConfig, run: int, project: int, config: AnalysisConfig):

paths = list_results(config=server, run=run, project=project)

if not paths:
raise AnalysisError(f"No data found for project {project}, RUN {run}")

Expand Down Expand Up @@ -62,14 +68,24 @@ def get_gen_analysis(gen: int, works: List[WorkPair]) -> GenAnalysis:
# TODO: round raw work output?
return GenAnalysis(gen=gen, works=filtered_works, free_energy=free_energy)

# Analyze gens, omitting incomplete gens
gens = list()
for gen, works in works_by_gen.items():
try:
gens.append( get_gen_analysis(gen, works) )
except InsufficientDataError as e:
# It's OK if we don't have sufficient data here
pass

return PhaseAnalysis(
free_energy=free_energy,
gens=[get_gen_analysis(gen, works) for gen, works in works_by_gen.items()],
gens=gens,
)


def analyze_transformation(
transformation: Transformation,
compounds: CompoundSeries,
projects: ProjectPair,
server: FahConfig,
config: AnalysisConfig,
Expand All @@ -86,11 +102,37 @@ def analyze_transformation(
complex_phase.free_energy.delta_f - solvent_phase.free_energy.delta_f
)

# get associated DDGs between compounds, if experimentally known
exp_ddg = calc_exp_ddg(transformation=transformation, compounds=compounds)
absolute_error = (
abs(binding_free_energy - exp_ddg) if (exp_ddg.point is not None) else None
)

# Check for consistency across GENS, if requested
consistent_bool = None
if filter_gen_consistency:
consistent_bool = gens_are_consistent(
complex_phase=complex_phase, solvent_phase=solvent_phase, nsigma=3
complex_phase=complex_phase, solvent_phase=solvent_phase, nsigma=1
)

return TransformationAnalysis(
transformation=transformation,
reliable_transformation=consistent_bool,
binding_free_energy=binding_free_energy,
complex_phase=complex_phase,
solvent_phase=solvent_phase,
exp_ddg=exp_ddg,
absolute_error=absolute_error,
)

else:

return TransformationAnalysis(
transformation=transformation,
binding_free_energy=binding_free_energy,
complex_phase=complex_phase,
solvent_phase=solvent_phase,
exp_ddg=exp_ddg,
)

return TransformationAnalysis(
Expand All @@ -101,45 +143,212 @@ def analyze_transformation(
solvent_phase=solvent_phase,
)


def calc_exp_ddg_DEPRECATED(
dotsdl marked this conversation as resolved.
Show resolved Hide resolved
transformation: TransformationAnalysis, compounds: CompoundSeries
):
"""
Compute experimental free energy difference between two compounds, if available.

Parameters
----------
transformation : TransformationAnalysis
The transformation of interest
compounds : CompoundSeries
Data for the compound series.

"""
graph = nx.DiGraph()

# make a simple two node graph
# NOTE there may be a faster way of doing this
graph.add_edge(
transformation.initial_microstate,
transformation.final_microstate,
)

for compound in compounds:
for microstate in compound.microstates:
node = CompoundMicrostate(
compound_id=compound.metadata.compound_id,
microstate_id=microstate.microstate_id,
)
if node in graph:
graph.nodes[node]["compound"] = compound
graph.nodes[node]["microstate"] = microstate

for node_1, node_2, edge in graph.edges(data=True):
# if both nodes contain exp pIC50 calculate the free energy difference between them
# NOTE assume star map (node 1 is our reference)
try:
node_1_pic50 = graph.nodes[node_1]["compound"].metadata.experimental_data[
"pIC50"
] # ref molecule
node_2_pic50 = graph.nodes[node_2]["compound"].metadata.experimental_data[
"pIC50"
] # new molecule

n_microstates_node_1 = len(graph.nodes[node_1]["compound"].microstates)
n_microstates_node_2 = len(graph.nodes[node_2]["compound"].microstates)

# Get experimental DeltaDeltaG by subtracting from experimental inspiration fragment (ref)

node_1_DG = (
pIC50_to_DG(node_1_pic50)
+ (0.6 * np.log(n_microstates_node_1)) / KT_KCALMOL
) # TODO check this is correct
node_2_DG = (
pIC50_to_DG(node_2_pic50)
+ (0.6 * np.log(n_microstates_node_2)) / KT_KCALMOL
) # TODO check this is correct

exp_ddg_ij = node_1_DG - node_2_DG

exp_ddg_ij_err = 0.1 # TODO check this is correct

except KeyError:
logging.info("Failed to get experimental pIC50 value")
exp_ddg_ij = None
exp_ddg_ij_err = None

return PointEstimate(point=exp_ddg_ij, stderr=exp_ddg_ij_err)


def calc_exp_ddg(transformation: TransformationAnalysis, compounds: CompoundSeries):
"""
Compute experimental free energy difference between two compounds, if available.

NOTE: This method makes the approximation that each microstate has the same affinity as the parent compound.

TODO: Instead, solve DiffNet without experimental data and use derived DDGs between compounds (not transformations).

Parameters
----------
transformation : TransformationAnalysis
The transformation of interest
compounds : CompoundSeries
Data for the compound series.

Returns
-------
ddg : PointEstimate
Point estimate of free energy difference for this transformation,
or PointEstimate(None, None) if not available.

"""
compounds_by_microstate = {
microstate.microstate_id: compound
for compound in compounds
for microstate in compound.microstates
}

initial_experimental_data = compounds_by_microstate[
transformation.initial_microstate.microstate_id
].metadata.experimental_data
final_experimental_data = compounds_by_microstate[
transformation.final_microstate.microstate_id
].metadata.experimental_data

if ("pIC50" in initial_experimental_data) and ("pIC50" in final_experimental_data):
exp_ddg_ij_err = 0.2 # TODO check this is correct
dotsdl marked this conversation as resolved.
Show resolved Hide resolved
initial_dg = PointEstimate(
point=pIC50_to_DG(initial_experimental_data["pIC50"]), stderr=exp_ddg_ij_err
)
final_dg = PointEstimate(
point=pIC50_to_DG(final_experimental_data["pIC50"]), stderr=exp_ddg_ij_err
)
error = final_dg - initial_dg
return error
else:
return PointEstimate(point=None, stderr=None)


def analyze_transformation_or_warn(
transformation: Transformation, **kwargs
) -> Optional[TransformationAnalysis]:

try:
return analyze_transformation(transformation, **kwargs)
except AnalysisError as exc:
logging.warning("Failed to analyze RUN%d: %s", transformation.run_id, exc)
return None


def analyze_compound_series(
def analyze_compound_series(
series: CompoundSeries,
config: AnalysisConfig,
server: FahConfig,
num_procs: Optional[int] = None,
) -> CompoundSeriesAnalysis:
"""
Analyze a compound series to generate JSON.

"""
from rich.progress import track

# TODO: Cache results and only update RUNs for which we have received new data

# Pre-filter based on which transformations have any work data
logging.info(f'Pre-filtering {len(series.transformations)} transformations to identify those with work data...')
available_transformations = [
transformation for transformation in series.transformations
if len(list_results(config=server, run=transformation.run_id, project=series.metadata.fah_projects.complex_phase)) > 0
and len(list_results(config=server, run=transformation.run_id, project=series.metadata.fah_projects.solvent_phase)) > 0
]
#available_transformations = series.transformations[:50]

# Process compound series in parallel
logging.info(f'Processing {len(available_transformations)} / {len(series.transformations)} available transformations in parallel...')
with multiprocessing.Pool(num_procs) as pool:
results_iter = pool.imap_unordered(
partial(
analyze_transformation_or_warn,
projects=series.metadata.fah_projects,
server=server,
config=config,
compounds=series.compounds,
),
series.transformations,
available_transformations,
)
transformations = [
result
for result in track(
results_iter,
total=len(series.transformations),
total=len(available_transformations),
description="Computing transformation free energies",
)
if result is not None
]

# Reprocess transformation experimental errors to only include most favorable transformation
# NOTE: This is a hack, and should be replaced by a more robust method for accounting for racemic mixtures
dotsdl marked this conversation as resolved.
Show resolved Hide resolved
# Compile list of all microstate transformations for each compound
compound_ddgs = dict()
for transformation in transformations:
compound_id = transformation.transformation.final_microstate.compound_id
if compound_id in compound_ddgs:
compound_ddgs[compound_id].append(transformation.binding_free_energy.point)
else:
compound_ddgs[compound_id] = [transformation.binding_free_energy.point]
# Collapse to a single estimate
from scipy.special import logsumexp
for compound_id, ddgs in compound_ddgs.items():
compound_ddgs[compound_id] = -logsumexp(-np.array(ddgs)) + np.log(len(ddgs))
# Regenerate list of transformations
for index, t in enumerate(transformations):
if (t.exp_ddg is None) or (t.exp_ddg.point is None):
continue
compound_id = t.transformation.final_microstate.compound_id
absolute_error_point = abs(t.exp_ddg.point - compound_ddgs[compound_id])
transformations[index] = TransformationAnalysis(
transformation=t.transformation,
reliable_transformation=t.reliable_transformation,
binding_free_energy=t.binding_free_energy,
complex_phase=t.complex_phase,
solvent_phase=t.solvent_phase,
exp_ddg=t.exp_ddg,
absolute_error=PointEstimate(point=absolute_error_point, stderr=t.absolute_error.stderr),
)

# Sort transformations by RUN
# transformations.sort(key=lambda transformation_analysis : transformation_analysis.transformation.run_id)
# Sort transformations by free energy difference
Expand Down Expand Up @@ -189,10 +398,17 @@ def generate_artifacts(
data_dir, f"PROJ{series.metadata.fah_projects.complex_phase}"
)

# Pre-filter based on which transformations have any data
available_transformations = [
transformation for transformation in series.transformations
if transformation.binding_free_energy is not None
and transformation.binding_free_energy.point is not None
]

if snapshots:
logging.info("Generating representative snapshots")
generate_representative_snapshots(
transformations=series.transformations,
transformations=available_transformations,
project_dir=complex_project_dir,
project_data_dir=complex_data_dir,
output_dir=os.path.join(output_dir, "transformations"),
Expand Down
2 changes: 1 addition & 1 deletion fah_xchem/analysis/free_energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def compute_relative_free_energy(

# TODO: Flag problematic RUN/CLONE/GEN trajectories for further analysis and debugging
works = _filter_work_values(all_works)

if len(works) < (min_num_work_values or 1):
raise InsufficientDataError(
f"Need at least {min_num_work_values} good work values for analysis, "
Expand Down
Loading