Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve inconsistency between bar_with_bootstrapped_uncertainty and df_and_err_from_u_kln #1324

Merged
merged 18 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added tests/data/zero_overlap_ukln.npz
Binary file not shown.
21 changes: 20 additions & 1 deletion tests/test_bar.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from functools import partial
from pathlib import Path
from typing import Tuple

import numpy as np
Expand Down Expand Up @@ -83,9 +84,15 @@ def test_bootstrap_bar(sigma):

# estimate 3 times
df_ref, df_err_ref = df_and_err_from_u_kln(u_kln)
df_0, bootstrap_samples = bootstrap_bar(u_kln, n_bootstrap=n_bootstrap)
df_0, ddf_0, bootstrap_samples = bootstrap_bar(u_kln, n_bootstrap=n_bootstrap)
df_1, bootstrap_sigma = bar_with_bootstrapped_uncertainty(u_kln)

# Full errors should match exactly
assert df_err_ref == ddf_0

# The bootstrapped error should be as large or larger than the full error
assert bootstrap_sigma >= df_err_ref

# assert estimates identical, uncertainties comparable
print(f"bootstrap uncertainty = {bootstrap_sigma}, pymbar.MBAR uncertainty = {df_err_ref}")
assert df_0 == df_ref
Expand Down Expand Up @@ -216,3 +223,15 @@ def test_compute_fwd_and_reverse_df_over_time(frames_per_step):
# The values at the end should be nearly identical since they contain all the samples
assert np.allclose(fwd[-1], rev[-1])
assert np.allclose(fwd_err[-1], rev_err[-1])


def test_bootstrap_bar_and_regular_bar_match():
"""In cases where the u_kln has effectively no overlap, that bootstrapping returns 0.0 as the error
since the BAR estimate is always zero.
maxentile marked this conversation as resolved.
Show resolved Hide resolved
badisa marked this conversation as resolved.
Show resolved Hide resolved
"""
test_ukln = Path(__file__).parent / "data" / "zero_overlap_ukln.npz"
u_kln = np.load(open(test_ukln, "rb"))["u_kln"]
boot_df, boot_df_err = bar_with_bootstrapped_uncertainty(u_kln)
df, df_err = df_and_err_from_u_kln(u_kln)
assert boot_df == df
np.testing.assert_allclose(boot_df_err, df_err)
mcwitt marked this conversation as resolved.
Show resolved Hide resolved
22 changes: 14 additions & 8 deletions timemachine/fe/bar.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def df_from_u_kln(

def bootstrap_bar(
u_kln: NDArray, n_bootstrap: int = 100, maximum_iterations: int = DEFAULT_MAXIMUM_ITERATIONS
) -> Tuple[float, NDArray]:
) -> Tuple[float, float, NDArray]:
"""Given a 2-state u_kln matrix, subsample u_kln with replacement and re-run df_from_u_kln many times

Parameters
Expand All @@ -166,8 +166,12 @@ def bootstrap_bar(
Returns
-------
best_estimate : float
BAR(w_F, w_R, computeUncertainty=False)
bootstrap_samples: array
BAR(w_F, w_R)

best_estimate_err : float
Full BAR(w_F, w_R) error estimate
badisa marked this conversation as resolved.
Show resolved Hide resolved

bootstrap_samples : array
shape (n_bootstrap,)

Notes
Expand All @@ -178,7 +182,9 @@ def bootstrap_bar(
u_kn, N_k = ukln_to_ukn(u_kln)
mbar = pymbar.MBAR(u_kn, N_k, maximum_iterations=maximum_iterations)

full_bar_result = mbar.getFreeEnergyDifferences(compute_uncertainty=False)[0][0, 1]
df, ddf = mbar.getFreeEnergyDifferences()
full_bar_result = df[0, 1]
full_bar_err = ddf[0, 1]

_, _, n = u_kln.shape

Expand All @@ -196,24 +202,24 @@ def bootstrap_bar(
)
bootstrap_samples.append(bar_result)

return full_bar_result, np.array(bootstrap_samples)
return full_bar_result, full_bar_err, np.array(bootstrap_samples)


def bar_with_bootstrapped_uncertainty(
mcwitt marked this conversation as resolved.
Show resolved Hide resolved
u_kln: NDArray, n_bootstrap=100, maximum_iterations: int = DEFAULT_MAXIMUM_ITERATIONS
) -> Tuple[float, float]:
"""Given 2-state u_kln, returns free energy difference and uncertainty computed by bootstrapping."""

df, bootstrap_dfs = bootstrap_bar(u_kln, n_bootstrap=n_bootstrap, maximum_iterations=maximum_iterations)
df, ddf, bootstrap_dfs = bootstrap_bar(u_kln, n_bootstrap=n_bootstrap, maximum_iterations=maximum_iterations)

# warn if bootstrap distribution deviates significantly from normality
normaltest_result = normaltest(bootstrap_dfs)
pvalue_threshold = 1e-3 # arbitrary, small
if normaltest_result.pvalue < pvalue_threshold:
logger.warning(f"bootstrapped errors non-normal: {normaltest_result}")

# regardless, summarize as if normal
ddf = np.std(bootstrap_dfs)
# Take the max of the full error estimate and the bootstrapped error. Summarize as if normal regardless
badisa marked this conversation as resolved.
Show resolved Hide resolved
ddf = max(ddf, np.std(bootstrap_dfs))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@maxentile suggested taking the max of the full bar error vs the bootstrapped error

Copy link
Collaborator

@mcwitt mcwitt Jun 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just double-checking, is the intent to return NaN here if the all-sample BAR calculation returns NaN for the uncertainty?

(Also might want to use np.maximum instead of max here since the latter has more consistent behavior with NaN inputs: max(1.0, nan) == 1.0 vs. np.maximum(1.0, nan) == nan, though both return NaN if the NaN argument is in the first position)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed in 5fecf57 to do np.maximum. Also per offline changed to call np.nan_to_num(ddf) so that values are set to zero beecb39.

return df, ddf


Expand Down