Skip to content

Commit

Permalink
Use a separate _is_saving_in_progress bool to track whether a save …
Browse files Browse the repository at this point in the history
…is ongoing, as this prevents conflicts when concurrent `wait_until_finished` calls cause the finalize thread lock to block.

PiperOrigin-RevId: 719000412
  • Loading branch information
cpgaffney1 authored and Orbax Authors committed Jan 23, 2025
1 parent d7c1cfd commit 359a0ed
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions checkpoint/orbax/checkpoint/checkpoint_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,10 @@ def __init__(
with self._finalize_thread_lock:
self._finalize_thread = None

self._is_saving_in_progress_lock = threading.Lock()
with self._is_saving_in_progress_lock:
self._is_saving_in_progress = False

self._checkpoint_deleter: deleter.CheckpointDeleter = (
deleter.create_checkpoint_deleter(
self._multiprocessing_options.primary_host,
Expand Down Expand Up @@ -1317,6 +1321,8 @@ def save(

assert self._finalize_thread is None
if is_async_checkpointer(self._checkpointer):
with self._is_saving_in_progress_lock:
self._is_saving_in_progress = True
with self._finalize_thread_lock:
finalize_thread_name = 'save_finalize'
logging.info(
Expand Down Expand Up @@ -1822,10 +1828,8 @@ def wait_until_finished(self):

def is_saving_in_progress(self) -> bool:
"""Returns whether a checkpoint save is in progress."""
with self._finalize_thread_lock:
return (
self._finalize_thread is not None and self._finalize_thread.is_alive()
)
with self._is_saving_in_progress_lock:
return self._is_saving_in_progress

def check_for_errors(self):
"""See superclass documentation."""
Expand Down Expand Up @@ -1905,6 +1909,8 @@ def _finalize(self, step: int, steps_to_remove: List[int]):
threading.current_thread().name,
step,
)
with self._is_saving_in_progress_lock:
self._is_saving_in_progress = False

def close(self):
"""See superclass documentation."""
Expand Down

0 comments on commit 359a0ed

Please sign in to comment.