From 85733e9623cec3526b8f4870aadefdbc696931a8 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Fri, 15 Nov 2024 09:48:07 -0800 Subject: [PATCH] Fix NCCL_ASYNC_ERROR_HANDLING deprecation warning It looks like the patch from https://github.com/pytorch/pytorch/pull/114077 landed in torch 2.2.0. Fixes #568. --- modulus/distributed/manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modulus/distributed/manager.py b/modulus/distributed/manager.py index 61bc2687e9..a266fdd0c5 100644 --- a/modulus/distributed/manager.py +++ b/modulus/distributed/manager.py @@ -332,7 +332,11 @@ def initialize(): addr = os.getenv("MASTER_ADDR", "localhost") port = os.getenv("MASTER_PORT", "12355") # https://pytorch.org/docs/master/notes/cuda.html#id5 - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + # was changed in version 2.2 + if torch.__version__ < (2, 2): + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + else: + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0" initialization_method = os.getenv("MODULUS_DISTRIBUTED_INITIALIZATION_METHOD") if initialization_method is None: try: