diff --git a/CHANGELOG.md b/CHANGELOG.md index 93824e625a..c25e069eaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed NCCL_ASYNC_ERROR_HANDLING deprecation warning + ### Security ### Dependencies diff --git a/modulus/distributed/manager.py b/modulus/distributed/manager.py index 61bc2687e9..a266fdd0c5 100644 --- a/modulus/distributed/manager.py +++ b/modulus/distributed/manager.py @@ -332,7 +332,11 @@ def initialize(): addr = os.getenv("MASTER_ADDR", "localhost") port = os.getenv("MASTER_PORT", "12355") # https://pytorch.org/docs/master/notes/cuda.html#id5 - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + # was changed in version 2.2 + if torch.__version__ < (2, 2): + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + else: + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0" initialization_method = os.getenv("MODULUS_DISTRIBUTED_INITIALIZATION_METHOD") if initialization_method is None: try: