From 40048d732f0d9bfb975edda0cb24e51464d18ae0 Mon Sep 17 00:00:00 2001 From: Wenqi Li <831580+wyli@users.noreply.github.com> Date: Mon, 28 Aug 2023 12:29:56 +0100 Subject: [PATCH] 6907 TORCH_ALLOW_TF32_CUBLAS_OVERRIDE warning and update print (#6909) Fixes #6907 ### Description - removes the default warning on `TORCH_ALLOW_TF32_CUBLAS_OVERRIDE` - add debug print info and documentation ### Types of changes - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [x] In-line docstrings updated. - [x] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: Wenqi Li --- docs/source/precision_accelerating.md | 3 +++ monai/config/deviceconfig.py | 14 ++++++++------ monai/utils/tf32.py | 4 ++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docs/source/precision_accelerating.md b/docs/source/precision_accelerating.md index e7e0ddb2b4..897a6f1652 100644 --- a/docs/source/precision_accelerating.md +++ b/docs/source/precision_accelerating.md @@ -33,6 +33,9 @@ Please note that there are environment variables that can override the flags abo If you are using an [NGC PyTorch container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch), the container includes a layer `ENV TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1`. The default value `torch.backends.cuda.matmul.allow_tf32` will be overridden to `True`. +To restore the upstream default value, please run `unset TORCH_ALLOW_TF32_CUBLAS_OVERRIDE` in the container, +and use the Pytorch API `torch.set_float32_matmul_precision`, `torch.backends.cudnn.allow_tf32=False` accordingly. + We recommend that users print out these two flags for confirmation when unsure. diff --git a/monai/config/deviceconfig.py b/monai/config/deviceconfig.py index 5d7aee6c75..854a4274c4 100644 --- a/monai/config/deviceconfig.py +++ b/monai/config/deviceconfig.py @@ -205,6 +205,8 @@ def get_gpu_info() -> OrderedDict: _dict_append(output, "CUDA version", lambda: torch.version.cuda) cudnn_ver = torch.backends.cudnn.version() _dict_append(output, "cuDNN enabled", lambda: bool(cudnn_ver)) + _dict_append(output, "NVIDIA_TF32_OVERRIDE", os.environ.get("NVIDIA_TF32_OVERRIDE")) + _dict_append(output, "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE", os.environ.get("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE")) if cudnn_ver: _dict_append(output, "cuDNN version", lambda: cudnn_ver) @@ -215,12 +217,12 @@ def get_gpu_info() -> OrderedDict: for gpu in range(num_gpus): gpu_info = torch.cuda.get_device_properties(gpu) - _dict_append(output, f"GPU {gpu} Name", lambda: gpu_info.name) - _dict_append(output, f"GPU {gpu} Is integrated", lambda: bool(gpu_info.is_integrated)) - _dict_append(output, f"GPU {gpu} Is multi GPU board", lambda: bool(gpu_info.is_multi_gpu_board)) - _dict_append(output, f"GPU {gpu} Multi processor count", lambda: gpu_info.multi_processor_count) - _dict_append(output, f"GPU {gpu} Total memory (GB)", lambda: round(gpu_info.total_memory / 1024**3, 1)) - _dict_append(output, f"GPU {gpu} CUDA capability (maj.min)", lambda: f"{gpu_info.major}.{gpu_info.minor}") + _dict_append(output, f"GPU {gpu} Name", gpu_info.name) + _dict_append(output, f"GPU {gpu} Is integrated", bool(gpu_info.is_integrated)) + _dict_append(output, f"GPU {gpu} Is multi GPU board", bool(gpu_info.is_multi_gpu_board)) + _dict_append(output, f"GPU {gpu} Multi processor count", gpu_info.multi_processor_count) + _dict_append(output, f"GPU {gpu} Total memory (GB)", round(gpu_info.total_memory / 1024**3, 1)) + _dict_append(output, f"GPU {gpu} CUDA capability (maj.min)", f"{gpu_info.major}.{gpu_info.minor}") return output diff --git a/monai/utils/tf32.py b/monai/utils/tf32.py index 9ef425ab8b..cfb023bdeb 100644 --- a/monai/utils/tf32.py +++ b/monai/utils/tf32.py @@ -52,7 +52,7 @@ def has_ampere_or_later() -> bool: @functools.lru_cache(None) def detect_default_tf32() -> bool: """ - Dectect if there is anything that may enable TF32 mode by default. + Detect if there is anything that may enable TF32 mode by default. If any, show a warning message. """ may_enable_tf32 = False @@ -70,7 +70,7 @@ def detect_default_tf32() -> bool: ) may_enable_tf32 = True - override_tf32_env_vars = {"NVIDIA_TF32_OVERRIDE": "1", "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE": "1"} + override_tf32_env_vars = {"NVIDIA_TF32_OVERRIDE": "1"} # TORCH_ALLOW_TF32_CUBLAS_OVERRIDE not checked #6907 for name, override_val in override_tf32_env_vars.items(): if os.environ.get(name) == override_val: warnings.warn(