Skip to content

Commit

Permalink
6907 TORCH_ALLOW_TF32_CUBLAS_OVERRIDE warning and update print (#6909)
Browse files Browse the repository at this point in the history
Fixes #6907

### Description
- removes the default warning on `TORCH_ALLOW_TF32_CUBLAS_OVERRIDE`
- add debug print info and documentation


### Types of changes
<!--- Put an `x` in all the boxes that apply, and remove the not
applicable items -->
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u
--net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick
--unittests --disttests`.
- [x] In-line docstrings updated.
- [x] Documentation updated, tested `make html` command in the `docs/`
folder.

---------

Signed-off-by: Wenqi Li <[email protected]>
  • Loading branch information
wyli authored Aug 28, 2023
1 parent 2daabf9 commit 40048d7
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 8 deletions.
3 changes: 3 additions & 0 deletions docs/source/precision_accelerating.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ Please note that there are environment variables that can override the flags abo

If you are using an [NGC PyTorch container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch), the container includes a layer `ENV TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1`.
The default value `torch.backends.cuda.matmul.allow_tf32` will be overridden to `True`.
To restore the upstream default value, please run `unset TORCH_ALLOW_TF32_CUBLAS_OVERRIDE` in the container,
and use the Pytorch API `torch.set_float32_matmul_precision`, `torch.backends.cudnn.allow_tf32=False` accordingly.


We recommend that users print out these two flags for confirmation when unsure.

Expand Down
14 changes: 8 additions & 6 deletions monai/config/deviceconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def get_gpu_info() -> OrderedDict:
_dict_append(output, "CUDA version", lambda: torch.version.cuda)
cudnn_ver = torch.backends.cudnn.version()
_dict_append(output, "cuDNN enabled", lambda: bool(cudnn_ver))
_dict_append(output, "NVIDIA_TF32_OVERRIDE", os.environ.get("NVIDIA_TF32_OVERRIDE"))
_dict_append(output, "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE", os.environ.get("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"))

if cudnn_ver:
_dict_append(output, "cuDNN version", lambda: cudnn_ver)
Expand All @@ -215,12 +217,12 @@ def get_gpu_info() -> OrderedDict:

for gpu in range(num_gpus):
gpu_info = torch.cuda.get_device_properties(gpu)
_dict_append(output, f"GPU {gpu} Name", lambda: gpu_info.name)
_dict_append(output, f"GPU {gpu} Is integrated", lambda: bool(gpu_info.is_integrated))
_dict_append(output, f"GPU {gpu} Is multi GPU board", lambda: bool(gpu_info.is_multi_gpu_board))
_dict_append(output, f"GPU {gpu} Multi processor count", lambda: gpu_info.multi_processor_count)
_dict_append(output, f"GPU {gpu} Total memory (GB)", lambda: round(gpu_info.total_memory / 1024**3, 1))
_dict_append(output, f"GPU {gpu} CUDA capability (maj.min)", lambda: f"{gpu_info.major}.{gpu_info.minor}")
_dict_append(output, f"GPU {gpu} Name", gpu_info.name)
_dict_append(output, f"GPU {gpu} Is integrated", bool(gpu_info.is_integrated))
_dict_append(output, f"GPU {gpu} Is multi GPU board", bool(gpu_info.is_multi_gpu_board))
_dict_append(output, f"GPU {gpu} Multi processor count", gpu_info.multi_processor_count)
_dict_append(output, f"GPU {gpu} Total memory (GB)", round(gpu_info.total_memory / 1024**3, 1))
_dict_append(output, f"GPU {gpu} CUDA capability (maj.min)", f"{gpu_info.major}.{gpu_info.minor}")

return output

Expand Down
4 changes: 2 additions & 2 deletions monai/utils/tf32.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def has_ampere_or_later() -> bool:
@functools.lru_cache(None)
def detect_default_tf32() -> bool:
"""
Dectect if there is anything that may enable TF32 mode by default.
Detect if there is anything that may enable TF32 mode by default.
If any, show a warning message.
"""
may_enable_tf32 = False
Expand All @@ -70,7 +70,7 @@ def detect_default_tf32() -> bool:
)
may_enable_tf32 = True

override_tf32_env_vars = {"NVIDIA_TF32_OVERRIDE": "1", "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE": "1"}
override_tf32_env_vars = {"NVIDIA_TF32_OVERRIDE": "1"} # TORCH_ALLOW_TF32_CUBLAS_OVERRIDE not checked #6907
for name, override_val in override_tf32_env_vars.items():
if os.environ.get(name) == override_val:
warnings.warn(
Expand Down

0 comments on commit 40048d7

Please sign in to comment.