From 179d2e47d1ddb4e26789c80b4b3c7690e55f9a54 Mon Sep 17 00:00:00 2001 From: Markus Goetz Date: Thu, 15 Nov 2018 16:51:46 +0100 Subject: [PATCH 1/2] Added CUDA-aware MPI check and implemented it for the communication stack --- heat/core/communication.py | 13 +++++++++++-- .../{test_communicator.py => test_communication.py} | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) rename heat/core/tests/{test_communicator.py => test_communication.py} (94%) diff --git a/heat/core/communication.py b/heat/core/communication.py index 44785b31d4..7d5c008382 100644 --- a/heat/core/communication.py +++ b/heat/core/communication.py @@ -1,9 +1,17 @@ from mpi4py import MPI import abc +import subprocess import torch from .stride_tricks import sanitize_axis +# check whether OpenMPI support CUDA-aware MPI +try: + buffer = subprocess.check_output(['ompi_info', '--parsable', '--all']) + CUDA_AWARE_MPI = b'mpi_built_with_cuda_support:value:true' in buffer +except FileNotFoundError: + CUDA_AWARE_MPI = False + class Communication(metaclass=abc.ABCMeta): @staticmethod @@ -93,11 +101,12 @@ def chunk(self, shape, split): def as_buffer(obj): if isinstance(obj, tensor.tensor): obj = obj._tensor__array - if not isinstance(obj, torch.Tensor): return obj - return MPI.memory.fromaddress(obj.cpu().data_ptr(), obj.element_size() * torch.numel(obj)) + pointer = obj.data_ptr() if CUDA_AWARE_MPI else obj.cpu().data_ptr() + + return MPI.memory.fromaddress(pointer, obj.element_size() * torch.numel(obj)) def convert_tensors(self, a_callable): def wrapped(*args, **kwargs): diff --git a/heat/core/tests/test_communicator.py b/heat/core/tests/test_communication.py similarity index 94% rename from heat/core/tests/test_communicator.py rename to heat/core/tests/test_communication.py index e959b2e309..e26988a427 100644 --- a/heat/core/tests/test_communicator.py +++ b/heat/core/tests/test_communication.py @@ -54,3 +54,6 @@ def test_mpi_communicator(self): self.assertIsInstance(chunks, tuple) self.assertEqual(len(chunks), len(self.data.shape)) + + def test_cuda_aware_mpi(self): + self.assertTrue(hasattr(ht.communication, 'CUDA_AWARE_MPI')) From 08feb59689728679e99f3b15514451adb0262ec5 Mon Sep 17 00:00:00 2001 From: Markus Goetz Date: Sun, 18 Nov 2018 03:21:02 +0100 Subject: [PATCH 2/2] Added code that tries to determine the used MPI installation --- heat/core/communication.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/heat/core/communication.py b/heat/core/communication.py index 7d5c008382..4d3292f117 100644 --- a/heat/core/communication.py +++ b/heat/core/communication.py @@ -7,8 +7,14 @@ # check whether OpenMPI support CUDA-aware MPI try: - buffer = subprocess.check_output(['ompi_info', '--parsable', '--all']) - CUDA_AWARE_MPI = b'mpi_built_with_cuda_support:value:true' in buffer + buffer = subprocess.check_output(['mpirun', '--help']) + + # OpenMPI + if buffer.startswith(b'mpirun (Open MPI)'): + buffer = subprocess.check_output(['ompi_info', '--parsable', '--all']) + CUDA_AWARE_MPI = b'mpi_built_with_cuda_support:value:true' in buffer + else: + CUDA_AWARE_MPI = False except FileNotFoundError: CUDA_AWARE_MPI = False