diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1282560b2..ced51bb43 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,8 +1,12 @@ repos: - - repo: https://github.com/timothycrosley/isort - rev: 5.0.7 + - repo: https://github.com/pycqa/isort + rev: 5.6.4 hooks: - id: isort + args: ["--settings-path=setup.cfg"] + exclude: __init__.py$ + types: [text] + types_or: [python, cython, pyi] - repo: https://github.com/ambv/black rev: 19.10b0 hooks: diff --git a/benchmarks/send-recv-core.py b/benchmarks/send-recv-core.py index 6af9f938c..6ab5c1125 100644 --- a/benchmarks/send-recv-core.py +++ b/benchmarks/send-recv-core.py @@ -44,7 +44,7 @@ from threading import Lock from time import perf_counter as clock -from distributed.utils import format_bytes, parse_bytes +from dask.utils import format_bytes, parse_bytes import ucp from ucp._libs import ucx_api diff --git a/benchmarks/send-recv.py b/benchmarks/send-recv.py index 9cd176021..2131e5bc9 100644 --- a/benchmarks/send-recv.py +++ b/benchmarks/send-recv.py @@ -44,7 +44,7 @@ import os from time import perf_counter as clock -from distributed.utils import format_bytes, parse_bytes +from dask.utils import format_bytes, parse_bytes import ucp diff --git a/ci/checks/style.sh b/ci/checks/style.sh index fde017651..81a18b226 100644 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -15,7 +15,7 @@ LANG=C.UTF-8 conda activate rapids # Run isort and get results/return code -ISORT=`isort --recursive --check-only .` +ISORT=`isort --check-only . --settings-path=setup.cfg` ISORT_RETVAL=$? # Run black and get results/return code diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 0d720bd3e..ad297a802 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -26,7 +26,7 @@ export HOME=$WORKSPACE cd $WORKSPACE export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` -export RAPIDS_VERSION="21.06" +export RAPIDS_VERSION="21.08" export UCX_PATH=$CONDA_PREFIX ################################################################################ @@ -60,22 +60,12 @@ conda info conda config --show-sources conda list --show-channel-urls -################################################################################ -# BUILD - Build ucx-py -################################################################################ - -gpuci_logger "Build ucx-py" -cd $WORKSPACE -python setup.py build_ext --inplace -python -m pip install -e . - ################################################################################ # TEST - Run py.tests for ucx-py ################################################################################ +function run_tests() { + UCX111=$1 -if hasArg --skip-tests; then - gpuci_logger "Skipping Tests" -else gpuci_logger "Check GPU usage" nvidia-smi @@ -83,6 +73,9 @@ else awk 'END{print $1}' /etc/hosts cat /etc/hosts + gpuci_logger "UCX Version and Build Configuration" + ucx_info -v + gpuci_logger "Python py.test for ucx-py" cd $WORKSPACE @@ -90,30 +83,84 @@ else ls tests/ # Setting UCX options - export UCXPY_IFNAME=eth0 - export UCX_MEMTYPE_CACHE=n - export UCX_TLS=tcp,cuda_copy,sockcm - export UCX_SOCKADDR_TLS_PRIORITY=sockcm + if [ "$UCX111" == "1" ]; then + export UCX_TLS=tcp,cuda_copy + else + export UCX_TLS=tcp,cuda_copy,sockcm + fi # Test with TCP/Sockets gpuci_logger "TEST WITH TCP ONLY" - py.test --cache-clear -vs --ignore-glob tests/test_send_recv_two_workers.py tests/ + py.test --cache-clear -vs tests/ py.test --cache-clear -vs ucp/_libs/tests # Test downstream packages, which requires Python v3.7 if [ $(python -c "import sys; print(sys.version_info[1])") -ge "7" ]; then - gpuci_logger "TEST OF DASK/UCX" - py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_cupy as m;print(m.__file__)"` - py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_numba as m;print(m.__file__)"` - py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_rmm as m;print(m.__file__)"` - py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_collection_cuda as m;print(m.__file__)"` - py.test --cache-clear -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"` - py.test --cache-clear -vs `python -c "import distributed.tests.test_nanny as m;print(m.__file__)"` - py.test --cache-clear -m "slow" -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"` + # Clone Distributed to avoid pytest cleanup fixture errors + # See https://github.com/dask/distributed/issues/4902 + gpuci_logger "Clone Distributed" + git clone https://github.com/dask/distributed + + gpuci_logger "Run Distributed Tests" + py.test --cache-clear -vs distributed/distributed/protocol/tests/test_cupy.py + py.test --cache-clear -vs distributed/distributed/protocol/tests/test_numba.py + py.test --cache-clear -vs distributed/distributed/protocol/tests/test_rmm.py + py.test --cache-clear -vs distributed/distributed/protocol/tests/test_collection_cuda.py + py.test --cache-clear -vs distributed/distributed/tests/test_nanny.py + py.test --cache-clear -vs --runslow distributed/distributed/comm/tests/test_ucx.py fi gpuci_logger "Run local benchmark" python benchmarks/send-recv.py -o cupy --server-dev 0 --client-dev 0 --reuse-alloc python benchmarks/send-recv-core.py -o cupy --server-dev 0 --client-dev 0 --reuse-alloc python benchmarks/cudf-merge.py --chunks-per-dev 4 --chunk-size 10000 --rmm-init-pool-size 2097152 +} + +################################################################################ +# BUILD - Build UCX-Py and run tests +################################################################################ + +gpuci_logger "UCX Version and Build Information" +ucx_info -v + +gpuci_logger "Build UCX-Py" +cd $WORKSPACE +python setup.py build_ext --inplace +python -m pip install -e . + +if hasArg --skip-tests; then + gpuci_logger "Skipping Tests" +else + run_tests 0 +fi + + +################################################################################ +# BUILD - Build UCX master, UCX-Py and run tests +################################################################################ + +gpuci_logger "Build UCX master" +cd $WORKSPACE +git clone https://github.com/openucx/ucx +cd ucx +git checkout v1.11.x +./autogen.sh +mkdir build +cd build +../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt +make -j install + +gpuci_logger "UCX Version and Build Information" +ucx_info -v + +gpuci_logger "Build UCX-Py" +cd $WORKSPACE +git clean -ffdx +python setup.py build_ext --inplace +python -m pip install -e . + +if hasArg --skip-tests; then + gpuci_logger "Skipping Tests" +else + run_tests 1 fi diff --git a/debug-tests/client.py b/debug-tests/client.py index d3eb16b28..1e09c8164 100644 --- a/debug-tests/client.py +++ b/debug-tests/client.py @@ -82,7 +82,7 @@ async def read(): # cuda_obj_generator = cloudpickle.loads(func) # pure_cuda_obj = cuda_obj_generator() - # from cudf.tests.utils import assert_eq + # from cudf.testing._utils import assert_eq # import cupy as cp # if isinstance(rx_cuda_obj, cp.ndarray): diff --git a/debug-tests/debug_utils.py b/debug-tests/debug_utils.py index c9dd49cc3..32bc6884b 100644 --- a/debug-tests/debug_utils.py +++ b/debug-tests/debug_utils.py @@ -5,7 +5,7 @@ import cupy from utils import get_num_gpus -from distributed.utils import parse_bytes +from dask.utils import parse_bytes import rmm diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 1ca673887..e8e40acfe 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -46,21 +46,35 @@ UCX_MEMTYPE_CACHE This is a UCX Memory optimization which toggles whether UCX library intercepts cu*alloc* calls. UCX-Py defaults this value to ``n``. There `known issues `_ when using this feature. -Values: n/y +Values: ``n``/``y`` UCX_CUDA_IPC_CACHE `````````````````` This is a UCX CUDA Memory optimization which enables/disables a remote endpoint IPC memhandle mapping cache. UCX/UCX-Py defaults this value to ``y`` -Values: n/y +Values: ``n``/``y`` + +UCX_MAX_RNDV_RAILS +`````````````````` + +Limitting the number of rails (network devices) to ``1`` allows UCX to use only the closest device according to NUMA locality and system topology. Particularly useful with InfiniBand and CUDA GPUs, ensuring all transfers from/to the GPU will use the closest InfiniBand device and thus implicitly enable GPUDirectRDMA. + +Values: Int (UCX default: ``2``) + +UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES +````````````````````````````````` + +By defining ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda``, UCX enables registration cache based on a buffer's base address, thus preventing multiple time-consuming registrations for the same buffer. This is particularly useful when using a CUDA memory pool, thus requiring a single registration between two ends for the entire pool, providing considerable performance gains, especially when using InfiniBand. + +Requires UCX 1.11 and above. UCX_RNDV_THRESH ``````````````` This is a configurable parameter used by UCX to help determine which transport method should be used. For example, on machines with multiple GPUs, and with NVLink enabled, UCX can deliver messages either through TCP or NVLink. Sending GPU buffers over TCP is costly as it triggers a device-to-host on the sender side, and then host-to-device transfer on the receiver side -- we want to avoid these kinds of transfers when NVLink is available. If a buffer is below the threshold, `Rendezvous-Protocol `_ is triggered and for UCX-Py users, this will typically mean messages will be delivered through TCP. Depending on the application, messages can be quite small, therefore, we recommend setting a small value if the application uses NVLink or InfiniBand: ``UCX_RNDV_THRESH=8192`` -Values: Int (UCX-Py default : 8192) +Values: Int (UCX-Py default: ``8192``) UCX_RNDV_SCHEME @@ -100,7 +114,7 @@ Transport Methods (Simplified): - ``rc`` -> InfiniBand (ibv_post_send, ibv_post_recv, ibv_poll_cq) uses rc_v and rc_x (preferably if available) - ``cuda_copy`` -> cuMemHostRegister, cuMemcpyAsync - ``cuda_ipc`` -> CUDA Interprocess Communication (cuIpcCloseMemHandle, cuIpcOpenMemHandle, cuMemcpyAsync) -- ``sockcm`` -> connection management over sockets +- ``sockcm`` -> connection management over sockets (Only applies to UCX 1.9 and older) - ``sm/shm`` -> all shared memory transports (mm, cma, knem) - ``mm`` -> shared memory transports - only memory mappers - ``ugni`` -> ugni_smsg and ugni_rdma (uses ugni_udt for bootstrap) @@ -145,25 +159,49 @@ InfiniBand -- No NVLink :: - UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy UCX_SOCKADDR_TLS_PRIORITY=sockcm