Skip to content

Commit

Permalink
Merge pull request #771 from rapidsai/branch-0.21
Browse files Browse the repository at this point in the history
[RELEASE] ucx-py v0.21
  • Loading branch information
raydouglass authored Aug 4, 2021
2 parents 19440ef + bfa0450 commit 1cf4619
Show file tree
Hide file tree
Showing 27 changed files with 887 additions and 154 deletions.
8 changes: 6 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
repos:
- repo: https://github.com/timothycrosley/isort
rev: 5.0.7
- repo: https://github.com/pycqa/isort
rev: 5.6.4
hooks:
- id: isort
args: ["--settings-path=setup.cfg"]
exclude: __init__.py$
types: [text]
types_or: [python, cython, pyi]
- repo: https://github.com/ambv/black
rev: 19.10b0
hooks:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/send-recv-core.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from threading import Lock
from time import perf_counter as clock

from distributed.utils import format_bytes, parse_bytes
from dask.utils import format_bytes, parse_bytes

import ucp
from ucp._libs import ucx_api
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/send-recv.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
import os
from time import perf_counter as clock

from distributed.utils import format_bytes, parse_bytes
from dask.utils import format_bytes, parse_bytes

import ucp

Expand Down
2 changes: 1 addition & 1 deletion ci/checks/style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ LANG=C.UTF-8
conda activate rapids

# Run isort and get results/return code
ISORT=`isort --recursive --check-only .`
ISORT=`isort --check-only . --settings-path=setup.cfg`
ISORT_RETVAL=$?

# Run black and get results/return code
Expand Down
99 changes: 73 additions & 26 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export HOME=$WORKSPACE
cd $WORKSPACE
export GIT_DESCRIBE_TAG=`git describe --tags`
export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
export RAPIDS_VERSION="21.06"
export RAPIDS_VERSION="21.08"
export UCX_PATH=$CONDA_PREFIX

################################################################################
Expand Down Expand Up @@ -60,60 +60,107 @@ conda info
conda config --show-sources
conda list --show-channel-urls

################################################################################
# BUILD - Build ucx-py
################################################################################

gpuci_logger "Build ucx-py"
cd $WORKSPACE
python setup.py build_ext --inplace
python -m pip install -e .

################################################################################
# TEST - Run py.tests for ucx-py
################################################################################
function run_tests() {
UCX111=$1

if hasArg --skip-tests; then
gpuci_logger "Skipping Tests"
else
gpuci_logger "Check GPU usage"
nvidia-smi

gpuci_logger "Check NICs"
awk 'END{print $1}' /etc/hosts
cat /etc/hosts

gpuci_logger "UCX Version and Build Configuration"
ucx_info -v

gpuci_logger "Python py.test for ucx-py"
cd $WORKSPACE

# list test directory
ls tests/

# Setting UCX options
export UCXPY_IFNAME=eth0
export UCX_MEMTYPE_CACHE=n
export UCX_TLS=tcp,cuda_copy,sockcm
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
if [ "$UCX111" == "1" ]; then
export UCX_TLS=tcp,cuda_copy
else
export UCX_TLS=tcp,cuda_copy,sockcm
fi

# Test with TCP/Sockets
gpuci_logger "TEST WITH TCP ONLY"
py.test --cache-clear -vs --ignore-glob tests/test_send_recv_two_workers.py tests/
py.test --cache-clear -vs tests/
py.test --cache-clear -vs ucp/_libs/tests

# Test downstream packages, which requires Python v3.7
if [ $(python -c "import sys; print(sys.version_info[1])") -ge "7" ]; then
gpuci_logger "TEST OF DASK/UCX"
py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_cupy as m;print(m.__file__)"`
py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_numba as m;print(m.__file__)"`
py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_rmm as m;print(m.__file__)"`
py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_collection_cuda as m;print(m.__file__)"`
py.test --cache-clear -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
py.test --cache-clear -vs `python -c "import distributed.tests.test_nanny as m;print(m.__file__)"`
py.test --cache-clear -m "slow" -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
# Clone Distributed to avoid pytest cleanup fixture errors
# See https://github.com/dask/distributed/issues/4902
gpuci_logger "Clone Distributed"
git clone https://github.com/dask/distributed

gpuci_logger "Run Distributed Tests"
py.test --cache-clear -vs distributed/distributed/protocol/tests/test_cupy.py
py.test --cache-clear -vs distributed/distributed/protocol/tests/test_numba.py
py.test --cache-clear -vs distributed/distributed/protocol/tests/test_rmm.py
py.test --cache-clear -vs distributed/distributed/protocol/tests/test_collection_cuda.py
py.test --cache-clear -vs distributed/distributed/tests/test_nanny.py
py.test --cache-clear -vs --runslow distributed/distributed/comm/tests/test_ucx.py
fi

gpuci_logger "Run local benchmark"
python benchmarks/send-recv.py -o cupy --server-dev 0 --client-dev 0 --reuse-alloc
python benchmarks/send-recv-core.py -o cupy --server-dev 0 --client-dev 0 --reuse-alloc
python benchmarks/cudf-merge.py --chunks-per-dev 4 --chunk-size 10000 --rmm-init-pool-size 2097152
}

################################################################################
# BUILD - Build UCX-Py and run tests
################################################################################

gpuci_logger "UCX Version and Build Information"
ucx_info -v

gpuci_logger "Build UCX-Py"
cd $WORKSPACE
python setup.py build_ext --inplace
python -m pip install -e .

if hasArg --skip-tests; then
gpuci_logger "Skipping Tests"
else
run_tests 0
fi


################################################################################
# BUILD - Build UCX master, UCX-Py and run tests
################################################################################

gpuci_logger "Build UCX master"
cd $WORKSPACE
git clone https://github.com/openucx/ucx
cd ucx
git checkout v1.11.x
./autogen.sh
mkdir build
cd build
../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt
make -j install

gpuci_logger "UCX Version and Build Information"
ucx_info -v

gpuci_logger "Build UCX-Py"
cd $WORKSPACE
git clean -ffdx
python setup.py build_ext --inplace
python -m pip install -e .

if hasArg --skip-tests; then
gpuci_logger "Skipping Tests"
else
run_tests 1
fi
2 changes: 1 addition & 1 deletion debug-tests/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ async def read():
# cuda_obj_generator = cloudpickle.loads(func)
# pure_cuda_obj = cuda_obj_generator()

# from cudf.tests.utils import assert_eq
# from cudf.testing._utils import assert_eq
# import cupy as cp

# if isinstance(rx_cuda_obj, cp.ndarray):
Expand Down
2 changes: 1 addition & 1 deletion debug-tests/debug_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import cupy
from utils import get_num_gpus

from distributed.utils import parse_bytes
from dask.utils import parse_bytes

import rmm

Expand Down
54 changes: 46 additions & 8 deletions docs/source/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,35 @@ UCX_MEMTYPE_CACHE

This is a UCX Memory optimization which toggles whether UCX library intercepts cu*alloc* calls. UCX-Py defaults this value to ``n``. There `known issues <https://github.com/openucx/ucx/wiki/NVIDIA-GPU-Support#known-issues>`_ when using this feature.

Values: n/y
Values: ``n``/``y``

UCX_CUDA_IPC_CACHE
``````````````````

This is a UCX CUDA Memory optimization which enables/disables a remote endpoint IPC memhandle mapping cache. UCX/UCX-Py defaults this value to ``y``

Values: n/y
Values: ``n``/``y``

UCX_MAX_RNDV_RAILS
``````````````````

Limitting the number of rails (network devices) to ``1`` allows UCX to use only the closest device according to NUMA locality and system topology. Particularly useful with InfiniBand and CUDA GPUs, ensuring all transfers from/to the GPU will use the closest InfiniBand device and thus implicitly enable GPUDirectRDMA.

Values: Int (UCX default: ``2``)

UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES
`````````````````````````````````

By defining ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda``, UCX enables registration cache based on a buffer's base address, thus preventing multiple time-consuming registrations for the same buffer. This is particularly useful when using a CUDA memory pool, thus requiring a single registration between two ends for the entire pool, providing considerable performance gains, especially when using InfiniBand.

Requires UCX 1.11 and above.

UCX_RNDV_THRESH
```````````````

This is a configurable parameter used by UCX to help determine which transport method should be used. For example, on machines with multiple GPUs, and with NVLink enabled, UCX can deliver messages either through TCP or NVLink. Sending GPU buffers over TCP is costly as it triggers a device-to-host on the sender side, and then host-to-device transfer on the receiver side -- we want to avoid these kinds of transfers when NVLink is available. If a buffer is below the threshold, `Rendezvous-Protocol <https://github.com/openucx/ucx/wiki/Rendezvous-Protocol>`_ is triggered and for UCX-Py users, this will typically mean messages will be delivered through TCP. Depending on the application, messages can be quite small, therefore, we recommend setting a small value if the application uses NVLink or InfiniBand: ``UCX_RNDV_THRESH=8192``

Values: Int (UCX-Py default : 8192)
Values: Int (UCX-Py default: ``8192``)


UCX_RNDV_SCHEME
Expand Down Expand Up @@ -100,7 +114,7 @@ Transport Methods (Simplified):
- ``rc`` -> InfiniBand (ibv_post_send, ibv_post_recv, ibv_poll_cq) uses rc_v and rc_x (preferably if available)
- ``cuda_copy`` -> cuMemHostRegister, cuMemcpyAsync
- ``cuda_ipc`` -> CUDA Interprocess Communication (cuIpcCloseMemHandle, cuIpcOpenMemHandle, cuMemcpyAsync)
- ``sockcm`` -> connection management over sockets
- ``sockcm`` -> connection management over sockets (Only applies to UCX 1.9 and older)
- ``sm/shm`` -> all shared memory transports (mm, cma, knem)
- ``mm`` -> shared memory transports - only memory mappers
- ``ugni`` -> ugni_smsg and ugni_rdma (uses ugni_udt for bootstrap)
Expand Down Expand Up @@ -145,25 +159,49 @@ InfiniBand -- No NVLink

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy <SCRIPT>

Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,cuda_copy <SCRIPT>

InfiniBand -- With NVLink
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy,cuda_ipc <SCRIPT>

Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,cuda_copy,cuda_ipc <SCRIPT>

TLS/Socket -- No NVLink
~~~~~~~~~~~~~~~~~~~~~~~

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy <SCRIPT>

Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy <SCRIPT>

TLS/Socket -- With NVLink
~~~~~~~~~~~~~~~~~~~~~~~~~

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc <SCRIPT>

Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:

::

UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc <SCRIPT>
50 changes: 42 additions & 8 deletions docs/source/dask.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,34 @@ Dask-cuda can also be used when manually starting a cluster:
# server
# Note: --interface is an Ethernet interface
UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
# worker
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm dask-cuda-worker ucx://{SCHEDULER_ADDR}:8786
dask-cuda-worker ucx://{SCHEDULER_ADDR}:8786
# client
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python <python file>
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
python <python file>
Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The commands above would be modified as follows for UCX 1.10:

.. code-block:: bash
# server
# Note: --interface is an Ethernet interface
UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
# worker
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
dask-cuda-worker ucx://{SCHEDULER_ADDR}:8786
# client
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
python <python file>
The benefit of using ``dask-cuda-worker`` is that it will invoke N workers where N is the number of GPUs and automatically pair workers with GPUs.
Expand All @@ -76,15 +94,31 @@ Lastly, we can also manually start each worker individually (this is typically o
# server
UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
# worker
CUDA_VISIBLE_DEVICES=0 UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm dask-worker ucx://{SCHEDULER_ADDR}:8786
dask-worker ucx://{SCHEDULER_ADDR}:8786
# client
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
python <python file>
Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The commands above would be modified as follows for UCX 1.10:

.. code-block:: bash
# server
UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
# worker
CUDA_VISIBLE_DEVICES=0 UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
dask-worker ucx://{SCHEDULER_ADDR}:8786
# client
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python <python file>
UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
python <python file>
Note: ``CUDA_VISIBLE_DEVICES`` controls which GPU(s) the worker has access to and ``--interface`` is an Ethernet interface

Expand Down
Loading

0 comments on commit 1cf4619

Please sign in to comment.