diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1282560b2..ced51bb43 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,8 +1,12 @@
 repos:
-      - repo: https://github.com/timothycrosley/isort
-        rev: 5.0.7
+      - repo: https://github.com/pycqa/isort
+        rev: 5.6.4
         hooks:
               - id: isort
+                args: ["--settings-path=setup.cfg"]
+                exclude: __init__.py$
+                types: [text]
+                types_or: [python, cython, pyi]
       - repo: https://github.com/ambv/black
         rev: 19.10b0
         hooks:
diff --git a/benchmarks/send-recv-core.py b/benchmarks/send-recv-core.py
index 6af9f938c..6ab5c1125 100644
--- a/benchmarks/send-recv-core.py
+++ b/benchmarks/send-recv-core.py
@@ -44,7 +44,7 @@
 from threading import Lock
 from time import perf_counter as clock
 
-from distributed.utils import format_bytes, parse_bytes
+from dask.utils import format_bytes, parse_bytes
 
 import ucp
 from ucp._libs import ucx_api
diff --git a/benchmarks/send-recv.py b/benchmarks/send-recv.py
index 9cd176021..2131e5bc9 100644
--- a/benchmarks/send-recv.py
+++ b/benchmarks/send-recv.py
@@ -44,7 +44,7 @@
 import os
 from time import perf_counter as clock
 
-from distributed.utils import format_bytes, parse_bytes
+from dask.utils import format_bytes, parse_bytes
 
 import ucp
 
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index fde017651..81a18b226 100644
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -15,7 +15,7 @@ LANG=C.UTF-8
 conda activate rapids
 
 # Run isort and get results/return code
-ISORT=`isort --recursive --check-only .`
+ISORT=`isort --check-only . --settings-path=setup.cfg`
 ISORT_RETVAL=$?
 
 # Run black and get results/return code
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 0d720bd3e..ad297a802 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -26,7 +26,7 @@ export HOME=$WORKSPACE
 cd $WORKSPACE
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
-export RAPIDS_VERSION="21.06"
+export RAPIDS_VERSION="21.08"
 export UCX_PATH=$CONDA_PREFIX
 
 ################################################################################
@@ -60,22 +60,12 @@ conda info
 conda config --show-sources
 conda list --show-channel-urls
 
-################################################################################
-# BUILD - Build ucx-py
-################################################################################
-
-gpuci_logger "Build ucx-py"
-cd $WORKSPACE
-python setup.py build_ext --inplace
-python -m pip install -e .
-
 ################################################################################
 # TEST - Run py.tests for ucx-py
 ################################################################################
+function run_tests() {
+    UCX111=$1
 
-if hasArg --skip-tests; then
-    gpuci_logger "Skipping Tests"
-else
     gpuci_logger "Check GPU usage"
     nvidia-smi
 
@@ -83,6 +73,9 @@ else
     awk 'END{print $1}' /etc/hosts
     cat /etc/hosts
 
+    gpuci_logger "UCX Version and Build Configuration"
+    ucx_info -v
+
     gpuci_logger "Python py.test for ucx-py"
     cd $WORKSPACE
 
@@ -90,30 +83,84 @@ else
     ls tests/
 
     # Setting UCX options
-    export UCXPY_IFNAME=eth0
-    export UCX_MEMTYPE_CACHE=n
-    export UCX_TLS=tcp,cuda_copy,sockcm
-    export UCX_SOCKADDR_TLS_PRIORITY=sockcm
+    if [ "$UCX111" == "1" ]; then
+        export UCX_TLS=tcp,cuda_copy
+    else
+        export UCX_TLS=tcp,cuda_copy,sockcm
+    fi
 
     # Test with TCP/Sockets
     gpuci_logger "TEST WITH TCP ONLY"
-    py.test --cache-clear -vs --ignore-glob tests/test_send_recv_two_workers.py tests/
+    py.test --cache-clear -vs tests/
     py.test --cache-clear -vs ucp/_libs/tests
 
     # Test downstream packages, which requires Python v3.7
     if [ $(python -c "import sys; print(sys.version_info[1])") -ge "7" ]; then
-        gpuci_logger "TEST OF DASK/UCX"
-        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_cupy as m;print(m.__file__)"`
-        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_numba as m;print(m.__file__)"`
-        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_rmm as m;print(m.__file__)"`
-        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_collection_cuda as m;print(m.__file__)"`
-        py.test --cache-clear -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
-        py.test --cache-clear -vs `python -c "import distributed.tests.test_nanny as m;print(m.__file__)"`
-        py.test --cache-clear -m "slow" -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
+        # Clone Distributed to avoid pytest cleanup fixture errors
+        # See https://github.com/dask/distributed/issues/4902
+        gpuci_logger "Clone Distributed"
+        git clone https://github.com/dask/distributed
+
+        gpuci_logger "Run Distributed Tests"
+        py.test --cache-clear -vs distributed/distributed/protocol/tests/test_cupy.py
+        py.test --cache-clear -vs distributed/distributed/protocol/tests/test_numba.py
+        py.test --cache-clear -vs distributed/distributed/protocol/tests/test_rmm.py
+        py.test --cache-clear -vs distributed/distributed/protocol/tests/test_collection_cuda.py
+        py.test --cache-clear -vs distributed/distributed/tests/test_nanny.py
+        py.test --cache-clear -vs --runslow distributed/distributed/comm/tests/test_ucx.py
     fi
 
     gpuci_logger "Run local benchmark"
     python benchmarks/send-recv.py -o cupy --server-dev 0 --client-dev 0 --reuse-alloc
     python benchmarks/send-recv-core.py -o cupy --server-dev 0 --client-dev 0 --reuse-alloc
     python benchmarks/cudf-merge.py --chunks-per-dev 4 --chunk-size 10000 --rmm-init-pool-size 2097152
+}
+
+################################################################################
+# BUILD - Build UCX-Py and run tests
+################################################################################
+
+gpuci_logger "UCX Version and Build Information"
+ucx_info -v
+
+gpuci_logger "Build UCX-Py"
+cd $WORKSPACE
+python setup.py build_ext --inplace
+python -m pip install -e .
+
+if hasArg --skip-tests; then
+    gpuci_logger "Skipping Tests"
+else
+    run_tests 0
+fi
+
+
+################################################################################
+# BUILD - Build UCX master, UCX-Py and run tests
+################################################################################
+
+gpuci_logger "Build UCX master"
+cd $WORKSPACE
+git clone https://github.com/openucx/ucx
+cd ucx
+git checkout v1.11.x
+./autogen.sh
+mkdir build
+cd build
+../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt
+make -j install
+
+gpuci_logger "UCX Version and Build Information"
+ucx_info -v
+
+gpuci_logger "Build UCX-Py"
+cd $WORKSPACE
+git clean -ffdx
+python setup.py build_ext --inplace
+python -m pip install -e .
+
+if hasArg --skip-tests; then
+    gpuci_logger "Skipping Tests"
+else
+    run_tests 1
 fi
diff --git a/debug-tests/client.py b/debug-tests/client.py
index d3eb16b28..1e09c8164 100644
--- a/debug-tests/client.py
+++ b/debug-tests/client.py
@@ -82,7 +82,7 @@ async def read():
     # cuda_obj_generator = cloudpickle.loads(func)
     # pure_cuda_obj = cuda_obj_generator()
 
-    # from cudf.tests.utils import assert_eq
+    # from cudf.testing._utils import assert_eq
     # import cupy as cp
 
     # if isinstance(rx_cuda_obj, cp.ndarray):
diff --git a/debug-tests/debug_utils.py b/debug-tests/debug_utils.py
index c9dd49cc3..32bc6884b 100644
--- a/debug-tests/debug_utils.py
+++ b/debug-tests/debug_utils.py
@@ -5,7 +5,7 @@
 import cupy
 from utils import get_num_gpus
 
-from distributed.utils import parse_bytes
+from dask.utils import parse_bytes
 
 import rmm
 
diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
index 1ca673887..e8e40acfe 100644
--- a/docs/source/configuration.rst
+++ b/docs/source/configuration.rst
@@ -46,21 +46,35 @@ UCX_MEMTYPE_CACHE
 
 This is a UCX Memory optimization which toggles whether UCX library intercepts cu*alloc* calls.  UCX-Py defaults this value to  ``n``.  There `known issues <https://github.com/openucx/ucx/wiki/NVIDIA-GPU-Support#known-issues>`_ when using this feature.
 
-Values: n/y
+Values: ``n``/``y``
 
 UCX_CUDA_IPC_CACHE
 ``````````````````
 
 This is a UCX CUDA Memory optimization which enables/disables a remote endpoint IPC memhandle mapping cache. UCX/UCX-Py defaults this value to ``y``
 
-Values: n/y
+Values: ``n``/``y``
+
+UCX_MAX_RNDV_RAILS
+``````````````````
+
+Limitting the number of rails (network devices) to ``1`` allows UCX to use only the closest device according to NUMA locality and system topology. Particularly useful with InfiniBand and CUDA GPUs, ensuring all transfers from/to the GPU will use the closest InfiniBand device and thus implicitly enable GPUDirectRDMA.
+
+Values: Int (UCX default: ``2``)
+
+UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES
+`````````````````````````````````
+
+By defining ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda``, UCX enables registration cache based on a buffer's base address, thus preventing multiple time-consuming registrations for the same buffer. This is particularly useful when using a CUDA memory pool, thus requiring a single registration between two ends for the entire pool, providing considerable performance gains, especially when using InfiniBand.
+
+Requires UCX 1.11 and above.
 
 UCX_RNDV_THRESH
 ```````````````
 
 This is a configurable parameter used by UCX to help determine which transport method should be used.  For example, on machines with multiple GPUs, and with NVLink enabled, UCX can deliver messages either through TCP or NVLink.  Sending GPU buffers over TCP is costly as it triggers a device-to-host on the sender side, and then host-to-device transfer on the receiver side --  we want to avoid these kinds of transfers when NVLink is available.  If a buffer is below the threshold, `Rendezvous-Protocol <https://github.com/openucx/ucx/wiki/Rendezvous-Protocol>`_ is triggered and for UCX-Py users, this will typically mean messages will be delivered through TCP.  Depending on the application, messages can be quite small, therefore, we recommend setting a small value if the application uses NVLink or InfiniBand: ``UCX_RNDV_THRESH=8192``
 
-Values: Int (UCX-Py default : 8192)
+Values: Int (UCX-Py default: ``8192``)
 
 
 UCX_RNDV_SCHEME
@@ -100,7 +114,7 @@ Transport Methods (Simplified):
 - ``rc`` -> InfiniBand (ibv_post_send, ibv_post_recv, ibv_poll_cq) uses rc_v and rc_x (preferably if available)
 - ``cuda_copy`` -> cuMemHostRegister, cuMemcpyAsync
 - ``cuda_ipc`` -> CUDA Interprocess Communication (cuIpcCloseMemHandle, cuIpcOpenMemHandle, cuMemcpyAsync)
-- ``sockcm`` -> connection management over sockets
+- ``sockcm`` -> connection management over sockets (Only applies to UCX 1.9 and older)
 - ``sm/shm`` -> all shared memory transports (mm, cma, knem)
 - ``mm`` -> shared memory transports - only memory mappers
 - ``ugni`` -> ugni_smsg and ugni_rdma (uses ugni_udt for bootstrap)
@@ -145,25 +159,49 @@ InfiniBand -- No NVLink
 
 ::
 
-    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy <SCRIPT>
+
+Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:
+
+::
+
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,cuda_copy <SCRIPT>
 
 InfiniBand -- With NVLink
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 ::
 
-    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,sockcm,cuda_copy,cuda_ipc <SCRIPT>
+
+Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:
+
+::
+
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=rc,tcp,cuda_copy,cuda_ipc <SCRIPT>
 
 TLS/Socket -- No NVLink
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 ::
 
-    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy <SCRIPT>
+
+Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:
+
+::
+
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy <SCRIPT>
 
 TLS/Socket -- With NVLink
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 ::
 
-    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm <SCRIPT>
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc <SCRIPT>
+
+Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The command above would be modified as follows for UCX 1.10:
+
+::
+
+    UCX_RNDV_SCHEME=get_zcopy UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc <SCRIPT>
diff --git a/docs/source/dask.rst b/docs/source/dask.rst
index fc62a14f7..f4d680ed5 100644
--- a/docs/source/dask.rst
+++ b/docs/source/dask.rst
@@ -52,16 +52,34 @@ Dask-cuda can also be used when manually starting a cluster:
     # server
     # Note: --interface is an Ethernet interface
     UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
+    python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
 
 
     # worker
     UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm dask-cuda-worker ucx://{SCHEDULER_ADDR}:8786
+    dask-cuda-worker ucx://{SCHEDULER_ADDR}:8786
 
     # client
-    UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python <python file>
+    UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
+    python <python file>
+
+Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The commands above would be modified as follows for UCX 1.10:
+
+.. code-block:: bash
+
+    # server
+    # Note: --interface is an Ethernet interface
+    UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
+    python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
+
+
+    # worker
+    UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
+    dask-cuda-worker ucx://{SCHEDULER_ADDR}:8786
+
+    # client
+    UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
+    python <python file>
 
 
 The benefit of using ``dask-cuda-worker`` is that it will invoke N workers where N is the number of GPUs and automatically pair workers with GPUs.
@@ -76,15 +94,31 @@ Lastly, we can also manually start each worker individually (this is typically o
 
     # server
     UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
+    python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
 
     # worker
     CUDA_VISIBLE_DEVICES=0 UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm dask-worker ucx://{SCHEDULER_ADDR}:8786
+    dask-worker ucx://{SCHEDULER_ADDR}:8786
+
+    # client
+    UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc \
+    python <python file>
+
+Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The commands above would be modified as follows for UCX 1.10:
+
+.. code-block:: bash
+
+    # server
+    UCX_CUDA_IPC_CACHE=n UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
+    python -m distributed.cli.dask_scheduler --interface enp1s0f0 --protocol ucx
+
+    # worker
+    CUDA_VISIBLE_DEVICES=0 UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
+    dask-worker ucx://{SCHEDULER_ADDR}:8786
 
     # client
-    UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python <python file>
+    UCX_CUDA_IPC_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc \
+    python <python file>
 
 Note: ``CUDA_VISIBLE_DEVICES`` controls which GPU(s) the worker has access to and ``--interface`` is an Ethernet interface
 
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 7c3ba306d..3a22b0085 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -4,17 +4,22 @@ Install
 Prerequisites
 -------------
 
-UCX depends on the following system libraries being present: ``libibcm``,
-``libibverbs``, ``librdmacm``, and ``libnuma`` (``numactl`` on Enterprise
-Linux).  Please install these with your Linux system's package manager. When
-building from source you will also need the ``*-dev`` (``*-devel`` on
+UCX depends on the following system libraries being present:
+
+* For MOFED 4.x support: ``libibcm``, ``libibverbs`` and ``librdmacm``. Ideally installed from `Mellanox OFED Drivers <https://www.mellanox.com/products/infiniband-drivers/linux/mlnx_ofed>`_
+* For MOFED 5.x support: `Mellanox OFED Drivers <https://www.mellanox.com/products/infiniband-drivers/linux/mlnx_ofed>`_
+* For system topology identification: ``libnuma`` (``numactl`` on Enterprise Linux)
+
+Please install the packages above with your Linux system's package manager.
+When building from source you will also need the ``*-dev`` (``*-devel`` on
 Enterprise Linux) packages as well.
 
+
 Conda
 -----
 
 Some preliminary Conda packages can be installed as so. Replace
-``<CUDA version>`` with either ``10.1``, ``10.2``, or ``11.0``. These are
+``<CUDA version>`` with either ``11.0`` or ``11.2``. These are
 available both on ``rapidsai`` and ``rapidsai-nightly``.
 
 With GPU support:
@@ -53,6 +58,7 @@ Build Dependencies
         libhwloc psutil \
         "python=3.7" setuptools "cython>=0.29.14,<3.0.0a0"
 
+
 Test Dependencies
 ~~~~~~~~~~~~~~~~~
 
@@ -63,58 +69,56 @@ Test Dependencies
         cupy "numba>=0.46" rmm \
         distributed
 
-UCX-1.9
-~~~~~~~
 
-Instructions for building ucx 1.9:
+UCX-1.11 (Development)
+~~~~~~~~~~~~~~~~~~~~~~
+
+Instructions for building UCX 1.11 (current development version):
 
 ::
 
     conda activate ucx
     git clone https://github.com/openucx/ucx
     cd ucx
-    git checkout v1.9.x
-    # apply UCX IB registration cache patch, improves overall
-    # CUDA IB performance when using a memory pool
-    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/11ad7a3c1f25514df8064930f69c310be4fd55dc/recipe/cuda-alloc-rcache.patch
-    git apply cuda-alloc-rcache.patch
+    git checkout v1.11.x
     ./autogen.sh
     mkdir build
     cd build
     # Performance build
-    ../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
+    ../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I$CUDA_HOME/include"
     # Debug build
-    ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
+    ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I$CUDA_HOME/include"
     make -j install
 
-UCX-1.8
+
+UCX-1.9
 ~~~~~~~
 
-Instructions for building ucx 1.8:
+Instructions for building ucx 1.9:
 
 ::
 
     conda activate ucx
     git clone https://github.com/openucx/ucx
     cd ucx
-    git checkout v1.8.x
+    git checkout v1.9.x
     # apply UCX IB registration cache patch, improves overall
     # CUDA IB performance when using a memory pool
-    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/bd0377fb7363fd0ddbc3d506ae3414ef6f2e2f50/recipe/add-page-alignment.patch add-page-alignment.patch
-    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/bd0377fb7363fd0ddbc3d506ae3414ef6f2e2f50/recipe/ib_registration_cache.patch ib_registration_cache.patch
-    git apply ib_registration_cache.patch && git apply add-page-alignment.patch
+    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/11ad7a3c1f25514df8064930f69c310be4fd55dc/recipe/cuda-alloc-rcache.patch
+    git apply cuda-alloc-rcache.patch
     ./autogen.sh
     mkdir build
     cd build
     # Performance build
-    ../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
+    ../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I$CUDA_HOME/include"
     # Debug build
-    ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
+    ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I$CUDA_HOME/include"
     make -j install
 
 .. note::
     If you're running on a machine without CUDA then you _must NOT_ apply any of the patches above.
 
+
 UCX + OFED
 ~~~~~~~~~~
 
@@ -127,8 +131,7 @@ As noted above, the UCX conda package no longer builds support for IB/RDMA.  To
 
 If OFED drivers are not installed on the machine, you can download drivers at directly from `Mellanox <https://www.mellanox.com/products/infiniband-drivers/linux/mlnx_ofed>`_.  For versions older than 5.1 click on, *archive versions*.
 
-
-To build UCX with IB/RDMA support, include the ``--with-rdmacm`` and ``--with-verbs`` build flags.  For example:
+Building UCX 1.9 or 1.11 as shown previously should automatically include IB/RDMA support if available in the system. It is possible to explicitly activate those, ensuring the system satisfies all dependencies or fail otherwise, by including the ``--with-rdmacm`` and ``--with-verbs`` build flags. For example:
 
 ::
 
@@ -139,7 +142,7 @@ To build UCX with IB/RDMA support, include the ``--with-rdmacm`` and ``--with-ve
     --enable-mt \
     --with-rdmacm \
     --with-verbs \
-    CPPFLAGS="-I/$CUDA_HOME/include"
+    CPPFLAGS="-I$CUDA_HOME/include"
 
 
 UCX-Py
diff --git a/docs/source/ucx-debug.rst b/docs/source/ucx-debug.rst
index 8cb6a3646..eee5aaac1 100644
--- a/docs/source/ucx-debug.rst
+++ b/docs/source/ucx-debug.rst
@@ -110,8 +110,8 @@ NVLink Performance
 
 ::
 
-    CUDA_VISIBLE_DEVICES=0 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm  ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 -c 0 & \
-    CUDA_VISIBLE_DEVICES=1 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999 -c 1
+    CUDA_VISIBLE_DEVICES=0 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 -c 0 & \
+    CUDA_VISIBLE_DEVICES=1 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999 -c 1
     +--------------+-----------------------------+---------------------+-----------------------+
     |              |       latency (usec)        |   bandwidth (MB/s)  |  message rate (msg/s) |
     +--------------+---------+---------+---------+----------+----------+-----------+-----------+
@@ -128,6 +128,14 @@ NVLink Performance
                 10     0.000  4163.694  4163.694   22904.52   22904.52         240         240
 
 
+Starting in UCX 1.10, ``sockcm`` has been removed and should not anymore be added to ``UCX_TLS``. The commands above would be modified as follows for UCX 1.10:
+
+::
+
+    CUDA_VISIBLE_DEVICES=0 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 -c 0 & \
+    CUDA_VISIBLE_DEVICES=1 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999 -c 1
+
+
 Experimental Debugging
 ----------------------
 
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..e446d0a1c
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,21 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--runslow", action="store_true", default=False, help="run slow tests"
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "slow: mark test as slow to run")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--runslow"):
+        # --runslow given in cli: do not skip slow tests
+        return
+    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+    for item in items:
+        if "slow" in item.keywords:
+            item.add_marker(skip_slow)
diff --git a/tests/test_custom_send_recv.py b/tests/test_custom_send_recv.py
index 3e1d9532b..05979a53b 100644
--- a/tests/test_custom_send_recv.py
+++ b/tests/test_custom_send_recv.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from distributed.utils import nbytes
-
 import ucp
 
 cudf = pytest.importorskip("cudf")
@@ -35,9 +33,7 @@
     ],
 )
 async def test_send_recv_cudf(event_loop, g):
-    # requires numba=0.45 (.nbytes)
-    # or fix nbytes in distributed
-    cudf = pytest.importorskip("cudf")
+    from distributed.utils import nbytes
 
     class UCX:
         def __init__(self, ep):
@@ -115,7 +111,7 @@ async def serve_forever(ep):
     typ = type(msg)
     res = typ.deserialize(ucx_header, cudf_buffer)
 
-    from cudf.tests.utils import assert_eq
+    from cudf.testing._utils import assert_eq
 
     assert_eq(res, msg)
     await uu.comm.ep.close()
diff --git a/tests/test_disconnect.py b/tests/test_disconnect.py
index 59bbad6ae..53ad92d24 100644
--- a/tests/test_disconnect.py
+++ b/tests/test_disconnect.py
@@ -80,8 +80,7 @@ async def run():
     asyncio.get_event_loop().run_until_complete(run())
 
 
-@pytest.mark.parametrize("endpoint_error_handling", [True, False])
-def test_shutdown_unexpected_closed_peer(caplog, endpoint_error_handling):
+def test_shutdown_unexpected_closed_peer(caplog):
     """
     Test clean server shutdown after unexpected peer close
 
@@ -89,21 +88,18 @@ def test_shutdown_unexpected_closed_peer(caplog, endpoint_error_handling):
     The main goal is to assert that the processes exit without errors
     despite a somewhat messy initial state.
     """
-    if endpoint_error_handling is True:
-        if ucp.get_ucx_version() < (1, 10, 0):
-            pytest.skip("Endpoint error handling is only supported for UCX >= 1.10")
-    else:
-        if any(
-            [
-                t.startswith(i)
-                for i in ("rc", "dc", "ud")
-                for t in ucp.get_active_transports()
-            ]
-        ):
-            pytest.skip(
-                "Endpoint error handling is required when rc, dc or ud"
-                "transport is enabled"
-            )
+    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
+    if endpoint_error_handling is False and any(
+        [
+            t.startswith(i)
+            for i in ("rc", "dc", "ud")
+            for t in ucp.get_active_transports()
+        ]
+    ):
+        pytest.skip(
+            "Endpoint error handling is required when rc, dc or ud"
+            "transport is enabled"
+        )
 
     client_queue = mp.Queue()
     server_queue = mp.Queue()
diff --git a/tests/test_send_recv_two_workers.py b/tests/test_send_recv_two_workers.py
index f9a327719..6136dd484 100644
--- a/tests/test_send_recv_two_workers.py
+++ b/tests/test_send_recv_two_workers.py
@@ -3,25 +3,16 @@
 import os
 import random
 
-import cloudpickle
 import numpy as np
 import pytest
 from utils import am_recv, am_send, get_cuda_devices, get_num_gpus, recv, send
 
-from distributed.comm.utils import to_frames
-from distributed.protocol import to_serialize
-from distributed.utils import nbytes
-
-import cudf.tests.utils
-
 import ucp
 
-cmd = "nvidia-smi nvlink --setcontrol 0bz"  # Get output in bytes
-# subprocess.check_call(cmd, shell=True)
-
 cupy = pytest.importorskip("cupy")
 rmm = pytest.importorskip("rmm")
-
+distributed = pytest.importorskip("distributed")
+cloudpickle = pytest.importorskip("cloudpickle")
 
 ITERATIONS = 30
 
@@ -43,6 +34,7 @@ def client(port, func, comm_api):
     # deserialize
     # assert deserialized msg is cdf
     # send receipt
+    from distributed.utils import nbytes
 
     ucp.init()
 
@@ -89,13 +81,18 @@ async def read():
     if isinstance(rx_cuda_obj, cupy.ndarray):
         cupy.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj)
     else:
-        cudf.tests.utils.assert_eq(rx_cuda_obj, pure_cuda_obj)
+        from cudf.testing._utils import assert_eq
+
+        assert_eq(rx_cuda_obj, pure_cuda_obj)
 
 
 def server(port, func, comm_api):
     # create listener receiver
     # write cudf object
     # confirm message is sent correctly
+    from distributed.comm.utils import to_frames
+    from distributed.protocol import to_serialize
+
     ucp.init()
 
     if comm_api == "am":
@@ -183,6 +180,7 @@ def cupy_obj():
     return cupy.arange(size)
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(
     get_num_gpus() <= 2, reason="Machine does not have more than two GPUs"
 )
diff --git a/tests/test_shutdown.py b/tests/test_shutdown.py
index 8411ef19d..23e414af6 100644
--- a/tests/test_shutdown.py
+++ b/tests/test_shutdown.py
@@ -26,11 +26,9 @@ def event_loop(scope="function"):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("endpoint_error_handling", [False, True])
-async def test_server_shutdown(endpoint_error_handling):
+async def test_server_shutdown():
     """The server calls shutdown"""
-    if ucp.get_ucx_version() < (1, 10, 0) and endpoint_error_handling is True:
-        pytest.skip("Endpoint error handling is only supported for UCX >= 1.10")
+    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
 
     async def server_node(ep):
         msg = np.empty(10 ** 6)
@@ -55,11 +53,9 @@ async def client_node(port):
     sys.version_info < (3, 7), reason="test currently fails for python3.6"
 )
 @pytest.mark.asyncio
-@pytest.mark.parametrize("endpoint_error_handling", [False, True])
-async def test_client_shutdown(endpoint_error_handling):
+async def test_client_shutdown():
     """The client calls shutdown"""
-    if ucp.get_ucx_version() < (1, 10, 0) and endpoint_error_handling is True:
-        pytest.skip("Endpoint error handling is only supported for UCX >= 1.10")
+    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
 
     async def client_node(port):
         ep = await ucp.create_endpoint(
@@ -81,11 +77,9 @@ async def server_node(ep):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("endpoint_error_handling", [False, True])
-async def test_listener_close(endpoint_error_handling):
+async def test_listener_close():
     """The server close the listener"""
-    if ucp.get_ucx_version() < (1, 10, 0) and endpoint_error_handling is True:
-        pytest.skip("Endpoint error handling is only supported for UCX >= 1.10")
+    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
 
     async def client_node(listener):
         ep = await ucp.create_endpoint(
@@ -111,11 +105,9 @@ async def server_node(ep):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("endpoint_error_handling", [False, True])
-async def test_listener_del(endpoint_error_handling):
+async def test_listener_del():
     """The client delete the listener"""
-    if ucp.get_ucx_version() < (1, 10, 0) and endpoint_error_handling is True:
-        pytest.skip("Endpoint error handling is only supported for UCX >= 1.10")
+    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
 
     async def server_node(ep):
         await ep.send(np.arange(100, dtype=np.int64))
@@ -137,11 +129,9 @@ async def server_node(ep):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("endpoint_error_handling", [False, True])
-async def test_close_after_n_recv(endpoint_error_handling):
+async def test_close_after_n_recv():
     """The Endpoint.close_after_n_recv()"""
-    if ucp.get_ucx_version() < (1, 10, 0) and endpoint_error_handling is True:
-        pytest.skip("Endpoint error handling is only supported for UCX >= 1.10")
+    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
 
     async def server_node(ep):
         for _ in range(10):
diff --git a/tests/utils.py b/tests/utils.py
index ef8ecccd7..a7be96f02 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,11 +4,7 @@
 from contextlib import contextmanager
 
 import numpy as np
-
-from distributed.comm.utils import from_frames
-from distributed.utils import nbytes
-
-import rmm
+import pytest
 
 import ucp
 
@@ -65,10 +61,20 @@ def captured_logger(logger, level=logging.INFO, propagate=None):
 
 
 def cuda_array(size):
-    return rmm.DeviceBuffer(size=size)
+    try:
+        import rmm
+
+        return rmm.DeviceBuffer(size=size)
+    except ImportError:
+        import numba.cuda
+
+        return numba.cuda.device_array((size,), dtype="u1")
 
 
 async def send(ep, frames):
+    pytest.importorskip("distributed")
+    from distributed.utils import nbytes
+
     await ep.send(np.array([len(frames)], dtype=np.uint64))
     await ep.send(
         np.array(
@@ -83,6 +89,10 @@ async def send(ep, frames):
 
 
 async def recv(ep):
+    pytest.importorskip("distributed")
+
+    from distributed.comm.utils import from_frames
+
     try:
         # Recv meta data
         nframes = np.empty(1, dtype=np.uint64)
@@ -123,6 +133,10 @@ async def am_send(ep, frames):
 
 
 async def am_recv(ep):
+    pytest.importorskip("distributed")
+
+    from distributed.comm.utils import from_frames
+
     try:
         # Recv meta data
         nframes = (await ep.am_recv()).view(np.uint64)
diff --git a/ucp/_libs/tests/test_mem.py b/ucp/_libs/tests/test_mem.py
index a58554cf7..d12ffaea7 100644
--- a/ucp/_libs/tests/test_mem.py
+++ b/ucp/_libs/tests/test_mem.py
@@ -5,6 +5,7 @@
 import pytest
 
 from ucp._libs import ucx_api
+from ucp._libs.utils_test import get_endpoint_error_handling_default
 
 builtin_buffers = [
     b"",
@@ -59,7 +60,9 @@ def test_rkey_unpack():
     packed_rkey = mem.pack_rkey()
     worker = ucx_api.UCXWorker(ctx)
     ep = ucx_api.UCXEndpoint.create_from_worker_address(
-        worker, worker.get_address(), endpoint_error_handling=False
+        worker,
+        worker.get_address(),
+        endpoint_error_handling=get_endpoint_error_handling_default(),
     )
     rkey = ep.unpack_rkey(packed_rkey)
     assert rkey is not None
diff --git a/ucp/_libs/tests/test_peer_send_recv.py b/ucp/_libs/tests/test_peer_send_recv.py
index fd0f1a253..07d5b7291 100644
--- a/ucp/_libs/tests/test_peer_send_recv.py
+++ b/ucp/_libs/tests/test_peer_send_recv.py
@@ -1,15 +1,76 @@
 import multiprocessing as mp
 import os
+from itertools import repeat
 
 import pytest
 
 from ucp._libs import ucx_api
-from ucp._libs.utils_test import blocking_recv, blocking_send
+from ucp._libs.utils_test import (
+    blocking_flush,
+    blocking_recv,
+    blocking_send,
+    get_endpoint_error_handling_default,
+)
 
 mp = mp.get_context("spawn")
 
 
-def _test_peer_communication(queue, rank, msg_size):
+def _rma_setup(worker, address, prkey, base, msg_size):
+    ep = ucx_api.UCXEndpoint.create_from_worker_address(
+        worker, address, endpoint_error_handling=get_endpoint_error_handling_default()
+    )
+    rkey = ep.unpack_rkey(prkey)
+    mem = ucx_api.RemoteMemory(rkey, base, msg_size)
+    return ep, mem
+
+
+def _test_peer_communication_rma(queue, rank, msg_size):
+    ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.RMA, ucx_api.Feature.TAG))
+    worker = ucx_api.UCXWorker(ctx)
+    self_address = worker.get_address()
+    mem_handle = ctx.alloc(msg_size)
+    self_base = mem_handle.address
+    self_prkey = mem_handle.pack_rkey()
+
+    self_ep, self_mem = _rma_setup(
+        worker, self_address, self_prkey, self_base, msg_size
+    )
+    send_msg = bytes(repeat(rank, msg_size))
+    if not self_mem.put_nbi(send_msg):
+        blocking_flush(self_ep)
+
+    queue.put((rank, self_address, self_prkey, self_base))
+    right_rank, right_address, right_prkey, right_base = queue.get()
+    left_rank, left_address, left_prkey, left_base = queue.get()
+
+    right_ep, right_mem = _rma_setup(
+        worker, right_address, right_prkey, right_base, msg_size
+    )
+    right_msg = bytearray(msg_size)
+    right_mem.get_nbi(right_msg)
+
+    left_ep, left_mem = _rma_setup(
+        worker, left_address, left_prkey, left_base, msg_size
+    )
+    left_msg = bytearray(msg_size)
+    left_mem.get_nbi(left_msg)
+
+    blocking_flush(worker)
+    assert left_msg == bytes(repeat(left_rank, msg_size))
+    assert right_msg == bytes(repeat(right_rank, msg_size))
+
+    # We use the blocking tag send/recv as a barrier implementation
+    recv_msg = bytearray(8)
+    if rank == 0:
+        send_msg = bytes(os.urandom(8))
+        blocking_send(worker, right_ep, send_msg, right_rank)
+        blocking_recv(worker, left_ep, recv_msg, rank)
+    else:
+        blocking_recv(worker, left_ep, recv_msg, rank)
+        blocking_send(worker, right_ep, recv_msg, right_rank)
+
+
+def _test_peer_communication_tag(queue, rank, msg_size):
     ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,))
     worker = ucx_api.UCXWorker(ctx)
     queue.put((rank, worker.get_address()))
@@ -17,10 +78,14 @@ def _test_peer_communication(queue, rank, msg_size):
     left_rank, left_address = queue.get()
 
     right_ep = ucx_api.UCXEndpoint.create_from_worker_address(
-        worker, right_address, endpoint_error_handling=False
+        worker,
+        right_address,
+        endpoint_error_handling=get_endpoint_error_handling_default(),
     )
     left_ep = ucx_api.UCXEndpoint.create_from_worker_address(
-        worker, left_address, endpoint_error_handling=False
+        worker,
+        left_address,
+        endpoint_error_handling=get_endpoint_error_handling_default(),
     )
     recv_msg = bytearray(msg_size)
     if rank == 0:
@@ -33,14 +98,17 @@ def _test_peer_communication(queue, rank, msg_size):
         blocking_send(worker, right_ep, recv_msg, right_rank)
 
 
+@pytest.mark.parametrize(
+    "test_name", [_test_peer_communication_tag, _test_peer_communication_rma]
+)
 @pytest.mark.parametrize("msg_size", [10, 2 ** 24])
-def test_peer_communication(msg_size, num_nodes=2):
+def test_peer_communication(test_name, msg_size, num_nodes=2):
     """Test peer communication by sending a message between each worker"""
     queues = [mp.Queue() for _ in range(num_nodes)]
     ps = []
     addresses = []
     for rank, queue in enumerate(queues):
-        p = mp.Process(target=_test_peer_communication, args=(queue, rank, msg_size))
+        p = mp.Process(target=test_name, args=(queue, rank, msg_size))
         p.start()
         ps.append(p)
         addresses.append(queue.get())
diff --git a/ucp/_libs/tests/test_rma.py b/ucp/_libs/tests/test_rma.py
new file mode 100644
index 000000000..2f5dca2ef
--- /dev/null
+++ b/ucp/_libs/tests/test_rma.py
@@ -0,0 +1,153 @@
+import array
+import io
+import mmap
+import os
+
+import pytest
+
+from ucp._libs import ucx_api
+from ucp._libs.utils_test import (
+    blocking_flush,
+    get_endpoint_error_handling_default,
+)
+
+builtin_buffers = [
+    b"",
+    b"abcd",
+    array.array("i", []),
+    array.array("i", [0, 1, 2, 3]),
+    array.array("I", [0, 1, 2, 3]),
+    array.array("f", []),
+    array.array("f", [0, 1, 2, 3]),
+    array.array("d", [0, 1, 2, 3]),
+    memoryview(array.array("B", [0, 1, 2, 3, 4, 5])).cast("B", (3, 2)),
+    memoryview(b"abcd"),
+    memoryview(bytearray(b"abcd")),
+    io.BytesIO(b"abcd").getbuffer(),
+    mmap.mmap(-1, 5),
+]
+
+
+def _(*args, **kwargs):
+    pass
+
+
+def test_flush():
+    ctx = ucx_api.UCXContext({})
+    worker = ucx_api.UCXWorker(ctx)
+    ep = ucx_api.UCXEndpoint.create_from_worker_address(
+        worker,
+        worker.get_address(),
+        endpoint_error_handling=get_endpoint_error_handling_default(),
+    )
+    req = ep.flush(_)
+    if req is None:
+        info = req.info
+        while info["status"] == "pending":
+            worker.progress()
+        assert info["status"] == "finished"
+
+
+@pytest.mark.parametrize("msg_size", [10, 2 ** 24])
+def test_implicit(msg_size):
+    ctx = ucx_api.UCXContext({})
+    mem = ucx_api.UCXMemoryHandle.alloc(ctx, msg_size)
+    packed_rkey = mem.pack_rkey()
+    worker = ucx_api.UCXWorker(ctx)
+    ep = ucx_api.UCXEndpoint.create_from_worker_address(
+        worker,
+        worker.get_address(),
+        endpoint_error_handling=get_endpoint_error_handling_default(),
+    )
+    rkey = ep.unpack_rkey(packed_rkey)
+    self_mem = ucx_api.RemoteMemory(rkey, mem.address, msg_size)
+
+    send_msg = bytes(os.urandom(msg_size))
+    if not self_mem.put_nbi(send_msg):
+        blocking_flush(ep)
+    recv_msg = bytearray(len(send_msg))
+    if not self_mem.get_nbi(recv_msg):
+        blocking_flush(ep)
+    assert send_msg == recv_msg
+
+
+@pytest.mark.parametrize("msg_size", [10, 2 ** 24])
+def test_explicit(msg_size):
+    ctx = ucx_api.UCXContext({})
+    mem = ucx_api.UCXMemoryHandle.alloc(ctx, msg_size)
+    packed_rkey = mem.pack_rkey()
+    worker = ucx_api.UCXWorker(ctx)
+    ep = ucx_api.UCXEndpoint.create_from_worker_address(
+        worker,
+        worker.get_address(),
+        endpoint_error_handling=get_endpoint_error_handling_default(),
+    )
+    rkey = ep.unpack_rkey(packed_rkey)
+    self_mem = ucx_api.RemoteMemory(rkey, mem.address, msg_size)
+
+    send_msg = bytes(os.urandom(msg_size))
+    put_req = self_mem.put_nb(send_msg, _)
+    if put_req is not None:
+        blocking_flush(ep)
+    recv_msg = bytearray(len(send_msg))
+    recv_req = self_mem.get_nb(recv_msg, _)
+    if recv_req is not None:
+        blocking_flush(ep)
+    assert send_msg == recv_msg
+
+
+@pytest.mark.parametrize("msg_size", [10, 2 ** 24])
+def test_ucxio(msg_size):
+    ctx = ucx_api.UCXContext({})
+    mem = ucx_api.UCXMemoryHandle.alloc(ctx, msg_size)
+    packed_rkey = mem.pack_rkey()
+    worker = ucx_api.UCXWorker(ctx)
+    ep = ucx_api.UCXEndpoint.create_from_worker_address(
+        worker,
+        worker.get_address(),
+        endpoint_error_handling=get_endpoint_error_handling_default(),
+    )
+    rkey = ep.unpack_rkey(packed_rkey)
+
+    uio = ucx_api.UCXIO(mem.address, msg_size, rkey)
+    send_msg = bytes(os.urandom(msg_size))
+    uio.write(send_msg)
+    uio.seek(0)
+    recv_msg = uio.read(msg_size)
+    assert send_msg == recv_msg
+    del uio
+
+
+def test_force_requests():
+    msg_size = 1024
+    ctx = ucx_api.UCXContext({})
+    mem = ucx_api.UCXMemoryHandle.alloc(ctx, msg_size)
+    packed_rkey = mem.pack_rkey()
+    worker = ucx_api.UCXWorker(ctx)
+    ep = ucx_api.UCXEndpoint.create_from_worker_address(
+        worker,
+        worker.get_address(),
+        endpoint_error_handling=get_endpoint_error_handling_default(),
+    )
+    rkey = ep.unpack_rkey(packed_rkey)
+    self_mem = ucx_api.RemoteMemory(rkey, mem.address, msg_size)
+
+    counter = 0
+    send_msg = bytes(os.urandom(msg_size))
+    req = self_mem.put_nb(send_msg, _)
+    while req is None:
+        counter = counter + 1
+        req = self_mem.put_nb(send_msg, _)
+        # This `if` is here because some combinations of transports, such as
+        # normal desktop PCs, will never have their transports exhausted. So
+        # we have a break to make sure this test still completes
+        if counter > 10000:
+            pytest.xfail("Could not generate a request")
+
+    blocking_flush(worker)
+    while worker.progress():
+        pass
+
+    while self_mem.put_nb(send_msg, _):
+        pass
+    blocking_flush(worker)
diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx
index af620c2c3..e8e35a219 100644
--- a/ucp/_libs/ucx_api.pyx
+++ b/ucp/_libs/ucx_api.pyx
@@ -12,6 +12,8 @@ include "ucx_memory_handle.pyx"
 include "ucx_object.pyx"
 include "ucx_request.pyx"
 include "ucx_rkey.pyx"
+include "ucx_rma.pyx"
 include "ucx_worker.pyx"
 include "ucx_worker_cb.pyx"
+include "ucxio.pyx"
 include "utils.pyx"
diff --git a/ucp/_libs/ucx_api_dep.pxd b/ucp/_libs/ucx_api_dep.pxd
index 416497a1e..232db1930 100644
--- a/ucp/_libs/ucx_api_dep.pxd
+++ b/ucp/_libs/ucx_api_dep.pxd
@@ -484,3 +484,13 @@ cdef extern from "sys/epoll.h":
     void ucp_rkey_buffer_release(void *rkey_buffer)
     ucs_status_t ucp_rkey_ptr(ucp_rkey_h rkey, uint64_t raddr, void **addr_p)
     void ucp_rkey_destroy(ucp_rkey_h rkey)
+    ucs_status_t ucp_put_nbi(ucp_ep_h ep, const void *buffer, size_t length,
+                             uint64_t remote_addr, ucp_rkey_h rkey)
+    ucs_status_t ucp_get_nbi(ucp_ep_h ep, void *buffer, size_t length,
+                             uint64_t remote_addr, ucp_rkey_h rkey)
+    ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length,
+                                uint64_t remote_addr, ucp_rkey_h rkey,
+                                ucp_send_callback_t cb)
+    ucs_status_ptr_t ucp_get_nb(ucp_ep_h ep, void *buffer, size_t length,
+                                uint64_t remote_addr, ucp_rkey_h rkey,
+                                ucp_send_callback_t cb)
diff --git a/ucp/_libs/ucx_rkey.pyx b/ucp/_libs/ucx_rkey.pyx
index 62800b563..70e9a3cb5 100644
--- a/ucp/_libs/ucx_rkey.pyx
+++ b/ucp/_libs/ucx_rkey.pyx
@@ -35,3 +35,7 @@ cdef class UCXRkey(UCXObject):
             ep
         )
         ep.add_child(self)
+
+    @property
+    def ep(self):
+        return self.ep
diff --git a/ucp/_libs/ucx_rma.pyx b/ucp/_libs/ucx_rma.pyx
new file mode 100644
index 000000000..a8a62b9b1
--- /dev/null
+++ b/ucp/_libs/ucx_rma.pyx
@@ -0,0 +1,228 @@
+from io import RawIOBase
+
+from .arr cimport Array
+from .ucx_api_dep cimport *
+
+from ..exceptions import UCXError
+
+
+class RemoteMemory:
+    """This class wraps all of the rkey meta data and remote memory locations to do
+       simple RMA operations.
+    """
+    def __init__(self, rkey, base, length):
+        self._rkey = rkey
+        self._base = base
+        self._length = length
+
+    def put_nb(self,
+               memory,
+               cb_func,
+               tuple cb_args=None,
+               dict cb_kwargs=None,
+               offset=0,
+               size=0,
+               ):
+        """RMA put operation. Takes the memory specified in the buffer object and writes
+        it to the specified remote address.
+
+        Parameters
+        ----------
+        memory: buffer
+            An ``Array`` wrapping a user-provided array-like object
+        cb_func: callable
+            The call-back function, which must accept `request` and `exception` as the
+            first two arguments.
+        cb_args: tuple, optional
+            Extra arguments to the call-back function
+        cb_kwargs: dict, optional
+            Extra keyword arguments to the call-back function
+        offset: int, optional
+            Optional parameter to indicate an offset into the remote buffer to place the
+            input buffer buffer into. By default it will write to the base provided in
+            the constructor
+        size: int, optional
+            Optional parameter to indicate how much remote memory to write. If 0 or not
+            specified it will write the entire buffer provided
+
+        Returns
+        -------
+        UCXRequest
+            request object that holds metadata about the driver's progress
+        """
+        memory = Array(memory)
+        dest = self._base + offset
+        if size == 0:
+            size = memory.nbytes
+        if size + offset > self._length:
+            raise IndexError("Out of bounds in UCX RMA interface")
+        return put_nb(memory, size, dest, self._rkey, cb_func,
+                      cb_args, cb_kwargs, u"get_nb")
+
+    def get_nb(self,
+               memory,
+               cb_func,
+               tuple cb_args=None,
+               dict cb_kwargs=None,
+               offset=0,
+               size=0
+               ):
+        """
+        Parameters
+        ----------
+        memory: buffer
+            An ``Array`` wrapping a user-provided array-like object
+        cb_func: callable
+            The call-back function, which must accept `request` and `exception` as the
+            first two arguments.
+        cb_args: tuple, optional
+            Extra arguments to the call-back function
+        cb_kwargs: dict, optional
+            Extra keyword arguments to the call-back function
+        offset: int, optional
+            Optional parameter to indicate an offset into the remote buffer to place the
+            input buffer buffer into
+        size: int, optional
+            Optional parameter to indicate how much remote memory to read. If 0 or not
+            specified it will read enough bytes to fill the buffer
+
+        Returns
+        -------
+        UCXRequest
+            request object that holds metadata about the driver's progress
+        """
+
+        memory = Array(memory)
+        dest = self._base + offset
+        if size == 0:
+            size = memory.nbytes
+        if size + offset > self._length:
+            raise IndexError("Out of bounds in UCX RMA interface")
+        return get_nb(memory, size, dest, self._rkey, cb_func,
+                      cb_args, cb_kwargs, u"get_nb")
+
+    def put_nbi(self, memory, size=0, offset=0):
+        """RMA put operation. Takes the memory specified in the buffer object and writes
+        it to remote memory. Contrast with the *_nb interface this does not return a
+        request object.
+
+        Parameters
+        ----------
+        memory: buffer
+            An ``Array`` wrapping a user-provided array-like object
+        offset: int, optional
+            Optional parameter to indicate an offset into the remote buffer to place the
+            input buffer buffer into
+
+        Returns
+        -------
+        True
+             UCX holds no references to this buffer and it maybe reused immediately
+        False
+            Buffer is in use by the underlying driver and not safe for reuse until the
+            endpoint or worker is flushed
+        """
+
+        memory = Array(memory)
+        dest = self._base + offset
+        if size == 0:
+            size = memory.nbytes
+        if size + offset > self._length:
+            raise IndexError("Out of bounds in UCX RMA interface")
+        return put_nbi(memory, size, dest, self._rkey)
+
+    def get_nbi(self, memory, size=0, offset=0):
+        """RMA get operation. Reads remote memory into a local buffer. Contrast with the
+         *_nb interface this does not return a request object.
+
+        Parameters
+        ----------
+        memory: buffer
+            An ``Array`` wrapping a user-provided array-like object
+        offset: int, optional
+            Optional parameter to indicate an offset into the remote buffer to place the
+            input buffer buffer into
+
+        Returns
+        -------
+        True
+            UCX holds no references to this buffer and it maybe reused immediately
+        False
+            Buffer is in use by the underlying driver and not safe for reuse until the
+            endpoint or worker is flushed
+        """
+        memory = Array(memory)
+        dest = self._base + offset
+        if size == 0:
+            size = memory.nbytes
+        if size + offset > self._length:
+            raise IndexError("Out of bounds in UCX RMA interface")
+        return get_nbi(memory, size, dest, self._rkey)
+
+
+def put_nbi(Array buffer, size_t nbytes, uint64_t remote_addr, UCXRkey rkey, name=None):
+    if name is None:
+        name = u"put_nbi"
+    cdef ucs_status_t status = ucp_put_nbi(rkey.ep._handle,
+                                           <const void *>buffer.ptr,
+                                           nbytes,
+                                           remote_addr,
+                                           rkey._handle)
+    return assert_ucs_status(status)
+
+
+def get_nbi(Array buffer, size_t nbytes, uint64_t remote_addr, UCXRkey rkey, name=None):
+    if name is None:
+        name = u"get_nbi"
+    cdef ucs_status_t status = ucp_get_nbi(rkey.ep._handle,
+                                           <void *>buffer.ptr,
+                                           nbytes,
+                                           remote_addr,
+                                           rkey._handle)
+    return assert_ucs_status(status)
+
+
+def put_nb(Array buffer,
+           size_t nbytes,
+           uint64_t remote_addr,
+           UCXRkey rkey,
+           cb_func,
+           tuple cb_args=None,
+           dict cb_kwargs=None,
+           name=None
+           ):
+    cdef ucs_status_t ucx_status
+    if name is None:
+        name = u"put_nb"
+    cdef ucp_send_callback_t send_cb = <ucp_send_callback_t>_send_callback
+    cdef ucs_status_ptr_t status = ucp_put_nb(rkey.ep._handle,
+                                              <const void *>buffer.ptr,
+                                              nbytes,
+                                              remote_addr,
+                                              rkey._handle,
+                                              send_cb)
+    return _handle_status(
+        status, nbytes, cb_func, cb_args, cb_kwargs, name, rkey.ep._inflight_msgs
+    )
+
+
+def get_nb(Array buffer,
+           size_t nbytes,
+           uint64_t remote_addr,
+           UCXRkey rkey,
+           cb_func,
+           tuple cb_args=None,
+           dict cb_kwargs=None,
+           name=None
+           ):
+    cdef ucs_status_t ucx_status
+    cdef ucp_send_callback_t send_cb = <ucp_send_callback_t>_send_callback
+    cdef ucs_status_ptr_t status = ucp_get_nb(rkey.ep._handle,
+                                              <void *>buffer.ptr,
+                                              nbytes,
+                                              remote_addr,
+                                              rkey._handle,
+                                              send_cb)
+    return _handle_status(
+        status, nbytes, cb_func, cb_args, cb_kwargs, name, rkey.ep._inflight_msgs
+    )
diff --git a/ucp/_libs/ucxio.pyx b/ucp/_libs/ucxio.pyx
new file mode 100644
index 000000000..b6f0647f6
--- /dev/null
+++ b/ucp/_libs/ucxio.pyx
@@ -0,0 +1,75 @@
+from io import RawIOBase
+
+from .arr cimport Array
+from .ucx_api_dep cimport *
+
+
+def blocking_handler(request, exception, finished):
+    assert exception is None
+    finished[0] = True
+
+
+class UCXIO(RawIOBase):
+    """A class to simulate python streams backed by UCX RMA operations
+
+        Parameters
+        ----------
+        dest: int
+            A 64 bit number that represents the remote address that will be written to
+            and read from.
+        length: int
+            Maximum length of the region that can be written to and read from.
+        rkey: UCXRkey
+            An unpacked UCXRkey that represents the remote memory that was unpacked by
+            UCX for use in RMA operations.
+    """
+
+    def __init__(self, dest, length, rkey):
+        self.pos = 0
+        self.remote_addr = dest
+        self.length = length
+        self.rkey = rkey
+        self.cb_finished = [False]
+
+    def block_on_request(self, req):
+        if req is not None:
+            while not self.cb_finished[0]:
+                self.rkey.ep.worker.progress()
+        self.cb_finished[0] = False
+
+    def flush(self):
+        req = self.rkey.ep.flush(blocking_handler, cb_args=(self.cb_finished,))
+        self.block_on_request(req)
+
+    def seek(self, pos, whence=0):
+        if whence == 1:
+            pos += self.pos
+        if whence == 2:
+            pos = self.length - pos
+        self.pos = pos
+
+    def _do_rma(self, op, buff):
+        data = Array(buff)
+        size = data.nbytes
+        if self.pos + size > self.length:
+            size = self.length - self.pos
+        finished = op(data, size, self.remote_addr + self.pos, self.rkey)
+        self.pos += size
+        if not finished:
+            self.flush()
+        return size
+
+    def readinto(self, buff):
+        return self._do_rma(get_nbi, buff)
+
+    def write(self, buff):
+        return self._do_rma(put_nbi, buff)
+
+    def seekable(self):
+        return True
+
+    def writable(self):
+        return True
+
+    def readable(self):
+        return True
diff --git a/ucp/_libs/utils_test.py b/ucp/_libs/utils_test.py
index 2cfb2671a..e827551b0 100644
--- a/ucp/_libs/utils_test.py
+++ b/ucp/_libs/utils_test.py
@@ -11,6 +11,18 @@ def blocking_handler(request, exception, finished):
     finished[0] = True
 
 
+def blocking_flush(obj):
+    finished = [False]
+    if not hasattr(obj, "progress"):
+        progress = obj.worker.progress
+    else:
+        progress = obj.progress
+    req = obj.flush(cb_func=blocking_handler, cb_args=(finished,))
+    if req is not None:
+        while not finished[0]:
+            progress()
+
+
 def blocking_send(worker, ep, msg, tag=0):
     msg = Array(msg)
     finished = [False]
@@ -63,3 +75,7 @@ def blocking_am_recv(worker, ep):
     while ret[0] is None:
         worker.progress()
     return ret[0]
+
+
+def get_endpoint_error_handling_default():
+    return ucx_api.get_ucx_version() >= (1, 10, 0)
diff --git a/ucp/utils.py b/ucp/utils.py
index d300236e6..180361301 100644
--- a/ucp/utils.py
+++ b/ucp/utils.py
@@ -1,5 +1,6 @@
 import asyncio
 import fcntl
+import glob
 import hashlib
 import logging
 import multiprocessing as mp
@@ -38,16 +39,40 @@ def get_address(ifname=None):
     >>> get_address(ifname='lo')
     '127.0.0.1'
     """
+
+    def _get_address(ifname):
+        ifname = ifname.encode()
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+            return socket.inet_ntoa(
+                fcntl.ioctl(
+                    s.fileno(), 0x8915, struct.pack("256s", ifname[:15])  # SIOCGIFADDR
+                )[20:24]
+            )
+
+    def _try_interfaces():
+        prefix_priority = ["ib", "eth", "en"]
+        iftypes = {p: [] for p in prefix_priority}
+        for i in glob.glob("/sys/class/net/*"):
+            name = i.split("/")[-1]
+            for p in prefix_priority:
+                if name.startswith(p):
+                    iftypes[p].append(name)
+        for p in prefix_priority:
+            iftype = iftypes[p]
+            iftype.sort()
+            for i in iftype:
+                try:
+                    return _get_address(i)
+                except OSError:
+                    pass
+
     if ifname is None:
-        ifname = os.environ.get("UCXPY_IFNAME", "ib0")
-
-    ifname = ifname.encode()
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        return socket.inet_ntoa(
-            fcntl.ioctl(
-                s.fileno(), 0x8915, struct.pack("256s", ifname[:15])  # SIOCGIFADDR
-            )[20:24]
-        )
+        ifname = os.environ.get("UCXPY_IFNAME")
+
+    if ifname is not None:
+        return _get_address(ifname)
+    else:
+        return _try_interfaces()
 
 
 def get_closest_net_devices(gpu_dev):