diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000000..9d35e3f97f
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,30 @@
+# syntax=docker/dockerfile:1.5
+
+ARG BASE
+ARG PYTHON_PACKAGE_MANAGER=conda
+
+FROM ${BASE} as pip-base
+
+ENV DEFAULT_VIRTUAL_ENV=rapids
+
+FROM ${BASE} as conda-base
+
+ENV DEFAULT_CONDA_ENV=rapids
+
+FROM ${PYTHON_PACKAGE_MANAGER}-base
+
+ARG CUDA
+ENV CUDAARCHS="RAPIDS"
+ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
+
+ARG PYTHON_PACKAGE_MANAGER
+ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}"
+
+ENV PYTHONSAFEPATH="1"
+ENV PYTHONUNBUFFERED="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+
+ENV SCCACHE_REGION="us-east-2"
+ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 0000000000..00c77a58fe
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,35 @@
+# KvikIO Development Containers
+
+This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces).
+
+This container is a turnkey development environment for building and testing the KvikIO C++ and Python libraries.
+
+## Table of Contents
+
+* [Prerequisites](#prerequisites)
+* [Host bind mounts](#host-bind-mounts)
+* [Launch a Dev Container](#launch-a-dev-container)
+
+## Prerequisites
+
+* [VSCode](https://code.visualstudio.com/download)
+* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+
+## Host bind mounts
+
+By default, the following directories are bind-mounted into the devcontainer:
+
+* `${repo}:/home/coder/kvikio`
+* `${repo}/../.aws:/home/coder/.aws`
+* `${repo}/../.local:/home/coder/.local`
+* `${repo}/../.cache:/home/coder/.cache`
+* `${repo}/../.conda:/home/coder/.conda`
+* `${repo}/../.config:/home/coder/.config`
+
+This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
+
+## Launch a Dev Container
+
+To launch a devcontainer from VSCode, open the KvikIO repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
+
+Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command.
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
new file mode 100644
index 0000000000..9d0eaa07ff
--- /dev/null
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
new file mode 100644
index 0000000000..2f466c8a3f
--- /dev/null
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -0,0 +1,36 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
new file mode 100644
index 0000000000..0c9378cb38
--- /dev/null
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
new file mode 100644
index 0000000000..f6d6c8fb4b
--- /dev/null
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -0,0 +1,36 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda12.0-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 0000000000..895ba83ee5
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2d1444c595..9a0b415503 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: true
 recently_updated: true
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 576759d7cb..bf9819f58c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,12 +56,12 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7610b96449..472be6d62f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,42 +18,51 @@ jobs:
       - conda-python-build
       - conda-python-tests
       - docs-build
+      - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.10
+    with:
+      build_command: |
+        sccache -z;
+        build-all;
+        sccache -s;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 390e1ac263..505b30df6c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.gitignore b/.gitignore
index 5ebc04bb31..1b23a132dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,7 @@ docs/build/
 cpp/doxygen/html/
 .mypy_cache
 .hypothesis
+.ipynb_checkpoints
+
+# clang tooling
+compile_commands.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b7759fcf8..8707fa639c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,7 +48,7 @@ repos:
                 "python/benchmarks"]
         pass_filenames: false
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.4
+    rev: v16.0.6
     hooks:
       - id: clang-format
         types_or: [c, c++, cuda]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 299253ab4d..b06abe885d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,41 @@
+# kvikio 23.10.00 (11 Oct 2023)
+
+## 🚨 Breaking Changes
+
+- Update to Cython 3.0.0 ([#258](https://github.com/rapidsai/kvikio/pull/258)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- Add numcodecs pin ([#300](https://github.com/rapidsai/kvikio/pull/300)) [@vyasr](https://github.com/vyasr)
+- Add missed filename to sed_runner call ([#286](https://github.com/rapidsai/kvikio/pull/286)) [@raydouglass](https://github.com/raydouglass)
+- Use `conda mambabuild` not `mamba mambabuild` ([#278](https://github.com/rapidsai/kvikio/pull/278)) [@bdice](https://github.com/bdice)
+- fixes #254 ([#262](https://github.com/rapidsai/kvikio/pull/262)) [@madsbk](https://github.com/madsbk)
+
+## 📖 Documentation
+
+- minor doc fixes ([#279](https://github.com/rapidsai/kvikio/pull/279)) [@madsbk](https://github.com/madsbk)
+- Docs ([#268](https://github.com/rapidsai/kvikio/pull/268)) [@madsbk](https://github.com/madsbk)
+- Zarr notebook ([#261](https://github.com/rapidsai/kvikio/pull/261)) [@madsbk](https://github.com/madsbk)
+
+## 🛠️ Improvements
+
+- Use branch-23.10 for devcontainers workflow. ([#289](https://github.com/rapidsai/kvikio/pull/289)) [@bdice](https://github.com/bdice)
+- Update image names ([#284](https://github.com/rapidsai/kvikio/pull/284)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Update to clang 16.0.6. ([#280](https://github.com/rapidsai/kvikio/pull/280)) [@bdice](https://github.com/bdice)
+- Update doxygen to 1.9.1 ([#277](https://github.com/rapidsai/kvikio/pull/277)) [@vyasr](https://github.com/vyasr)
+- Async I/O using by-value arguments ([#275](https://github.com/rapidsai/kvikio/pull/275)) [@madsbk](https://github.com/madsbk)
+- Zarr-IO Benchmark ([#274](https://github.com/rapidsai/kvikio/pull/274)) [@madsbk](https://github.com/madsbk)
+- Add KvikIO devcontainers ([#273](https://github.com/rapidsai/kvikio/pull/273)) [@trxcllnt](https://github.com/trxcllnt)
+- async: fall back to blocking ([#272](https://github.com/rapidsai/kvikio/pull/272)) [@madsbk](https://github.com/madsbk)
+- Unify batch and stream API check ([#271](https://github.com/rapidsai/kvikio/pull/271)) [@madsbk](https://github.com/madsbk)
+- Use `copy-pr-bot` ([#269](https://github.com/rapidsai/kvikio/pull/269)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Zarr+CuPy+GDS+nvCOMP made easy ([#267](https://github.com/rapidsai/kvikio/pull/267)) [@madsbk](https://github.com/madsbk)
+- Remove sphinx pinning ([#260](https://github.com/rapidsai/kvikio/pull/260)) [@vyasr](https://github.com/vyasr)
+- Initial changes to support cufile stream I/O. ([#259](https://github.com/rapidsai/kvikio/pull/259)) [@tell-rebanta](https://github.com/tell-rebanta)
+- Update to Cython 3.0.0 ([#258](https://github.com/rapidsai/kvikio/pull/258)) [@vyasr](https://github.com/vyasr)
+- Modernize Python build ([#257](https://github.com/rapidsai/kvikio/pull/257)) [@vyasr](https://github.com/vyasr)
+- Enable roundtrip for nvCOMP batch codecs. ([#253](https://github.com/rapidsai/kvikio/pull/253)) [@Alexey-Kamenev](https://github.com/Alexey-Kamenev)
+
 # kvikio 23.08.00 (9 Aug 2023)
 
 ## 🐛 Bug Fixes
diff --git a/README.md b/README.md
index 84d94f3c4a..4df538aa2d 100644
--- a/README.md
+++ b/README.md
@@ -1,176 +1,23 @@
-# KvikIO: C++ and Python bindings to cuFile
+# KvikIO: High Performance File IO
 
 ## Summary
 
-This provides C++ and Python bindings to cuFile, which enables GPUDirect Storage (GDS).
-KvikIO also works efficiently when GDS isn't available and can read/write both host and
-device data seamlessly.
+KvikIO is a Python and C++ library for high performance file IO. It provides C++ and Python
+bindings to [cuFile](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html),
+which enables [GPUDirect Storage (GDS)](https://developer.nvidia.com/blog/gpudirect-storage/).
+KvikIO also works efficiently when GDS isn't available and can read/write both host and device data seamlessly.
+The C++ library is header-only making it easy to include in [existing projects](https://github.com/rapidsai/kvikio/blob/HEAD/cpp/examples/downstream/).
+
 
 ### Features
 
-* Object Oriented API.
-* Exception handling.
+* Object oriented API of [cuFile](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html) with C++/Python exception handling.
+* A Python [Zarr](https://zarr.readthedocs.io/en/stable/) backend for reading and writing GPU data to file seamlessly.
 * Concurrent reads and writes using an internal thread pool.
 * Non-blocking API.
-* Python Zarr reader.
 * Handle both host and device IO seamlessly.
 * Provides Python bindings to [nvCOMP](https://github.com/NVIDIA/nvcomp).
 
-## Requirements
-
-To install users should have a working Linux machine with CUDA Toolkit
-installed (v11.4+) and a working compiler toolchain (C++17 and cmake).
-
-### C++
-
-The C++ bindings are header-only and depends on the CUDA Driver API.
-In order to build and run the example code, CMake and the CUDA Runtime
-API is required.
-
-### Python
-
-The Python package depends on the following packages:
-
-* cython
-* pip
-* setuptools
-* scikit-build
-
-For nvCOMP, benchmarks, examples, and tests:
-
-* pytest
-* numpy
-* cupy
-
-## Install
-
-### Conda
-
-Install the stable release from the `rapidsai` channel like:
-
-```
-conda create -n kvikio_env -c rapidsai -c conda-forge kvikio
-```
-
-Install the `kvikio` conda package from the `rapidsai-nightly` channel like:
-
-```
-conda create -n kvikio_env -c rapidsai-nightly -c conda-forge python=3.10 cuda-version=11.8 kvikio
-```
-
-If the nightly install doesn't work, set `channel_priority: flexible` in your `.condarc`.
-
-In order to setup a development environment run:
-```
-conda env create --name kvikio-dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
-```
-
-### C++ (build from source)
-
-To build the C++ example, go to the `cpp` subdiretory and run:
-
-```
-./build.sh libkvikio
-```
-
-Then run the example:
-
-```
-./examples/basic_io
-```
-
-### Python (build from source)
-
-To build and install the extension, go to the `python` subdirectory and run:
-
-```
-./build.sh kvikio
-```
-
-One might have to define `CUDA_HOME` to the path to the CUDA installation.
-
-In order to test the installation, run the following:
-
-```
-pytest tests/
-```
-
-And to test performance, run the following:
-
-```
-python benchmarks/single-node-io.py
-```
-
-## Examples
-
-### C++
-
-```c++
-#include <cstddef>
-#include <cuda_runtime.h>
-#include <kvikio/file_handle.hpp>
-using namespace std;
-
-int main()
-{
-  // Create two arrays `a` and `b`
-  constexpr std::size_t size = 100;
-  void *a = nullptr;
-  void *b = nullptr;
-  cudaMalloc(&a, size);
-  cudaMalloc(&b, size);
-
-  // Write `a` to file
-  kvikio::FileHandle fw("test-file", "w");
-  size_t written = fw.write(a, size);
-  fw.close();
-
-  // Read file into `b`
-  kvikio::FileHandle fr("test-file", "r");
-  size_t read = fr.read(b, size);
-  fr.close();
-
-  // Read file into `b` in parallel using 16 threads
-  kvikio::default_thread_pool::reset(16);
-  {
-    kvikio::FileHandle f("test-file", "r");
-    future<size_t> future = f.pread(b_dev, sizeof(a), 0);  // Non-blocking
-    size_t read = future.get(); // Blocking
-    // Notice, `f` closes automatically on destruction.
-  }
-}
-```
-
-### Python
-
-```python
-import cupy
-import kvikio
-
-a = cupy.arange(100)
-f = kvikio.CuFile("test-file", "w")
-# Write whole array to file
-f.write(a)
-f.close()
-
-b = cupy.empty_like(a)
-f = kvikio.CuFile("test-file", "r")
-# Read whole array from file
-f.read(b)
-assert all(a == b)
-
-# Use contexmanager
-c = cupy.empty_like(a)
-with kvikio.CuFile(path, "r") as f:
-    f.read(c)
-assert all(a == c)
-
-# Non-blocking read
-d = cupy.empty_like(a)
-with kvikio.CuFile(path, "r") as f:
-    future1 = f.pread(d[:50])
-    future2 = f.pread(d[50:], file_offset=d[:50].nbytes)
-    future1.get()  # Wait for first read
-    future2.get()  # Wait for second read
-assert all(a == d)
-```
+### Documentation
+ * Python: <https://docs.rapids.ai/api/kvikio/nightly/>
+ * C++: <https://docs.rapids.ai/api/libkvikio/nightly/>
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 0fde172bb5..c01f20f377 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -11,6 +11,6 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-rapids-mamba-retry mambabuild conda/recipes/libkvikio
+rapids-conda-retry mambabuild conda/recipes/libkvikio
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 4f82c49925..ab2b6336c4 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -25,7 +25,7 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   kvikio libkvikio
 
-export RAPIDS_VERSION_NUMBER="23.08"
+export RAPIDS_VERSION_NUMBER="23.10"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 0ae600cbec..023c53a0b4 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -13,7 +13,7 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/kvikio
 
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 8d2a2c2e6b..8de0c72238 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -10,7 +10,7 @@ PATH=/conda/bin:$PATH
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index c9845112bc..3edcd6939c 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -23,6 +23,9 @@ NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
+# Need to distutils-normalize the original version
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
 # Inplace sed replace; workaround for Linux and Mac
@@ -57,9 +60,6 @@ sed_runner 's/PROJECT_NUMBER         = .*/PROJECT_NUMBER         = '${NEXT_FULL_
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
 
-# Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-
 DEPENDENCIES=(
   cudf
 )
@@ -74,3 +74,9 @@ for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+# .devcontainer files
+find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_FULL_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+done
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 88a96a13b7..c81dd4a6df 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -11,28 +11,31 @@ dependencies:
 - cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
-- cudf==23.8.*
+- cudf==23.10.*
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=0.29,<0.30
+- cython>=3.0.0
 - dask>=2022.05.2
 - distributed>=2022.05.2
-- doxygen=1.8.20
+- doxygen=1.9.1
 - gcc_linux-64=11.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - ninja
+- numcodecs <0.12.0
 - numpy>=1.21
+- numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==2.6.1
 - packaging
 - pre-commit
-- pydata-sphinx-theme
 - pytest
 - pytest-cov
 - python>=3.9,<3.11
 - scikit-build>=0.13.1
-- sphinx<6
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
 - sysroot_linux-64=2.17
 - zarr
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 2a5ee53ec9..26f804356b 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -11,27 +11,30 @@ dependencies:
 - cuda-nvcc
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.0
-- cudf==23.8.*
+- cudf==23.10.*
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=0.29,<0.30
+- cython>=3.0.0
 - dask>=2022.05.2
 - distributed>=2022.05.2
-- doxygen=1.8.20
+- doxygen=1.9.1
 - gcc_linux-64=11.*
 - libcufile
 - libcufile-dev
 - ninja
+- numcodecs <0.12.0
 - numpy>=1.21
+- numpydoc
 - nvcomp==2.6.1
 - packaging
 - pre-commit
-- pydata-sphinx-theme
 - pytest
 - pytest-cov
 - python>=3.9,<3.11
 - scikit-build>=0.13.1
-- sphinx<6
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
 - sysroot_linux-64=2.17
 - zarr
 name: all_cuda-120_arch-x86_64
diff --git a/conda/recipes/kvikio/meta.yaml b/conda/recipes/kvikio/meta.yaml
index 2f32ed7326..52d81936ba 100644
--- a/conda/recipes/kvikio/meta.yaml
+++ b/conda/recipes/kvikio/meta.yaml
@@ -57,7 +57,7 @@ requirements:
     - python
     - setuptools
     - pip
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
     {% if cuda_major == "11" %}
     - cudatoolkit
     {% endif %}
@@ -70,6 +70,8 @@ requirements:
     - numpy >=1.20
     - cupy >=12.0.0
     - zarr
+    # See https://github.com/zarr-developers/numcodecs/pull/475
+    - numcodecs <0.12.0
     - packaging
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9e288fc1a4..5b218d2717 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   KvikIO
-  VERSION 23.08.00
+  VERSION 23.10.00
   LANGUAGES CXX
 )
 
@@ -71,6 +71,13 @@ else()
     set(cuFile_BATCH_API_FOUND TRUE)
   endif()
   message(STATUS "Found cuFile's Batch API: ${cuFile_BATCH_API_FOUND}")
+  string(FIND "${CUFILE_H_STR}" "cuFileReadAsync" cuFileReadAsync_location)
+  if(cuFileReadAsync_location EQUAL "-1")
+    set(cuFile_STREAM_API_FOUND FALSE)
+  else()
+    set(cuFile_STREAM_API_FOUND TRUE)
+  endif()
+  message(STATUS "Found cuFile's Stream API: ${cuFile_STREAM_API_FOUND}")
 endif()
 
 # library targets
@@ -87,10 +94,12 @@ target_link_libraries(kvikio INTERFACE CUDA::toolkit)
 if(cuFile_FOUND)
   target_link_libraries(kvikio INTERFACE cufile::cuFile_interface)
   target_compile_definitions(kvikio INTERFACE KVIKIO_CUFILE_FOUND)
-
   if(cuFile_BATCH_API_FOUND)
     target_compile_definitions(kvikio INTERFACE KVIKIO_CUFILE_BATCH_API_FOUND)
   endif()
+  if(cuFile_STREAM_API_FOUND)
+    target_compile_definitions(kvikio INTERFACE KVIKIO_CUFILE_STREAM_API_FOUND)
+  endif()
 endif()
 target_link_libraries(kvikio INTERFACE ${CMAKE_DL_LIBS})
 target_compile_features(kvikio INTERFACE cxx_std_17)
diff --git a/cpp/cmake/fetch_rapids.cmake b/cpp/cmake/fetch_rapids.cmake
index 56cc1c2f17..d19e5f8a60 100644
--- a/cpp/cmake/fetch_rapids.cmake
+++ b/cpp/cmake/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/KVIKIO_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/KVIKIO_RAPIDS.cmake
   )
 endif()
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index a19d5a2f24..3fc08c8645 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libkvikio"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.08.00
+PROJECT_NUMBER         = 23.10.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 5494d5c580..e03c87b5d9 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -1,4 +1,136 @@
-# libkvikio
+# Welcome to KvikIO's C++ documentation!
 
-libkvikio is a C++ header-only library providing bindings to
-cuFile, which enables GPUDirectStorage (GDS).
+KvikIO is a Python and C++ library for high performance file IO. It provides C++ and Python
+bindings to [cuFile](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html)
+which enables [GPUDirect Storage (GDS)](https://developer.nvidia.com/blog/gpudirect-storage/).
+KvikIO also works efficiently when GDS isn't available and can read/write both host and device data seamlessly.
+
+KvikIO C++ is a header-only library that is part of the [RAPIDS](https://rapids.ai/) suite of open-source software libraries for GPU-accelerated data science.
+
+---
+**Notice** this is the documentation for the C++ library. For the Python documentation, see under [kvikio](https://docs.rapids.ai/api/kvikio/nightly/).
+
+
+---
+
+## Features
+
+* Object Oriented API.
+* Exception handling.
+* Concurrent reads and writes using an internal thread pool.
+* Non-blocking API.
+* Handle both host and device IO seamlessly.
+
+## Installation
+
+KvikIO is a header-only library and as such doesn't need installation.
+However, for convenience we release Conda packages that makes it easy
+to include KvikIO in your CMake projects.
+
+### Conda/Mamba
+
+We strongly recommend using [mamba](https://github.com/mamba-org/mamba) in place of conda, which we will do throughout the documentation.
+
+Install the **stable release** from the ``rapidsai`` channel with the following:
+```sh
+# Install in existing environment
+mamba install -c rapidsai -c conda-forge libkvikio
+# Create new environment (CUDA 11.8)
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=11.8 libkvikio
+# Create new environment (CUDA 12.0)
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.0 libkvikio
+```
+
+Install the **nightly release** from the ``rapidsai-nightly`` channel with the following:
+
+```sh
+# Install in existing environment
+mamba install -c rapidsai-nightly -c conda-forge libkvikio
+# Create new environment (CUDA 11.8)
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.10 cuda-version=11.8 libkvikio
+# Create new environment (CUDA 12.0)
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.10 cuda-version=12.0 libkvikio
+```
+
+---
+**Notice** if the nightly install doesn't work, set ``channel_priority: flexible`` in your ``.condarc``.
+
+---
+
+### Include KvikIO in a CMake project
+An example of how to include KvikIO in an existing CMake project can be found here:  <https://github.com/rapidsai/kvikio/blob/HEAD/cpp/examples/downstream/>.
+
+
+### Build from source
+
+To build the C++ example run:
+
+```
+./build.sh libkvikio
+```
+
+Then run the example:
+
+```
+./examples/basic_io
+```
+
+## Runtime Settings
+
+#### Compatibility Mode (KVIKIO_COMPAT_MODE)
+When KvikIO is running in compatibility mode, it doesn't load `libcufile.so`. Instead, reads and writes are done using POSIX. Notice, this is not the same as the compatibility mode in cuFile. That is cuFile can run in compatibility mode while KvikIO is not.
+
+Set the environment variable `KVIKIO_COMPAT_MODE` to enable/disable compatibility mode. By default, compatibility mode is enabled:
+  - when `libcufile.so` cannot be found.
+  - when running in Windows Subsystem for Linux (WSL).
+  - when `/run/udev` isn't readable, which typically happens when running inside a docker image not launched with `--volume /run/udev:/run/udev:ro`.
+
+#### Thread Pool (KVIKIO_NTHREADS)
+KvikIO can use multiple threads for IO automatically. Set the environment variable `KVIKIO_NTHREADS` to the number of threads in the thread pool. If not set, the default value is 1.
+
+#### Task Size (KVIKIO_TASK_SIZE)
+KvikIO splits parallel IO operations into multiple tasks. Set the environment variable `KVIKIO_TASK_SIZE` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB).
+
+#### GDS Threshold (KVIKIO_GDS_THRESHOLD)
+In order to improve performance of small IO, `.pread()` and `.pwrite()` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable `KVIKIO_GDS_THRESHOLD` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB).
+
+
+## Example
+
+```cpp
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <kvikio/file_handle.hpp>
+using namespace std;
+
+int main()
+{
+  // Create two arrays `a` and `b`
+  constexpr std::size_t size = 100;
+  void *a = nullptr;
+  void *b = nullptr;
+  cudaMalloc(&a, size);
+  cudaMalloc(&b, size);
+
+  // Write `a` to file
+  kvikio::FileHandle fw("test-file", "w");
+  size_t written = fw.write(a, size);
+  fw.close();
+
+  // Read file into `b`
+  kvikio::FileHandle fr("test-file", "r");
+  size_t read = fr.read(b, size);
+  fr.close();
+
+  // Read file into `b` in parallel using 16 threads
+  kvikio::default_thread_pool::reset(16);
+  {
+    kvikio::FileHandle f("test-file", "r");
+    future<size_t> future = f.pread(b_dev, sizeof(a), 0);  // Non-blocking
+    size_t read = future.get(); // Blocking
+    // Notice, `f` closes automatically on destruction.
+  }
+}
+```
+
+For a full runnable example see <https://github.com/rapidsai/kvikio/blob/HEAD/cpp/examples/basic_io.cpp>.
diff --git a/cpp/examples/basic_io.cpp b/cpp/examples/basic_io.cpp
index bbae854ddb..abc5fc5110 100644
--- a/cpp/examples/basic_io.cpp
+++ b/cpp/examples/basic_io.cpp
@@ -22,6 +22,7 @@
 #include <kvikio/buffer.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/driver.hpp>
+#include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
 
 using namespace std;
@@ -34,11 +35,13 @@ void check(bool condition)
   }
 }
 
-constexpr int NELEM = 1000;                 // Number of elements used throughout the test
-constexpr int SIZE  = NELEM * sizeof(int);  // Size of the memory allocations (in bytes)
+constexpr int NELEM      = 1024;                 // Number of elements used throughout the test
+constexpr int SIZE       = NELEM * sizeof(int);  // Size of the memory allocations (in bytes)
+constexpr int LARGE_SIZE = 8 * SIZE;             // LARGE SIZE to test partial submit (in bytes)
 
 int main()
 {
+  std::size_t io_size = SIZE;
   check(cudaSetDevice(0) == cudaSuccess);
 
   cout << "KvikIO defaults: " << endl;
@@ -119,12 +122,12 @@ int main()
   {
     kvikio::FileHandle f("/tmp/test-file", "r+", kvikio::FileHandle::m644);
     kvikio::buffer_register(c_dev, SIZE);
-    size_t read = f.pread(b_dev, SIZE).get();
+    size_t read = f.pread(c_dev, SIZE).get();
     check(read == SIZE);
     check(read == f.nbytes());
     kvikio::buffer_deregister(c_dev);
+    cout << "Read buffer registered data: " << read << endl;
   }
-
   {
     kvikio::FileHandle f("/tmp/test-file", "w");
     size_t written = f.pwrite(a, SIZE).get();
@@ -144,52 +147,108 @@ int main()
     cout << "Parallel POSIX read (" << kvikio::defaults::thread_pool_nthreads()
          << " threads): " << read << endl;
   }
-
-  if (kvikio::is_batch_available() && !kvikio::defaults::compat_mode()) {
+  if (kvikio::is_batch_and_stream_available() && !kvikio::defaults::compat_mode()) {
     // Here we use the batch API to read "/tmp/test-file" into `b_dev` by
     // submitting 4 batch operations.
     constexpr int num_ops_in_batch = 4;
     constexpr int batchsize        = SIZE / num_ops_in_batch;
     kvikio::DriverProperties props;
     check(num_ops_in_batch < props.get_max_batch_io_size());
-
-    // We open the file as usual.
-    kvikio::FileHandle f("/tmp/test-file", "r");
-
-    // Then we create a batch
-    auto batch = kvikio::BatchHandle(num_ops_in_batch);
-
-    // And submit 4 operations each with its own offset
-    std::vector<kvikio::BatchOp> ops;
-    for (int i = 0; i < num_ops_in_batch; ++i) {
-      ops.push_back(kvikio::BatchOp{.file_handle   = f,
-                                    .devPtr_base   = b_dev,
-                                    .file_offset   = i * batchsize,
-                                    .devPtr_offset = i * batchsize,
-                                    .size          = batchsize,
-                                    .opcode        = CUFILE_READ});
-    }
-    batch.submit(ops);
-
-    // Finally, we wait on all 4 operations to be finished and check the result
-    auto statuses = batch.status(num_ops_in_batch, num_ops_in_batch);
-    check(statuses.size() == num_ops_in_batch);
-    size_t total_read = 0;
-    for (auto status : statuses) {
-      check(status.status == CUFILE_COMPLETE);
-      check(status.ret == batchsize);
-      total_read += status.ret;
-    }
-    check(cudaMemcpy(b, b_dev, SIZE, cudaMemcpyDeviceToHost) == cudaSuccess);
-    for (int i = 0; i < NELEM; ++i) {
-      check(a[i] == b[i]);
+    {
+      // We open the file as usual.
+      kvikio::FileHandle f("/tmp/test-file", "r");
+
+      // Then we create a batch
+      auto batch = kvikio::BatchHandle(num_ops_in_batch);
+
+      // And submit 4 operations each with its own offset
+      std::vector<kvikio::BatchOp> ops;
+      for (int i = 0; i < num_ops_in_batch; ++i) {
+        ops.push_back(kvikio::BatchOp{.file_handle   = f,
+                                      .devPtr_base   = b_dev,
+                                      .file_offset   = i * batchsize,
+                                      .devPtr_offset = i * batchsize,
+                                      .size          = batchsize,
+                                      .opcode        = CUFILE_READ});
+      }
+      batch.submit(ops);
+
+      // Finally, we wait on all 4 operations to be finished and check the result
+      auto statuses = batch.status(num_ops_in_batch, num_ops_in_batch);
+      check(statuses.size() == num_ops_in_batch);
+      size_t total_read = 0;
+      for (auto status : statuses) {
+        check(status.status == CUFILE_COMPLETE);
+        check(status.ret == batchsize);
+        total_read += status.ret;
+      }
+      check(cudaMemcpy(b, b_dev, SIZE, cudaMemcpyDeviceToHost) == cudaSuccess);
+      for (int i = 0; i < NELEM; ++i) {
+        check(a[i] == b[i]);
+      }
+      cout << "Batch read using 4 operations: " << total_read << endl;
+
+      batch.submit(ops);
+      batch.cancel();
+      statuses = batch.status(num_ops_in_batch, num_ops_in_batch);
+      check(statuses.empty());
+      cout << "Batch canceling of all 4 operations" << endl;
     }
-    cout << "Batch read using 4 operations: " << total_read << endl;
-
-    batch.submit(ops);
-    batch.cancel();
-    statuses = batch.status(num_ops_in_batch, num_ops_in_batch);
-    check(statuses.empty());
-    cout << "Batch canceling of all 4 operations" << endl;
+  } else {
+    cout << "The batch API isn't available, requires CUDA 12.2+" << endl;
+  }
+  {
+    cout << "Performing async I/O using by-reference arguments" << endl;
+    off_t f_off{0};
+    off_t d_off{0};
+    // Notice, we have to allocate the `bytes_done_p` argument on the heap and set it to 0.
+    ssize_t* bytes_done_p{};
+    check(cudaHostAlloc((void**)&bytes_done_p, SIZE, cudaHostAllocDefault) == cudaSuccess);
+    *bytes_done_p = 0;
+
+    // Let's create a new stream and submit an async write
+    CUstream stream{};
+    check(cudaStreamCreate(&stream) == cudaSuccess);
+    kvikio::FileHandle f_handle("/tmp/test-file", "w+");
+    check(cudaMemcpyAsync(a_dev, a, SIZE, cudaMemcpyHostToDevice, stream) == cudaSuccess);
+    f_handle.write_async(a_dev, &io_size, &f_off, &d_off, bytes_done_p, stream);
+
+    // After synchronizing `stream`, we can read the number of bytes written
+    check(cudaStreamSynchronize(stream) == cudaSuccess);
+    // Note, `*bytes_done_p` might be negative, which indicate an IO error thus we
+    // use `CUFILE_CHECK_STREAM_IO` to check for errors.
+    CUFILE_CHECK_STREAM_IO(bytes_done_p);
+    check(*bytes_done_p == SIZE);
+    cout << "File async write: " << *bytes_done_p << endl;
+
+    // Let's async read the data back into device memory
+    *bytes_done_p = 0;
+    f_handle.read_async(c_dev, &io_size, &f_off, &d_off, bytes_done_p, stream);
+    check(cudaStreamSynchronize(stream) == cudaSuccess);
+    CUFILE_CHECK_STREAM_IO(bytes_done_p);
+    check(*bytes_done_p == SIZE);
+    cout << "File async read: " << *bytes_done_p << endl;
+    check(cudaFreeHost((void*)bytes_done_p) == cudaSuccess);
+  }
+  {
+    cout << "Performing async I/O using by-value arguments" << endl;
+
+    // Let's create a new stream and submit an async write
+    CUstream stream{};
+    check(cudaStreamCreate(&stream) == cudaSuccess);
+    kvikio::FileHandle f_handle("/tmp/test-file", "w+");
+    check(cudaMemcpyAsync(a_dev, a, SIZE, cudaMemcpyHostToDevice, stream) == cudaSuccess);
+
+    // Notice, we get a handle `res`, which will synchronize the CUDA stream on destruction
+    kvikio::StreamFuture res = f_handle.write_async(a_dev, SIZE, 0, 0, stream);
+    // But we can also trigger the synchronization and get the bytes written by calling
+    // `check_bytes_done()`.
+    check(res.check_bytes_done() == SIZE);
+    cout << "File async write: " << res.check_bytes_done() << endl;
+
+    // Let's async read the data back into device memory
+    res = f_handle.read_async(c_dev, SIZE, 0, 0, stream);
+    check(res.check_bytes_done() == SIZE);
+    cout << "File async read: " << res.check_bytes_done() << endl;
   }
 }
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index e515297408..d2ee6b8d91 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -218,8 +218,8 @@ class defaults {
    * In order to improve performance of small IO, `.pread()` and `.pwrite()` implement a shortcut
    * that circumvent the threadpool and use the POSIX backend directly.
    *
-   * Set the default value using `kvikio::default::task_size_reset()` or by setting the
-   * `KVIKIO_TASK_SIZE` environment variable. If not set, the default value is 1 MiB.
+   * Set the default value using `kvikio::default::gds_threshold_reset()` or by setting the
+   * `KVIKIO_GDS_THRESHOLD` environment variable. If not set, the default value is 1 MiB.
    *
    * @return The default GDS threshold size in bytes.
    */
diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp
index ca809c63b5..748e704f60 100644
--- a/cpp/include/kvikio/error.hpp
+++ b/cpp/include/kvikio/error.hpp
@@ -53,7 +53,7 @@ struct CUfileException : public std::runtime_error {
                              std::string(err_str) + ")"};                                      \
     }                                                                                          \
   } while (0)
-#define CUDA_DRIVER_TRY_1(_call) CUDA_DRIVER_TRY_2(_call, CUfileException)
+#define CUDA_DRIVER_TRY_1(_call) CUDA_DRIVER_TRY_2(_call, kvikio::CUfileException)
 #endif
 
 #ifdef KVIKIO_CUFILE_FOUND
@@ -75,8 +75,32 @@ struct CUfileException : public std::runtime_error {
                              cufileop_status_error(error.err)};                  \
     }                                                                            \
   } while (0)
-#define CUFILE_TRY_1(_call) CUFILE_TRY_2(_call, CUfileException)
+#define CUFILE_TRY_1(_call) CUFILE_TRY_2(_call, kvikio::CUfileException)
 #endif
 #endif
 
+#ifndef CUFILE_CHECK_STREAM_IO
+#define CUFILE_CHECK_STREAM_IO(...)                                  \
+  GET_CUFILE_CHECK_STREAM_IO_MACRO(                                  \
+    __VA_ARGS__, CUFILE_CHECK_STREAM_IO_2, CUFILE_CHECK_STREAM_IO_1) \
+  (__VA_ARGS__)
+#define GET_CUFILE_CHECK_STREAM_IO_MACRO(_1, _2, NAME, ...) NAME
+#ifdef KVIKIO_CUFILE_FOUND
+#define CUFILE_CHECK_STREAM_IO_2(_nbytes_done, _exception_type)                            \
+  do {                                                                                     \
+    auto const _nbytes = *(_nbytes_done);                                                  \
+    if (_nbytes < 0) {                                                                     \
+      throw(_exception_type){std::string{"cuFile error at: "} + __FILE__ + ":" +           \
+                             KVIKIO_STRINGIFY(__LINE__) + ": " + std::to_string(_nbytes)}; \
+    }                                                                                      \
+  } while (0)
+#else
+// if cufile isn't available, we don't do anything in the body
+#define CUFILE_CHECK_STREAM_IO_2(_nbytes_done, _exception_type) \
+  do {                                                          \
+  } while (0)
+#endif
+#define CUFILE_CHECK_STREAM_IO_1(_call) CUFILE_CHECK_STREAM_IO_2(_call, kvikio::CUfileException)
+#endif
+
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index b63d12c5e7..9cd74b2319 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -17,6 +17,7 @@
 
 #include <fcntl.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <unistd.h>
 
 #include <cstddef>
@@ -33,6 +34,7 @@
 #include <kvikio/parallel_operation.hpp>
 #include <kvikio/posix_io.hpp>
 #include <kvikio/shim/cufile.hpp>
+#include <kvikio/stream.hpp>
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
@@ -344,7 +346,7 @@ class FileHandle {
    * `devPtr_base` must remain set to the base address used in the `buffer_register` call.
    * @param size Size in bytes to write.
    * @param file_offset Offset in the file to write from.
-   * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write into.
+   * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write from.
    * This parameter should be used only with registered buffers.
    * @return Size of bytes that were successfully written.
    */
@@ -496,6 +498,191 @@ class FileHandle {
     return parallel_io(op, devPtr_base, size, file_offset, task_size, devPtr_offset);
   }
 
+  /**
+   * @brief Reads specified bytes from the file into the device memory asynchronously.
+   *
+   * This is an asynchronous version of `.read()`, which will be executed in sequence
+   * for the specified stream.
+   *
+   * When running CUDA v12.1 or older, this function falls back to use `.read()` after
+   * `stream` has been synchronized.
+   *
+   * The arguments have the same meaning as in `.read()` but some of them are deferred.
+   * That is, the values pointed to by `size_p`, `file_offset_p` and `devPtr_offset_p`
+   * will not be evaluated until execution time. Notice, this behavior can be changed
+   * using cuFile's cuFileStreamRegister API.
+   *
+   * @param devPtr_base Base address of buffer in device memory. For registered buffers,
+   * `devPtr_base` must remain set to the base address used in the `buffer_register` call.
+   * @param size_p Pointer to size in bytes to read. If the exact size is not known at the time of
+   * I/O submission, then you must set it to the maximum possible I/O size for that stream I/O.
+   * Later the actual size can be set prior to the stream I/O execution.
+   * @param file_offset_p Pointer to offset in the file from which to read. Unless otherwise set
+   * using cuFileStreamRegister API, this value will not be evaluated until execution time.
+   * @param devPtr_offset_p Pointer to the offset relative to the bufPtr_base from which to write.
+   * Unless otherwise set using cuFileStreamRegister API, this value will not be evaluated until
+   * execution time.
+   * @param bytes_read_p Pointer to the bytes read from file. This pointer should be a non-NULL
+   * value and *bytes_read_p set to 0. The bytes_read_p memory should be allocated with
+   * cuMemHostAlloc/malloc/mmap or registered with cuMemHostRegister. After successful execution of
+   * the operation in the stream, the value *bytes_read_p will contain either:
+   *     - The number of bytes successfully read.
+   *     - -1 on IO errors.
+   *     - All other errors return a negative integer value of the CUfileOpError enum value.
+   * @param stream CUDA stream in which to enqueue the operation. If NULL, make this operation
+   * synchronous.
+   */
+  void read_async(void* devPtr_base,
+                  std::size_t* size_p,
+                  off_t* file_offset_p,
+                  off_t* devPtr_offset_p,
+                  ssize_t* bytes_read_p,
+                  CUstream stream)
+  {
+#ifdef KVIKIO_CUFILE_STREAM_API_FOUND
+    if (kvikio::is_batch_and_stream_available() && !_compat_mode) {
+      CUFILE_TRY(cuFileAPI::instance().ReadAsync(
+        _handle, devPtr_base, size_p, file_offset_p, devPtr_offset_p, bytes_read_p, stream));
+      return;
+    }
+#endif
+
+    CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+    *bytes_read_p =
+      static_cast<ssize_t>(read(devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
+  }
+
+  /**
+   * @brief Reads specified bytes from the file into the device memory asynchronously.
+   *
+   * This is an asynchronous version of `.read()`, which will be executed in sequence
+   * for the specified stream.
+   *
+   * When running CUDA v12.1 or older, this function falls back to use `.read()` after
+   * `stream` has been synchronized.
+   *
+   * The arguments have the same meaning as in `.read()` but returns a `StreamFuture` object
+   * that the caller must keep alive until all data has been read from disk. One way to do this,
+   * is by calling `StreamFuture.check_bytes_done()`, which will synchronize the associated stream
+   * and return the number of bytes read.
+   *
+   * @param devPtr_base Base address of buffer in device memory. For registered buffers,
+   * `devPtr_base` must remain set to the base address used in the `buffer_register` call.
+   * @param size Size in bytes to read.
+   * @param file_offset Offset in the file to read from.
+   * @param devPtr_offset Offset relative to the `devPtr_base` pointer to read into.
+   * This parameter should be used only with registered buffers.
+   * @param stream CUDA stream in which to enqueue the operation. If NULL, make this operation
+   * synchronous.
+   * @return A future object that must be kept alive until all data has been read to disk e.g.
+   * by synchronizing `stream`.
+   */
+  [[nodiscard]] StreamFuture read_async(void* devPtr_base,
+                                        std::size_t size,
+                                        off_t file_offset   = 0,
+                                        off_t devPtr_offset = 0,
+                                        CUstream stream     = nullptr)
+  {
+    StreamFuture ret(devPtr_base, size, file_offset, devPtr_offset, stream);
+    auto [devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_read_p, stream_] =
+      ret.get_args();
+    read_async(devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_read_p, stream_);
+    return ret;
+  }
+
+  /**
+   * @brief Writes specified bytes from the device memory into the file asynchronously.
+   *
+   * This is an asynchronous version of `.write()`, which will be executed in sequence
+   * for the specified stream.
+   *
+   * When running CUDA v12.1 or older, this function falls back to use `.read()` after
+   * `stream` has been synchronized.
+   *
+   * The arguments have the same meaning as in `.write()` but some of them are deferred.
+   * That is, the values pointed to by `size_p`, `file_offset_p` and `devPtr_offset_p`
+   * will not be evaluated until execution time. Notice, this behavior can be changed
+   * using cuFile's cuFileStreamRegister API.
+   *
+   * @param devPtr_base Base address of buffer in device memory. For registered buffers,
+   * `devPtr_base` must remain set to the base address used in the `buffer_register` call.
+   * @param size_p Pointer to size in bytes to read. If the exact size is not known at the time of
+   * I/O submission, then you must set it to the maximum possible I/O size for that stream I/O.
+   * Later the actual size can be set prior to the stream I/O execution.
+   * @param file_offset_p Pointer to offset in the file from which to read. Unless otherwise set
+   * using cuFileStreamRegister API, this value will not be evaluated until execution time.
+   * @param devPtr_offset_p Pointer to the offset relative to the bufPtr_base from which to read.
+   * Unless otherwise set using cuFileStreamRegister API, this value will not be evaluated until
+   * execution time.
+   * @param bytes_written_p Pointer to the bytes read from file. This pointer should be a non-NULL
+   * value and *bytes_written_p set to 0. The bytes_written_p memory should be allocated with
+   * cuMemHostAlloc/malloc/mmap or registered with cuMemHostRegister.
+   * After successful execution of the operation in the stream, the value *bytes_written_p will
+   * contain either:
+   *     - The number of bytes successfully read.
+   *     - -1 on IO errors.
+   *     - All other errors return a negative integer value of the CUfileOpError enum value.
+   * @param stream CUDA stream in which to enqueue the operation. If NULL, make this operation
+   * synchronous.
+   */
+  void write_async(void* devPtr_base,
+                   std::size_t* size_p,
+                   off_t* file_offset_p,
+                   off_t* devPtr_offset_p,
+                   ssize_t* bytes_written_p,
+                   CUstream stream)
+  {
+#ifdef KVIKIO_CUFILE_STREAM_API_FOUND
+    if (kvikio::is_batch_and_stream_available() && !_compat_mode) {
+      CUFILE_TRY(cuFileAPI::instance().WriteAsync(
+        _handle, devPtr_base, size_p, file_offset_p, devPtr_offset_p, bytes_written_p, stream));
+      return;
+    }
+#endif
+
+    CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+    *bytes_written_p =
+      static_cast<ssize_t>(write(devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
+  }
+
+  /**
+   * @brief Writes specified bytes from the device memory into the file asynchronously.
+   *
+   * This is an asynchronous version of `.write()`, which will be executed in sequence
+   * for the specified stream.
+   *
+   * When running CUDA v12.1 or older, this function falls back to use `.read()` after
+   * `stream` has been synchronized.
+   *
+   * The arguments have the same meaning as in `.write()` but returns a `StreamFuture` object
+   * that the caller must keep alive until all data has been written to disk. One way to do this,
+   * is by calling `StreamFuture.check_bytes_done()`, which will synchronize the associated stream
+   * and return the number of bytes written.
+   *
+   * @param devPtr_base Base address of buffer in device memory. For registered buffers,
+   * `devPtr_base` must remain set to the base address used in the `buffer_register` call.
+   * @param size Size in bytes to write.
+   * @param file_offset Offset in the file to write from.
+   * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write from.
+   * This parameter should be used only with registered buffers.
+   * @param stream CUDA stream in which to enqueue the operation. If NULL, make this operation
+   * synchronous.
+   * @return A future object that must be kept alive until all data has been written to disk e.g.
+   * by synchronizing `stream`.
+   */
+  [[nodiscard]] StreamFuture write_async(void* devPtr_base,
+                                         std::size_t size,
+                                         off_t file_offset   = 0,
+                                         off_t devPtr_offset = 0,
+                                         CUstream stream     = nullptr)
+  {
+    StreamFuture ret(devPtr_base, size, file_offset, devPtr_offset, stream);
+    auto [devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_written_p, stream_] =
+      ret.get_args();
+    write_async(devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_written_p, stream_);
+    return ret;
+  }
+
   /**
    * @brief Returns `true` if the compatibility mode has been enabled for this file.
    *
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index dbca8c8430..7d4b08d9d8 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -46,6 +46,7 @@ class cudaAPI {
   decltype(cuDeviceGet)* DeviceGet{nullptr};
   decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr};
   decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr};
+  decltype(cuStreamSynchronize)* StreamSynchronize{nullptr};
 
  private:
   cudaAPI()
@@ -70,6 +71,7 @@ class cudaAPI {
     get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
     get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
     get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
+    get_symbol(StreamSynchronize, lib, KVIKIO_STRINGIFY(cuStreamSynchronize));
   }
 
  public:
diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp
index 4791b39cd4..84db69b72e 100644
--- a/cpp/include/kvikio/shim/cufile.hpp
+++ b/cpp/include/kvikio/shim/cufile.hpp
@@ -54,7 +54,14 @@ class cuFileAPI {
   decltype(cuFileBatchIOCancel)* BatchIOCancel{nullptr};
   decltype(cuFileBatchIODestroy)* BatchIODestroy{nullptr};
 #endif
-  bool batch_available = false;
+
+#ifdef KVIKIO_CUFILE_STREAM_API_FOUND
+  decltype(cuFileReadAsync)* ReadAsync{nullptr};
+  decltype(cuFileWriteAsync)* WriteAsync{nullptr};
+  decltype(cuFileStreamRegister)* StreamRegister{nullptr};
+  decltype(cuFileStreamDeregister)* StreamDeregister{nullptr};
+#endif
+  bool stream_available = false;
 
  private:
   cuFileAPI()
@@ -89,16 +96,17 @@ class cuFileAPI {
     get_symbol(BatchIOGetStatus, lib, KVIKIO_STRINGIFY(cuFileBatchIOGetStatus));
     get_symbol(BatchIOCancel, lib, KVIKIO_STRINGIFY(cuFileBatchIOCancel));
     get_symbol(BatchIODestroy, lib, KVIKIO_STRINGIFY(cuFileBatchIODestroy));
+#endif
 
-    // HACK: we use the mangled name of the `CUfileOpError` to determine if cuFile's
-    // batch API is available (v12.0.1+). Notice, the symbols of `cuFileBatchIOSetUp` & co.
-    // exist all the way back to CUDA v11.5 but calling them is undefined behavior.
-    // TODO: when CUDA v12.2 is released, use `cuFileReadAsync` to determine the availability
-    // of both the batch and async API.
+#ifdef KVIKIO_CUFILE_STREAM_API_FOUND
+    get_symbol(ReadAsync, lib, KVIKIO_STRINGIFY(cuFileReadAsync));
+    get_symbol(WriteAsync, lib, KVIKIO_STRINGIFY(cuFileWriteAsync));
+    get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister));
+    get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister));
     try {
       void* s{};
-      get_symbol(s, lib, "_ZTS13CUfileOpError");
-      batch_available = true;
+      get_symbol(s, lib, "cuFileReadAsync");
+      stream_available = true;
     } catch (const std::runtime_error&) {
     }
 #endif
@@ -169,21 +177,25 @@ inline bool is_cufile_available()
 }
 
 /**
- * @brief Check if cuFile's batch API is available
+ * @brief Check if cuFile's batch and stream API is available
+ *
+ * Technically, the batch API is available in CUDA 12.1 but since there is no good
+ * way to check CUDA version using the driver API, we check for the existing of the
+ * `cuFileReadAsync` symbol, which is defined in CUDA 12.2+.
  *
  * @return The boolean answer
  */
-#ifdef KVIKIO_CUFILE_BATCH_API_FOUND
-inline bool is_batch_available()
+#if defined(KVIKIO_CUFILE_STREAM_API_FOUND) && defined(KVIKIO_CUFILE_STREAM_API_FOUND)
+inline bool is_batch_and_stream_available()
 {
   try {
-    return is_cufile_available() && cuFileAPI::instance().batch_available;
+    return is_cufile_available() && cuFileAPI::instance().stream_available;
   } catch (const std::runtime_error&) {
     return false;
   }
 }
 #else
-constexpr bool is_batch_available() { return false; }
+constexpr bool is_batch_and_stream_available() { return false; }
 #endif
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/stream.hpp b/cpp/include/kvikio/stream.hpp
new file mode 100644
index 0000000000..6b573fcb06
--- /dev/null
+++ b/cpp/include/kvikio/stream.hpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <sys/types.h>
+#include <algorithm>
+#include <cstdlib>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/cuda.hpp>
+#include <kvikio/shim/cufile.hpp>
+#include <tuple>
+
+namespace kvikio {
+
+/**
+ * @brief Future of an asynchronous IO operation
+ *
+ * This class shouldn't be used directly, instead some stream operations such as
+ * `FileHandle.read_async` and `FileHandle.write_async` returns an instance of this class. Use
+ * `.check_bytes_done()` to synchronize the associated CUDA stream and return the number of bytes
+ * read or written by the operation.
+ *
+ * The goal of this class is twofold:
+ *   - Have `read_async` and `write_async` return an object that clearly associates the function
+ *     arguments with the CUDA stream used. This is useful because the current validity of the
+ *     arguments depends on the stream.
+ *   - Support of by-value arguments. In many cases, a user will use `read_async` and `write_async`
+ *     like most other asynchronous CUDA functions that take by-value arguments.
+ *
+ * To support by-value arguments, we allocate the arguments on the heap (malloc `ArgByVal`) and have
+ * the by-reference arguments points into `ArgByVal`. This way, the `read_async` and `write_async`
+ * can call `.get_args()` to get the by-reference arguments required by cuFile's stream API.
+ */
+class StreamFuture {
+ private:
+  struct ArgByVal {
+    std::size_t size;
+    off_t file_offset;
+    off_t devPtr_offset;
+    ssize_t bytes_done;
+  };
+
+  void* _devPtr_base{nullptr};
+  CUstream _stream{nullptr};
+  ArgByVal* _val{nullptr};
+  bool _stream_synchronized{false};
+
+ public:
+  StreamFuture() noexcept = default;
+
+  StreamFuture(
+    void* devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream)
+    : _devPtr_base{devPtr_base}, _stream{stream}
+  {
+    // Notice, we allocate the arguments using malloc() as specified in the cuFile docs:
+    // <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufilewriteasync>
+    if ((_val = static_cast<ArgByVal*>(std::malloc(sizeof(ArgByVal)))) == nullptr) {
+      throw std::bad_alloc{};
+    }
+    *_val = {
+      .size = size, .file_offset = file_offset, .devPtr_offset = devPtr_offset, .bytes_done = 0};
+  }
+
+  /**
+   * @brief StreamFuture support move semantic but isn't copyable
+   */
+  StreamFuture(const StreamFuture&)        = delete;
+  StreamFuture& operator=(StreamFuture& o) = delete;
+  StreamFuture(StreamFuture&& o) noexcept
+    : _devPtr_base{std::exchange(o._devPtr_base, nullptr)},
+      _stream{std::exchange(o._stream, nullptr)},
+      _val{std::exchange(o._val, nullptr)},
+      _stream_synchronized{o._stream_synchronized}
+  {
+  }
+  StreamFuture& operator=(StreamFuture&& o) noexcept
+  {
+    _devPtr_base         = std::exchange(o._devPtr_base, nullptr);
+    _stream              = std::exchange(o._stream, nullptr);
+    _val                 = std::exchange(o._val, nullptr);
+    _stream_synchronized = o._stream_synchronized;
+    return *this;
+  }
+
+  /**
+   * @brief Return the arguments of the future call
+   *
+   * @return Tuple of the arguments in the order matching `FileHandle.read()` and
+   * `FileHandle.write()`
+   */
+  std::tuple<void*, std::size_t*, off_t*, off_t*, ssize_t*, CUstream> get_args() const
+  {
+    if (_val == nullptr) {
+      throw kvikio::CUfileException("cannot get arguments from an uninitialized StreamFuture");
+    }
+    return {_devPtr_base,
+            &_val->size,
+            &_val->file_offset,
+            &_val->devPtr_offset,
+            &_val->bytes_done,
+            _stream};
+  }
+
+  /**
+   * @brief Return the number of bytes read or written by the future operation.
+   *
+   * Synchronize the associated CUDA stream.
+   *
+   * @return Number of bytes read or written by the future operation.
+   */
+  std::size_t check_bytes_done()
+  {
+    if (_val == nullptr) {
+      throw kvikio::CUfileException("cannot check bytes done on an uninitialized StreamFuture");
+    }
+
+    if (!_stream_synchronized) {
+      _stream_synchronized = true;
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
+    }
+
+    CUFILE_CHECK_STREAM_IO(&_val->bytes_done);
+    // At this point, we know `*_val->bytes_done` is a positive value otherwise
+    // CUFILE_CHECK_STREAM_IO() would have raised an exception.
+    return static_cast<std::size_t>(_val->bytes_done);
+  }
+
+  /**
+   * @brief Free the by-value arguments and make sure the associated CUDA stream has been
+   * synchronized.
+   */
+  ~StreamFuture() noexcept
+  {
+    if (_val != nullptr) {
+      try {
+        check_bytes_done();
+      } catch (const kvikio::CUfileException& e) {
+        std::cerr << e.what() << std::endl;
+      }
+      std::free(_val);
+    }
+  }
+};
+
+}  // namespace kvikio
diff --git a/dependencies.yaml b/dependencies.yaml
index 270a228aae..4ed3bbb14b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -92,7 +92,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cmake>=3.26.4
-          - cython>=0.29,<0.30
+          - cython>=3.0.0
           - ninja
           - scikit-build>=0.13.1
       - output_types: conda
@@ -236,11 +236,13 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - pydata-sphinx-theme
-          - sphinx<6
+          - numpydoc
+          - sphinx
+          - sphinx-click
+          - sphinx_rtd_theme
       - output_types: conda
         packages:
-          - doxygen=1.8.20 # pre-commit hook needs a specific version.
+          - doxygen=1.9.1 # pre-commit hook needs a specific version.
   py_version:
     specific:
       - output_types: conda
@@ -262,6 +264,8 @@ dependencies:
         packages:
           - numpy>=1.21
           - zarr
+          # See https://github.com/zarr-developers/numcodecs/pull/475
+          - numcodecs <0.12.0
           - packaging
       - output_types: conda
         packages:
@@ -269,14 +273,6 @@ dependencies:
       - output_types: [requirements, pyproject]
         packages:
           - cupy-cuda11x>=12.0.0
-    specific:
-      - output_types: requirements
-        matrices:
-          - matrix:
-              arch: x86_64
-            packages:
-              - libcufile
-              - libcufile-dev
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -304,4 +300,4 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - cudf==23.8.*
+          - cudf==23.10.*
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 5973ac8f29..4d19c09bbb 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -18,7 +18,6 @@ Zarr
 .. autoclass:: GDSStore
     :members:
 
-
 Defaults
 --------
 .. currentmodule:: kvikio.defaults
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b0aa9a5513..d36282c096 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,5 +1,5 @@
-#!/usr/bin/env python3
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
 #
 # Configuration file for the Sphinx documentation builder.
 #
@@ -17,17 +17,16 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 
-
 # -- Project information -----------------------------------------------------
 
 project = "kvikio"
-copyright = "2022, NVIDIA"
+copyright = "2023, NVIDIA"
 author = "NVIDIA"
 
 # The short X.Y version.
-version = '23.08'
+version = '23.10'
 # The full version, including alpha/beta/rc tags
-release = '23.08.00'
+release = '23.10.00'
 
 
 # -- General configuration ---------------------------------------------------
@@ -37,40 +36,152 @@
 # ones.
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.extlinks",
+    "numpydoc",
+    "sphinx_click",
+    "sphinx_rtd_theme",
 ]
 
+numpydoc_show_class_members = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = ".rst"
+
+# The master toctree document.
+master_doc = "index"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
 
 # -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = "pydata_sphinx_theme"
-html_logo = "_static/RAPIDS-logo-purple.png"
-
-html_theme_options = {
-    "external_links": [],
-    # https://github.com/pydata/pydata-sphinx-theme/issues/1220
-    "icon_links": [],
-    "github_url": "https://github.com/rapidsai/kvikio",
-    "twitter_url": "https://twitter.com/rapidsai",
-    "show_toc_level": 1,
-    "navbar_align": "right",
-}
+html_theme = "sphinx_rtd_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "kvikiodoc"
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, "kvikio.tex", "kvikio Documentation", "NVIDIA", "manual")
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, "kvikio", "kvikio Documentation", [author], 1)]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (
+        master_doc,
+        "kvikio",
+        "kvikio Documentation",
+        author,
+        "kvikio",
+        "One line description of project.",
+        "Miscellaneous",
+    )
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ["search.html"]
+
+
+# -- Extension configuration -------------------------------------------------
+
 
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 31754db736..4dd491fd96 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,18 +1,28 @@
-Welcome to KvikIO's documentation!
-==================================
+Welcome to KvikIO's Python documentation!
+=========================================
 
-KvikIO is a Python library providing bindings to `cuFile <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html>`_, which enables `GPUDirectStorage <https://developer.nvidia.com/blog/gpudirect-storage/>`_ (GDS).
+KvikIO is a Python and C++ library for high performance file IO. It provides C++ and Python
+bindings to `cuFile <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html>`_,
+which enables `GPUDirect Storage <https://developer.nvidia.com/blog/gpudirect-storage/>`_ (GDS).
+KvikIO also works efficiently when GDS isn't available and can read/write both host and device data seamlessly.
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+KvikIO is a part of the `RAPIDS <https://rapids.ai/>`_ suite of open-source software libraries for GPU-accelerated data science.
 
-   api
 
+.. note::
+   This is the documentation for the Python library. For the C++ documentation, see under `libkvikio <https://docs.rapids.ai/api/libkvikio/nightly/>`_.
 
 
-Indices and tables
-==================
+Contents
+--------
 
-* :ref:`genindex`
-* :ref:`search`
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting Started
+
+   install
+   quickstart
+   zarr
+   runtime_settings
+   api
+   genindex
diff --git a/docs/source/install.rst b/docs/source/install.rst
new file mode 100644
index 0000000000..c6f11a7a93
--- /dev/null
+++ b/docs/source/install.rst
@@ -0,0 +1,71 @@
+Installation
+============
+
+KvikIO can be installed using Conda/Mamba or from source.
+
+
+Conda/Mamba
+-----------
+
+We strongly recommend using `mamba <https://github.com/mamba-org/mamba>`_ inplace of conda, which we will do throughout the documentation.
+
+Install the **stable release** from the ``rapidsai`` channel like:
+
+.. code-block::
+
+  # Install in existing environment
+  mamba install -c rapidsai -c conda-forge kvikio
+  # Create new environment (CUDA 11.8)
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.10 cuda-version=11.8 kvikio
+  # Create new environment (CUDA 12.0)
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.10 cuda-version=12.0 kvikio
+
+Install the **nightly release** from the ``rapidsai-nightly`` channel like:
+
+.. code-block::
+
+  # Install in existing environment
+  mamba install -c rapidsai-nightly -c conda-forge kvikio
+  # Create new environment (CUDA 11.8)
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.10 cuda-version=11.8 kvikio
+  # Create new environment (CUDA 12.0)
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.10 cuda-version=12.0 kvikio
+
+
+.. note::
+
+  If the nightly install doesn't work, set ``channel_priority: flexible`` in your ``.condarc``.
+
+Build from source
+-----------------
+
+In order to setup a development environment run:
+
+.. code-block::
+
+  # CUDA 11.8
+  mamba env create --name kvikio-dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+  # CUDA 12.0
+  mamba env create --name kvikio-dev --file conda/environments/all_cuda-120_arch-x86_64.yaml
+
+To build and install the extension run:
+
+.. code-block::
+
+  ./build.sh kvikio
+
+
+One might have to define ``CUDA_HOME`` to the path to the CUDA installation.
+
+In order to test the installation, run the following:
+
+.. code-block::
+
+  pytest tests/
+
+
+And to test performance, run the following:
+
+.. code-block::
+
+  python benchmarks/single-node-io.py
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
new file mode 100644
index 0000000000..448b3101f5
--- /dev/null
+++ b/docs/source/quickstart.rst
@@ -0,0 +1,37 @@
+Quickstart
+==========
+
+KvikIO can be used inplace of Python's built-in `open() <https://docs.python.org/3/library/functions.html#open>`_ function with the caveat that a file is always opened in binary (``"b"``) mode.
+In order to open a file, use KvikIO's filehandle :py:meth:`kvikio.cufile.CuFile`.
+
+.. code-block:: python
+
+  import cupy
+  import kvikio
+
+  a = cupy.arange(100)
+  f = kvikio.CuFile("test-file", "w")
+  # Write whole array to file
+  f.write(a)
+  f.close()
+
+  b = cupy.empty_like(a)
+  f = kvikio.CuFile("test-file", "r")
+  # Read whole array from file
+  f.read(b)
+  assert all(a == b)
+
+  # Use contexmanager
+  c = cupy.empty_like(a)
+  with kvikio.CuFile(path, "r") as f:
+      f.read(c)
+  assert all(a == c)
+
+  # Non-blocking read
+  d = cupy.empty_like(a)
+  with kvikio.CuFile(path, "r") as f:
+      future1 = f.pread(d[:50])
+      future2 = f.pread(d[50:], file_offset=d[:50].nbytes)
+      future1.get()  # Wait for first read
+      future2.get()  # Wait for second read
+  assert all(a == d)
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
new file mode 100644
index 0000000000..2d03eb2f87
--- /dev/null
+++ b/docs/source/runtime_settings.rst
@@ -0,0 +1,26 @@
+Runtime Settings
+================
+
+Compatibility Mode ``KVIKIO_COMPAT_MODE``
+-----------------------------------------
+When KvikIO is running in compatibility mode, it doesn't load ``libcufile.so``. Instead, reads and writes are done using POSIX. Notice, this is not the same as the compatibility mode in cuFile. That is cuFile can run in compatibility mode while KvikIO is not.
+Set the environment variable ``KVIKIO_COMPAT_MODE`` to enable/disable compatibility mode. By default, compatibility mode is enabled:
+
+  * when ``libcufile.so`` cannot be found.
+  * when running in Windows Subsystem for Linux (WSL).
+  * when ``/run/udev`` isn't readable, which typically happens when running inside a docker image not launched with ``--volume /run/udev:/run/udev:ro``.
+
+
+Thread Pool ``KVIKIO_NTHREADS``
+-------------------------------
+KvikIO can use multiple threads for IO automatically. Set the environment variable ``KVIKIO_NTHREADS`` to the number of threads in the thread pool. If not set, the default value is 1.
+
+
+Task Size ``KVIKIO_TASK_SIZE``
+------------------------------
+KvikIO splits parallel IO operations into multiple tasks. Set the environment variable ``KVIKIO_TASK_SIZE`` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB).
+
+
+GDS Threshold ``KVIKIO_GDS_THRESHOLD``
+--------------------------------------
+In order to improve performance of small IO, ``.pread()`` and ``.pwrite()`` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable ``KVIKIO_GDS_THRESHOLD`` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB).
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
new file mode 100644
index 0000000000..f2a697d525
--- /dev/null
+++ b/docs/source/zarr.rst
@@ -0,0 +1,15 @@
+Zarr
+====
+
+`Zarr <https://github.com/zarr-developers/zarr-specs>`_ is a binary file format for chunked, compressed, N-Dimensional array. It is used throughout the PyData ecosystem and especially for climate and biological science applications.
+
+
+`Zarr-Python <https://zarr.readthedocs.io/en/stable/>`_ is the official Python package for reading and writing Zarr arrays. Its main feature is a NumPy-like array that translates array operations into file IO seamlessly.
+KvikIO provides a GPU backend to Zarr-Python that enables `GPUDirect Storage (GDS) <https://developer.nvidia.com/blog/gpudirect-storage/>`_ seamlessly.
+
+The following is an example of how to use the convenience function :py:meth:`kvikio.zarr.open_cupy_array`
+to create a new Zarr array and how open an existing Zarr array.
+
+
+.. literalinclude:: ../../python/examples/zarr_cupy_nvcomp.py
+    :language: python
diff --git a/legate/legate_kvikio/__init__.py b/legate/legate_kvikio/__init__.py
index a83712726d..612e15e0aa 100644
--- a/legate/legate_kvikio/__init__.py
+++ b/legate/legate_kvikio/__init__.py
@@ -3,4 +3,4 @@
 
 from .cufile import CuFile  # noqa: F401
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/legate/pyproject.toml b/legate/pyproject.toml
index 6f6b440b2b..3e211f9703 100644
--- a/legate/pyproject.toml
+++ b/legate/pyproject.toml
@@ -5,7 +5,7 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=0.29,<0.30",
+    "cython>=3.0.0",
     "ninja",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -14,7 +14,7 @@ requires = [
 
 [project]
 name = "legate_kvikio"
-version = "23.08.00"
+version = "23.10.00"
 description = "KvikIO - GPUDirect Storage"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -24,6 +24,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cupy-cuda11x>=12.0.0",
+    "numcodecs <0.12.0",
     "numpy>=1.21",
     "packaging",
     "zarr",
diff --git a/notebooks/zarr.ipynb b/notebooks/zarr.ipynb
new file mode 100644
index 0000000000..33a981ebf5
--- /dev/null
+++ b/notebooks/zarr.ipynb
@@ -0,0 +1,364 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "7a060f7d-9a0c-4763-98df-7dc82409c6ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "In this tutorial, we will show how to use KvikIO to read and write GPU memory directly to/from Zarr files.\n",
+    "\"\"\"\n",
+    "import json\n",
+    "import shutil\n",
+    "import numpy\n",
+    "import cupy\n",
+    "import zarr\n",
+    "import kvikio\n",
+    "import kvikio.zarr\n",
+    "from kvikio.nvcomp_codec import NvCompBatchCodec\n",
+    "from numcodecs import LZ4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "99f4d25b-2006-4026-8629-1accafb338ef",
+   "metadata": {},
+   "source": [
+    "We need to set three Zarr arguments: \n",
+    " - `meta_array`: in order to make Zarr read into GPU memory (instead of CPU memory), we set the `meta_array` argument to an empty CuPy array. \n",
+    " - `store`: we need to use a GPU compatible Zarr Store, which will be KvikIO’s GDS store in our case. \n",
+    " - `compressor`: finally, we need to use a GPU compatible compressor (or `None`). KvikIO provides a nvCOMP compressor `kvikio.nvcomp_codec.NvCompBatchCodec` that we will use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "c179c24a-766e-4e09-83c5-349868042576",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(<zarr.core.Array (10,) int64>,\n",
+       " NvCompBatchCodec(algorithm='lz4', options={}),\n",
+       " <kvikio.zarr.GDSStore at 0x7fd42021ac20>)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Let's create a new Zarr array using KvikIO's GDS store and LZ4 compression\n",
+    "z = zarr.array(\n",
+    "    cupy.arange(10), \n",
+    "    chunks=2, \n",
+    "    store=kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"), \n",
+    "    meta_array=cupy.empty(()),\n",
+    "    compressor=NvCompBatchCodec(\"lz4\"),\n",
+    "    overwrite=True,\n",
+    ")\n",
+    "z, z.compressor, z.store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "cupy.ndarray"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# And because we set the `meta_array` argument, reading the Zarr array returns a CuPy array\n",
+    "type(z[:])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "549ded39-1053-4f82-a8a7-5a2ee999a4a1",
+   "metadata": {},
+   "source": [
+    "From this point onwards, `z` can be used just like any other Zarr array."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "8221742d-f15c-450a-9701-dc8c05326126",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 2, 3, 4, 5, 6, 7, 8])"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "z[1:9]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "f0c451c1-a240-4b26-a5ef-6e70a5bbeb55",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([42, 43, 44, 45, 46, 47, 48, 49, 50, 51])"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "z[:] + 42"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7797155f-40f4-4c50-b704-2356ca64cba3",
+   "metadata": {},
+   "source": [
+    "### GPU compression / CPU decompression"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0029deb-19b9-4dbb-baf0-ce4b199605a5",
+   "metadata": {},
+   "source": [
+    "In order to read GPU-written Zarr file into a NumPy array, we simply open that file **without** setting the `meta_array` argument:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "399f23f7-4475-496a-a537-a7163a35c888",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(numpy.ndarray,\n",
+       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
+       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "z = zarr.open_array(kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"))\n",
+    "type(z[:]), type(z.compressor), z[:]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e9f31d5",
+   "metadata": {},
+   "source": [
+    "And we don't need to use `kvikio.zarr.GDSStore` either:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "4b1f46b2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(numpy.ndarray,\n",
+       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
+       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "z = zarr.open_array(\"my-zarr-file.zarr\")\n",
+    "type(z[:]), type(z.compressor), z[:]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f10fd704-35f7-46b7-aabe-ea68fb2bf88d",
+   "metadata": {},
+   "source": [
+    "However, the above use `NvCompBatchCodec(\"lz4\")` for decompression. In the following, we will show how to read Zarr file written and compressed using a GPU on the CPU.\n",
+    "\n",
+    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU but Zarr will always use the compressor used to write the Zarr file. We are working with the Zarr team to fix this shortcoming but for now, we will use a workaround where we _patch_ the metadata manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "d980361a-e132-4f29-ab13-cbceec5bbbb5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(numpy.ndarray, numcodecs.lz4.LZ4, array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Read the Zarr metadata and replace the compressor with a CPU implementation of LZ4\n",
+    "store = zarr.DirectoryStore(\"my-zarr-file.zarr\")  # We could also have used kvikio.zarr.GDSStore\n",
+    "meta = json.loads(store[\".zarray\"])\n",
+    "meta[\"compressor\"] = LZ4().get_config()\n",
+    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
+    "\n",
+    "# And then open the file as usually\n",
+    "z = zarr.open_array(store)\n",
+    "type(z[:]), type(z.compressor), z[:]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ea73705",
+   "metadata": {},
+   "source": [
+    "### CPU compression / GPU decompression\n",
+    "\n",
+    "Now, let's try the otherway around."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "c9b2d56a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(<zarr.core.Array (10,) int64>,\n",
+       " LZ4(acceleration=1),\n",
+       " <zarr.storage.DirectoryStore at 0x7fd351e7a9b0>)"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numcodecs\n",
+    "# Let's create a new Zarr array using the default compression.\n",
+    "z = zarr.array(\n",
+    "    numpy.arange(10), \n",
+    "    chunks=2, \n",
+    "    store=\"my-zarr-file.zarr\", \n",
+    "    overwrite=True,\n",
+    "    # The default (CPU) implementation of LZ4 codec.\n",
+    "    compressor=numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
+    ")\n",
+    "z, z.compressor, z.store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dedd4623",
+   "metadata": {},
+   "source": [
+    "Again, we will use a workaround where we _patch_ the metadata manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "ac3f30b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(cupy.ndarray,\n",
+       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
+       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Read the Zarr metadata and replace the compressor with a GPU implementation of LZ4\n",
+    "store = kvikio.zarr.GDSStore(\"my-zarr-file.zarr\")  # We could also have used zarr.DirectoryStore\n",
+    "meta = json.loads(store[\".zarray\"])\n",
+    "meta[\"compressor\"] = NvCompBatchCodec(\"lz4\").get_config()\n",
+    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
+    "\n",
+    "# And then open the file as usually\n",
+    "z = zarr.open_array(store, meta_array=cupy.empty(()))\n",
+    "type(z[:]), type(z.compressor), z[:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "80682922-b7b0-4b08-b595-228c2b446a78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clean up\n",
+    "shutil.rmtree(\"my-zarr-file.zarr\", ignore_errors=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 71ce53d16f..4813801439 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -14,12 +14,13 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(kvikio_version 23.08.00)
+set(kvikio_version 23.10.00)
 
-set(CYTHON_FLAGS
-    "--directive binding=True,embedsignature=True,always_allow_keywords=True"
-    CACHE STRING "The directives for Cython compilation."
-)
+include(../cpp/cmake/fetch_rapids.cmake)
+include(rapids-cpm)
+rapids_cpm_init()
+include(rapids-cuda)
+rapids_cuda_init_architectures(kvikio-python)
 
 project(
   kvikio-python
@@ -35,26 +36,25 @@ option(FIND_KVIKIO_CPP
        "Search for existing KVIKIO C++ installations before defaulting to local files" OFF
 )
 
-find_package(PythonExtensions REQUIRED)
-find_package(Cython REQUIRED)
-
 # TODO: Should we symlink FindcuFile.cmake into python/cmake? find cuFile
 include(../cpp/cmake/Modules/FindcuFile.cmake)
 
-# Ignore unused variable warning.
-set(ignored_variable "${SKBUILD}")
-
 if(FIND_KVIKIO_CPP)
   find_package(KvikIO ${kvikio_version})
 else()
-  set(KVIKIO_FOUND OFF)
+  set(KvikIO_FOUND OFF)
 endif()
 
 find_package(CUDAToolkit REQUIRED)
 
-if(NOT KVIKIO_FOUND)
+if(NOT KvikIO_FOUND)
   add_subdirectory(../cpp kvikio-cpp)
+  set(cython_lib_dir kvikio)
+  install(TARGETS kvikio DESTINATION ${cython_lib_dir})
 endif()
 
+include(rapids-cython)
+rapids_cython_init()
+
 add_subdirectory(cmake)
 add_subdirectory(kvikio/_lib)
diff --git a/python/benchmarks/zarr-io.py b/python/benchmarks/zarr-io.py
new file mode 100644
index 0000000000..983c735364
--- /dev/null
+++ b/python/benchmarks/zarr-io.py
@@ -0,0 +1,324 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import argparse
+import contextlib
+import os
+import os.path
+import pathlib
+import shutil
+import statistics
+import subprocess
+import tempfile
+from time import perf_counter as clock
+from typing import ContextManager, Union
+
+import cupy
+import numcodecs.blosc
+import numpy
+import zarr
+from dask.utils import format_bytes, parse_bytes
+
+import kvikio
+import kvikio.defaults
+import kvikio.zarr
+
+if not kvikio.zarr.supported:
+    raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
+
+compressors = {
+    "none": (None, None),
+    "lz4": (numcodecs.blosc.Blosc(cname="lz4"), kvikio.zarr.LZ4()),
+}
+
+
+def drop_vm_cache(args):
+    if args.drop_vm_cache:
+        subprocess.check_output(["sudo /sbin/sysctl vm.drop_caches=3"], shell=True)
+
+
+def create_src_data(args):
+    return cupy.random.random(args.nelem, dtype=args.dtype)
+
+
+def run_kvikio(args):
+    dir_path = args.dir / "kvikio"
+    shutil.rmtree(str(dir_path), ignore_errors=True)
+
+    # Get the GPU compressor
+    compressor = compressors[args.compressor][1]
+
+    src = create_src_data(args)
+
+    # Write
+    drop_vm_cache(args)
+    t0 = clock()
+    z = zarr.create(
+        shape=(args.nelem,),
+        chunks=(args.chunksize,),
+        dtype=args.dtype,
+        compressor=compressor,
+        store=kvikio.zarr.GDSStore(dir_path),
+        meta_array=cupy.empty(()),
+    )
+    z[:] = src
+    os.sync()
+    write_time = clock() - t0
+
+    # Read
+    drop_vm_cache(args)
+    t0 = clock()
+    res = z[:]
+    read_time = clock() - t0
+    assert res.nbytes == args.nbytes
+
+    return read_time, write_time
+
+
+def run_posix(args):
+    dir_path = args.dir / "posix"
+    shutil.rmtree(str(dir_path), ignore_errors=True)
+
+    # Get the CPU compressor
+    compressor = compressors[args.compressor][0]
+
+    src = create_src_data(args)
+
+    # Write
+    drop_vm_cache(args)
+    t0 = clock()
+    z = zarr.create(
+        shape=(args.nelem,),
+        chunks=(args.chunksize,),
+        dtype=args.dtype,
+        compressor=compressor,
+        store=zarr.DirectoryStore(dir_path),
+        meta_array=numpy.empty(()),
+    )
+    z[:] = src.get()
+    os.sync()
+    write_time = clock() - t0
+
+    # Read
+    drop_vm_cache(args)
+    t0 = clock()
+    res = cupy.asarray(z[:])
+    read_time = clock() - t0
+    assert res.nbytes == args.nbytes
+
+    return read_time, write_time
+
+
+API = {
+    "kvikio": run_kvikio,
+    "posix": run_posix,
+}
+
+
+def main(args):
+    cupy.cuda.set_allocator(None)  # Disable CuPy's default memory pool
+    cupy.arange(10)  # Make sure CUDA is initialized
+
+    kvikio.defaults.num_threads_reset(args.nthreads)
+    props = kvikio.DriverProperties()
+    try:
+        import pynvml.smi
+
+        nvsmi = pynvml.smi.nvidia_smi.getInstance()
+    except ImportError:
+        gpu_name = "Unknown (install pynvml)"
+        mem_total = gpu_name
+        bar1_total = gpu_name
+    else:
+        info = nvsmi.DeviceQuery()["gpu"][0]
+        gpu_name = f"{info['product_name']} (dev #0)"
+        mem_total = format_bytes(
+            parse_bytes(
+                str(info["fb_memory_usage"]["total"]) + info["fb_memory_usage"]["unit"]
+            )
+        )
+        bar1_total = format_bytes(
+            parse_bytes(
+                str(info["bar1_memory_usage"]["total"])
+                + info["bar1_memory_usage"]["unit"]
+            )
+        )
+    gds_version = "N/A (Compatibility Mode)"
+    if props.is_gds_available:
+        gds_version = f"v{props.major_version}.{props.minor_version}"
+    gds_config_json_path = os.path.realpath(
+        os.getenv("CUFILE_ENV_PATH_JSON", "/etc/cufile.json")
+    )
+    drop_vm_cache_msg = str(args.drop_vm_cache)
+    if not args.drop_vm_cache:
+        drop_vm_cache_msg += " (use --drop-vm-cache for better accuracy!)"
+    chunksize = args.chunksize * args.dtype.itemsize
+
+    print("Roundtrip benchmark")
+    print("----------------------------------")
+    if kvikio.defaults.compat_mode():
+        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        print("   WARNING - KvikIO compat mode   ")
+        print("      libcufile.so not used       ")
+        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+    elif not props.is_gds_available:
+        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        print("   WARNING - cuFile compat mode   ")
+        print("         GDS not enabled          ")
+        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+    print(f"GPU               | {gpu_name}")
+    print(f"GPU Memory Total  | {mem_total}")
+    print(f"BAR1 Memory Total | {bar1_total}")
+    print(f"GDS driver        | {gds_version}")
+    print(f"GDS config.json   | {gds_config_json_path}")
+    print("----------------------------------")
+    print(f"nbytes            | {args.nbytes} bytes ({format_bytes(args.nbytes)})")
+    print(f"chunksize         | {chunksize} bytes ({format_bytes(chunksize)})")
+    print(f"4K aligned        | {args.nbytes % 4096 == 0}")
+    print(f"drop-vm-cache     | {drop_vm_cache_msg}")
+    print(f"directory         | {args.dir}")
+    print(f"nthreads          | {args.nthreads}")
+    print(f"nruns             | {args.nruns}")
+    print(f"compressor        | {args.compressor}")
+    print("==================================")
+
+    # Run each benchmark using the requested APIs
+    for api in args.api:
+        rs = []
+        ws = []
+        for _ in range(args.n_warmup_runs):
+            read, write = API[api](args)
+        for _ in range(args.nruns):
+            read, write = API[api](args)
+            rs.append(args.nbytes / read)
+            ws.append(args.nbytes / write)
+
+        def pprint_api_res(name, samples):
+            mean = statistics.mean(samples) if len(samples) > 1 else samples[0]
+            ret = f"{api} {name}".ljust(18)
+            ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14)
+            if len(samples) > 1:
+                stdev = statistics.stdev(samples) / mean * 100
+                ret += " ± %5.2f %%" % stdev
+                ret += " ("
+                for sample in samples:
+                    ret += f"{format_bytes(sample)}/s, "
+                ret = ret[:-2] + ")"  # Replace trailing comma
+            return ret
+
+        print(pprint_api_res("read", rs))
+        print(pprint_api_res("write", ws))
+
+
+if __name__ == "__main__":
+
+    def parse_directory(x):
+        if x is None:
+            return x
+        else:
+            p = pathlib.Path(x)
+            if not p.is_dir():
+                raise argparse.ArgumentTypeError("Must be a directory")
+            return p
+
+    parser = argparse.ArgumentParser(description="Roundtrip benchmark")
+    parser.add_argument(
+        "-n",
+        "--nbytes",
+        metavar="BYTES",
+        default="10 MiB",
+        type=parse_bytes,
+        help="Message size, which must be a multiple of 8 (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--chunksize",
+        metavar="BYTES",
+        default="10 MiB",
+        type=parse_bytes,
+        help="Chunk size (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        type=numpy.dtype,
+        help="NumPy datatype to use (default: '%(default)s')",
+    )
+    parser.add_argument(
+        "-d",
+        "--dir",
+        metavar="PATH",
+        default=None,
+        type=parse_directory,
+        help="Path to the directory to r/w from (default: tempfile.TemporaryDirectory)",
+    )
+    parser.add_argument(
+        "--nruns",
+        metavar="RUNS",
+        default=1,
+        type=int,
+        help="Number of runs per API (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--n-warmup-runs",
+        default=0,
+        type=int,
+        help="Number of warmup runs (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-t",
+        "--nthreads",
+        metavar="THREADS",
+        default=1,
+        type=int,
+        help="Number of threads to use (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--api",
+        metavar="API",
+        default=("kvikio", "posix"),
+        nargs="+",
+        choices=tuple(API.keys()) + ("all",),
+        help="List of APIs to use {%(choices)s}",
+    )
+    parser.add_argument(
+        "--compressor",
+        metavar="COMPRESSOR",
+        default="none",
+        choices=tuple(compressors.keys()),
+        help=(
+            "Set a nvCOMP compressor to use with Zarr "
+            "{%(choices)s} (default: %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--drop-vm-cache",
+        action="store_true",
+        default=False,
+        help=(
+            "Drop the VM cache between writes and reads, "
+            "requires sudo access to /sbin/sysctl"
+        ),
+    )
+
+    args = parser.parse_args()
+    if "all" in args.api:
+        args.api = tuple(API.keys())
+
+    # Check if size is divisible by size of datatype
+    assert args.nbytes % args.dtype.itemsize == 0
+    assert args.chunksize % args.dtype.itemsize == 0
+
+    # Compute/convert to number of elements
+    args.nelem = args.nbytes // args.dtype.itemsize
+    args.chunksize = args.chunksize // args.dtype.itemsize
+
+    # Create a temporary directory if user didn't specify a directory
+    temp_dir: Union[tempfile.TemporaryDirectory, ContextManager]
+    if args.dir is None:
+        temp_dir = tempfile.TemporaryDirectory()
+        args.dir = pathlib.Path(temp_dir.name)
+    else:
+        temp_dir = contextlib.nullcontext()
+
+    with temp_dir:
+        main(args)
diff --git a/python/cmake/CMakeLists.txt b/python/cmake/CMakeLists.txt
index c200e3026a..4249773480 100644
--- a/python/cmake/CMakeLists.txt
+++ b/python/cmake/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,4 +13,3 @@
 # =============================================================================
 
 include(thirdparty/get_nvcomp.cmake)
-include(kvikio_python_helpers.cmake)
diff --git a/python/cmake/kvikio_python_helpers.cmake b/python/cmake/kvikio_python_helpers.cmake
deleted file mode 100644
index 837039ecc9..0000000000
--- a/python/cmake/kvikio_python_helpers.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-#[=======================================================================[.rst:
-add_cython_modules
-------------------
-
-Generate C(++) from Cython and create Python modules.
-
-.. code-block:: cmake
-
-  add_cython_modules(<ModuleName...>)
-
-Creates a Cython target for a module, then adds a corresponding Python
-extension module.
-
-``ModuleName``
-  The list of modules to build.
-
-#]=======================================================================]
-function(add_cython_modules cython_modules)
-  foreach(cython_module ${cython_modules})
-    add_cython_target(${cython_module} CXX PY3)
-    add_library(${cython_module} MODULE ${cython_module})
-    python_extension_module(${cython_module})
-
-    # To avoid libraries being prefixed with "lib".
-    set_target_properties(${cython_module} PROPERTIES PREFIX "" CXX_STANDARD 17)
-    # Link to the C++ library.
-    target_link_libraries(${cython_module} kvikio)
-    # Treat warnings as errors when compiling.
-    target_compile_options(${cython_module} PRIVATE -Werror)
-
-    # Compute the install directory relative to the source and rely on installs being relative to
-    # the CMAKE_PREFIX_PATH for e.g. editable installs.
-    cmake_path(
-      RELATIVE_PATH CMAKE_CURRENT_SOURCE_DIR BASE_DIRECTORY ${kvikio-python_SOURCE_DIR}
-      OUTPUT_VARIABLE install_dst
-    )
-    install(TARGETS ${cython_module} DESTINATION ${install_dst})
-  endforeach(cython_module ${cython_sources})
-endfunction()
diff --git a/python/examples/zarr_cupy_nvcomp.py b/python/examples/zarr_cupy_nvcomp.py
new file mode 100644
index 0000000000..766139b442
--- /dev/null
+++ b/python/examples/zarr_cupy_nvcomp.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import cupy
+import numpy
+import zarr
+
+import kvikio
+import kvikio.zarr
+
+
+def main(path):
+    a = cupy.arange(20)
+
+    # Let's use KvikIO's convenience function `open_cupy_array()` to create
+    # a new Zarr file on disk. Its semantic is the same as `zarr.open_array()`
+    # but uses a GDS file store, nvCOMP compression, and CuPy arrays.
+    z = kvikio.zarr.open_cupy_array(store=path, mode="w", shape=(20,), chunks=(5,))
+
+    # `z` is a regular Zarr Array that we can write to as usual
+    z[0:10] = numpy.arange(0, 10)
+    # but it also support direct reads and writes of CuPy arrays
+    z[10:20] = cupy.arange(10, 20)
+
+    # Reading `z` returns a CuPy array
+    assert isinstance(z[:], cupy.ndarray)
+    assert (a == z[:]).all()
+
+    # Normally, we cannot assume that GPU and CPU compressors are compatible.
+    # E.g., `open_cupy_array()` uses nvCOMP's Snappy GPU compression by default,
+    # which, as far as we know, isn’t compatible with any CPU compressor. Thus,
+    # let's re-write our Zarr array using a CPU and GPU compatible compressor.
+    z = kvikio.zarr.open_cupy_array(
+        store=path,
+        mode="w",
+        shape=(20,),
+        chunks=(5,),
+        compressor=kvikio.zarr.CompatCompressor.lz4(),
+    )
+    z[:] = a
+
+    # Because we are using a CompatCompressor, it is now possible to open the file
+    # using Zarr's built-in LZ4 decompressor that uses the CPU.
+    z = zarr.open_array(path)
+    # `z` is now read as a regular NumPy array
+    assert isinstance(z[:], numpy.ndarray)
+    assert (a.get() == z[:]).all()
+    # and we can write to is as usual
+    z[:] = numpy.arange(20, 40)
+
+    # And we can read the Zarr file back into a CuPy array.
+    z = kvikio.zarr.open_cupy_array(store=path, mode="r")
+    assert isinstance(z[:], cupy.ndarray)
+    assert (cupy.arange(20, 40) == z[:]).all()
+
+
+if __name__ == "__main__":
+    main("/tmp/zarr-cupy-nvcomp")
diff --git a/python/kvikio/__init__.py b/python/kvikio/__init__.py
index 34006c5096..0599660428 100644
--- a/python/kvikio/__init__.py
+++ b/python/kvikio/__init__.py
@@ -4,7 +4,7 @@
 from ._lib import libkvikio  # type: ignore
 from .cufile import CuFile  # noqa: F401
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
 
 
 def memory_register(buf) -> None:
diff --git a/python/kvikio/_lib/CMakeLists.txt b/python/kvikio/_lib/CMakeLists.txt
index f88c0126b0..04cff87c08 100644
--- a/python/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/_lib/CMakeLists.txt
@@ -13,10 +13,10 @@
 # =============================================================================
 
 # Set the list of Cython files to build
-set(cython_modules arr libnvcomp libnvcomp_ll libkvikio)
+set(cython_modules arr.pyx libnvcomp.pyx libnvcomp_ll.pyx libkvikio.pyx)
 
-# Build all of the Cython targets
-add_cython_modules("${cython_modules}")
-
-target_link_libraries(libnvcomp nvcomp::nvcomp)
-target_link_libraries(libnvcomp_ll nvcomp::nvcomp)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_modules}"
+  LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp
+)
diff --git a/python/kvikio/_lib/libkvikio.pyx b/python/kvikio/_lib/libkvikio.pyx
index 5be21bf6e9..aeeb2e3fc8 100644
--- a/python/kvikio/_lib/libkvikio.pyx
+++ b/python/kvikio/_lib/libkvikio.pyx
@@ -5,6 +5,7 @@
 # cython: language_level=3
 
 import pathlib
+from typing import Optional
 
 from libc.stdint cimport uintptr_t
 from libcpp.utility cimport move, pair
@@ -49,7 +50,7 @@ def memory_deregister(buf) -> None:
     kvikio_cxx_api.memory_deregister(<void*>arr.ptr)
 
 
-def compat_mode() -> int:
+def compat_mode() -> bool:
     return kvikio_cxx_api.compat_mode()
 
 
@@ -124,7 +125,7 @@ cdef class CuFile:
     def open_flags(self) -> int:
         return self._handle.fd_open_flags()
 
-    def pread(self, buf, size: int, file_offset: int, task_size) -> IOFuture:
+    def pread(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = _parse_buffer(buf, size, True)
         return _wrap_io_future(
             self._handle.pread(
@@ -135,7 +136,7 @@ cdef class CuFile:
             )
         )
 
-    def pwrite(self, buf, size: int, file_offset: int, task_size) -> IOFuture:
+    def pwrite(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = _parse_buffer(buf, size, True)
         return _wrap_io_future(
             self._handle.pwrite(
@@ -146,7 +147,7 @@ cdef class CuFile:
             )
         )
 
-    def read(self, buf, size: int, file_offset: int, dev_offset: int) -> int:
+    def read(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = _parse_buffer(buf, size, False)
         return self._handle.read(
             <void*>info.first,
@@ -155,7 +156,7 @@ cdef class CuFile:
             dev_offset,
         )
 
-    def write(self, buf, size: int, file_offset: int, dev_offset: int) -> int:
+    def write(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = _parse_buffer(buf, size, False)
         return self._handle.write(
             <void*>info.first,
diff --git a/python/kvikio/_lib/libnvcomp_ll.pyx b/python/kvikio/_lib/libnvcomp_ll.pyx
index 1794d29819..da6cc82bc0 100644
--- a/python/kvikio/_lib/libnvcomp_ll.pyx
+++ b/python/kvikio/_lib/libnvcomp_ll.pyx
@@ -1,12 +1,22 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
 from enum import IntEnum
 
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uint32_t, uint64_t, uintptr_t
+
+from kvikio._lib.nvcomp_ll_cxx_api cimport (
+    cudaMemcpyKind,
+    cudaStream_t,
+    nvcompStatus_t,
+    nvcompType_t,
+)
 
-from kvikio._lib.nvcomp_ll_cxx_api cimport cudaStream_t, nvcompStatus_t, nvcompType_t
+import cupy
+from cupy.cuda.runtime import memcpyAsync
 
 
 class nvCompStatus(IntEnum):
@@ -76,7 +86,7 @@ class nvCompBatchAlgorithm(ABC):
         self,
         size_t batch_size,
         size_t max_uncompressed_chunk_bytes,
-    ) -> (nvcompStatus_t, size_t):
+    ) -> tuple[nvcompStatus_t, size_t]:
         """Algorithm-specific implementation."""
         ...
 
@@ -123,9 +133,9 @@ class nvCompBatchAlgorithm(ABC):
 
         Parameters
         ----------
-        uncomp_chunks: cp.ndarray
+        uncomp_chunks: cp.ndarray[uintp]
             The pointers on the GPU, to uncompressed batched items.
-        uncomp_chunk_sizes: cp.ndarray
+        uncomp_chunk_sizes: cp.ndarray[uint64]
             The size in bytes of each uncompressed batch item on the GPU.
         max_uncomp_chunk_bytes: int
             The maximum size in bytes of the largest chunk in the batch.
@@ -133,26 +143,34 @@ class nvCompBatchAlgorithm(ABC):
             The number of chunks to compress.
         temp_buf: cp.ndarray
             The temporary GPU workspace.
-        comp_chunks: cp.ndarray
-            (output) The pointers on the GPU, to the output location for each
+        comp_chunks: np.ndarray[uintp]
+            (output) The list of pointers on the GPU, to the output location for each
             compressed batch item.
-        comp_chunk_sizes: cp.ndarray
-            (output) The compressed size in bytes of each chunk on the GPU.
+        comp_chunk_sizes: np.ndarray[uint64]
+            (output) The compressed size in bytes of each chunk.
         stream: cp.cuda.Stream
             CUDA stream.
         """
+
+        # nvCOMP requires comp_chunks pointers container and
+        # comp_chunk_sizes to be in GPU memory.
+        comp_chunks_d = cupy.array(comp_chunks, dtype=cupy.uintp)
+        comp_chunk_sizes_d = cupy.empty_like(comp_chunk_sizes)
+
         err = self._compress(
             uncomp_chunks,
             uncomp_chunk_sizes,
             max_uncomp_chunk_bytes,
             batch_size,
             temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
+            comp_chunks_d,
+            comp_chunk_sizes_d,
             stream,
         )
         if err != nvcompStatus_t.nvcompSuccess:
             raise RuntimeError(f"Compression failed, error: {nvCompStatus(err)!r}.")
+        # Copy resulting compressed chunk sizes back to the host buffer.
+        comp_chunk_sizes[:] = comp_chunk_sizes_d.get()
 
     @abstractmethod
     def _compress(
@@ -210,6 +228,64 @@ class nvCompBatchAlgorithm(ABC):
         """Algorithm-specific implementation."""
         ...
 
+    def get_decompress_size(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        stream,
+    ):
+        """Get the amount of space required on the GPU for decompression.
+
+        Parameters
+        ----------
+        comp_chunks: np.ndarray[uintp]
+            The pointers on the GPU, to compressed batched items.
+        comp_chunk_sizes: np.ndarray[uint64]
+            The size in bytes of each compressed batch item.
+        stream: cp.cuda.Stream
+            CUDA stream.
+
+        Returns
+        -------
+        cp.ndarray[uint64]
+            The amount of GPU space in bytes that will be required
+            to decompress each chunk.
+        """
+
+        assert len(comp_chunks) == len(comp_chunk_sizes)
+        batch_size = len(comp_chunks)
+
+        # nvCOMP requires all buffers to be in GPU memory.
+        comp_chunks_d = cupy.array(comp_chunks, dtype=cupy.uintp)
+        comp_chunk_sizes_d = cupy.array(comp_chunk_sizes, dtype=cupy.uint64)
+        uncomp_chunk_sizes_d = cupy.empty_like(comp_chunk_sizes_d)
+
+        err = self._get_decomp_size(
+            comp_chunks_d,
+            comp_chunk_sizes_d,
+            batch_size,
+            uncomp_chunk_sizes_d,
+            stream,
+        )
+        if err != nvcompStatus_t.nvcompSuccess:
+            raise RuntimeError(
+                f"Could not get decompress buffer size, error: {nvCompStatus(err)!r}."
+            )
+
+        return uncomp_chunk_sizes_d
+
+    @abstractmethod
+    def _get_decomp_size(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        size_t batch_size,
+        uncomp_chunk_sizes,
+        stream,
+    ):
+        """Algorithm-specific implementation."""
+        ...
+
     def decompress(
         self,
         comp_chunks,
@@ -226,29 +302,35 @@ class nvCompBatchAlgorithm(ABC):
 
         Parameters
         ----------
-        comp_chunks: cp.ndarray
+        comp_chunks: np.ndarray[uintp]
             The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray
-            The size in bytes of each compressed batch item on the GPU.
+        comp_chunk_sizes: np.ndarray[uint64]
+            The size in bytes of each compressed batch item.
         batch_size: int
             The number of chunks to decompress.
         temp_buf: cp.ndarray
             The temporary GPU workspace.
-        uncomp_chunks: cp.ndarray
+        uncomp_chunks: cp.ndarray[uintp]
             (output) The pointers on the GPU, to the output location for each
             decompressed batch item.
-        uncomp_chunk_sizes: cp.ndarray
+        uncomp_chunk_sizes: cp.ndarray[uint64]
             The size in bytes of each decompress chunk location on the GPU.
-        actual_uncomp_chunk_sizes: cp.ndarray
+        actual_uncomp_chunk_sizes: cp.ndarray[uint64]
             (output) The actual decompressed size in bytes of each chunk on the GPU.
         statuses: cp.ndarray
             (output) The status for each chunk of whether it was decompressed or not.
         stream: cp.cuda.Stream
             CUDA stream.
         """
+
+        # nvCOMP requires comp_chunks pointers container and
+        # comp_chunk_sizes to be in GPU memory.
+        comp_chunks_d = cupy.array(comp_chunks, dtype=cupy.uintp)
+        comp_chunk_sizes_d = cupy.array(comp_chunk_sizes, dtype=cupy.uint64)
+
         err = self._decompress(
-            comp_chunks,
-            comp_chunk_sizes,
+            comp_chunks_d,
+            comp_chunk_sizes_d,
             batch_size,
             temp_buf,
             uncomp_chunks,
@@ -288,12 +370,14 @@ cdef cudaStream_t to_stream(stream):
 #
 # LZ4 algorithm.
 #
+
 from kvikio._lib.nvcomp_ll_cxx_api cimport (
     nvcompBatchedLZ4CompressAsync,
     nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
     nvcompBatchedLZ4CompressGetTempSize,
     nvcompBatchedLZ4DecompressAsync,
     nvcompBatchedLZ4DecompressGetTempSize,
+    nvcompBatchedLZ4GetDecompressSizeAsync,
     nvcompBatchedLZ4Opts_t,
 )
 
@@ -305,14 +389,29 @@ class nvCompBatchAlgorithmLZ4(nvCompBatchAlgorithm):
 
     options: nvcompBatchedLZ4Opts_t
 
-    def __init__(self, data_type: int = 0):
+    HEADER_SIZE_BYTES: size_t = sizeof(uint32_t)
+
+    def __init__(self, data_type: int = 0, has_header: bool = True):
+        """Initialize the codec.
+
+        Parameters
+        ----------
+        data_type: int
+            Source data type.
+        has_header: bool
+            Whether the compressed data has a header.
+            This enables data compatibility between numcodecs LZ4 codec,
+            which has the header and nvCOMP LZ4 codec which does not
+            require the header.
+        """
         self.options = nvcompBatchedLZ4Opts_t(data_type)
+        self.has_header = has_header
 
     def _get_comp_temp_size(
         self,
         size_t batch_size,
         size_t max_uncompressed_chunk_bytes,
-    ) -> (nvcompStatus_t, size_t):
+    ) -> tuple[nvcompStatus_t, size_t]:
         cdef size_t temp_bytes = 0
 
         err = nvcompBatchedLZ4CompressGetTempSize(
@@ -333,8 +432,61 @@ class nvCompBatchAlgorithmLZ4(nvCompBatchAlgorithm):
             &max_compressed_bytes
         )
 
+        # Add header size, if needed.
+        if err == nvcompStatus_t.nvcompSuccess and self.has_header:
+            max_compressed_bytes += self.HEADER_SIZE_BYTES
+
         return (err, max_compressed_bytes)
 
+    def compress(
+        self,
+        uncomp_chunks,
+        uncomp_chunk_sizes,
+        size_t max_uncomp_chunk_bytes,
+        size_t batch_size,
+        temp_buf,
+        comp_chunks,
+        comp_chunk_sizes,
+        stream,
+    ):
+        if self.has_header:
+            # If there is a header, we need to:
+            # 1. Copy the uncompressed chunk size to the compressed chunk header.
+            # 2. Update target pointers in comp_chunks to skip the header portion,
+            # which is not compressed.
+            #
+            # Get the base pointers to sizes.
+            psize = to_ptr(uncomp_chunk_sizes)
+            for i in range(batch_size):
+                # Copy the original data size to the header.
+                memcpyAsync(
+                    <uintptr_t>comp_chunks[i],
+                    psize,
+                    self.HEADER_SIZE_BYTES,
+                    cudaMemcpyKind.cudaMemcpyDeviceToDevice,
+                    stream.ptr
+                )
+                psize += sizeof(uint64_t)
+                # Update chunk pointer to skip the header.
+                comp_chunks[i] += self.HEADER_SIZE_BYTES
+
+        super().compress(
+            uncomp_chunks,
+            uncomp_chunk_sizes,
+            max_uncomp_chunk_bytes,
+            batch_size,
+            temp_buf,
+            comp_chunks,
+            comp_chunk_sizes,
+            stream,
+        )
+
+        if self.has_header:
+            for i in range(batch_size):
+                # Update chunk pointer and size to include the header.
+                comp_chunks[i] -= self.HEADER_SIZE_BYTES
+                comp_chunk_sizes[i] += self.HEADER_SIZE_BYTES
+
     def _compress(
         self,
         uncomp_chunks,
@@ -376,6 +528,92 @@ class nvCompBatchAlgorithmLZ4(nvCompBatchAlgorithm):
 
         return (err, temp_bytes)
 
+    def get_decompress_size(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        stream,
+    ):
+        if not self.has_header:
+            return super().get_decompress_size(
+                comp_chunks,
+                comp_chunk_sizes,
+                stream,
+            )
+
+        assert comp_chunks.shape == comp_chunk_sizes.shape
+        batch_size = len(comp_chunks)
+
+        # uncomp_chunk_sizes is uint32 array to match the type in LZ4 header.
+        uncomp_chunk_sizes = cupy.empty(batch_size, dtype=cupy.uint32)
+
+        psize = to_ptr(uncomp_chunk_sizes)
+        for i in range(batch_size):
+            # Get pointer to the header and copy the data.
+            memcpyAsync(
+                psize,
+                <uintptr_t>comp_chunks[i],
+                sizeof(uint32_t),
+                cudaMemcpyKind.cudaMemcpyDeviceToDevice,
+                stream.ptr
+            )
+            psize += sizeof(uint32_t)
+        stream.synchronize()
+
+        return uncomp_chunk_sizes.astype(cupy.uint64)
+
+    def _get_decomp_size(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        size_t batch_size,
+        uncomp_chunk_sizes,
+        stream,
+    ):
+        return nvcompBatchedLZ4GetDecompressSizeAsync(
+            <const void* const*>to_ptr(comp_chunks),
+            <const size_t*>to_ptr(comp_chunk_sizes),
+            <size_t*>to_ptr(uncomp_chunk_sizes),
+            batch_size,
+            to_stream(stream),
+        )
+
+    def decompress(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        size_t batch_size,
+        temp_buf,
+        uncomp_chunks,
+        uncomp_chunk_sizes,
+        actual_uncomp_chunk_sizes,
+        statuses,
+        stream,
+    ):
+        if self.has_header:
+            for i in range(batch_size):
+                # Update chunk pointer and size to exclude the header.
+                comp_chunks[i] += self.HEADER_SIZE_BYTES
+                comp_chunk_sizes[i] -= self.HEADER_SIZE_BYTES
+
+        super().decompress(
+            comp_chunks,
+            comp_chunk_sizes,
+            batch_size,
+            temp_buf,
+            uncomp_chunks,
+            uncomp_chunk_sizes,
+            actual_uncomp_chunk_sizes,
+            statuses,
+            stream,
+        )
+
+        if self.has_header:
+            for i in range(batch_size):
+                # Update chunk pointer and size to include the header.
+                comp_chunks[i] -= self.HEADER_SIZE_BYTES
+                comp_chunk_sizes[i] += self.HEADER_SIZE_BYTES
+
     def _decompress(
         self,
         comp_chunks,
@@ -416,6 +654,7 @@ from kvikio._lib.nvcomp_ll_cxx_api cimport (
     nvcompBatchedGdeflateCompressGetTempSize,
     nvcompBatchedGdeflateDecompressAsync,
     nvcompBatchedGdeflateDecompressGetTempSize,
+    nvcompBatchedGdeflateGetDecompressSizeAsync,
     nvcompBatchedGdeflateOpts_t,
 )
 
@@ -434,7 +673,7 @@ class nvCompBatchAlgorithmGdeflate(nvCompBatchAlgorithm):
         self,
         size_t batch_size,
         size_t max_uncompressed_chunk_bytes,
-    ) -> (nvcompStatus_t, size_t):
+    ) -> tuple[nvcompStatus_t, size_t]:
         cdef size_t temp_bytes = 0
 
         err = nvcompBatchedGdeflateCompressGetTempSize(
@@ -496,6 +735,22 @@ class nvCompBatchAlgorithmGdeflate(nvCompBatchAlgorithm):
 
         return (err, temp_bytes)
 
+    def _get_decomp_size(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        size_t batch_size,
+        uncomp_chunk_sizes,
+        stream,
+    ):
+        return nvcompBatchedGdeflateGetDecompressSizeAsync(
+            <const void* const*>to_ptr(comp_chunks),
+            <const size_t*>to_ptr(comp_chunk_sizes),
+            <size_t*>to_ptr(uncomp_chunk_sizes),
+            batch_size,
+            to_stream(stream),
+        )
+
     def _decompress(
         self,
         comp_chunks,
@@ -534,6 +789,7 @@ from kvikio._lib.nvcomp_ll_cxx_api cimport (
     nvcompBatchedZstdCompressGetTempSize,
     nvcompBatchedZstdDecompressAsync,
     nvcompBatchedZstdDecompressGetTempSize,
+    nvcompBatchedZstdGetDecompressSizeAsync,
     nvcompBatchedZstdOpts_t,
 )
 
@@ -552,7 +808,7 @@ class nvCompBatchAlgorithmZstd(nvCompBatchAlgorithm):
         self,
         size_t batch_size,
         size_t max_uncompressed_chunk_bytes,
-    ) -> (nvcompStatus_t, size_t):
+    ) -> tuple[nvcompStatus_t, size_t]:
         cdef size_t temp_bytes = 0
 
         err = nvcompBatchedZstdCompressGetTempSize(
@@ -614,6 +870,22 @@ class nvCompBatchAlgorithmZstd(nvCompBatchAlgorithm):
 
         return (err, temp_bytes)
 
+    def _get_decomp_size(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        size_t batch_size,
+        uncomp_chunk_sizes,
+        stream,
+    ):
+        return nvcompBatchedZstdGetDecompressSizeAsync(
+            <const void* const*>to_ptr(comp_chunks),
+            <const size_t*>to_ptr(comp_chunk_sizes),
+            <size_t*>to_ptr(uncomp_chunk_sizes),
+            batch_size,
+            to_stream(stream),
+        )
+
     def _decompress(
         self,
         comp_chunks,
@@ -652,6 +924,7 @@ from kvikio._lib.nvcomp_ll_cxx_api cimport (
     nvcompBatchedSnappyCompressGetTempSize,
     nvcompBatchedSnappyDecompressAsync,
     nvcompBatchedSnappyDecompressGetTempSize,
+    nvcompBatchedSnappyGetDecompressSizeAsync,
     nvcompBatchedSnappyOpts_t,
 )
 
@@ -670,7 +943,7 @@ class nvCompBatchAlgorithmSnappy(nvCompBatchAlgorithm):
         self,
         size_t batch_size,
         size_t max_uncompressed_chunk_bytes,
-    ) -> (nvcompStatus_t, size_t):
+    ) -> tuple[nvcompStatus_t, size_t]:
         cdef size_t temp_bytes = 0
 
         err = nvcompBatchedSnappyCompressGetTempSize(
@@ -732,6 +1005,22 @@ class nvCompBatchAlgorithmSnappy(nvCompBatchAlgorithm):
 
         return (err, temp_bytes)
 
+    def _get_decomp_size(
+        self,
+        comp_chunks,
+        comp_chunk_sizes,
+        size_t batch_size,
+        uncomp_chunk_sizes,
+        stream,
+    ):
+        return nvcompBatchedSnappyGetDecompressSizeAsync(
+            <const void* const*>to_ptr(comp_chunks),
+            <const size_t*>to_ptr(comp_chunk_sizes),
+            <size_t*>to_ptr(uncomp_chunk_sizes),
+            batch_size,
+            to_stream(stream),
+        )
+
     def _decompress(
         self,
         comp_chunks,
diff --git a/python/kvikio/_lib/nvcomp_cxx_api.pxd b/python/kvikio/_lib/nvcomp_cxx_api.pxd
index 5f8389bc11..e5b464d5c2 100644
--- a/python/kvikio/_lib/nvcomp_cxx_api.pxd
+++ b/python/kvikio/_lib/nvcomp_cxx_api.pxd
@@ -73,8 +73,11 @@ cdef extern from "nvcomp/nvcompManager.hpp" namespace 'nvcomp':
             size_t uncompressed_buffer_size) except +
         nvcompStatus_t* get_status() const
         CompressionConfig(CompressionConfig& other)
-        CompressionConfig& operator=(CompressionConfig&& other) except +
         CompressionConfig& operator=(const CompressionConfig& other) except +
+        # Commented as Cython doesn't support rvalues, but a user can call
+        # `move` with the existing operator and generate correct C++ code
+        # xref: https://github.com/cython/cython/issues/1445
+        # CompressionConfig& operator=(CompressionConfig&& other) except +
 
     cdef cppclass DecompressionConfig "nvcomp::DecompressionConfig":
         size_t decomp_data_size
@@ -82,8 +85,11 @@ cdef extern from "nvcomp/nvcompManager.hpp" namespace 'nvcomp':
         DecompressionConfig(PinnedPtrPool[nvcompStatus_t]& pool) except +
         nvcompStatus_t* get_status() const
         DecompressionConfig(DecompressionConfig& other)
-        DecompressionConfig& operator=(DecompressionConfig&& other) except +
         DecompressionConfig& operator=(const DecompressionConfig& other) except +
+        # Commented as Cython doesn't support rvalues, but a user can call
+        # `move` with the existing operator and generate correct C++ code
+        # xref: https://github.com/cython/cython/issues/1445
+        # DecompressionConfig& operator=(DecompressionConfig&& other) except +
 
     cdef cppclass nvcompManagerBase "nvcomp::nvcompManagerBase":
         CompressionConfig configure_compression(
diff --git a/python/kvikio/_lib/nvcomp_ll_cxx_api.pxd b/python/kvikio/_lib/nvcomp_ll_cxx_api.pxd
index 2ceb3037d1..750eb542ea 100644
--- a/python/kvikio/_lib/nvcomp_ll_cxx_api.pxd
+++ b/python/kvikio/_lib/nvcomp_ll_cxx_api.pxd
@@ -7,6 +7,13 @@
 cdef extern from "cuda_runtime.h":
     ctypedef void* cudaStream_t
 
+    ctypedef enum cudaMemcpyKind:
+        cudaMemcpyHostToHost = 0,
+        cudaMemcpyHostToDevice = 1,
+        cudaMemcpyDeviceToHost = 2,
+        cudaMemcpyDeviceToDevice = 3,
+        cudaMemcpyDefault = 4
+
 cdef extern from "nvcomp.h":
     ctypedef enum nvcompType_t:
         NVCOMP_TYPE_CHAR = 0,       # 1B
@@ -74,6 +81,14 @@ cdef extern from "nvcomp/lz4.h" nogil:
         size_t* temp_bytes
     )
 
+    cdef nvcompStatus_t nvcompBatchedLZ4GetDecompressSizeAsync(
+        const void* const* device_compressed_ptrs,
+        const size_t* device_compressed_bytes,
+        size_t* device_uncompressed_bytes,
+        size_t batch_size,
+        cudaStream_t stream
+    )
+
     nvcompStatus_t nvcompBatchedLZ4DecompressAsync(
         const void* const* device_compressed_ptrs,
         const size_t* device_compressed_bytes,
@@ -128,6 +143,14 @@ cdef extern from "nvcomp/gdeflate.h" nogil:
         size_t* temp_bytes
     )
 
+    nvcompStatus_t nvcompBatchedGdeflateGetDecompressSizeAsync(
+        const void* const* device_compressed_ptrs,
+        const size_t* device_compressed_bytes,
+        size_t* device_uncompressed_bytes,
+        size_t batch_size,
+        cudaStream_t stream
+    )
+
     nvcompStatus_t nvcompBatchedGdeflateDecompressAsync(
         const void* const* device_compressed_ptrs,
         const size_t* device_compressed_bytes,
@@ -182,6 +205,14 @@ cdef extern from "nvcomp/zstd.h" nogil:
         size_t* temp_bytes
     )
 
+    nvcompStatus_t nvcompBatchedZstdGetDecompressSizeAsync(
+        const void* const* device_compressed_ptrs,
+        const size_t* device_compressed_bytes,
+        size_t* device_uncompressed_bytes,
+        size_t batch_size,
+        cudaStream_t stream
+    )
+
     nvcompStatus_t nvcompBatchedZstdDecompressAsync(
         const void* const* device_compressed_ptrs,
         const size_t* device_compressed_bytes,
@@ -236,6 +267,14 @@ cdef extern from "nvcomp/snappy.h" nogil:
         size_t* temp_bytes
     )
 
+    nvcompStatus_t nvcompBatchedSnappyGetDecompressSizeAsync(
+        const void* const* device_compressed_ptrs,
+        const size_t* device_compressed_bytes,
+        size_t* device_uncompressed_bytes,
+        size_t batch_size,
+        cudaStream_t stream
+    )
+
     nvcompStatus_t nvcompBatchedSnappyDecompressAsync(
         const void* const* device_compressed_ptrs,
         const size_t* device_compressed_bytes,
diff --git a/python/kvikio/nvcomp_codec.py b/python/kvikio/nvcomp_codec.py
index 8f3b73dd79..af36494bad 100644
--- a/python/kvikio/nvcomp_codec.py
+++ b/python/kvikio/nvcomp_codec.py
@@ -8,7 +8,7 @@
 from numcodecs.abc import Codec
 from numcodecs.compat import ensure_contiguous_ndarray_like
 
-import kvikio._lib.libnvcomp_ll as _ll
+from kvikio._lib.libnvcomp_ll import SUPPORTED_ALGORITHMS
 
 
 class NvCompBatchCodec(Codec):
@@ -34,11 +34,11 @@ def __init__(
         stream: Optional[cp.cuda.Stream] = None,
     ) -> None:
         algo_id = algorithm.lower()
-        algo_t = _ll.SUPPORTED_ALGORITHMS.get(algo_id, None)
+        algo_t = SUPPORTED_ALGORITHMS.get(algo_id, None)
         if algo_t is None:
             raise ValueError(
                 f"{algorithm} is not supported. "
-                f"Must be one of: {list(_ll.SUPPORTED_ALGORITHMS.keys())}"
+                f"Must be one of: {list(SUPPORTED_ALGORITHMS.keys())}"
             )
 
         self.algorithm = algo_id
@@ -95,31 +95,18 @@ def encode_batch(self, bufs: List[Any]) -> List[Any]:
         comp_chunk_size = self._algo.get_compress_chunk_size(max_chunk_size)
 
         # Prepare data and size buffers.
-        uncomp_chunks = cp.array(
-            [b.data.ptr for b in bufs],
-            dtype=cp.uint64,
-        )
+        # uncomp_chunks is used as a container that stores pointers to actual chunks.
+        # nvCOMP requires this and sizes buffers to be in GPU memory.
+        uncomp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
         uncomp_chunk_sizes = cp.array(buf_sizes, dtype=cp.uint64)
 
         temp_buf = cp.empty(temp_size, dtype=cp.uint8)
 
-        # Includes header with the original buffer size,
-        # same as in numcodecs codec. This enables data compatibility between
-        # numcodecs default codecs and this nvCOMP batch codec.
-        # TODO(akamenev): probably should use contiguous buffer which stores all chunks?
-        comp_chunks_header = [
-            cp.empty(self.HEADER_SIZE_BYTES + comp_chunk_size, dtype=cp.uint8)
-            for _ in range(num_chunks)
-        ]
-        # comp_chunks is used as a container that stores pointers to actual chunks.
-        # nvCOMP requires this container to be in GPU memory.
-        comp_chunks = cp.array(
-            [c.data.ptr + self.HEADER_SIZE_BYTES for c in comp_chunks_header],
-            dtype=cp.uint64,
-        )
-        # Similar to comp_chunks, comp_chunk_sizes is an array that contains
-        # chunk sizes and is required by nvCOMP to be in GPU memory.
-        comp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
+        comp_chunks = cp.empty((num_chunks, comp_chunk_size), dtype=cp.uint8)
+        # Array of pointers to each compressed chunk.
+        comp_chunk_ptrs = np.array([c.data.ptr for c in comp_chunks], dtype=cp.uintp)
+        # Resulting compressed chunk sizes.
+        comp_chunk_sizes = np.empty(num_chunks, dtype=np.uint64)
 
         self._algo.compress(
             uncomp_chunks,
@@ -127,22 +114,14 @@ def encode_batch(self, bufs: List[Any]) -> List[Any]:
             max_chunk_size,
             num_chunks,
             temp_buf,
-            comp_chunks,
+            comp_chunk_ptrs,
             comp_chunk_sizes,
             self._stream,
         )
 
-        # Write output buffers, each with the header.
         res = []
         for i in range(num_chunks):
-            comp_chunk = comp_chunks_header[i]
-            header = comp_chunk[:4].view(dtype=cp.uint32)
-            header[:] = buf_sizes[i]
-
-            res.append(
-                comp_chunk[: self.HEADER_SIZE_BYTES + comp_chunk_sizes[0]].tobytes()
-            )
-
+            res.append(comp_chunks[i, : comp_chunk_sizes[i]].tobytes())
         return res
 
     def decode(self, buf, out=None):
@@ -196,37 +175,36 @@ def decode_batch(
         if is_host_buffer:
             bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
 
-        # Get uncompressed chunk sizes from the header.
-        uncomp_chunk_sizes = [
-            int(b[: self.HEADER_SIZE_BYTES].view(dtype=cp.uint32)[0]) for b in bufs
-        ]
-        max_chunk_size = max(uncomp_chunk_sizes)
+        # Prepare compressed chunks buffers.
+        comp_chunks = np.array([b.data.ptr for b in bufs], dtype=np.uintp)
+        comp_chunk_sizes = np.array([b.size for b in bufs], dtype=np.uint64)
+
+        # Get uncompressed chunk sizes.
+        uncomp_chunk_sizes = self._algo.get_decompress_size(
+            comp_chunks,
+            comp_chunk_sizes,
+            self._stream,
+        )
+        # Copy to host since we'll need it to properly allocate buffers.
+        uncomp_chunk_sizes_h = uncomp_chunk_sizes.get()
+
+        max_chunk_size = uncomp_chunk_sizes_h.max()
 
         # Get temp buffer size.
         temp_size = self._algo.get_decompress_temp_size(num_chunks, max_chunk_size)
 
-        # Prepare compressed chunks buffers.
-        comp_chunks = cp.array(
-            [b.data.ptr + self.HEADER_SIZE_BYTES for b in bufs],
-            dtype=cp.uint64,
-        )
-        comp_chunk_sizes = cp.array(
-            [b.size - self.HEADER_SIZE_BYTES for b in bufs],
-            dtype=cp.uint64,
-        )
-
         temp_buf = cp.empty(temp_size, dtype=cp.uint8)
 
         # Prepare uncompressed chunks buffers.
         # First, allocate chunks of appropriate sizes and then
         # copy the pointers to a pointer array in GPU memory as required by nvCOMP.
         # TODO(akamenev): probably can allocate single contiguous buffer.
-        uncomp_chunks = [cp.empty(size, dtype=cp.uint8) for size in uncomp_chunk_sizes]
+        uncomp_chunks = [
+            cp.empty(size, dtype=cp.uint8) for size in uncomp_chunk_sizes_h
+        ]
         uncomp_chunk_ptrs = cp.array(
-            [c.data.ptr for c in uncomp_chunks], dtype=cp.uint64
+            [c.data.ptr for c in uncomp_chunks], dtype=cp.uintp
         )
-        # Sizes array must be in GPU memory.
-        uncomp_chunk_sizes = cp.array(uncomp_chunk_sizes, dtype=cp.uint64)
 
         # TODO(akamenev): currently we provide the following 2 buffers to decompress()
         # but do not check/use them afterwards since some of the algos
diff --git a/python/kvikio/zarr.py b/python/kvikio/zarr.py
index 50a6756db8..1c030f96ae 100644
--- a/python/kvikio/zarr.py
+++ b/python/kvikio/zarr.py
@@ -1,13 +1,15 @@
 # Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
+from __future__ import annotations
 
 import contextlib
 import os
 import os.path
 from abc import abstractmethod
-from typing import Any, Mapping, Sequence
+from typing import Any, Literal, Mapping, Optional, Sequence, Union
 
 import cupy
+import numcodecs
 import numpy
 import numpy as np
 import zarr
@@ -20,6 +22,9 @@
 
 import kvikio
 import kvikio.nvcomp
+import kvikio.nvcomp_codec
+import kvikio.zarr
+from kvikio.nvcomp_codec import NvCompBatchCodec
 
 MINIMUM_ZARR_VERSION = "2.15"
 
@@ -37,22 +42,60 @@ class GDSStore(zarr.storage.DirectoryStore):
     It uses KvikIO for reads and writes, which in turn will use GDS
     when applicable.
 
+    Parameters
+    ----------
+    path : string
+        Location of directory to use as the root of the storage hierarchy.
+    normalize_keys : bool, optional
+        If True, all store keys will be normalized to use lower case characters
+        (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be
+        useful to avoid potential discrepancies between case-sensitive and
+        case-insensitive file system. Default value is False.
+    dimension_separator : {'.', '/'}, optional
+        Separator placed between the dimensions of a chunk.
+    compressor_config_overwrite
+        If not None, use this `Mapping` to specify what is written to the Zarr metadata
+        file on disk (`.zarray`). Normally, Zarr writes the configuration[1] given by
+        the `compressor` argument to the `.zarray` file. Use this argument to overwrite
+        the normal configuration and use the specified `Mapping` instead.
+    decompressor_config_overwrite
+        If not None, use this `Mapping` to specify what compressor configuration[1] is
+        used for decompressing no matter the configuration found in the Zarr metadata
+        on disk (the `.zarray` file).
+
+    [1] https://github.com/zarr-developers/numcodecs/blob/cb155432/numcodecs/abc.py#L79
+
     Notes
     -----
-    GDSStore doesn't implement `_fromfile()` thus non-array data such as
-    meta data is always read into host memory.
-    This is because only zarr.Array use getitems() to retrieve data.
+    Atomic writes are used, which means that data are first written to a
+    temporary file, then moved into place when the write is successfully
+    completed. Files are only held open while they are being read or written and are
+    closed immediately afterwards, so there is no need to manually close any files.
+
+    Safe to write in multiple threads or processes.
     """
 
     # The default output array type used by getitems().
     default_meta_array = numpy.empty(())
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(
+        self,
+        path,
+        normalize_keys=False,
+        dimension_separator=None,
+        *,
+        compressor_config_overwrite: Optional[Mapping] = None,
+        decompressor_config_overwrite: Optional[Mapping] = None,
+    ) -> None:
         if not kvikio.zarr.supported:
             raise RuntimeError(
                 f"GDSStore requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}"
             )
-        super().__init__(*args, **kwargs)
+        super().__init__(
+            path, normalize_keys=normalize_keys, dimension_separator=dimension_separator
+        )
+        self.compressor_config_overwrite = compressor_config_overwrite
+        self.decompressor_config_overwrite = decompressor_config_overwrite
 
     def __eq__(self, other):
         return isinstance(other, GDSStore) and self.path == other.path
@@ -62,6 +105,23 @@ def _tofile(self, a, fn):
             written = f.write(a)
             assert written == a.nbytes
 
+    def __getitem__(self, key):
+        ret = super().__getitem__(key)
+        if self.decompressor_config_overwrite and key == ".zarray":
+            meta = self._metadata_class.decode_array_metadata(ret)
+            if meta["compressor"]:
+                meta["compressor"] = self.decompressor_config_overwrite
+                ret = self._metadata_class.encode_array_metadata(meta)
+        return ret
+
+    def __setitem__(self, key, value):
+        if self.compressor_config_overwrite and key == ".zarray":
+            meta = self._metadata_class.decode_array_metadata(value)
+            if meta["compressor"]:
+                meta["compressor"] = self.compressor_config_overwrite
+                value = self._metadata_class.encode_array_metadata(meta)
+        super().__setitem__(key, value)
+
     def getitems(
         self,
         keys: Sequence[str],
@@ -237,3 +297,107 @@ def get_nvcomp_manager(self):
 nvcomp_compressors = [ANS, Bitcomp, Cascaded, Gdeflate, LZ4, Snappy]
 for c in nvcomp_compressors:
     register_codec(c)
+
+
+class CompatCompressor:
+    """A pair of compatible compressors one using the CPU and one using the GPU"""
+
+    def __init__(self, cpu: Codec, gpu: Codec) -> None:
+        self.cpu = cpu
+        self.gpu = gpu
+
+    @classmethod
+    def lz4(cls) -> CompatCompressor:
+        """A compatible pair of LZ4 compressors"""
+        return cls(cpu=numcodecs.LZ4(), gpu=NvCompBatchCodec("lz4"))
+
+
+def open_cupy_array(
+    store: Union[os.PathLike, str],
+    mode: Literal["r", "r+", "a", "w", "w-"] = "a",
+    compressor: Codec | CompatCompressor = Snappy(device_ordinal=0),
+    meta_array=cupy.empty(()),
+    **kwargs,
+) -> zarr.Array:
+    """Open an Zarr array as a CuPy-like array using file-mode-like semantics.
+
+    This function is a CUDA friendly version of `zarr.open_array` that reads
+    and writes to CuPy arrays. Beside the arguments listed below, the arguments
+    have the same semantics as in `zarr.open_array`.
+
+    Parameters
+    ----------
+    store
+        Path to directory in file system. As opposed to `zarr.open_array`,
+        Store and path to zip files isn't supported.
+    mode
+        Persistence mode: 'r' means read only (must exist); 'r+' means
+        read/write (must exist); 'a' means read/write (create if doesn't
+        exist); 'w' means create (overwrite if exists); 'w-' means create
+        (fail if exists).
+    compressor
+        The compressor used when creating a Zarr file or None if no compressor
+        is to be used. If a `CompatCompressor` is given, `CompatCompressor.gpu`
+        is used for compression and decompression; and `CompatCompressor.cpu`
+        is written as the compressor in the Zarr file metadata on disk.
+        This argument is ignored in "r" and "r+" mode. By default the
+        Snappy compressor by nvCOMP is used.
+    meta_array : array-like, optional
+        An CuPy-like array instance to use for determining arrays to create and
+        return to users. It must implement `__cuda_array_interface__`.
+    **kwargs
+        The rest of the arguments are forwarded to `zarr.open_array` as-is.
+
+    Returns
+    -------
+    Zarr array backed by a GDS file store, nvCOMP compression, and CuPy arrays.
+    """
+
+    if not isinstance(store, (str, os.PathLike)):
+        raise ValueError("store must be a path")
+    store = str(os.fspath(store))
+    if not hasattr(meta_array, "__cuda_array_interface__"):
+        raise ValueError("meta_array must implement __cuda_array_interface__")
+
+    if mode in ("r", "r+"):
+        ret = zarr.open_array(
+            store=kvikio.zarr.GDSStore(path=store),
+            mode=mode,
+            meta_array=meta_array,
+            **kwargs,
+        )
+        # If we are reading a LZ4-CPU compressed file, we overwrite the metadata
+        # on-the-fly to make Zarr use LZ4-GPU for both compression and decompression.
+        compat_lz4 = CompatCompressor.lz4()
+        if ret.compressor == compat_lz4.cpu:
+            ret = zarr.open_array(
+                store=kvikio.zarr.GDSStore(
+                    path=store,
+                    compressor_config_overwrite=compat_lz4.cpu.get_config(),
+                    decompressor_config_overwrite=compat_lz4.gpu.get_config(),
+                ),
+                mode=mode,
+                meta_array=meta_array,
+                **kwargs,
+            )
+        return ret
+
+    if isinstance(compressor, CompatCompressor):
+        compressor_config_overwrite = compressor.cpu.get_config()
+        decompressor_config_overwrite = compressor.gpu.get_config()
+        compressor = compressor.gpu
+    else:
+        compressor_config_overwrite = None
+        decompressor_config_overwrite = None
+
+    return zarr.open_array(
+        store=kvikio.zarr.GDSStore(
+            path=store,
+            compressor_config_overwrite=compressor_config_overwrite,
+            decompressor_config_overwrite=decompressor_config_overwrite,
+        ),
+        mode=mode,
+        meta_array=meta_array,
+        compressor=compressor,
+        **kwargs,
+    )
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 41d5d64947..0be1e70992 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -5,7 +5,7 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=0.29,<0.30",
+    "cython>=3.0.0",
     "ninja",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -14,7 +14,7 @@ requires = [
 
 [project]
 name = "kvikio"
-version = "23.08.00"
+version = "23.10.00"
 description = "KvikIO - GPUDirect Storage"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -24,6 +24,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cupy-cuda11x>=12.0.0",
+    "numcodecs <0.12.0",
     "numpy>=1.21",
     "packaging",
     "zarr",
diff --git a/python/tests/test_examples.py b/python/tests/test_examples.py
index 3343216c6f..e9e1f83d08 100644
--- a/python/tests/test_examples.py
+++ b/python/tests/test_examples.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 import os
@@ -16,3 +16,13 @@ def test_hello_world(tmp_path, monkeypatch):
 
     monkeypatch.syspath_prepend(str(examples_path))
     import_module("hello_world").main(tmp_path / "test-file")
+
+
+def test_zarr_cupy_nvcomp(tmp_path, monkeypatch):
+    """Test examples/zarr_cupy_nvcomp.py"""
+
+    # `examples/zarr_cupy_nvcomp.py` requires the Zarr submodule
+    pytest.importorskip("kvikio.zarr")
+
+    monkeypatch.syspath_prepend(str(examples_path))
+    import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file")
diff --git a/python/tests/test_nvcomp_codec.py b/python/tests/test_nvcomp_codec.py
index 99a90834c0..e4ce969c7a 100644
--- a/python/tests/test_nvcomp_codec.py
+++ b/python/tests/test_nvcomp_codec.py
@@ -22,12 +22,12 @@
 SUPPORTED_CODECS = [LZ4_ALGO, GDEFLATE_ALGO, SNAPPY_ALGO, ZSTD_ALGO]
 
 
-def _get_codec(algo: str):
-    codec_args = {"id": NVCOMP_CODEC_ID, "algorithm": algo}
+def _get_codec(algo: str, **kwargs):
+    codec_args = {"id": NVCOMP_CODEC_ID, "algorithm": algo, "options": kwargs}
     return numcodecs.registry.get_codec(codec_args)
 
 
-@pytest.fixture(params=[(16,), (8, 16), (16, 16)])
+@pytest.fixture(params=[(32,), (8, 16), (16, 16)])
 def shape(request):
     return request.param
 
@@ -36,7 +36,7 @@ def shape(request):
 # chunks array must have the same rank as data array.
 @pytest.fixture(
     params=it.chain(
-        it.product([(32,)], [(16,), (32,), (40,)]),
+        it.product([(64,)], [(64,), (100,)]),
         it.product([(16, 8), (16, 16)], [(8, 16), (16, 16), (40, 12)]),
     )
 )
@@ -156,9 +156,16 @@ def test_codec_invalid_options():
         zarr.array(data, compressor=codec)
 
 
-def test_lz4_cpu_comp_gpu_decomp():
-    cpu_codec = numcodecs.registry.get_codec({"id": "lz4"})
-    gpu_codec = _get_codec(LZ4_ALGO)
+@pytest.mark.parametrize(
+    "cpu_algo, gpu_algo",
+    [
+        ("lz4", LZ4_ALGO),
+        ("zstd", ZSTD_ALGO),
+    ],
+)
+def test_cpu_comp_gpu_decomp(cpu_algo, gpu_algo):
+    cpu_codec = numcodecs.registry.get_codec({"id": cpu_algo})
+    gpu_codec = _get_codec(gpu_algo)
 
     shape = (16, 16)
     chunks = (8, 8)
@@ -170,9 +177,9 @@ def test_lz4_cpu_comp_gpu_decomp():
     zarr.save_array(store, z1, compressor=cpu_codec)
 
     meta = json.loads(store[".zarray"])
-    assert meta["compressor"]["id"] == "lz4"
+    assert meta["compressor"]["id"] == cpu_algo
 
-    meta["compressor"] = {"id": NVCOMP_CODEC_ID, "algorithm": LZ4_ALGO}
+    meta["compressor"] = {"id": NVCOMP_CODEC_ID, "algorithm": gpu_algo}
     store[".zarray"] = json.dumps(meta).encode()
 
     z2 = zarr.open_array(store, compressor=gpu_codec)
@@ -180,6 +187,24 @@ def test_lz4_cpu_comp_gpu_decomp():
     assert_equal(z1[:], z2[:])
 
 
+def test_lz4_codec_header(shape_chunks):
+    shape, chunks = shape_chunks
+
+    # Test LZ4 nvCOMP codecs with and without the header.
+    codec_h = _get_codec(LZ4_ALGO, has_header=True)
+    codec_no_h = _get_codec(LZ4_ALGO, has_header=False)
+
+    np.random.seed(1)
+
+    data = np.random.randn(*shape).astype(np.float32)
+
+    z_h = zarr.array(data, chunks=chunks, compressor=codec_h)
+    z_no_h = zarr.array(data, chunks=chunks, compressor=codec_no_h)
+
+    # Result must be the same regardless of the header presence.
+    assert_equal(z_h[:], z_no_h[:])
+
+
 def test_empty_batch():
     codec = _get_codec(LZ4_ALGO)
 
diff --git a/python/tests/test_zarr.py b/python/tests/test_zarr.py
index 296c5f1ee6..f909559eea 100644
--- a/python/tests/test_zarr.py
+++ b/python/tests/test_zarr.py
@@ -4,12 +4,14 @@
 
 import math
 
+import numpy
 import pytest
 
 cupy = pytest.importorskip("cupy")
 zarr = pytest.importorskip("zarr")
 kvikio_zarr = pytest.importorskip("kvikio.zarr")
-
+kvikio_nvcomp_codec = pytest.importorskip("kvikio.nvcomp_codec")
+numcodecs = pytest.importorskip("numcodecs")
 
 if not kvikio_zarr.supported:
     pytest.skip(
@@ -156,3 +158,90 @@ def test_compressor(store, xp_write, xp_read, compressor):
     b = z[:]
     assert isinstance(b, xp_read.ndarray)
     cupy.testing.assert_array_equal(b, a)
+
+
+@pytest.mark.parametrize("algo", ["lz4", "zstd"])
+def test_decompressor_config_overwrite(tmp_path, xp, algo):
+    cpu_codec = numcodecs.registry.get_codec({"id": algo})
+    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
+
+    # Write using Zarr's default file store and the `cpu_codec` compressor
+    z = zarr.open_array(tmp_path, mode="w", shape=(10,), compressor=cpu_codec)
+    z[:] = range(10)
+    assert z.compressor == cpu_codec
+
+    # Open file using GDSStore and use `gpu_codec` as decompressor.
+    z = zarr.open_array(
+        kvikio_zarr.GDSStore(
+            tmp_path,
+            decompressor_config_overwrite=gpu_codec.get_config(),
+        ),
+        mode="r",
+        meta_array=xp.empty(()),
+    )
+    assert z.compressor == gpu_codec
+    assert isinstance(z[:], xp.ndarray)
+    xp.testing.assert_array_equal(z[:], range(10))
+
+
+@pytest.mark.parametrize("algo", ["lz4"])
+def test_compressor_config_overwrite(tmp_path, xp, algo):
+    cpu_codec = numcodecs.registry.get_codec({"id": algo})
+    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
+
+    # Write file using GDSStore and the `gpu_codec` compressor. In order
+    # to make the file compatible with Zarr's builtin CPU decompressor,
+    # we set `cpu_codec` as the compressor in the meta file on disk.
+    z = zarr.open_array(
+        kvikio_zarr.GDSStore(
+            tmp_path,
+            compressor_config_overwrite=cpu_codec.get_config(),
+            decompressor_config_overwrite=gpu_codec.get_config(),
+        ),
+        mode="w",
+        shape=10,
+        compressor=gpu_codec,
+        meta_array=xp.empty(()),
+    )
+    assert z.compressor == gpu_codec
+    z[:] = xp.arange(10)
+
+    # We can now open the file using Zarr's builtin CPU decompressor
+    z = zarr.open_array(tmp_path, mode="r")
+    assert isinstance(z[:], numpy.ndarray)
+    numpy.testing.assert_array_equal(z[:], range(10))
+
+
+def test_open_cupy_array(tmp_path):
+    a = cupy.arange(10)
+    z = kvikio_zarr.open_cupy_array(
+        tmp_path,
+        mode="w",
+        shape=a.shape,
+        dtype=a.dtype,
+        chunks=(2,),
+        compressor=kvikio_zarr.CompatCompressor.lz4(),
+    )
+    z[:] = a
+    assert a.shape == z.shape
+    assert a.dtype == z.dtype
+    assert isinstance(z[:], type(a))
+    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
+    cupy.testing.assert_array_equal(a, z[:])
+
+    z = kvikio_zarr.open_cupy_array(
+        tmp_path,
+        mode="r",
+    )
+    assert a.shape == z.shape
+    assert a.dtype == z.dtype
+    assert isinstance(z[:], type(a))
+    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
+    cupy.testing.assert_array_equal(a, z[:])
+
+    z = zarr.open_array(tmp_path, mode="r")
+    assert a.shape == z.shape
+    assert a.dtype == z.dtype
+    assert isinstance(z[:], numpy.ndarray)
+    assert z.compressor == kvikio_zarr.CompatCompressor.lz4().cpu
+    numpy.testing.assert_array_equal(a.get(), z[:])