diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-conda/devcontainer.json
rename to .devcontainer/cuda12.8-conda/devcontainer.json
index 3ed6fa9c37b..ad36130c6a0 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.8-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
     }
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cugraph,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json
similarity index 88%
rename from .devcontainer/cuda12.5-pip/devcontainer.json
rename to .devcontainer/cuda12.8-pip/devcontainer.json
index fe402024e29..a2955b81a60 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.8-pip/devcontainer.json
@@ -3,20 +3,20 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.18.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.8-ucx1.18.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
-      "version": "12.5",
+      "version": "12.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
@@ -28,7 +28,7 @@
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cugraph,type=bind,consistency=consistent",
@@ -36,7 +36,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e48f2e11acd..8e3134b896e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -195,7 +195,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
     with:
       arch: '["amd64"]'
-      cuda: '["12.5"]'
+      cuda: '["12.8"]'
       node_type: cpu32
       build_command: |
         sccache -z;
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
similarity index 96%
rename from conda/environments/all_cuda-125_arch-x86_64.yaml
rename to conda/environments/all_cuda-128_arch-x86_64.yaml
index 83126df9194..22c5f594a42 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.5
+- cuda-version=12.8
 - cudf==25.2.*,>=0.0.0a0
 - cupy>=12.0.0
 - cxx-compiler
@@ -77,4 +77,4 @@ dependencies:
 - torchmetrics
 - ucx-py==0.42.*,>=0.0.0a0
 - wheel
-name: all_cuda-125_arch-x86_64
+name: all_cuda-128_arch-x86_64
diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
index c94f456f215..b31624da840 100644
--- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index 628c3cc10cc..b887309bf6f 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,10 +26,10 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
@@ -43,7 +43,7 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t>
-__device__ thrust::optional<vertex_t> major_hypersparse_idx_from_major_nocheck_impl(
+__device__ cuda::std::optional<vertex_t> major_hypersparse_idx_from_major_nocheck_impl(
   raft::device_span<vertex_t const> dcs_nzd_vertices, vertex_t major)
 {
   // we can avoid binary search (and potentially improve performance) if we add an auxiliary array
@@ -51,10 +51,10 @@ __device__ thrust::optional<vertex_t> major_hypersparse_idx_from_major_nocheck_i
   auto it =
     thrust::lower_bound(thrust::seq, dcs_nzd_vertices.begin(), dcs_nzd_vertices.end(), major);
   return it != dcs_nzd_vertices.end()
-           ? (*it == major ? thrust::optional<vertex_t>{static_cast<vertex_t>(
+           ? (*it == major ? cuda::std::optional<vertex_t>{static_cast<vertex_t>(
                                thrust::distance(dcs_nzd_vertices.begin(), it))}
-                           : thrust::nullopt)
-           : thrust::nullopt;
+                           : cuda::std::nullopt)
+           : cuda::std::nullopt;
 }
 
 template <typename vertex_t, typename edge_t, typename return_type_t, bool multi_gpu, bool use_dcs>
@@ -490,7 +490,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_value_start_offset_;
   }
 
-  __host__ __device__ thrust::optional<vertex_t> major_hypersparse_first() const noexcept
+  __host__ __device__ cuda::std::optional<vertex_t> major_hypersparse_first() const noexcept
   {
     return major_hypersparse_first_;
   }
@@ -528,15 +528,16 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_range_first_ + major_offset;
   }
 
-  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  __device__ cuda::std::optional<vertex_t> major_idx_from_major_nocheck(
+    vertex_t major) const noexcept
   {
     if (major_hypersparse_first_ && (major >= *major_hypersparse_first_)) {
       auto major_hypersparse_idx =
         detail::major_hypersparse_idx_from_major_nocheck_impl(*dcs_nzd_vertices_, major);
       return major_hypersparse_idx
-               ? thrust::make_optional((*major_hypersparse_first_ - major_range_first_) +
-                                       *major_hypersparse_idx)
-               : thrust::nullopt;
+               ? cuda::std::make_optional((*major_hypersparse_first_ - major_range_first_) +
+                                          *major_hypersparse_idx)
+               : cuda::std::nullopt;
     } else {
       return major - major_range_first_;
     }
@@ -554,23 +555,23 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   // major_hypersparse_idx: index within the hypersparse segment
-  __device__ thrust::optional<vertex_t> major_hypersparse_idx_from_major_nocheck(
+  __device__ cuda::std::optional<vertex_t> major_hypersparse_idx_from_major_nocheck(
     vertex_t major) const noexcept
   {
     if (dcs_nzd_vertices_) {
       return detail::major_hypersparse_idx_from_major_nocheck_impl(*dcs_nzd_vertices_, major);
     } else {
-      return thrust::nullopt;
+      return cuda::std::nullopt;
     }
   }
 
   // major_hypersparse_idx: index within the hypersparse segment
-  __device__ thrust::optional<vertex_t> major_from_major_hypersparse_idx_nocheck(
+  __device__ cuda::std::optional<vertex_t> major_from_major_hypersparse_idx_nocheck(
     vertex_t major_hypersparse_idx) const noexcept
   {
     return dcs_nzd_vertices_
-             ? thrust::optional<vertex_t>{(*dcs_nzd_vertices_)[major_hypersparse_idx]}
-             : thrust::nullopt;
+             ? cuda::std::optional<vertex_t>{(*dcs_nzd_vertices_)[major_hypersparse_idx]}
+             : cuda::std::nullopt;
   }
 
   __host__ __device__ vertex_t minor_from_minor_offset_nocheck(vertex_t minor_offset) const noexcept
@@ -578,36 +579,36 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return minor_range_first_ + minor_offset;
   }
 
-  // FIxME: better return thrust::optional<raft::device_span<vertex_t const>> for consistency (see
-  // dcs_nzd_range_bitmap())
-  __host__ __device__ thrust::optional<vertex_t const*> dcs_nzd_vertices() const
+  // FIxME: better return cuda::std::optional<raft::device_span<vertex_t const>> for consistency
+  // (see dcs_nzd_range_bitmap())
+  __host__ __device__ cuda::std::optional<vertex_t const*> dcs_nzd_vertices() const
   {
-    return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
-                             : thrust::nullopt;
+    return dcs_nzd_vertices_ ? cuda::std::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
+                             : cuda::std::nullopt;
   }
 
-  __host__ __device__ thrust::optional<vertex_t> dcs_nzd_vertex_count() const
+  __host__ __device__ cuda::std::optional<vertex_t> dcs_nzd_vertex_count() const
   {
     return dcs_nzd_vertices_
-             ? thrust::optional<vertex_t>{static_cast<vertex_t>((*dcs_nzd_vertices_).size())}
-             : thrust::nullopt;
+             ? cuda::std::optional<vertex_t>{static_cast<vertex_t>((*dcs_nzd_vertices_).size())}
+             : cuda::std::nullopt;
   }
 
-  __host__ __device__ thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap()
+  __host__ __device__ cuda::std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap()
     const
   {
     return dcs_nzd_range_bitmap_
-             ? thrust::make_optional<raft::device_span<uint32_t const>>(
+             ? cuda::std::make_optional<raft::device_span<uint32_t const>>(
                  (*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size())
-             : thrust::nullopt;
+             : cuda::std::nullopt;
   }
 
  private:
   // should be trivially copyable to device
 
-  thrust::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{thrust::nullopt};
-  thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{thrust::nullopt};
-  thrust::optional<vertex_t> major_hypersparse_first_{thrust::nullopt};
+  cuda::std::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{cuda::std::nullopt};
+  cuda::std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{cuda::std::nullopt};
+  cuda::std::optional<vertex_t> major_hypersparse_first_{cuda::std::nullopt};
 
   vertex_t major_range_first_{0};
   vertex_t major_range_last_{0};
@@ -790,10 +791,10 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
 
   __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; }
 
-  __host__ __device__ thrust::optional<vertex_t> major_hypersparse_first() const noexcept
+  __host__ __device__ cuda::std::optional<vertex_t> major_hypersparse_first() const noexcept
   {
     assert(false);
-    return thrust::nullopt;
+    return cuda::std::nullopt;
   }
 
   __host__ __device__ constexpr vertex_t major_range_first() const noexcept { return vertex_t{0}; }
@@ -823,7 +824,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_offset;
   }
 
-  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  __device__ cuda::std::optional<vertex_t> major_idx_from_major_nocheck(
+    vertex_t major) const noexcept
   {
     return major_offset_from_major_nocheck(major);
   }
@@ -834,19 +836,19 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   // major_hypersparse_idx: index within the hypersparse segment
-  __device__ thrust::optional<vertex_t> major_hypersparse_idx_from_major_nocheck(
+  __device__ cuda::std::optional<vertex_t> major_hypersparse_idx_from_major_nocheck(
     vertex_t major) const noexcept
   {
     assert(false);
-    return thrust::nullopt;
+    return cuda::std::nullopt;
   }
 
   // major_hypersparse_idx: index within the hypersparse segment
-  __device__ thrust::optional<vertex_t> major_from_major_hypersparse_idx_nocheck(
+  __device__ cuda::std::optional<vertex_t> major_from_major_hypersparse_idx_nocheck(
     vertex_t major_hypersparse_idx) const noexcept
   {
     assert(false);
-    return thrust::nullopt;
+    return cuda::std::nullopt;
   }
 
   __host__ __device__ vertex_t minor_from_minor_offset_nocheck(vertex_t minor_offset) const noexcept
@@ -854,14 +856,14 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return minor_offset;
   }
 
-  __host__ __device__ thrust::optional<vertex_t const*> dcs_nzd_vertices() const
+  __host__ __device__ cuda::std::optional<vertex_t const*> dcs_nzd_vertices() const
   {
-    return thrust::nullopt;
+    return cuda::std::nullopt;
   }
 
-  __host__ __device__ thrust::optional<vertex_t> dcs_nzd_vertex_count() const
+  __host__ __device__ cuda::std::optional<vertex_t> dcs_nzd_vertex_count() const
   {
-    return thrust::nullopt;
+    return cuda::std::nullopt;
   }
 
  private:
diff --git a/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh b/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
index 4b324bcf348..061c4108f4d 100644
--- a/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
@@ -21,8 +21,8 @@
 #include <cugraph/utilities/packed_bool_utils.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/optional.h>
 
 namespace cugraph {
 
@@ -182,7 +182,7 @@ template <typename edge_t>
 class edge_partition_edge_dummy_property_device_view_t {
  public:
   using edge_type  = edge_t;
-  using value_type = thrust::nullopt_t;
+  using value_type = cuda::std::nullopt_t;
 
   static constexpr bool is_packed_bool          = false;
   static constexpr bool has_packed_bool_element = false;
@@ -194,7 +194,7 @@ class edge_partition_edge_dummy_property_device_view_t {
   {
   }
 
-  __device__ auto get(edge_t offset) const { return thrust::nullopt; }
+  __device__ auto get(edge_t offset) const { return cuda::std::nullopt; }
 };
 
 }  // namespace detail
diff --git a/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh b/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
index f86675e5572..e5600848db6 100644
--- a/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
@@ -23,12 +23,12 @@
 
 #include <raft/core/device_span.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/optional.h>
 
 namespace cugraph {
 
@@ -184,9 +184,10 @@ class edge_partition_endpoint_property_device_view_t {
   }
 
  private:
-  thrust::optional<raft::device_span<vertex_t const>> keys_{thrust::nullopt};
-  thrust::optional<raft::device_span<vertex_t const>> key_chunk_start_offsets_{thrust::nullopt};
-  thrust::optional<size_t> key_chunk_size_{thrust::nullopt};
+  cuda::std::optional<raft::device_span<vertex_t const>> keys_{cuda::std::nullopt};
+  cuda::std::optional<raft::device_span<vertex_t const>> key_chunk_start_offsets_{
+    cuda::std::nullopt};
+  cuda::std::optional<size_t> key_chunk_size_{cuda::std::nullopt};
 
   ValueIterator value_first_{};
   vertex_t range_first_{};
@@ -214,7 +215,7 @@ template <typename vertex_t>
 class edge_partition_endpoint_dummy_property_device_view_t {
  public:
   using vertex_type                             = vertex_t;
-  using value_type                              = thrust::nullopt_t;
+  using value_type                              = cuda::std::nullopt_t;
   static constexpr bool is_packed_bool          = false;
   static constexpr bool has_packed_bool_element = false;
 
@@ -230,7 +231,7 @@ class edge_partition_endpoint_dummy_property_device_view_t {
   {
   }
 
-  __device__ auto get(vertex_t offset) const { return thrust::nullopt; }
+  __device__ auto get(vertex_t offset) const { return cuda::std::nullopt; }
 };
 
 }  // namespace detail
diff --git a/cpp/include/cugraph/edge_property.hpp b/cpp/include/cugraph/edge_property.hpp
index d46d4e52fd4..11041f504f3 100644
--- a/cpp/include/cugraph/edge_property.hpp
+++ b/cpp/include/cugraph/edge_property.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/optional.h>
 
 #include <optional>
 #include <type_traits>
@@ -63,7 +63,7 @@ class edge_property_view_t {
 
 class edge_dummy_property_view_t {
  public:
-  using value_type     = thrust::nullopt_t;
+  using value_type     = cuda::std::nullopt_t;
   using value_iterator = void*;
 };
 
@@ -155,7 +155,7 @@ class edge_property_t {
 
 class edge_dummy_property_t {
  public:
-  using value_type = thrust::nullopt_t;
+  using value_type = cuda::std::nullopt_t;
 
   auto view() const { return edge_dummy_property_view_t{}; }
 };
diff --git a/cpp/include/cugraph/edge_src_dst_property.hpp b/cpp/include/cugraph/edge_src_dst_property.hpp
index d27f6856428..f7096ce32fa 100644
--- a/cpp/include/cugraph/edge_src_dst_property.hpp
+++ b/cpp/include/cugraph/edge_src_dst_property.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 #include <raft/core/handle.hpp>
 #include <raft/core/host_span.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <optional>
@@ -365,7 +365,7 @@ class edge_minor_property_t {
 
 class edge_endpoint_dummy_property_view_t {
  public:
-  using value_type     = thrust::nullopt_t;
+  using value_type     = cuda::std::nullopt_t;
   using value_iterator = void*;
 };
 
@@ -557,14 +557,14 @@ class edge_dst_property_t {
 
 class edge_src_dummy_property_t {
  public:
-  using value_type = thrust::nullopt_t;
+  using value_type = cuda::std::nullopt_t;
 
   auto view() const { return detail::edge_endpoint_dummy_property_view_t{}; }
 };
 
 class edge_dst_dummy_property_t {
  public:
-  using value_type = thrust::nullopt_t;
+  using value_type = cuda::std::nullopt_t;
 
   auto view() const { return detail::edge_endpoint_dummy_property_view_t{}; }
 };
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index 35c51c1ea6d..0ccf49ddfb6 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -1152,7 +1152,10 @@ lookup_endpoints_from_edge_ids_and_types(
  * @param dst_biases Optional bias for randomly selecting destination vertices.  If std::nullopt
  * vertices will be selected uniformly.  In multi-GPU environment the biases should be partitioned
  * based on the vertex partitions.
- * @param num_samples Number of negative samples to generate
+ * @param num_samples Number of negative samples to generate.  In SG mode this represents the total
+ * number of samples to generate.  In MG mode, each gpu will provide the number of samples desired
+ * on that GPU.  The total number of samples in MG mode will be the aggregation of these values, the
+ * resulting samples will be randomly distributed across the ranks.
  * @param remove_duplicates If true, remove duplicate samples
  * @param remove_existing_edges If true, remove samples that are actually edges in the graph
  * @param exact_number_of_samples If true, repeat generation until we get the exact number of
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index 91a349007da..b25dd9a41f4 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,11 +23,11 @@
 
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 
 #include <optional>
 #include <tuple>
@@ -87,15 +87,15 @@ std::tuple<std::vector<vertex_t>, std::vector<offset_t>> compute_offset_aligned_
 }
 
 template <typename T>
-thrust::optional<T> to_thrust_optional(std::optional<T> val)
+cuda::std::optional<T> to_thrust_optional(std::optional<T> val)
 {
-  thrust::optional<T> ret{thrust::nullopt};
+  cuda::std::optional<T> ret{cuda::std::nullopt};
   if (val) { ret = *val; }
   return ret;
 }
 
 template <typename T>
-std::optional<T> to_std_optional(thrust::optional<T> val)
+std::optional<T> to_std_optional(cuda::std::optional<T> val)
 {
   std::optional<T> ret{std::nullopt};
   if (val) { ret = *val; }
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 98fa2cb1706..d173cc08a1c 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,21 +145,21 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm,
 
 template <typename key_type, typename KeyToGroupIdOp>
 struct key_group_id_less_t {
-  KeyToGroupIdOp key_to_group_id_op{};
+  KeyToGroupIdOp key_to_group_id_op;
   int pivot{};
   __device__ bool operator()(key_type k) const { return key_to_group_id_op(k) < pivot; }
 };
 
 template <typename value_type, typename ValueToGroupIdOp>
 struct value_group_id_less_t {
-  ValueToGroupIdOp value_to_group_id_op{};
+  ValueToGroupIdOp value_to_group_id_op;
   int pivot{};
   __device__ bool operator()(value_type v) const { return value_to_group_id_op(v) < pivot; }
 };
 
 template <typename key_type, typename value_type, typename KeyToGroupIdOp>
 struct kv_pair_group_id_less_t {
-  KeyToGroupIdOp key_to_group_id_op{};
+  KeyToGroupIdOp key_to_group_id_op;
   int pivot{};
   __device__ bool operator()(thrust::tuple<key_type, value_type> t) const
   {
@@ -169,14 +169,14 @@ struct kv_pair_group_id_less_t {
 
 template <typename value_type, typename ValueToGroupIdOp>
 struct value_group_id_greater_equal_t {
-  ValueToGroupIdOp value_to_group_id_op{};
+  ValueToGroupIdOp value_to_group_id_op;
   int pivot{};
   __device__ bool operator()(value_type v) const { return value_to_group_id_op(v) >= pivot; }
 };
 
 template <typename key_type, typename value_type, typename KeyToGroupIdOp>
 struct kv_pair_group_id_greater_equal_t {
-  KeyToGroupIdOp key_to_group_id_op{};
+  KeyToGroupIdOp key_to_group_id_op;
   int pivot{};
   __device__ bool operator()(thrust::tuple<key_type, value_type> t) const
   {
diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh
index 88ef3987a03..4eb57b621ea 100644
--- a/cpp/src/centrality/betweenness_centrality_impl.cuh
+++ b/cpp/src/centrality/betweenness_centrality_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,8 +36,8 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/functional.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 
 //
@@ -52,10 +52,11 @@ struct brandes_e_op_t {
   const vertex_t invalid_distance_{std::numeric_limits<vertex_t>::max()};
 
   template <typename value_t, typename ignore_t>
-  __device__ thrust::optional<value_t> operator()(
+  __device__ cuda::std::optional<value_t> operator()(
     vertex_t, vertex_t, value_t src_sigma, vertex_t dst_distance, ignore_t) const
   {
-    return (dst_distance == invalid_distance_) ? thrust::make_optional(src_sigma) : thrust::nullopt;
+    return (dst_distance == invalid_distance_) ? cuda::std::make_optional(src_sigma)
+                                               : cuda::std::nullopt;
   }
 };
 
@@ -64,7 +65,7 @@ struct extract_edge_e_op_t {
   vertex_t d{};
 
   template <typename edge_t, typename weight_t>
-  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
+  __device__ cuda::std::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
     vertex_t src,
     vertex_t dst,
     thrust::tuple<vertex_t, edge_t, weight_t> src_props,
@@ -72,8 +73,8 @@ struct extract_edge_e_op_t {
     weight_t edge_centrality) const
   {
     return ((thrust::get<0>(dst_props) == d) && (thrust::get<0>(src_props) == (d - 1)))
-             ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
-             : thrust::nullopt;
+             ? cuda::std::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
+             : cuda::std::nullopt;
   }
 };
 
@@ -153,8 +154,8 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
                       thrust::make_zip_iterator(distances.begin(), sigmas.begin()),
                       [hop] __device__(auto v, auto old_values, auto v_sigma) {
                         return thrust::make_tuple(
-                          thrust::make_optional(bucket_idx_next),
-                          thrust::make_optional(thrust::make_tuple(hop + 1, v_sigma)));
+                          cuda::std::make_optional(bucket_idx_next),
+                          cuda::std::make_optional(thrust::make_tuple(hop + 1, v_sigma)));
                       });
 
     vertex_frontier.bucket(bucket_idx_cur).clear();
diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh
index 869ed4e7ae6..57bf9d50b9a 100644
--- a/cpp/src/community/approx_weighted_matching_impl.cuh
+++ b/cpp/src/community/approx_weighted_matching_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,8 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
     cugraph::edge_src_dummy_property_t{}.view(),
     cugraph::edge_dst_dummy_property_t{}.view(),
     cugraph::edge_dummy_property_t{}.view(),
-    [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+    [] __device__(
+      auto src, auto dst, cuda::std::nullopt_t, cuda::std::nullopt_t, cuda::std::nullopt_t) {
       return !(src == dst);
     },
     edge_masks_even.mutable_view());
@@ -130,7 +131,7 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
         graph_view_t::is_multi_gpu
           ? src_key_cache.view()
           : detail::edge_major_property_view_t<vertex_t, vertex_t const*>(local_vertices.begin()),
-        [] __device__(auto, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto wt) {
+        [] __device__(auto, auto dst, cuda::std::nullopt_t, cuda::std::nullopt_t, auto wt) {
           return thrust::make_tuple(wt, dst);
         },
         thrust::make_tuple(weight_t{0.0}, invalid_partner),
@@ -314,7 +315,7 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
           dst_match_flags.view(),
           cugraph::edge_dummy_property_t{}.view(),
           [] __device__(
-            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, cuda::std::nullopt_t) {
             return !((is_src_matched == true) || (is_dst_matched == true));
           },
           edge_masks_odd.mutable_view());
@@ -327,7 +328,7 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
                                                                     vertex_t{0}),
           cugraph::edge_dummy_property_t{}.view(),
           [] __device__(
-            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, cuda::std::nullopt_t) {
             return !((is_src_matched == true) || (is_dst_matched == true));
           },
           edge_masks_odd.mutable_view());
@@ -346,7 +347,7 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
           dst_match_flags.view(),
           cugraph::edge_dummy_property_t{}.view(),
           [] __device__(
-            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, cuda::std::nullopt_t) {
             return !((is_src_matched == true) || (is_dst_matched == true));
           },
           edge_masks_even.mutable_view());
@@ -359,7 +360,7 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
                                                                     vertex_t{0}),
           cugraph::edge_dummy_property_t{}.view(),
           [] __device__(
-            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, cuda::std::nullopt_t) {
             return !((is_src_matched == true) || (is_dst_matched == true));
           },
           edge_masks_even.mutable_view());
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index 18fb3fdb251..d37a8864e68 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,11 +30,11 @@
 #include <cugraph/graph_functions.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/transform.h>
@@ -140,7 +140,7 @@ struct cluster_update_op_t {
 template <typename vertex_t, typename weight_t>
 struct return_edge_weight_t {
   __device__ auto operator()(
-    vertex_t, vertex_t, thrust::nullopt_t, thrust::nullopt_t, weight_t w) const
+    vertex_t, vertex_t, cuda::std::nullopt_t, cuda::std::nullopt_t, weight_t w) const
   {
     return w;
   }
@@ -150,7 +150,7 @@ struct return_edge_weight_t {
 template <typename vertex_t, typename weight_t>
 struct return_one_t {
   __device__ auto operator()(
-    vertex_t, vertex_t, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+    vertex_t, vertex_t, cuda::std::nullopt_t, cuda::std::nullopt_t, cuda::std::nullopt_t) const
   {
     return 1.0;
   }
diff --git a/cpp/src/community/detail/maximal_independent_moves.cuh b/cpp/src/community/detail/maximal_independent_moves.cuh
index 85892f711ba..70b812d687a 100644
--- a/cpp/src/community/detail/maximal_independent_moves.cuh
+++ b/cpp/src/community/detail/maximal_independent_moves.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,13 +32,13 @@
 #include <thrust/distance.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/merge.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/set_operations.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
 #include <cmath>
+#include <optional>
 
 namespace cugraph {
 
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 01a68a3a0d7..6038fcc6f27 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -38,7 +38,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/random.h>
 #include <thrust/sequence.h>
 #include <thrust/shuffle.h>
@@ -48,6 +47,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <optional>
+
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
 // FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
diff --git a/cpp/src/community/ecg_impl.cuh b/cpp/src/community/ecg_impl.cuh
index d01b13f0b35..100efdb025d 100644
--- a/cpp/src/community/ecg_impl.cuh
+++ b/cpp/src/community/ecg_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
+
 namespace cugraph {
 
 namespace detail {
@@ -106,7 +108,7 @@ std::tuple<rmm::device_uvector<vertex_t>, size_t, weight_t> ecg(
     edge_dst_dummy_property_t{}.view(),
     view_concat(*edge_weight_view, modified_edge_weights.view()),
     [min_weight, ensemble_size = static_cast<weight_t>(ensemble_size)] __device__(
-      auto, auto, thrust::nullopt_t, thrust::nullopt_t, auto edge_properties) {
+      auto, auto, cuda::std::nullopt_t, cuda::std::nullopt_t, auto edge_properties) {
       auto e_weight    = thrust::get<0>(edge_properties);
       auto e_frequency = thrust::get<1>(edge_properties);
       return min_weight + (e_weight - min_weight) * e_frequency / ensemble_size;
diff --git a/cpp/src/community/edge_triangle_count_impl.cuh b/cpp/src/community/edge_triangle_count_impl.cuh
index fbf47615dbe..b210bcacf35 100644
--- a/cpp/src/community/edge_triangle_count_impl.cuh
+++ b/cpp/src/community/edge_triangle_count_impl.cuh
@@ -28,6 +28,7 @@
 
 #include <raft/util/integer_utils.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
@@ -353,9 +354,9 @@ edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_t
      num_edges     = edgelist_srcs.size(),
      num_triangles = num_triangles.data()] __device__(auto src,
                                                       auto dst,
-                                                      thrust::nullopt_t,
-                                                      thrust::nullopt_t,
-                                                      thrust::nullopt_t) {
+                                                      cuda::std::nullopt_t,
+                                                      cuda::std::nullopt_t,
+                                                      cuda::std::nullopt_t) {
       auto pair = thrust::make_tuple(src, dst);
 
       // Find its position in 'edges'
diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh
index 2b712a6de77..25a1cf63f4d 100644
--- a/cpp/src/community/k_truss_impl.cuh
+++ b/cpp/src/community/k_truss_impl.cuh
@@ -19,6 +19,7 @@
 #include "prims/extract_transform_e.cuh"
 #include "prims/extract_transform_v_frontier_outgoing_e.cuh"
 #include "prims/fill_edge_property.cuh"
+#include "prims/per_v_pair_dst_nbr_intersection.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
@@ -31,63 +32,131 @@
 
 #include <raft/util/integer_utils.hpp>
 
+#include <cuda/std/optional>
+#include <cuda/std/utility>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
 namespace cugraph {
 
-namespace {
+template <typename vertex_t, typename edge_t>
+struct extract_weak_edges {
+  edge_t k{};
+  __device__ cuda::std::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
+    vertex_t src, vertex_t dst, cuda::std::nullopt_t, cuda::std::nullopt_t, edge_t count) const
+  {
+    // No need to process edges with count == 0
+    return ((count < k - 2) && (count != 0))
+             ? cuda::std::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
+             : cuda::std::nullopt;
+  }
+};
 
-template <typename vertex_t>
-struct exclude_self_loop_t {
-  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
-    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+template <typename edge_t>
+struct is_k_or_greater_t {
+  edge_t k{};
+  __device__ bool operator()(edge_t core_number) const { return core_number >= edge_t{k}; }
+};
+
+template <typename vertex_t, typename edge_t>
+struct extract_triangles_endpoints {
+  size_t chunk_start{};
+  raft::device_span<size_t const> intersection_offsets{};
+  raft::device_span<vertex_t const> intersection_indices{};
+  raft::device_span<vertex_t const> weak_srcs{};
+  raft::device_span<vertex_t const> weak_dsts{};
+
+  __device__ thrust::tuple<vertex_t, vertex_t, vertex_t> operator()(edge_t i) const
   {
-    return src != dst
-             ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
-             : thrust::nullopt;
+    auto itr = thrust::upper_bound(
+      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
+    auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
+
+    auto endpoints = thrust::make_tuple(weak_srcs[chunk_start + idx],  // p
+                                        weak_dsts[chunk_start + idx],  // q
+                                        intersection_indices[i]        // r
+    );
+
+    auto p = weak_srcs[chunk_start + idx];
+    auto q = weak_dsts[chunk_start + idx];
+    auto r = intersection_indices[i];
+    // Re-order the endpoints such that p < q < r in order to identify duplicate triangles
+    // which will cause overcompensation. comparing the vertex IDs is cheaper than comparing the
+    // degrees (d(p) < d(q) < d(r)) which will be done once in the latter stage to retrieve the
+    // direction of the edges once the triplet dependency is broken.
+    if (p > q) cuda::std::swap(p, q);
+    if (p > r) cuda::std::swap(p, r);
+    if (q > r) cuda::std::swap(q, r);
+
+    return thrust::make_tuple(p, q, r);
   }
 };
 
-template <typename vertex_t, typename weight_t, typename edge_t>
-struct extract_low_to_high_degree_weighted_edges_t {
-  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t, weight_t>> operator()(
-    vertex_t src, vertex_t dst, edge_t src_out_degree, edge_t dst_out_degree, weight_t wgt) const
+namespace {
+
+template <typename vertex_t>
+struct exclude_self_loop_t {
+  __device__ cuda::std::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
+    vertex_t src,
+    vertex_t dst,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t) const
   {
-    return (src_out_degree < dst_out_degree)
-             ? thrust::optional<thrust::tuple<vertex_t, vertex_t, weight_t>>{thrust::make_tuple(
-                 src, dst, wgt)}
-             : (((src_out_degree == dst_out_degree) &&
-                 (src < dst) /* tie-breaking using vertex ID */)
-                  ? thrust::optional<
-                      thrust::tuple<vertex_t, vertex_t, weight_t>>{thrust::make_tuple(
-                      src, dst, wgt)}
-                  : thrust::nullopt);
+    return src != dst
+             ? cuda::std::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
+             : cuda::std::nullopt;
   }
 };
 
 template <typename vertex_t, typename edge_t>
-struct extract_low_to_high_degree_edges_t {
-  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(vertex_t src,
-                                                                            vertex_t dst,
-                                                                            edge_t src_out_degree,
-                                                                            edge_t dst_out_degree,
-                                                                            thrust::nullopt_t) const
+struct extract_low_to_high_degree_edges_from_endpoints_t {
+  raft::device_span<vertex_t const> srcs{};
+  raft::device_span<vertex_t const> dsts{};
+  raft::device_span<edge_t const> count{};
+  __device__ cuda::std::optional<thrust::tuple<vertex_t, vertex_t, edge_t>> operator()(
+    vertex_t src,
+    vertex_t dst,
+    edge_t src_out_degree,
+    edge_t dst_out_degree,
+    cuda::std::nullopt_t) const
   {
-    return (src_out_degree < dst_out_degree)
-             ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
-             : (((src_out_degree == dst_out_degree) &&
-                 (src < dst) /* tie-breaking using vertex ID */)
-                  ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src,
-                                                                                           dst)}
-                  : thrust::nullopt);
+    // FIXME: Not the most efficient way because the entire edgelist is scan just to find
+    // the direction of the edges
+    auto itr = thrust::lower_bound(thrust::seq,
+                                   thrust::make_zip_iterator(srcs.begin(), dsts.begin()),
+                                   thrust::make_zip_iterator(srcs.end(), dsts.end()),
+                                   thrust::make_tuple(src, dst));
+
+    if ((itr != thrust::make_zip_iterator(srcs.end(), dsts.end())) &&
+        (*itr == thrust::make_tuple(src, dst))) {
+      auto idx = thrust::distance(thrust::make_zip_iterator(srcs.begin(), dsts.begin()), itr);
+
+      if (src_out_degree < dst_out_degree) {
+        return cuda::std::optional<thrust::tuple<vertex_t, vertex_t, edge_t>>{
+          thrust::make_tuple(src, dst, count[idx])};
+      } else if (dst_out_degree < src_out_degree) {
+        return cuda::std::optional<thrust::tuple<vertex_t, vertex_t, edge_t>>{
+          thrust::make_tuple(dst, src, count[idx])};
+      } else {
+        if ((src_out_degree == dst_out_degree) && (src < dst) /* tie-breaking using vertex ID */) {
+          return cuda::std::optional<thrust::tuple<vertex_t, vertex_t, edge_t>>{
+            thrust::make_tuple(src, dst, count[idx])};
+        } else if ((src_out_degree == dst_out_degree) &&
+                   (src > dst) /* tie-breaking using vertex ID */) {
+          return cuda::std::optional<thrust::tuple<vertex_t, vertex_t, edge_t>>{
+            thrust::make_tuple(dst, src, count[idx])};
+        }
+      }
+    } else {
+      return cuda::std::nullopt;
+    }
   }
 };
 
@@ -114,246 +183,543 @@ k_truss(raft::handle_t const& handle,
     // nothing to do
   }
 
-  std::optional<graph_t<vertex_t, edge_t, false, multi_gpu>> modified_graph{std::nullopt};
-  std::optional<graph_view_t<vertex_t, edge_t, false, multi_gpu>> modified_graph_view{std::nullopt};
-  std::optional<rmm::device_uvector<vertex_t>> renumber_map{std::nullopt};
-  std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>>
-    edge_weight{std::nullopt};
-  std::optional<rmm::device_uvector<weight_t>> wgts{std::nullopt};
-
-  if (graph_view.count_self_loops(handle) > edge_t{0}) {
-    auto [srcs, dsts] = extract_transform_e(handle,
-                                            graph_view,
-                                            edge_src_dummy_property_t{}.view(),
-                                            edge_dst_dummy_property_t{}.view(),
-                                            edge_dummy_property_t{}.view(),
-                                            exclude_self_loop_t<vertex_t>{});
-
-    if constexpr (multi_gpu) {
-      std::tie(
-        srcs, dsts, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore) =
-        detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
-                                                                                       edge_t,
-                                                                                       weight_t,
-                                                                                       int32_t,
-                                                                                       int32_t>(
-          handle,
-          std::move(srcs),
-          std::move(dsts),
-          std::nullopt,
-          std::nullopt,
-          std::nullopt,
-          std::nullopt,
-          std::nullopt);
-    }
+  // 2. Exclude self-loops and edges that do not belong to (k-1)-core
 
-    std::tie(*modified_graph, std::ignore, std::ignore, std::ignore, renumber_map) =
-      create_graph_from_edgelist<vertex_t, edge_t, weight_t, int32_t, false, multi_gpu>(
-        handle,
-        std::nullopt,
-        std::move(srcs),
-        std::move(dsts),
-        std::nullopt,
-        std::nullopt,
-        std::nullopt,
-        cugraph::graph_properties_t{true, graph_view.is_multigraph()},
-        true);
+  auto cur_graph_view          = graph_view;
+  auto unmasked_cur_graph_view = cur_graph_view;
 
-    modified_graph_view = (*modified_graph).view();
-  }
+  if (unmasked_cur_graph_view.has_edge_mask()) { unmasked_cur_graph_view.clear_edge_mask(); }
+  // mask for self-loops and edges not part of k-1 core
+  cugraph::edge_property_t<decltype(cur_graph_view), bool> undirected_mask(handle);
+  {
+    // 2.1 Exclude self-loops
 
-  // 2. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+    if (cur_graph_view.count_self_loops(handle) > edge_t{0}) {
+      // 2.1. Exclude self-loops
 
-  {
-    auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
-
-    auto vertex_partition_range_lasts =
-      renumber_map
-        ? std::make_optional<std::vector<vertex_t>>(cur_graph_view.vertex_partition_range_lasts())
-        : std::nullopt;
-
-    rmm::device_uvector<edge_t> core_numbers(cur_graph_view.number_of_vertices(),
-                                             handle.get_stream());
-    core_number(
-      handle, cur_graph_view, core_numbers.data(), k_core_degree_type_t::OUT, size_t{2}, size_t{2});
-
-    raft::device_span<edge_t const> core_number_span{core_numbers.data(), core_numbers.size()};
-
-    auto [srcs, dsts, wgts] = k_core(handle,
-                                     cur_graph_view,
-                                     edge_weight_view,
-                                     k - 1,
-                                     std::make_optional(k_core_degree_type_t::OUT),
-                                     std::make_optional(core_number_span));
-
-    if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore) =
-        detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
-                                                                                       edge_t,
-                                                                                       weight_t,
-                                                                                       int32_t,
-                                                                                       int32_t>(
-          handle,
-          std::move(srcs),
-          std::move(dsts),
-          std::move(wgts),
-          std::nullopt,
-          std::nullopt,
-          std::nullopt,
-          std::nullopt);
-    }
+      cugraph::edge_property_t<decltype(cur_graph_view), bool> self_loop_edge_mask(handle,
+                                                                                   cur_graph_view);
+      cugraph::fill_edge_property(
+        handle, unmasked_cur_graph_view, self_loop_edge_mask.mutable_view(), false);
 
-    std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
-    std::tie(*modified_graph, edge_weight, std::ignore, std::ignore, tmp_renumber_map) =
-      create_graph_from_edgelist<vertex_t, edge_t, weight_t, int32_t, false, multi_gpu>(
+      transform_e(
         handle,
-        std::nullopt,
-        std::move(srcs),
-        std::move(dsts),
-        std::move(wgts),
-        std::nullopt,
-        std::nullopt,
-        cugraph::graph_properties_t{true, graph_view.is_multigraph()},
-        true);
+        cur_graph_view,
+        edge_src_dummy_property_t{}.view(),
+        edge_dst_dummy_property_t{}.view(),
+        edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto, auto, auto) { return src != dst; },
+        self_loop_edge_mask.mutable_view());
+
+      undirected_mask = std::move(self_loop_edge_mask);
+      if (cur_graph_view.has_edge_mask()) { cur_graph_view.clear_edge_mask(); }
+      cur_graph_view.attach_edge_mask(undirected_mask.view());
+    }
 
-    modified_graph_view = (*modified_graph).view();
+    // 2.2 Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+    {
+      rmm::device_uvector<edge_t> core_numbers(cur_graph_view.number_of_vertices(),
+                                               handle.get_stream());
+      core_number(handle,
+                  cur_graph_view,
+                  core_numbers.data(),
+                  k_core_degree_type_t::OUT,
+                  size_t{2},
+                  size_t{2});
+
+      edge_src_property_t<decltype(cur_graph_view), bool> edge_src_in_k_minus_1_cores(
+        handle, cur_graph_view);
+      edge_dst_property_t<decltype(cur_graph_view), bool> edge_dst_in_k_minus_1_cores(
+        handle, cur_graph_view);
+      auto in_k_minus_1_core_first =
+        thrust::make_transform_iterator(core_numbers.begin(), is_k_or_greater_t<edge_t>{k - 1});
+      rmm::device_uvector<bool> in_k_minus_1_core_flags(core_numbers.size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   in_k_minus_1_core_first,
+                   in_k_minus_1_core_first + core_numbers.size(),
+                   in_k_minus_1_core_flags.begin());
+      update_edge_src_property(handle,
+                               cur_graph_view,
+                               in_k_minus_1_core_flags.begin(),
+                               edge_src_in_k_minus_1_cores.mutable_view());
+      update_edge_dst_property(handle,
+                               cur_graph_view,
+                               in_k_minus_1_core_flags.begin(),
+                               edge_dst_in_k_minus_1_cores.mutable_view());
+
+      cugraph::edge_property_t<decltype(cur_graph_view), bool> in_k_minus_1_core_edge_mask(
+        handle, cur_graph_view);
+      cugraph::fill_edge_property(
+        handle, unmasked_cur_graph_view, in_k_minus_1_core_edge_mask.mutable_view(), false);
+
+      transform_e(
+        handle,
+        cur_graph_view,
+        edge_src_in_k_minus_1_cores.view(),
+        edge_dst_in_k_minus_1_cores.view(),
+        edge_dummy_property_t{}.view(),
+        [] __device__(auto, auto, auto src_in_k_minus_1_core, auto dst_in_k_minus_1_core, auto) {
+          return src_in_k_minus_1_core && dst_in_k_minus_1_core;
+        },
+        in_k_minus_1_core_edge_mask.mutable_view());
 
-    if (renumber_map) {  // collapse renumber_map
-      unrenumber_int_vertices<vertex_t, multi_gpu>(handle,
-                                                   (*tmp_renumber_map).data(),
-                                                   (*tmp_renumber_map).size(),
-                                                   (*renumber_map).data(),
-                                                   *vertex_partition_range_lasts);
+      undirected_mask = std::move(in_k_minus_1_core_edge_mask);
+      if (cur_graph_view.has_edge_mask()) { cur_graph_view.clear_edge_mask(); }
+      cur_graph_view.attach_edge_mask(undirected_mask.view());
     }
-
-    renumber_map = std::move(tmp_renumber_map);
   }
 
   // 3. Keep only the edges from a low-degree vertex to a high-degree vertex.
 
-  {
-    auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
-
-    auto vertex_partition_range_lasts =
-      renumber_map
-        ? std::make_optional<std::vector<vertex_t>>(cur_graph_view.vertex_partition_range_lasts())
-        : std::nullopt;
+  edge_src_property_t<decltype(cur_graph_view), edge_t> edge_src_out_degrees(handle,
+                                                                             cur_graph_view);
+  edge_dst_property_t<decltype(cur_graph_view), edge_t> edge_dst_out_degrees(handle,
+                                                                             cur_graph_view);
 
+  cugraph::edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, bool> dodg_mask(
+    handle, cur_graph_view);
+  {
     auto out_degrees = cur_graph_view.compute_out_degrees(handle);
-    edge_src_property_t<decltype(cur_graph_view), edge_t> edge_src_out_degrees(handle,
-                                                                               cur_graph_view);
-    edge_dst_property_t<decltype(cur_graph_view), edge_t> edge_dst_out_degrees(handle,
-                                                                               cur_graph_view);
     update_edge_src_property(
       handle, cur_graph_view, out_degrees.begin(), edge_src_out_degrees.mutable_view());
     update_edge_dst_property(
       handle, cur_graph_view, out_degrees.begin(), edge_dst_out_degrees.mutable_view());
 
-    rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
-
-    edge_weight_view =
-      edge_weight ? std::make_optional((*edge_weight).view())
-                  : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
-    if (edge_weight_view) {
-      std::tie(srcs, dsts, wgts) = extract_transform_e(
-        handle,
-        cur_graph_view,
-        edge_src_out_degrees.view(),
-        edge_dst_out_degrees.view(),
-        *edge_weight_view,
-        extract_low_to_high_degree_weighted_edges_t<vertex_t, weight_t, edge_t>{});
-    } else {
-      std::tie(srcs, dsts) =
-        extract_transform_e(handle,
-                            cur_graph_view,
-                            edge_src_out_degrees.view(),
-                            edge_dst_out_degrees.view(),
-                            edge_dummy_property_t{}.view(),
-                            extract_low_to_high_degree_edges_t<vertex_t, edge_t>{});
-    }
-
-    if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore) =
-        detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
-                                                                                       edge_t,
-                                                                                       weight_t,
-                                                                                       int32_t,
-                                                                                       int32_t>(
-          handle,
-          std::move(srcs),
-          std::move(dsts),
-          std::move(wgts),
-          std::nullopt,
-          std::nullopt,
-          std::nullopt,
-          std::nullopt);
-    }
-
-    std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
-
-    std::tie(*modified_graph, edge_weight, std::ignore, std::ignore, tmp_renumber_map) =
-      create_graph_from_edgelist<vertex_t, edge_t, weight_t, int32_t, false, multi_gpu>(
-        handle,
-        std::nullopt,
-        std::move(srcs),
-        std::move(dsts),
-        std::move(wgts),
-        std::nullopt,
-        std::nullopt,
-        cugraph::graph_properties_t{false /* now asymmetric */, cur_graph_view.is_multigraph()},
-        true);
-
-    modified_graph_view = (*modified_graph).view();
-    if (renumber_map) {  // collapse renumber_map
-      unrenumber_int_vertices<vertex_t, multi_gpu>(handle,
-                                                   (*tmp_renumber_map).data(),
-                                                   (*tmp_renumber_map).size(),
-                                                   (*renumber_map).data(),
-                                                   *vertex_partition_range_lasts);
-    }
-    renumber_map = std::move(tmp_renumber_map);
+    cugraph::fill_edge_property(
+      handle, unmasked_cur_graph_view, dodg_mask.mutable_view(), bool{false});
+
+    cugraph::transform_e(
+      handle,
+      cur_graph_view,
+      edge_src_out_degrees.view(),
+      edge_dst_out_degrees.view(),
+      edge_dummy_property_t{}.view(),
+      [] __device__(auto src, auto dst, auto src_out_degree, auto dst_out_degree, auto) {
+        return (src_out_degree < dst_out_degree) ? true
+               : ((src_out_degree == dst_out_degree) &&
+                  (src < dst) /* tie-breaking using vertex ID */)
+                 ? true
+                 : false;
+      },
+      dodg_mask.mutable_view(),
+      do_expensive_check);
+
+    if (cur_graph_view.has_edge_mask()) { cur_graph_view.clear_edge_mask(); }
+    cur_graph_view.attach_edge_mask(dodg_mask.view());
   }
 
   // 4. Compute triangle count using nbr_intersection and unroll weak edges
 
   {
-    auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
+    // Mask self loops and edges not being part of k-1 core
+    auto weak_edges_mask = std::move(undirected_mask);
 
-    edge_weight_view =
-      edge_weight ? std::make_optional((*edge_weight).view())
-                  : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
+    auto edge_triangle_counts =
+      edge_triangle_count<vertex_t, edge_t, multi_gpu>(handle, cur_graph_view, false);
 
-    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
-    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), bool{true});
+    cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> edgelist_weak(handle);
+    cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> edges_to_decrement_count(handle);
+    size_t prev_chunk_size = 0;  // FIXME: Add support for chunking
 
     while (true) {
-      // FIXME: This approach is very expensive when invalidating only few edges per iteration
-      // and should be address.
-      auto edge_triangle_counts =
-        edge_triangle_count<vertex_t, edge_t, multi_gpu>(handle, cur_graph_view);
+      // Extract weak edges
+      auto [weak_edgelist_srcs, weak_edgelist_dsts] =
+        extract_transform_e(handle,
+                            cur_graph_view,
+                            edge_src_dummy_property_t{}.view(),
+                            edge_dst_dummy_property_t{}.view(),
+                            edge_triangle_counts.view(),
+                            extract_weak_edges<vertex_t, edge_t>{k});
+
+      auto weak_edgelist_first =
+        thrust::make_zip_iterator(weak_edgelist_srcs.begin(), weak_edgelist_dsts.begin());
+      auto weak_edgelist_last =
+        thrust::make_zip_iterator(weak_edgelist_srcs.end(), weak_edgelist_dsts.end());
+
+      thrust::sort(handle.get_thrust_policy(), weak_edgelist_first, weak_edgelist_last);
+
+      // Perform nbr_intersection of the weak edges from the undirected
+      // graph view
+      cur_graph_view.clear_edge_mask();
+
+      // Attach the weak edge mask
+      cur_graph_view.attach_edge_mask(weak_edges_mask.view());
+
+      auto [intersection_offsets, intersection_indices] = per_v_pair_dst_nbr_intersection(
+        handle, cur_graph_view, weak_edgelist_first, weak_edgelist_last, do_expensive_check);
+
+      // This array stores (p, q, r) which are endpoints for the triangles with weak edges
+
+      auto triangles_endpoints =
+        allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t, vertex_t>>(
+          intersection_indices.size(), handle.get_stream());
+
+      // Extract endpoints for triangles with weak edges
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        get_dataframe_buffer_begin(triangles_endpoints),
+        get_dataframe_buffer_end(triangles_endpoints),
+        extract_triangles_endpoints<vertex_t, edge_t>{
+          prev_chunk_size,
+          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+          raft::device_span<vertex_t const>(intersection_indices.data(),
+                                            intersection_indices.size()),
+          raft::device_span<vertex_t const>(weak_edgelist_srcs.data(), weak_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(weak_edgelist_dsts.data(), weak_edgelist_dsts.size())});
+
+      thrust::sort(handle.get_thrust_policy(),
+                   get_dataframe_buffer_begin(triangles_endpoints),
+                   get_dataframe_buffer_end(triangles_endpoints));
+
+      auto unique_triangle_end = thrust::unique(handle.get_thrust_policy(),
+                                                get_dataframe_buffer_begin(triangles_endpoints),
+                                                get_dataframe_buffer_end(triangles_endpoints));
+
+      auto num_unique_triangles = thrust::distance(  // Triangles are represented by their endpoints
+        get_dataframe_buffer_begin(triangles_endpoints),
+        unique_triangle_end);
+
+      resize_dataframe_buffer(triangles_endpoints, num_unique_triangles, handle.get_stream());
+
+      if constexpr (multi_gpu) {
+        auto& comm           = handle.get_comms();
+        auto const comm_size = comm.get_size();
+        auto& major_comm     = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+        auto const major_comm_size = major_comm.get_size();
+        auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+        auto const minor_comm_size = minor_comm.get_size();
+
+        auto vertex_partition_range_lasts = cur_graph_view.vertex_partition_range_lasts();
+
+        rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+          vertex_partition_range_lasts.size(), handle.get_stream());
+
+        raft::update_device(d_vertex_partition_range_lasts.data(),
+                            vertex_partition_range_lasts.data(),
+                            vertex_partition_range_lasts.size(),
+                            handle.get_stream());
+
+        // Shuffle the edges with respect to the undirected graph view to the GPU
+        // owning edge (p, q). Remember that the triplet (p, q, r) is ordered based on the
+        // vertex ID and not the degree so (p, q) might not be an edge in the DODG but is
+        // surely an edge in the undirected graph
+        std::tie(triangles_endpoints, std::ignore) = groupby_gpu_id_and_shuffle_values(
+          handle.get_comms(),
+          get_dataframe_buffer_begin(triangles_endpoints),
+          get_dataframe_buffer_end(triangles_endpoints),
+
+          [key_func =
+             cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t<vertex_t>{
+               raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                                 d_vertex_partition_range_lasts.size()),
+               comm_size,
+               major_comm_size,
+               minor_comm_size}] __device__(auto val) {
+            return key_func(thrust::get<0>(val), thrust::get<1>(val));
+          },
+          handle.get_stream());
+
+        thrust::sort(handle.get_thrust_policy(),
+                     get_dataframe_buffer_begin(triangles_endpoints),
+                     get_dataframe_buffer_end(triangles_endpoints));
+
+        unique_triangle_end = thrust::unique(handle.get_thrust_policy(),
+                                             get_dataframe_buffer_begin(triangles_endpoints),
+                                             get_dataframe_buffer_end(triangles_endpoints));
+
+        num_unique_triangles =
+          thrust::distance(get_dataframe_buffer_begin(triangles_endpoints), unique_triangle_end);
+        resize_dataframe_buffer(triangles_endpoints, num_unique_triangles, handle.get_stream());
+      }
+
+      auto edgelist_to_update_count = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+        3 * num_unique_triangles, handle.get_stream());
+
+      // The order no longer matters since duplicated triangles have been removed
+      // Flatten the endpoints to a list of egdes.
+      thrust::transform(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator<edge_t>(0),
+        thrust::make_counting_iterator<edge_t>(size_dataframe_buffer(edgelist_to_update_count)),
+        get_dataframe_buffer_begin(edgelist_to_update_count),
+        [num_unique_triangles,
+         triangles_endpoints =
+           get_dataframe_buffer_begin(triangles_endpoints)] __device__(auto idx) {
+          auto idx_triangle           = idx % num_unique_triangles;
+          auto idx_vertex_in_triangle = idx / num_unique_triangles;
+          auto triangle               = (triangles_endpoints + idx_triangle).get_iterator_tuple();
+          vertex_t src;
+          vertex_t dst;
+
+          if (idx_vertex_in_triangle == 0) {
+            src = *(thrust::get<0>(triangle));
+            dst = *(thrust::get<1>(triangle));
+          }
+
+          if (idx_vertex_in_triangle == 1) {
+            src = *(thrust::get<0>(triangle));
+            dst = *(thrust::get<2>(triangle));
+          }
+
+          if (idx_vertex_in_triangle == 2) {
+            src = *(thrust::get<1>(triangle));
+            dst = *(thrust::get<2>(triangle));
+          }
+
+          return thrust::make_tuple(src, dst);
+        });
+
+      if constexpr (multi_gpu) {
+        std::tie(std::get<0>(edgelist_to_update_count),
+                 std::get<1>(edgelist_to_update_count),
+                 std::ignore,
+                 std::ignore,
+                 std::ignore,
+                 std::ignore,
+                 std::ignore,
+                 std::ignore) =
+          detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                         edge_t,
+                                                                                         weight_t,
+                                                                                         int32_t,
+                                                                                         int32_t>(
+            handle,
+            std::move(std::get<0>(edgelist_to_update_count)),
+            std::move(std::get<1>(edgelist_to_update_count)),
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            cur_graph_view.vertex_partition_range_lasts());
+      }
+
+      thrust::sort(handle.get_thrust_policy(),
+                   get_dataframe_buffer_begin(edgelist_to_update_count),
+                   get_dataframe_buffer_end(edgelist_to_update_count));
+
+      auto unique_pair_count =
+        thrust::unique_count(handle.get_thrust_policy(),
+                             get_dataframe_buffer_begin(edgelist_to_update_count),
+                             get_dataframe_buffer_end(edgelist_to_update_count));
+
+      auto vertex_pair_buffer_unique = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+        unique_pair_count, handle.get_stream());
+
+      rmm::device_uvector<edge_t> decrease_count(unique_pair_count, handle.get_stream());
+
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            get_dataframe_buffer_begin(edgelist_to_update_count),
+                            get_dataframe_buffer_end(edgelist_to_update_count),
+                            thrust::make_constant_iterator(size_t{1}),
+                            get_dataframe_buffer_begin(vertex_pair_buffer_unique),
+                            decrease_count.begin(),
+                            thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>{});
+
+      std::tie(std::get<0>(vertex_pair_buffer_unique),
+               std::get<1>(vertex_pair_buffer_unique),
+               decrease_count) =
+        extract_transform_e(
+          handle,
+          cur_graph_view,
+          edge_src_out_degrees.view(),
+          edge_dst_out_degrees.view(),
+          edge_dummy_property_t{}.view(),
+          extract_low_to_high_degree_edges_from_endpoints_t<vertex_t, edge_t>{
+            raft::device_span<vertex_t const>(std::get<0>(vertex_pair_buffer_unique).data(),
+                                              std::get<0>(vertex_pair_buffer_unique).size()),
+            raft::device_span<vertex_t const>(std::get<1>(vertex_pair_buffer_unique).data(),
+                                              std::get<1>(vertex_pair_buffer_unique).size()),
+            raft::device_span<edge_t const>(decrease_count.data(), decrease_count.size())});
+
+      if constexpr (multi_gpu) {
+        auto& comm           = handle.get_comms();
+        auto const comm_size = comm.get_size();
+        auto& major_comm     = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+        auto const major_comm_size = major_comm.get_size();
+        auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+        auto const minor_comm_size        = minor_comm.get_size();
+        auto vertex_partition_range_lasts = cur_graph_view.vertex_partition_range_lasts();
+
+        rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+          vertex_partition_range_lasts.size(), handle.get_stream());
+        raft::update_device(d_vertex_partition_range_lasts.data(),
+                            vertex_partition_range_lasts.data(),
+                            vertex_partition_range_lasts.size(),
+                            handle.get_stream());
+
+        std::forward_as_tuple(std::tie(std::get<0>(vertex_pair_buffer_unique),
+                                       std::get<1>(vertex_pair_buffer_unique),
+                                       decrease_count),
+                              std::ignore) =
+          groupby_gpu_id_and_shuffle_values(
+            handle.get_comms(),
+            thrust::make_zip_iterator(std::get<0>(vertex_pair_buffer_unique).begin(),
+                                      std::get<1>(vertex_pair_buffer_unique).begin(),
+                                      decrease_count.begin()),
+            thrust::make_zip_iterator(std::get<0>(vertex_pair_buffer_unique).end(),
+                                      std::get<1>(vertex_pair_buffer_unique).end(),
+                                      decrease_count.end()),
+            [key_func =
+               cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t<vertex_t>{
+                 raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                                   d_vertex_partition_range_lasts.size()),
+                 comm_size,
+                 major_comm_size,
+                 minor_comm_size}] __device__(auto val) {
+              return key_func(thrust::get<0>(val), thrust::get<1>(val));
+            },
+            handle.get_stream());
+      }
+
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          get_dataframe_buffer_begin(vertex_pair_buffer_unique),
+                          get_dataframe_buffer_end(vertex_pair_buffer_unique),
+                          decrease_count.begin());
+
+      // Update count of weak edges
+      edges_to_decrement_count.clear();
+
+      edges_to_decrement_count.insert(std::get<0>(vertex_pair_buffer_unique).begin(),
+                                      std::get<0>(vertex_pair_buffer_unique).end(),
+                                      std::get<1>(vertex_pair_buffer_unique).begin());
+
+      cur_graph_view.clear_edge_mask();
+      // Check for edge existance on the directed graph view
+      cur_graph_view.attach_edge_mask(dodg_mask.view());
+
+      // Update count of weak edges from the DODG view
+      cugraph::transform_e(
+        handle,
+        cur_graph_view,
+        edges_to_decrement_count,
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        edge_triangle_counts.view(),
+        [edge_buffer_first =
+           thrust::make_zip_iterator(std::get<0>(vertex_pair_buffer_unique).begin(),
+                                     std::get<1>(vertex_pair_buffer_unique).begin()),
+         edge_buffer_last = thrust::make_zip_iterator(std::get<0>(vertex_pair_buffer_unique).end(),
+                                                      std::get<1>(vertex_pair_buffer_unique).end()),
+         decrease_count   = raft::device_span<edge_t>(
+           decrease_count.data(), decrease_count.size())] __device__(auto src,
+                                                                     auto dst,
+                                                                     cuda::std::nullopt_t,
+                                                                     cuda::std::nullopt_t,
+                                                                     edge_t count) {
+          auto itr_pair = thrust::lower_bound(
+            thrust::seq, edge_buffer_first, edge_buffer_last, thrust::make_tuple(src, dst));
+          auto idx_pair = thrust::distance(edge_buffer_first, itr_pair);
+          count -= decrease_count[idx_pair];
+
+          return count;
+        },
+        edge_triangle_counts.mutable_view(),
+        do_expensive_check);
+
+      edgelist_weak.clear();
+
+      thrust::sort(
+        handle.get_thrust_policy(),
+        thrust::make_zip_iterator(weak_edgelist_srcs.begin(), weak_edgelist_dsts.begin()),
+        thrust::make_zip_iterator(weak_edgelist_srcs.end(), weak_edgelist_dsts.end()));
 
-      // Mask all the edges that have k - 2 count
+      edgelist_weak.insert(
+        weak_edgelist_srcs.begin(), weak_edgelist_srcs.end(), weak_edgelist_dsts.begin());
+
+      // Get undirected graph view
+      cur_graph_view.clear_edge_mask();
+      cur_graph_view.attach_edge_mask(weak_edges_mask.view());
 
       auto prev_number_of_edges = cur_graph_view.compute_number_of_edges(handle);
 
       cugraph::transform_e(
         handle,
         cur_graph_view,
+        edgelist_weak,
         cugraph::edge_src_dummy_property_t{}.view(),
         cugraph::edge_dst_dummy_property_t{}.view(),
-        edge_triangle_counts.view(),
-        [k] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto count) {
-          return count >= k - 2;
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(
+          auto src, auto dst, cuda::std::nullopt_t, cuda::std::nullopt_t, cuda::std::nullopt_t) {
+          return false;
         },
-        edge_mask.mutable_view(),
-        false);
+        weak_edges_mask.mutable_view(),
+        do_expensive_check);
+
+      edgelist_weak.clear();
+
+      // shuffle the edges if multi_gpu
+      if constexpr (multi_gpu) {
+        std::tie(weak_edgelist_dsts,
+                 weak_edgelist_srcs,
+                 std::ignore,
+                 std::ignore,
+                 std::ignore,
+                 std::ignore,
+                 std::ignore,
+                 std::ignore) =
+          detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                         edge_t,
+                                                                                         weight_t,
+                                                                                         int32_t,
+                                                                                         int32_t>(
+            handle,
+            std::move(weak_edgelist_dsts),
+            std::move(weak_edgelist_srcs),
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            cur_graph_view.vertex_partition_range_lasts());
+      }
+
+      thrust::sort(
+        handle.get_thrust_policy(),
+        thrust::make_zip_iterator(weak_edgelist_dsts.begin(), weak_edgelist_srcs.begin()),
+        thrust::make_zip_iterator(weak_edgelist_dsts.end(), weak_edgelist_srcs.end()));
+
+      edgelist_weak.insert(
+        weak_edgelist_dsts.begin(), weak_edgelist_dsts.end(), weak_edgelist_srcs.begin());
+
+      cugraph::transform_e(
+        handle,
+        cur_graph_view,
+        edgelist_weak,
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(
+          auto src, auto dst, cuda::std::nullopt_t, cuda::std::nullopt_t, cuda::std::nullopt_t) {
+          return false;
+        },
+        weak_edges_mask.mutable_view(),
+        do_expensive_check);
 
-      cur_graph_view.attach_edge_mask(edge_mask.view());
+      cur_graph_view.attach_edge_mask(weak_edges_mask.view());
 
       if (prev_number_of_edges == cur_graph_view.compute_number_of_edges(handle)) { break; }
+
+      cur_graph_view.clear_edge_mask();
+      cur_graph_view.attach_edge_mask(dodg_mask.view());
     }
 
+    cur_graph_view.clear_edge_mask();
+    cur_graph_view.attach_edge_mask(dodg_mask.view());
+
+    cugraph::transform_e(
+      handle,
+      cur_graph_view,
+      cugraph::edge_src_dummy_property_t{}.view(),
+      cugraph::edge_dst_dummy_property_t{}.view(),
+      edge_triangle_counts.view(),
+      [] __device__(auto src, auto dst, cuda::std::nullopt_t, cuda::std::nullopt_t, auto count) {
+        return count == 0 ? false : true;
+      },
+      dodg_mask.mutable_view(),
+      do_expensive_check);
+
     rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
     rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
     std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
@@ -362,11 +728,10 @@ k_truss(raft::handle_t const& handle,
       decompress_to_edgelist(
         handle,
         cur_graph_view,
-        edge_weight_view ? std::make_optional(*edge_weight_view) : std::nullopt,
+        edge_weight_view,
         std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
-        std::make_optional(
-          raft::device_span<vertex_t const>((*renumber_map).data(), (*renumber_map).size())));
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
     std::tie(edgelist_srcs,
              edgelist_dsts,
diff --git a/cpp/src/community/triangle_count_impl.cuh b/cpp/src/community/triangle_count_impl.cuh
index 100451f06f3..6718d08cdd7 100644
--- a/cpp/src/community/triangle_count_impl.cuh
+++ b/cpp/src/community/triangle_count_impl.cuh
@@ -27,6 +27,7 @@
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -34,7 +35,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/scatter.h>
 #include <thrust/sort.h>
 #include <thrust/transform.h>
@@ -64,19 +64,20 @@ struct is_two_or_greater_t {
 
 template <typename vertex_t, typename edge_t>
 struct extract_low_to_high_degree_edges_t {
-  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(vertex_t src,
-                                                                            vertex_t dst,
-                                                                            edge_t src_out_degree,
-                                                                            edge_t dst_out_degree,
-                                                                            thrust::nullopt_t) const
+  __device__ cuda::std::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
+    vertex_t src,
+    vertex_t dst,
+    edge_t src_out_degree,
+    edge_t dst_out_degree,
+    cuda::std::nullopt_t) const
   {
     return (src_out_degree < dst_out_degree)
-             ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
+             ? cuda::std::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
              : (((src_out_degree == dst_out_degree) &&
                  (src < dst) /* tie-breaking using vertex ID */)
-                  ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src,
-                                                                                           dst)}
-                  : thrust::nullopt);
+                  ? cuda::std::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src,
+                                                                                              dst)}
+                  : cuda::std::nullopt);
   }
 };
 
@@ -85,8 +86,8 @@ struct intersection_op_t {
   __device__ thrust::tuple<edge_t, edge_t, edge_t> operator()(
     vertex_t,
     vertex_t,
-    thrust::nullopt_t,
-    thrust::nullopt_t,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t,
     raft::device_span<vertex_t const> intersection) const
   {
     return thrust::make_tuple(static_cast<edge_t>(intersection.size()),
diff --git a/cpp/src/components/mis_impl.cuh b/cpp/src/components/mis_impl.cuh
index b593c639946..2714d7e3d63 100644
--- a/cpp/src/components/mis_impl.cuh
+++ b/cpp/src/components/mis_impl.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,13 +32,13 @@
 #include <thrust/distance.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/merge.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/set_operations.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
 #include <cmath>
+#include <optional>
 
 namespace cugraph {
 
diff --git a/cpp/src/components/vertex_coloring_impl.cuh b/cpp/src/components/vertex_coloring_impl.cuh
index fa7fb1f6099..7ad06d12027 100644
--- a/cpp/src/components/vertex_coloring_impl.cuh
+++ b/cpp/src/components/vertex_coloring_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <raft/core/handle.hpp>
 #include <raft/random/rng_state.hpp>
 
+#include <cuda/std/optional>
+
 namespace cugraph {
 
 namespace detail {
@@ -52,7 +54,8 @@ rmm::device_uvector<vertex_t> vertex_coloring(
     edge_src_dummy_property_t{}.view(),
     edge_dst_dummy_property_t{}.view(),
     cugraph::edge_dummy_property_t{}.view(),
-    [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+    [] __device__(
+      auto src, auto dst, cuda::std::nullopt_t, cuda::std::nullopt_t, cuda::std::nullopt_t) {
       return !(src == dst);  // mask out self-loop
     },
     edge_masks_even.mutable_view());
@@ -119,7 +122,7 @@ rmm::device_uvector<vertex_t> vertex_coloring(
                                        is_vertex_in_mis.begin(), vertex_t{0}),
         cugraph::edge_dummy_property_t{}.view(),
         [color_id] __device__(
-          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, cuda::std::nullopt_t) {
           return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
         },
         edge_masks_odd.mutable_view());
@@ -140,7 +143,7 @@ rmm::device_uvector<vertex_t> vertex_coloring(
                                        is_vertex_in_mis.begin(), vertex_t{0}),
         cugraph::edge_dummy_property_t{}.view(),
         [color_id] __device__(
-          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, cuda::std::nullopt_t) {
           return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
         },
         edge_masks_even.mutable_view());
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index e791f4dcad3..46db347e0bc 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -35,6 +35,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -45,7 +46,6 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/merge.h>
-#include <thrust/optional.h>
 #include <thrust/partition.h>
 #include <thrust/random.h>
 #include <thrust/scan.h>
@@ -189,11 +189,11 @@ struct e_op_t {
   EdgeIterator edge_buffer_first{};
   size_t* num_edge_inserts{};
 
-  __device__ thrust::optional<vertex_t> operator()(thrust::tuple<vertex_t, vertex_t> tagged_src,
-                                                   vertex_t dst,
-                                                   thrust::nullopt_t,
-                                                   thrust::nullopt_t,
-                                                   thrust::nullopt_t) const
+  __device__ cuda::std::optional<vertex_t> operator()(thrust::tuple<vertex_t, vertex_t> tagged_src,
+                                                      vertex_t dst,
+                                                      cuda::std::nullopt_t,
+                                                      cuda::std::nullopt_t,
+                                                      cuda::std::nullopt_t) const
   {
     auto tag        = thrust::get<1>(tagged_src);
     auto dst_offset = dst - dst_first;
@@ -207,8 +207,8 @@ struct e_op_t {
       *(edge_buffer_first + edge_idx) =
         tag >= old ? thrust::make_tuple(tag, old) : thrust::make_tuple(old, tag);
     }
-    return old == invalid_component_id<vertex_t>::value ? thrust::optional<vertex_t>{tag}
-                                                        : thrust::nullopt;
+    return old == invalid_component_id<vertex_t>::value ? cuda::std::optional<vertex_t>{tag}
+                                                        : cuda::std::nullopt;
   }
 };
 
@@ -231,9 +231,10 @@ struct v_op_t {
   size_t bucket_idx_conflict{};  // relevant only if GraphViewType::is_multi_gpu is true
 
   template <bool multi_gpu = GraphViewType::is_multi_gpu>
-  __device__ std::enable_if_t<multi_gpu,
-                              thrust::tuple<thrust::optional<size_t>, thrust::optional<std::byte>>>
-  operator()(thrust::tuple<vertex_type, vertex_type> tagged_v, int /* v_val */) const
+  __device__
+    std::enable_if_t<multi_gpu,
+                     thrust::tuple<cuda::std::optional<size_t>, cuda::std::optional<std::byte>>>
+    operator()(thrust::tuple<vertex_type, vertex_type> tagged_v, int /* v_val */) const
   {
     auto tag = thrust::get<1>(tagged_v);
     auto v_offset =
@@ -242,22 +243,23 @@ struct v_op_t {
     auto old     = invalid_component_id<vertex_type>::value;
     bool success = v_component.compare_exchange_strong(old, tag, cuda::std::memory_order_relaxed);
     if (!success && (old != tag)) {  // conflict
-      return thrust::make_tuple(thrust::optional<size_t>{bucket_idx_conflict},
-                                thrust::optional<std::byte>{std::byte{0}} /* dummy */);
+      return thrust::make_tuple(cuda::std::optional<size_t>{bucket_idx_conflict},
+                                cuda::std::optional<std::byte>{std::byte{0}} /* dummy */);
     } else {
       return thrust::make_tuple(
-        success ? thrust::optional<size_t>{bucket_idx_next} : thrust::nullopt,
-        success ? thrust::optional<std::byte>{std::byte{0}} /* dummy */ : thrust::nullopt);
+        success ? cuda::std::optional<size_t>{bucket_idx_next} : cuda::std::nullopt,
+        success ? cuda::std::optional<std::byte>{std::byte{0}} /* dummy */ : cuda::std::nullopt);
     }
   }
 
   template <bool multi_gpu = GraphViewType::is_multi_gpu>
-  __device__ std::enable_if_t<!multi_gpu,
-                              thrust::tuple<thrust::optional<size_t>, thrust::optional<std::byte>>>
-  operator()(thrust::tuple<vertex_type, vertex_type> /* tagged_v */, int /* v_val */) const
+  __device__
+    std::enable_if_t<!multi_gpu,
+                     thrust::tuple<cuda::std::optional<size_t>, cuda::std::optional<std::byte>>>
+    operator()(thrust::tuple<vertex_type, vertex_type> /* tagged_v */, int /* v_val */) const
   {
-    return thrust::make_tuple(thrust::optional<size_t>{bucket_idx_next},
-                              thrust::optional<std::byte>{std::byte{0}} /* dummy */);
+    return thrust::make_tuple(cuda::std::optional<size_t>{bucket_idx_next},
+                              cuda::std::optional<std::byte>{std::byte{0}} /* dummy */);
   }
 };
 
diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh
index a2b6f6430f0..f1ff0912002 100644
--- a/cpp/src/cores/core_number_impl.cuh
+++ b/cpp/src/cores/core_number_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/for_each.h>
@@ -35,7 +36,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/partition.h>
 #include <thrust/reduce.h>
 #include <thrust/remove.h>
@@ -53,10 +53,10 @@ struct e_op_t {
   size_t k{};
   edge_t delta{};
 
-  __device__ thrust::optional<edge_t> operator()(
-    vertex_t, vertex_t, thrust::nullopt_t, edge_t dst_val, thrust::nullopt_t) const
+  __device__ cuda::std::optional<edge_t> operator()(
+    vertex_t, vertex_t, cuda::std::nullopt_t, edge_t dst_val, cuda::std::nullopt_t) const
   {
-    return dst_val >= k ? thrust::optional<edge_t>{delta} : thrust::nullopt;
+    return dst_val >= k ? cuda::std::optional<edge_t>{delta} : cuda::std::nullopt;
   }
 };
 
@@ -251,8 +251,8 @@ void core_number(raft::handle_t const& handle,
               auto new_core_number = v_val >= pushed_val ? v_val - pushed_val : edge_t{0};
               new_core_number      = new_core_number < (k - delta) ? (k - delta) : new_core_number;
               new_core_number      = new_core_number < k_first ? edge_t{0} : new_core_number;
-              return thrust::make_tuple(thrust::optional<size_t>{bucket_idx_next},
-                                        thrust::optional<edge_t>{new_core_number});
+              return thrust::make_tuple(cuda::std::optional<size_t>{bucket_idx_next},
+                                        cuda::std::optional<edge_t>{new_core_number});
             });
         }
 
diff --git a/cpp/src/detail/permute_range.cuh b/cpp/src/detail/permute_range.cuh
index c7cd57c2048..a9d1b27f52d 100644
--- a/cpp/src/detail/permute_range.cuh
+++ b/cpp/src/detail/permute_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ rmm::device_uvector<vertex_t> permute_range(raft::handle_t const& handle,
       sub_range_sizes.begin(), sub_range_sizes.end(), sub_range_sizes.begin(), global_start);
     CUGRAPH_EXPECTS(
       sub_range_sizes[comm_rank] == local_range_start,
-      "Invalid input arguments: a rage must have contiguous and non-overlapping values");
+      "Invalid input arguments: a range must have contiguous and non-overlapping values");
   }
   rmm::device_uvector<vertex_t> permuted_integers(local_range_size, handle.get_stream());
 
diff --git a/cpp/src/detail/permute_range_v32.cu b/cpp/src/detail/permute_range_v32.cu
index 6a7bc059901..91d23487f03 100644
--- a/cpp/src/detail/permute_range_v32.cu
+++ b/cpp/src/detail/permute_range_v32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,23 +16,6 @@
 
 #include "detail/permute_range.cuh"
 
-#include <cugraph/detail/collect_comm_wrapper.hpp>
-#include <cugraph/detail/shuffle_wrappers.hpp>
-#include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/utilities/host_scalar_comm.hpp>
-#include <cugraph/utilities/shuffle_comm.cuh>
-
-#include <raft/core/device_span.hpp>
-#include <raft/core/handle.hpp>
-#include <raft/random/rng_state.hpp>
-
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-
 namespace cugraph {
 
 namespace detail {
diff --git a/cpp/src/detail/permute_range_v64.cu b/cpp/src/detail/permute_range_v64.cu
index ad7daf16419..a6dbc9a72ae 100644
--- a/cpp/src/detail/permute_range_v64.cu
+++ b/cpp/src/detail/permute_range_v64.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,23 +16,6 @@
 
 #include "detail/permute_range.cuh"
 
-#include <cugraph/detail/collect_comm_wrapper.hpp>
-#include <cugraph/detail/shuffle_wrappers.hpp>
-#include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/utilities/host_scalar_comm.hpp>
-#include <cugraph/utilities/shuffle_comm.cuh>
-
-#include <raft/core/device_span.hpp>
-#include <raft/core/handle.hpp>
-#include <raft/random/rng_state.hpp>
-
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-
 namespace cugraph {
 
 namespace detail {
diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh
index 45bbf870d80..dd03e621022 100644
--- a/cpp/src/lookup/lookup_src_dst_impl.cuh
+++ b/cpp/src/lookup/lookup_src_dst_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
+
 namespace cugraph {
 
 template <typename edge_id_t, typename edge_type_t, typename vertex_t, typename value_t>
@@ -370,7 +372,7 @@ EdgeTypeAndIdToSrcDstLookupContainerType build_edge_id_and_type_to_src_dst_looku
   EdgeIdInputWrapper edge_id_view,
   EdgeTypeInputWrapper edge_type_view)
 {
-  static_assert(!std::is_same_v<typename EdgeIdInputWrapper::value_type, thrust::nullopt_t>,
+  static_assert(!std::is_same_v<typename EdgeIdInputWrapper::value_type, cuda::std::nullopt_t>,
                 "Can not create edge id lookup table without edge ids");
 
   using vertex_t    = typename GraphViewType::vertex_type;
@@ -411,17 +413,17 @@ EdgeTypeAndIdToSrcDstLookupContainerType build_edge_id_and_type_to_src_dst_looku
         cugraph::edge_src_dummy_property_t{}.view(),
         cugraph::edge_dst_dummy_property_t{}.view(),
         view_concat(edge_id_view, edge_type_view),
-        cuda::proclaim_return_type<thrust::optional<thrust::tuple<int, edge_type_t>>>(
+        cuda::proclaim_return_type<cuda::std::optional<thrust::tuple<int, edge_type_t>>>(
           [key_func =
              cugraph::detail::compute_gpu_id_from_ext_edge_id_t<edge_t>{
                comm_size,
                major_comm_size,
                minor_comm_size}] __device__(auto,
                                             auto,
-                                            thrust::nullopt_t,
-                                            thrust::nullopt_t,
+                                            cuda::std::nullopt_t,
+                                            cuda::std::nullopt_t,
                                             thrust::tuple<edge_t, edge_type_t> id_and_type) {
-            return thrust::optional<thrust::tuple<int, edge_type_t>>{thrust::make_tuple(
+            return cuda::std::optional<thrust::tuple<int, edge_type_t>>{thrust::make_tuple(
               key_func(thrust::get<0>(id_and_type)), thrust::get<1>(id_and_type))};
           }));
 
@@ -518,9 +520,9 @@ EdgeTypeAndIdToSrcDstLookupContainerType build_edge_id_and_type_to_src_dst_looku
       cugraph::edge_src_dummy_property_t{}.view(),
       cugraph::edge_dst_dummy_property_t{}.view(),
       edge_type_view,
-      cuda::proclaim_return_type<thrust::optional<edge_type_t>>(
-        [] __device__(auto, auto, thrust::nullopt_t, thrust::nullopt_t, edge_type_t et) {
-          return thrust::optional<edge_type_t>{et};
+      cuda::proclaim_return_type<cuda::std::optional<edge_type_t>>(
+        [] __device__(auto, auto, cuda::std::nullopt_t, cuda::std::nullopt_t, edge_type_t et) {
+          return cuda::std::optional<edge_type_t>{et};
         }));
 
     thrust::sort(handle.get_thrust_policy(), edge_types.begin(), edge_types.end());
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index 2b89d214fd7..9816753852f 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -47,7 +48,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/optional.h>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
@@ -138,7 +138,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferValueOutputIterator buffer_value_output_first,
   size_t* buffer_idx_ptr,
@@ -238,7 +238,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
 
     if (edge_partition_e_mask) {
       for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-        e_op_result_t e_op_result{thrust::nullopt};
+        e_op_result_t e_op_result{cuda::std::nullopt};
 
         if (i < static_cast<size_t>(num_edges_this_warp)) {
           auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
@@ -261,7 +261,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
       }
     } else {
       for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-        e_op_result_t e_op_result{thrust::nullopt};
+        e_op_result_t e_op_result{cuda::std::nullopt};
 
         if (i < static_cast<size_t>(num_edges_this_warp)) {
           auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
@@ -304,7 +304,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferValueOutputIterator buffer_value_output_first,
   size_t* buffer_idx_ptr,
@@ -359,7 +359,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
 
     if (edge_partition_e_mask) {
       for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
-        e_op_result_t e_op_result{thrust::nullopt};
+        e_op_result_t e_op_result{cuda::std::nullopt};
         if ((i < static_cast<size_t>(local_degree)) &&
             ((*edge_partition_e_mask).get(local_edge_offset + i))) {
           e_op_result = call_e_op(i);
@@ -370,7 +370,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
       }
     } else {
       for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
-        e_op_result_t e_op_result{thrust::nullopt};
+        e_op_result_t e_op_result{cuda::std::nullopt};
         if (i < static_cast<size_t>(local_degree)) { e_op_result = call_e_op(i); }
 
         warp_push_buffer_elements(
@@ -400,7 +400,7 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferValueOutputIterator buffer_value_output_first,
   size_t* buffer_idx_ptr,
@@ -429,7 +429,7 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
     ((static_cast<size_t>(num_edges) + (raft::warp_size() - 1)) / raft::warp_size()) *
     raft::warp_size();
   while (idx < rounded_up_num_edges) {
-    e_op_result_t e_op_result{thrust::nullopt};
+    e_op_result_t e_op_result{cuda::std::nullopt};
     if (idx < num_edges) {
       auto key_idx = thrust::distance(
         key_local_degree_offsets.begin() + 1,
@@ -494,7 +494,7 @@ void extract_transform_v_frontier_e_edge_partition(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   OptionalOutputKeyIterator output_key_first,
   OptionalOutputValueIterator output_value_first,
   raft::device_span<size_t> count /* size = 1 */,
@@ -665,21 +665,21 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
                                                      EdgeOp>::type;
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -692,12 +692,12 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
   static_assert(!std::is_same_v<e_op_result_t, void>);
   static_assert(
     std::is_same_v<e_op_result_t,
-                   std::conditional_t<!std::is_same_v<output_key_t, void> &&
-                                        !std::is_same_v<output_value_t, void>,
-                                      thrust::optional<thrust::tuple<output_key_t, output_value_t>>,
-                                      std::conditional_t<!std::is_same_v<output_key_t, void>,
-                                                         thrust::optional<output_key_t>,
-                                                         thrust::optional<output_value_t>>>>);
+                   std::conditional_t<
+                     !std::is_same_v<output_key_t, void> && !std::is_same_v<output_value_t, void>,
+                     cuda::std::optional<thrust::tuple<output_key_t, output_value_t>>,
+                     std::conditional_t<!std::is_same_v<output_key_t, void>,
+                                        cuda::std::optional<output_key_t>,
+                                        cuda::std::optional<output_value_t>>>>);
 
   constexpr bool try_bitmap = GraphViewType::is_multi_gpu && std::is_same_v<key_t, vertex_t> &&
                               KeyBucketType::is_sorted_unique;
@@ -1401,10 +1401,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
           graph_view.local_edge_partition_view(partition_idx));
       auto edge_partition_e_mask =
         edge_mask_view
-          ? thrust::make_optional<
+          ? cuda::std::make_optional<
               detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
               *edge_mask_view, partition_idx)
-          : thrust::nullopt;
+          : cuda::std::nullopt;
       size_t num_streams_per_loop{1};
       if (stream_pool_indices) {
         assert((*stream_pool_indices).size() >= num_concurrent_loops);
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index 847c1db6937..d21a8153dc6 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -49,7 +50,6 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/remove.h>
 #include <thrust/scan.h>
@@ -106,7 +106,7 @@ struct update_rx_major_local_degree_t {
   int minor_comm_size{};
 
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition{};
-  thrust::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
+  cuda::std::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
     edge_partition_e_mask{};
 
   size_t reordered_idx_first{};
@@ -155,7 +155,7 @@ struct update_rx_major_local_nbrs_t {
 
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition{};
   edge_partition_e_input_device_view_t edge_partition_e_value_input{};
-  thrust::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
+  cuda::std::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
     edge_partition_e_mask{};
 
   size_t reordered_idx_first{};
@@ -214,7 +214,7 @@ struct update_rx_major_local_nbrs_t {
     if (local_degree > 0) {
       if (edge_partition_e_mask) {
         auto mask_first = (*edge_partition_e_mask).value_first();
-        if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
           auto input_first =
             thrust::make_zip_iterator(indices, edge_partition_e_value_input.value_first());
           copy_if_mask_set(input_first,
@@ -233,7 +233,7 @@ struct update_rx_major_local_nbrs_t {
                            local_degree);
         }
       } else {
-        if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
           auto input_first =
             thrust::make_zip_iterator(indices, edge_partition_e_value_input.value_first()) +
             edge_offset;
@@ -278,7 +278,7 @@ struct pick_min_degree_t {
   raft::device_span<edge_t const> second_element_offsets{};
 
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition{};
-  thrust::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
+  cuda::std::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
     edge_partition_e_mask{};
 
   __device__ edge_t operator()(thrust::tuple<vertex_t, vertex_t> pair) const
@@ -413,7 +413,7 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
 
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition{};
   edge_partition_e_input_device_view_t edge_partition_e_value_input{};
-  thrust::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
+  cuda::std::optional<edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>
     edge_partition_e_mask{};
 
   VertexPairIterator vertex_pair_first;
@@ -430,7 +430,7 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
 
     auto pair = *(vertex_pair_first + i);
     vertex_t const* indices0{};
-    std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        edge_property_value_t const*,
                        void*>
       edge_property_values0{};
@@ -439,7 +439,7 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
     edge_t local_degree0{0};
     if constexpr (std::is_same_v<FirstElementToIdxMap, void*>) {
       indices0 = edge_partition.indices();
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         edge_property_values0 = edge_partition_e_value_input.value_first();
       }
 
@@ -468,7 +468,7 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
       }
     } else {
       indices0 = first_element_indices.begin();
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         edge_property_values0 = first_element_edge_property_values;
       }
 
@@ -478,7 +478,7 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
     }
 
     vertex_t const* indices1{};
-    std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        edge_property_value_t const*,
                        void*>
       edge_property_values1{};
@@ -487,7 +487,7 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
     edge_t local_degree1{0};
     if constexpr (std::is_same_v<SecondElementToIdxMap, void*>) {
       indices1 = edge_partition.indices();
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         edge_property_values1 = edge_partition_e_value_input.value_first();
       }
 
@@ -516,7 +516,7 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
       }
     } else {
       indices1 = second_element_indices.begin();
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         edge_property_values1 = second_element_edge_property_values;
       }
 
@@ -618,7 +618,7 @@ struct gatherv_indices_t {
     // in a single warp (better optimize if this becomes a performance bottleneck)
 
     for (int j = 0; j < minor_comm_size; ++j) {
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         auto zipped_gathered_begin = thrust::make_zip_iterator(
           thrust::make_tuple(gathered_intersection_indices.begin(),
                              gathered_nbr_intersection_e_property_values0,
@@ -664,7 +664,7 @@ struct gatherv_indices_t {
 // number of groups"  is recommended for load-balancing.
 template <typename GraphViewType, typename VertexPairIterator, typename EdgeValueInputIterator>
 std::conditional_t<
-  !std::is_same_v<typename EdgeValueInputIterator::value_type, thrust::nullopt_t>,
+  !std::is_same_v<typename EdgeValueInputIterator::value_type, cuda::std::nullopt_t>,
   std::tuple<rmm::device_uvector<size_t>,
              rmm::device_uvector<typename GraphViewType::vertex_type>,
              rmm::device_uvector<typename EdgeValueInputIterator::value_type>,
@@ -684,7 +684,7 @@ nbr_intersection(raft::handle_t const& handle,
   using edge_property_value_t = typename EdgeValueInputIterator::value_type;
 
   using edge_partition_e_input_device_view_t =
-    std::conditional_t<std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
                        detail::edge_partition_edge_property_device_view_t<
                          edge_t,
@@ -692,16 +692,16 @@ nbr_intersection(raft::handle_t const& handle,
                          edge_property_value_t>>;
 
   using optional_property_buffer_value_type =
-    std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        edge_property_value_t,
                        void>;
 
   using optional_property_buffer_view_t =
-    std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        edge_property_value_t const*,
                        void*>;
   using optional_property_buffer_mutable_view_t =
-    std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        edge_property_value_t*,
                        void*>;
 
@@ -907,11 +907,11 @@ nbr_intersection(raft::handle_t const& handle,
               graph_view.local_edge_partition_view(i));
           auto edge_partition_e_mask =
             edge_mask_view
-              ? thrust::make_optional<
+              ? cuda::std::make_optional<
                   detail::
                     edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
                   *edge_mask_view, i)
-              : thrust::nullopt;
+              : cuda::std::nullopt;
           auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
           auto reordered_idx_first =
             (i == size_t{0}) ? size_t{0} : h_rx_reordered_group_lasts[i * major_comm_size - 1];
@@ -950,7 +950,7 @@ nbr_intersection(raft::handle_t const& handle,
 
         optional_property_buffer_mutable_view_t optional_local_e_property_values{};
 
-        if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
           local_e_property_values_for_rx_majors.resize(local_nbrs_for_rx_majors.size(),
                                                        handle.get_stream());
           optional_local_e_property_values = local_e_property_values_for_rx_majors.data();
@@ -964,11 +964,11 @@ nbr_intersection(raft::handle_t const& handle,
             edge_partition_e_input_device_view_t(edge_value_input, i);
           auto edge_partition_e_mask =
             edge_mask_view
-              ? thrust::make_optional<
+              ? cuda::std::make_optional<
                   detail::
                     edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
                   *edge_mask_view, i)
-              : thrust::nullopt;
+              : cuda::std::nullopt;
 
           auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
           auto reordered_idx_first =
@@ -1045,7 +1045,7 @@ nbr_intersection(raft::handle_t const& handle,
       std::tie(major_nbr_indices, std::ignore) = shuffle_values(
         major_comm, local_nbrs_for_rx_majors.begin(), local_nbr_counts, handle.get_stream());
 
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         std::tie(major_e_property_values, std::ignore) =
           shuffle_values(major_comm,
                          local_e_property_values_for_rx_majors.begin(),
@@ -1132,16 +1132,18 @@ nbr_intersection(raft::handle_t const& handle,
     edge_partition_nbr_intersection_sizes.reserve(graph_view.number_of_local_edge_partitions());
     edge_partition_nbr_intersection_indices.reserve(graph_view.number_of_local_edge_partitions());
 
-    [[maybe_unused]] std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
-                                        std::vector<rmm::device_uvector<edge_property_value_t>>,
-                                        std::byte /* dummy */>
+    [[maybe_unused]] std::conditional_t<
+      !std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
+      std::vector<rmm::device_uvector<edge_property_value_t>>,
+      std::byte /* dummy */>
       edge_partition_nbr_intersection_e_property_values0{};
-    [[maybe_unused]] std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
-                                        std::vector<rmm::device_uvector<edge_property_value_t>>,
-                                        std::byte /* dummy */>
+    [[maybe_unused]] std::conditional_t<
+      !std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
+      std::vector<rmm::device_uvector<edge_property_value_t>>,
+      std::byte /* dummy */>
       edge_partition_nbr_intersection_e_property_values1{};
 
-    if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+    if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
       edge_partition_nbr_intersection_e_property_values0.reserve(
         graph_view.number_of_local_edge_partitions());
       edge_partition_nbr_intersection_e_property_values1.reserve(
@@ -1198,10 +1200,10 @@ nbr_intersection(raft::handle_t const& handle,
           edge_partition_e_input_device_view_t(edge_value_input, i);
         auto edge_partition_e_mask =
           edge_mask_view
-            ? thrust::make_optional<
+            ? cuda::std::make_optional<
                 detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
                 *edge_mask_view, i)
-            : thrust::nullopt;
+            : cuda::std::nullopt;
 
         auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
 
@@ -1249,7 +1251,7 @@ nbr_intersection(raft::handle_t const& handle,
         optional_property_buffer_mutable_view_t
           rx_v_pair_optional_nbr_intersection_e_property_values1{};
 
-        if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
           rx_v_pair_nbr_intersection_e_property_values0.resize(
             rx_v_pair_nbr_intersection_indices.size(), handle.get_stream());
           rx_v_pair_nbr_intersection_e_property_values1.resize(
@@ -1264,7 +1266,7 @@ nbr_intersection(raft::handle_t const& handle,
 
         if (intersect_minor_nbr[0] && intersect_minor_nbr[1]) {
           optional_property_buffer_view_t optional_major_e_property_values{};
-          if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+          if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
             optional_major_e_property_values = major_e_property_values.data();
           }
 
@@ -1309,7 +1311,7 @@ nbr_intersection(raft::handle_t const& handle,
           CUGRAPH_FAIL("unimplemented.");
         }
 
-        if constexpr (std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+        if constexpr (std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
           rx_v_pair_nbr_intersection_indices.resize(
             thrust::distance(rx_v_pair_nbr_intersection_indices.begin(),
                              thrust::remove(handle.get_thrust_policy(),
@@ -1515,7 +1517,7 @@ nbr_intersection(raft::handle_t const& handle,
             rx_displacements.back() + gathered_nbr_intersection_index_rx_counts.back(),
             handle.get_stream());
 
-        if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
           device_multicast_sendrecv(minor_comm,
                                     rx_v_pair_nbr_intersection_e_property_values0.begin(),
                                     rx_v_pair_nbr_intersection_index_tx_counts,
@@ -1548,7 +1550,7 @@ nbr_intersection(raft::handle_t const& handle,
             gathered_nbr_intersection_e_property_values1.size(), handle.get_stream());
         }
 
-        if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
           thrust::for_each(
             handle.get_thrust_policy(),
             thrust::make_counting_iterator(size_t{0}),
@@ -1598,7 +1600,7 @@ nbr_intersection(raft::handle_t const& handle,
       edge_partition_nbr_intersection_sizes.push_back(std::move(combined_nbr_intersection_sizes));
       edge_partition_nbr_intersection_indices.push_back(
         std::move(combined_nbr_intersection_indices));
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         edge_partition_nbr_intersection_e_property_values0.push_back(
           std::move(combined_nbr_intersection_e_property_values0));
         edge_partition_nbr_intersection_e_property_values1.push_back(
@@ -1612,7 +1614,7 @@ nbr_intersection(raft::handle_t const& handle,
       num_nbr_intersection_indices += edge_partition_nbr_intersection_indices[i].size();
     }
     nbr_intersection_indices.resize(num_nbr_intersection_indices, handle.get_stream());
-    if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+    if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
       nbr_intersection_e_property_values0.resize(nbr_intersection_indices.size(),
                                                  handle.get_stream());
       nbr_intersection_e_property_values1.resize(nbr_intersection_indices.size(),
@@ -1631,7 +1633,7 @@ nbr_intersection(raft::handle_t const& handle,
                    edge_partition_nbr_intersection_indices[i].end(),
                    nbr_intersection_indices.begin() + index_offset);
 
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         thrust::copy(handle.get_thrust_policy(),
                      edge_partition_nbr_intersection_e_property_values0[i].begin(),
                      edge_partition_nbr_intersection_e_property_values0[i].end(),
@@ -1660,10 +1662,10 @@ nbr_intersection(raft::handle_t const& handle,
     auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, 0);
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, 0)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     rmm::device_uvector<edge_t> nbr_intersection_sizes(
       input_size,
@@ -1699,7 +1701,7 @@ nbr_intersection(raft::handle_t const& handle,
     optional_property_buffer_mutable_view_t optional_nbr_intersection_e_property_values0{};
     optional_property_buffer_mutable_view_t optional_nbr_intersection_e_property_values1{};
 
-    if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+    if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
       nbr_intersection_e_property_values0.resize(nbr_intersection_indices.size(),
                                                  handle.get_stream());
       nbr_intersection_e_property_values1.resize(nbr_intersection_indices.size(),
@@ -1770,7 +1772,7 @@ nbr_intersection(raft::handle_t const& handle,
         size_t{1} << 27,
         static_cast<size_t>(thrust::distance(nbr_intersection_indices.begin() + num_scanned,
                                              nbr_intersection_indices.end())));
-      if constexpr (std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         num_copied += static_cast<size_t>(thrust::distance(
           tmp_indices.begin() + num_copied,
           thrust::copy_if(handle.get_thrust_policy(),
@@ -1804,12 +1806,12 @@ nbr_intersection(raft::handle_t const& handle,
       num_scanned += this_scan_size;
     }
     nbr_intersection_indices = std::move(tmp_indices);
-    if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+    if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
       nbr_intersection_e_property_values0 = std::move(tmp_property_values0);
       nbr_intersection_e_property_values1 = std::move(tmp_property_values1);
     }
 #else
-    if constexpr (std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+    if constexpr (std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
       nbr_intersection_indices.resize(
         thrust::distance(nbr_intersection_indices.begin(),
                          thrust::remove(handle.get_thrust_policy(),
@@ -1845,7 +1847,7 @@ nbr_intersection(raft::handle_t const& handle,
 
   // 5. Return
 
-  if constexpr (std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+  if constexpr (std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
     return std::make_tuple(std::move(nbr_intersection_offsets),
                            std::move(nbr_intersection_indices));
 
diff --git a/cpp/src/prims/detail/partition_v_frontier.cuh b/cpp/src/prims/detail/partition_v_frontier.cuh
index 018960d9a54..f5249aec304 100644
--- a/cpp/src/prims/detail/partition_v_frontier.cuh
+++ b/cpp/src/prims/detail/partition_v_frontier.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,14 +35,12 @@
 #include <thrust/count.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
-#include <optional>
 #include <tuple>
 
 namespace cugraph {
diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
index c521774a50d..1e47bb53a9e 100644
--- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
+++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
@@ -46,6 +46,7 @@
 
 #include <cub/cub.cuh>
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -54,7 +55,6 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/scatter.h>
 #include <thrust/set_operations.h>
 #include <thrust/transform_reduce.h>
@@ -265,7 +265,7 @@ __global__ static void per_v_transform_reduce_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -296,7 +296,7 @@ __global__ static void per_v_transform_reduce_e_hypersparse(
   while (idx < key_count) {
     key_t key{};
     vertex_t major{};
-    thrust::optional<vertex_t> major_idx{};
+    cuda::std::optional<vertex_t> major_idx{};
     if constexpr (use_input_key) {
       key       = *(key_first + idx);
       major     = thrust_tuple_get_or_identity<key_t, 0>(key);
@@ -402,7 +402,7 @@ __global__ static void per_v_transform_reduce_e_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -512,7 +512,7 @@ __global__ static void per_v_transform_reduce_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -596,7 +596,7 @@ __global__ static void per_v_transform_reduce_e_mid_degree(
           ((static_cast<size_t>(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
           raft::warp_size();
         for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
-          thrust::optional<T> e_op_result{thrust::nullopt};
+          cuda::std::optional<T> e_op_result{cuda::std::nullopt};
           if ((i < static_cast<size_t>(local_degree)) &&
               (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) {
             e_op_result = call_e_op(i);
@@ -630,7 +630,7 @@ __global__ static void per_v_transform_reduce_e_mid_degree(
           ((static_cast<size_t>(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
           raft::warp_size();
         for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
-          thrust::optional<T> e_op_result{thrust::nullopt};
+          cuda::std::optional<T> e_op_result{cuda::std::nullopt};
           if (i < static_cast<size_t>(local_degree) && call_pred_op(i)) {
             e_op_result = call_e_op(i);
           }
@@ -699,7 +699,7 @@ __global__ static void per_v_transform_reduce_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -790,7 +790,7 @@ __global__ static void per_v_transform_reduce_e_high_degree(
            per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) *
           per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size;
         for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) {
-          thrust::optional<T> e_op_result{thrust::nullopt};
+          cuda::std::optional<T> e_op_result{cuda::std::nullopt};
           if ((i < static_cast<size_t>(local_degree)) &&
               (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) {
             e_op_result = call_e_op(i);
@@ -835,7 +835,7 @@ __global__ static void per_v_transform_reduce_e_high_degree(
            per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) *
           per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size;
         for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) {
-          thrust::optional<T> e_op_result{thrust::nullopt};
+          cuda::std::optional<T> e_op_result{cuda::std::nullopt};
           if ((i < static_cast<size_t>(local_degree)) && call_pred_op(i)) {
             e_op_result = call_e_op(i);
           }
@@ -1141,7 +1141,7 @@ void per_v_transform_reduce_e_edge_partition(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper output_buffer,
   EdgeOp e_op,
   T major_init,
@@ -1415,21 +1415,21 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
     typename iterator_value_type_or_default_t<OptionalKeyIterator, vertex_t>::value_type;
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -1519,10 +1519,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(static_cast<size_t>(minor_comm_rank)));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, static_cast<size_t>(minor_comm_rank))
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     std::optional<std::vector<size_t>> edge_partition_stream_pool_indices{std::nullopt};
     if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) {
@@ -1737,10 +1737,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
        sorted_unique_key_first,
        sorted_unique_nzd_key_last,
        deg1_v_first = (filter_input_key && graph_view.use_dcs())
-                        ? thrust::make_optional(graph_view.local_vertex_partition_range_first() +
-                                                (*local_vertex_partition_segment_offsets)[3] +
-                                                *((*hypersparse_degree_offsets).rbegin() + 1))
-                        : thrust::nullopt,
+                        ? cuda::std::make_optional(graph_view.local_vertex_partition_range_first() +
+                                                   (*local_vertex_partition_segment_offsets)[3] +
+                                                   *((*hypersparse_degree_offsets).rbegin() + 1))
+                        : cuda::std::nullopt,
        vertex_partition_range_first =
          graph_view.local_vertex_partition_range_first()] __device__(size_t i) {
         if (i == 0) {
@@ -3102,10 +3102,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             graph_view.local_edge_partition_view(partition_idx));
         auto edge_partition_e_mask =
           edge_mask_view
-            ? thrust::make_optional<
+            ? cuda::std::make_optional<
                 detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
                 *edge_mask_view, partition_idx)
-            : thrust::nullopt;
+            : cuda::std::nullopt;
         size_t num_streams_per_loop{1};
         if (stream_pool_indices) {
           assert((*stream_pool_indices).size() >= num_concurrent_loops);
diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
index dd0da77851b..3e38b85f105 100644
--- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
+++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,12 +38,12 @@
 #include <cub/cub.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/adjacent_difference.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -474,10 +474,10 @@ compute_valid_local_nbr_count_inclusive_sums(
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     auto edge_partition_frontier_major_first =
       aggregate_local_frontier_major_first + local_frontier_displacements[i];
@@ -1255,10 +1255,10 @@ compute_aggregate_local_frontier_local_degrees(
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     auto edge_partition_frontier_major_first =
       aggregate_local_frontier_major_first + local_frontier_displacements[i];
@@ -1501,9 +1501,9 @@ rmm::device_uvector<typename GraphViewType::edge_type> convert_to_unmasked_local
     thrust::make_counting_iterator(size_t{0}),
     cuda::proclaim_return_type<size_t>(
       [K,
-       key_indices = key_indices ? thrust::make_optional<raft::device_span<size_t const>>(
+       key_indices = key_indices ? cuda::std::make_optional<raft::device_span<size_t const>>(
                                      (*key_indices).data(), (*key_indices).size())
-                                 : thrust::nullopt] __device__(size_t i) {
+                                 : cuda::std::nullopt] __device__(size_t i) {
         return key_indices ? (*key_indices)[i] : i / K;
       }));
   auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), sample_major_idx_first);
@@ -1513,10 +1513,10 @@ rmm::device_uvector<typename GraphViewType::edge_type> convert_to_unmasked_local
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     auto edge_partition_frontier_major_first =
       aggregate_local_frontier_major_first + local_frontier_displacements[i];
@@ -1863,10 +1863,10 @@ biased_sample_and_compute_local_nbr_indices(
            sample_local_random_numbers.data() + local_frontier_sample_offsets[i],
            local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]),
          key_indices =
-           key_indices ? thrust::make_optional<raft::device_span<size_t const>>(
+           key_indices ? cuda::std::make_optional<raft::device_span<size_t const>>(
                            (*key_indices).data() + local_frontier_sample_offsets[i],
                            local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i])
-                       : thrust::nullopt,
+                       : cuda::std::nullopt,
          key_idx_to_unique_key_idx =
            raft::device_span<size_t>(aggregate_local_frontier_key_idx_to_unique_key_idx.data() +
                                        local_frontier_displacements[i],
diff --git a/cpp/src/prims/detail/transform_v_frontier_e.cuh b/cpp/src/prims/detail/transform_v_frontier_e.cuh
index 5ebcddfe8da..8f414391596 100644
--- a/cpp/src/prims/detail/transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/transform_v_frontier_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,9 +29,9 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <type_traits>
@@ -389,21 +389,21 @@ auto transform_v_frontier_e(raft::handle_t const& handle,
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<e_op_result_t>::value);
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -424,10 +424,10 @@ auto transform_v_frontier_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     auto edge_partition_frontier_key_first =
       aggregate_local_frontier_key_first + local_frontier_displacements[i];
@@ -470,10 +470,10 @@ auto transform_v_frontier_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     auto edge_partition_frontier_key_first =
       aggregate_local_frontier_key_first + local_frontier_displacements[i];
diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh
index 5741c98d90e..bb003aa8747 100644
--- a/cpp/src/prims/extract_transform_e.cuh
+++ b/cpp/src/prims/extract_transform_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,9 +69,9 @@ namespace cugraph {
  * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
  * access edge property values).
  * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
- * property values for the destination, and property values for the edge and returns thrust::nullopt
- * (if the return value is to be discarded) or a valid @p e_op output to be extracted and
- * accumulated.
+ * property values for the destination, and property values for the edge and returns
+ * cuda::std::nullopt (if the return value is to be discarded) or a valid @p e_op output to be
+ * extracted and accumulated.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  * @return Dataframe buffer object storing extracted and accumulated valid @p e_op return values.
  */
diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
index ba227b263bc..46984d6b4e5 100644
--- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
+++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,9 +57,9 @@ namespace cugraph {
  * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
  * access edge property values).
  * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
- * property values for the destination, and property values for the edge and returns thrust::nullopt
- * (if the return value is to be discarded) or a valid @p e_op output to be extracted and
- * accumulated.
+ * property values for the destination, and property values for the edge and returns
+ * cuda::std::nullopt (if the return value is to be discarded) or a valid @p e_op output to be
+ * extracted and accumulated.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  * @return Dataframe buffer object storing extracted and accumulated valid @p e_op return values.
  */
diff --git a/cpp/src/prims/fill_edge_property.cuh b/cpp/src/prims/fill_edge_property.cuh
index 54d0c454ec2..3e1383707a2 100644
--- a/cpp/src/prims/fill_edge_property.cuh
+++ b/cpp/src/prims/fill_edge_property.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/fill.h>
 #include <thrust/iterator/constant_iterator.h>
 
@@ -50,10 +51,10 @@ void fill_edge_property(raft::handle_t const& handle,
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     if constexpr (cugraph::has_packed_bool_element<
                     std::remove_reference_t<decltype(value_firsts[i])>,
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index f03e8f54fb2..728c1eac2bd 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -43,7 +44,6 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/merge.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -109,7 +109,7 @@ struct call_intersection_op_t {
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu>
     edge_partition{};
-  thrust::optional<raft::device_span<typename GraphViewType::vertex_type const>> unique_vertices;
+  cuda::std::optional<raft::device_span<typename GraphViewType::vertex_type const>> unique_vertices;
   VertexValueInputIterator vertex_property_first;
   IntersectionOp intersection_op{};
   size_t const* nbr_offsets{nullptr};
@@ -135,17 +135,17 @@ struct call_intersection_op_t {
     auto intersection = raft::device_span<typename GraphViewType::vertex_type const>(
       nbr_indices + nbr_offsets[i], nbr_indices + nbr_offsets[i + 1]);
 
-    std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        raft::device_span<edge_property_value_t const>,
                        std::byte /* dummy */>
       property_values0{};
 
-    std::conditional_t<!std::is_same_v<edge_property_value_t, thrust::nullopt_t>,
+    std::conditional_t<!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>,
                        raft::device_span<edge_property_value_t const>,
                        std::byte /* dummy */>
       property_values1{};
 
-    if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+    if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
       property_values0 = raft::device_span<edge_property_value_t const>(
         nbr_intersection_property_values0 + nbr_offsets[i],
         nbr_intersection_property_values0 + +nbr_offsets[i + 1]);
@@ -392,7 +392,7 @@ void per_v_pair_transform_dst_nbr_intersection(
       [[maybe_unused]] rmm::device_uvector<edge_property_value_t>
         r_nbr_intersection_property_values1(size_t{0}, handle.get_stream());
 
-      if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_property_value_t, cuda::std::nullopt_t>) {
         std::tie(intersection_offsets,
                  intersection_indices,
                  r_nbr_intersection_property_values0,
@@ -430,7 +430,7 @@ void per_v_pair_transform_dst_nbr_intersection(
                            VertexPairIterator,
                            VertexPairValueOutputIterator>{
                            edge_partition,
-                           thrust::make_optional<raft::device_span<vertex_t const>>(
+                           cuda::std::make_optional<raft::device_span<vertex_t const>>(
                              (*sorted_unique_vertices).data(), (*sorted_unique_vertices).size()),
                            vertex_value_input_for_sorted_unique_vertices_first,
                            intersection_op,
@@ -442,28 +442,29 @@ void per_v_pair_transform_dst_nbr_intersection(
                            vertex_pair_first,
                            vertex_pair_value_output_first});
       } else {
-        thrust::for_each(handle.get_thrust_policy(),
-                         thrust::make_counting_iterator(size_t{0}),
-                         thrust::make_counting_iterator(this_chunk_size),
-                         detail::call_intersection_op_t<
-                           GraphViewType,
-                           VertexValueInputIterator,
-                           typename decltype(r_nbr_intersection_property_values0)::const_pointer,
-                           IntersectionOp,
-                           decltype(chunk_vertex_pair_index_first),
-                           VertexPairIterator,
-                           VertexPairValueOutputIterator>{
-                           edge_partition,
-                           thrust::optional<raft::device_span<vertex_t const>>{thrust::nullopt},
-                           vertex_value_input_first,
-                           intersection_op,
-                           intersection_offsets.data(),
-                           intersection_indices.data(),
-                           r_nbr_intersection_property_values0.data(),
-                           r_nbr_intersection_property_values1.data(),
-                           chunk_vertex_pair_index_first,
-                           vertex_pair_first,
-                           vertex_pair_value_output_first});
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(this_chunk_size),
+          detail::call_intersection_op_t<
+            GraphViewType,
+            VertexValueInputIterator,
+            typename decltype(r_nbr_intersection_property_values0)::const_pointer,
+            IntersectionOp,
+            decltype(chunk_vertex_pair_index_first),
+            VertexPairIterator,
+            VertexPairValueOutputIterator>{
+            edge_partition,
+            cuda::std::optional<raft::device_span<vertex_t const>>{cuda::std::nullopt},
+            vertex_value_input_first,
+            intersection_op,
+            intersection_offsets.data(),
+            intersection_indices.data(),
+            r_nbr_intersection_property_values0.data(),
+            r_nbr_intersection_property_values1.data(),
+            chunk_vertex_pair_index_first,
+            vertex_pair_first,
+            vertex_pair_value_output_first});
       }
 
       chunk_vertex_pair_index_first += this_chunk_size;
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 30706632ad2..812e0a9b926 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,11 +35,11 @@
 #include <cub/cub.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -93,7 +93,7 @@ struct transform_local_nbr_indices_t {
   using edge_t   = typename GraphViewType::edge_type;
 
   edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu> edge_partition{};
-  thrust::optional<size_t const*> local_key_indices{thrust::nullopt};
+  cuda::std::optional<size_t const*> local_key_indices{cuda::std::nullopt};
   KeyIterator key_first{};
   LocalNbrIdxIterator local_nbr_idx_first{};
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input;
@@ -101,7 +101,7 @@ struct transform_local_nbr_indices_t {
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input;
   EdgeOp e_op{};
   edge_t invalid_idx{};
-  thrust::optional<T> invalid_value{thrust::nullopt};
+  cuda::std::optional<T> invalid_value{cuda::std::nullopt};
   size_t K{};
 
   __device__ T operator()(size_t i) const
@@ -241,21 +241,21 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   using key_buffer_t = dataframe_buffer_type_t<key_t>;
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -430,7 +430,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                       EdgeOp,
                                       T>{
           edge_partition,
-          thrust::make_optional(edge_partition_sample_key_index_first),
+          cuda::std::make_optional(edge_partition_sample_key_index_first),
           edge_partition_key_list_first,
           edge_partition_sample_local_nbr_index_first,
           edge_partition_src_value_input,
@@ -454,7 +454,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                       edge_partition_e_input_device_view_t,
                                       EdgeOp,
                                       T>{edge_partition,
-                                         thrust::nullopt,
+                                         cuda::std::nullopt,
                                          edge_partition_key_list_first,
                                          edge_partition_sample_local_nbr_index_first,
                                          edge_partition_src_value_input,
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index c13816242bc..4ebda9d42c4 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -49,7 +50,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/scatter.h>
 #include <thrust/sort.h>
@@ -89,15 +89,16 @@ struct tuple_to_minor_comm_rank_t {
   int minor_comm_size{};
 
   template <typename edge_value_type = edge_value_t>
-  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, thrust::nullopt_t>, int> operator()(
+  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, cuda::std::nullopt_t>, int>
+  operator()(
     thrust::tuple<vertex_t, vertex_t, edge_value_t> val /* major, minor key, edge value */) const
   {
     return key_func(thrust::get<1>(val)) % minor_comm_size;
   }
 
   template <typename edge_value_type = edge_value_t>
-  __device__ std::enable_if_t<std::is_same_v<edge_value_type, thrust::nullopt_t>, int> operator()(
-    thrust::tuple<vertex_t, vertex_t> val /* major, minor key */) const
+  __device__ std::enable_if_t<std::is_same_v<edge_value_type, cuda::std::nullopt_t>, int>
+  operator()(thrust::tuple<vertex_t, vertex_t> val /* major, minor key */) const
   {
     return key_func(thrust::get<1>(val)) % minor_comm_size;
   }
@@ -123,13 +124,13 @@ template <typename vertex_t,
           typename KeyAggregatedEdgeOp>
 struct call_key_aggregated_e_op_t {
   EdgePartitionDeviceView edge_partition{};
-  thrust::optional<EdgeMajorValueMap> edge_major_value_map{};
+  cuda::std::optional<EdgeMajorValueMap> edge_major_value_map{};
   EdgePartitionMajorValueInputWrapper edge_partition_major_value_input{};
   EdgeMinorKeyValueMap edge_minor_key_value_map{};
   KeyAggregatedEdgeOp key_aggregated_e_op{};
 
   template <typename edge_value_type = edge_value_t>
-  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, thrust::nullopt_t>, e_op_result_t>
+  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, cuda::std::nullopt_t>, e_op_result_t>
   operator()(thrust::tuple<vertex_t, vertex_t, edge_value_t>
                val /* major, minor key, aggregated edge value */) const
   {
@@ -145,7 +146,7 @@ struct call_key_aggregated_e_op_t {
   }
 
   template <typename edge_value_type = edge_value_t>
-  __device__ std::enable_if_t<std::is_same_v<edge_value_type, thrust::nullopt_t>, e_op_result_t>
+  __device__ std::enable_if_t<std::is_same_v<edge_value_type, cuda::std::nullopt_t>, e_op_result_t>
   operator()(thrust::tuple<vertex_t, vertex_t> val /* major, minor key */) const
   {
     auto major     = thrust::get<0>(val);
@@ -155,7 +156,7 @@ struct call_key_aggregated_e_op_t {
                        : edge_partition_major_value_input.get(
                            edge_partition.major_offset_from_major_nocheck(major));
     return key_aggregated_e_op(
-      major, minor_key, major_val, edge_minor_key_value_map.find(minor_key), thrust::nullopt);
+      major, minor_key, major_val, edge_minor_key_value_map.find(minor_key), cuda::std::nullopt);
   }
 };
 
@@ -284,16 +285,16 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
   using edge_value_t     = typename EdgeValueInputWrapper::value_type;
   using kv_pair_value_t  = typename KVStoreViewType::value_type;
   using optional_edge_value_buffer_value_type =
-    std::conditional_t<!std::is_same_v<edge_value_t, thrust::nullopt_t>, edge_value_t, void>;
+    std::conditional_t<!std::is_same_v<edge_value_t, cuda::std::nullopt_t>, edge_value_t, void>;
 
   static_assert(
-    std::is_same_v<edge_value_t, thrust::nullopt_t> || std::is_arithmetic_v<edge_value_t>,
+    std::is_same_v<edge_value_t, cuda::std::nullopt_t> || std::is_arithmetic_v<edge_value_t>,
     "Currently only scalar values are supported, should be extended to support thrust::tuple of "
     "arithmetic types and void (for dummy property values) to be consistent with other "
     "primitives.");  // this will also require a custom edge value aggregation op.
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
@@ -303,7 +304,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       vertex_t,
       typename EdgeDstKeyInputWrapper::value_iterator>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -315,7 +316,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
 
   auto total_global_mem = handle.get_device_properties().totalGlobalMem;
   size_t element_size   = sizeof(vertex_t) * 2;  // major + minor keys
-  if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+  if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
     static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<edge_value_t>::value);
     if constexpr (is_thrust_tuple_of_arithmetic<edge_value_t>::value) {
       element_size += sum_thrust_tuple_element_sizes<edge_value_t>();
@@ -323,7 +324,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       element_size += sizeof(edge_value_t);
     }
   }
-  if constexpr (!std::is_same_v<edge_src_value_t, thrust::nullopt_t>) {
+  if constexpr (!std::is_same_v<edge_src_value_t, cuda::std::nullopt_t>) {
     static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<edge_src_value_t>::value);
     if constexpr (is_thrust_tuple_of_arithmetic<edge_src_value_t>::value) {
       element_size += sum_thrust_tuple_element_sizes<edge_src_value_t>();
@@ -350,10 +351,10 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     auto edge_partition_src_value_input =
       edge_partition_src_input_device_view_t(edge_src_value_input, i);
@@ -472,7 +473,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
                             1,
                             handle.get_stream());
           handle.sync_stream();
-          if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+          if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
             detail::copy_if_mask_set(
               handle,
               thrust::make_zip_iterator(minor_key_first,
@@ -505,7 +506,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
           (offsets_with_mask ? (*offsets_with_mask).data() : edge_partition.offsets()) +
             h_vertex_offsets[j],
           detail::rebase_offset_t<edge_t>{h_edge_offsets[j]});
-        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
           cub::DeviceSegmentedSort::SortPairs(
             static_cast<void*>(nullptr),
             tmp_storage_bytes,
@@ -536,7 +537,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         if (tmp_storage_bytes > d_tmp_storage.size()) {
           d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
         }
-        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
           cub::DeviceSegmentedSort::SortPairs(
             d_tmp_storage.data(),
             tmp_storage_bytes,
@@ -573,7 +574,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
           thrust::make_zip_iterator(unreduced_majors.begin(), unreduced_minor_keys.begin());
         auto output_key_first =
           thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
-        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
           reduced_size +=
             thrust::distance(output_key_first + reduced_size,
                              thrust::get<0>(thrust::reduce_by_key(
@@ -626,7 +627,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       auto const minor_comm_size = minor_comm.get_size();
 
       rmm::device_uvector<size_t> d_tx_value_counts(0, handle.get_stream());
-      if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
         auto triplet_first =
           thrust::make_zip_iterator(tmp_majors.begin(),
                                     tmp_minor_keys.begin(),
@@ -782,7 +783,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         tmp_minor_keys.resize(0, handle.get_stream());
         tmp_minor_keys.shrink_to_fit(handle.get_stream());
 
-        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
           std::tie(rx_key_aggregated_edge_values, std::ignore) =
             shuffle_values(minor_comm,
                            detail::get_optional_dataframe_buffer_begin<edge_value_t>(
@@ -795,7 +796,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
           tmp_key_aggregated_edge_values, handle.get_stream());
       } else {
-        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
           auto triplet_first =
             thrust::make_zip_iterator(tmp_majors.begin(),
                                       tmp_minor_keys.begin(),
@@ -820,7 +821,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       }
 
       auto key_pair_first = thrust::make_zip_iterator(rx_majors.begin(), rx_minor_keys.begin());
-      if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+      if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
         if (rx_majors.size() >
             mem_frugal_threshold) {  // trade-off parallelism to lower peak memory
           auto second_first =
@@ -956,15 +957,15 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
 
     auto major_value_map_device_view =
       (GraphViewType::is_multi_gpu && edge_src_value_input.keys())
-        ? thrust::make_optional<detail::kv_binary_search_store_device_view_t<
+        ? cuda::std::make_optional<detail::kv_binary_search_store_device_view_t<
             decltype(multi_gpu_major_value_map_ptr->view())>>(multi_gpu_major_value_map_ptr->view())
-        : thrust::nullopt;
+        : cuda::std::nullopt;
     std::conditional_t<KVStoreViewType::binary_search,
                        detail::kv_binary_search_store_device_view_t<KVStoreViewType>,
                        detail::kv_cuco_store_find_device_view_t<KVStoreViewType>>
       dst_key_value_map_device_view(
         GraphViewType::is_multi_gpu ? multi_gpu_minor_key_value_map_ptr->view() : kv_store_view);
-    if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+    if constexpr (!std::is_same_v<edge_value_t, cuda::std::nullopt_t>) {
       auto triplet_first = thrust::make_zip_iterator(
         tmp_majors.begin(),
         tmp_minor_keys.begin(),
diff --git a/cpp/src/prims/property_op_utils.cuh b/cpp/src/prims/property_op_utils.cuh
index 04ad22cbf71..2cab42c1dc8 100644
--- a/cpp/src/prims/property_op_utils.cuh
+++ b/cpp/src/prims/property_op_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,9 +127,7 @@ template <typename T, template <typename> typename Op>
 struct property_op : public Op<T> {};
 
 template <typename... Args, template <typename> typename Op>
-struct property_op<thrust::tuple<Args...>, Op>
-  : public thrust::
-      binary_function<thrust::tuple<Args...>, thrust::tuple<Args...>, thrust::tuple<Args...>> {
+struct property_op<thrust::tuple<Args...>, Op> {
   using Type = thrust::tuple<Args...>;
 
  private:
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index aaa2703f1ae..d5d64f708ba 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -273,21 +274,21 @@ void transform_e(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -306,10 +307,10 @@ void transform_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -464,21 +465,21 @@ void transform_e(raft::handle_t const& handle,
     std::is_same_v<typename EdgeBucketType::key_type, thrust::tuple<vertex_t, vertex_t>>);
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -541,10 +542,10 @@ void transform_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     if (do_expensive_check) {
       CUGRAPH_EXPECTS(
diff --git a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
index c938b10fbbb..eef34938c57 100644
--- a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
+++ b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -43,7 +44,6 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/merge.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -249,14 +249,14 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
   using weight_t = float;  // dummy
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index 43722550c58..1e45fea0608 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,11 +35,11 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
@@ -68,7 +68,7 @@ __global__ static void transform_reduce_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -162,7 +162,7 @@ __global__ static void transform_reduce_e_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -251,7 +251,7 @@ __global__ static void transform_reduce_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -329,7 +329,7 @@ __global__ static void transform_reduce_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  cuda::std::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -445,21 +445,21 @@ T transform_reduce_e(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -486,10 +486,10 @@ T transform_reduce_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 3abce6f8bd5..8786336bd10 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,10 +31,10 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
@@ -108,7 +108,7 @@ __global__ static void transform_reduce_by_src_dst_key_hypersparse(
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
   EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
-  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+  cuda::std::optional<raft::device_span<typename GraphViewType::edge_type const>>
     edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
@@ -198,7 +198,7 @@ __global__ static void transform_reduce_by_src_dst_key_low_degree(
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
   EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
-  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+  cuda::std::optional<raft::device_span<typename GraphViewType::edge_type const>>
     edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
@@ -284,7 +284,7 @@ __global__ static void transform_reduce_by_src_dst_key_mid_degree(
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
   EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
-  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+  cuda::std::optional<raft::device_span<typename GraphViewType::edge_type const>>
     edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
@@ -383,7 +383,7 @@ __global__ static void transform_reduce_by_src_dst_key_high_degree(
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
   EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
-  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+  cuda::std::optional<raft::device_span<typename GraphViewType::edge_type const>>
     edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
@@ -520,21 +520,21 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeSrcValueInputWrapper::value_iterator,
       typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, cuda::std::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
@@ -556,10 +556,10 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     rmm::device_uvector<vertex_t> tmp_keys(0, handle.get_stream());
     std::optional<rmm::device_uvector<edge_t>> edge_offsets_with_mask{std::nullopt};
@@ -627,9 +627,9 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_src_dst_key_input,
               edge_partition_e_mask,
               edge_offsets_with_mask
-                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                ? cuda::std::make_optional<raft::device_span<edge_t const>>(
                     (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
-                : thrust::nullopt,
+                : cuda::std::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -650,9 +650,9 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_src_dst_key_input,
               edge_partition_e_mask,
               edge_offsets_with_mask
-                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                ? cuda::std::make_optional<raft::device_span<edge_t const>>(
                     (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
-                : thrust::nullopt,
+                : cuda::std::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -673,9 +673,9 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_src_dst_key_input,
               edge_partition_e_mask,
               edge_offsets_with_mask
-                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                ? cuda::std::make_optional<raft::device_span<edge_t const>>(
                     (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
-                : thrust::nullopt,
+                : cuda::std::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -695,9 +695,9 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_src_dst_key_input,
               edge_partition_e_mask,
               edge_offsets_with_mask
-                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                ? cuda::std::make_optional<raft::device_span<edge_t const>>(
                     (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
-                : thrust::nullopt,
+                : cuda::std::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -719,9 +719,9 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
             edge_partition_src_dst_key_input,
             edge_partition_e_mask,
             edge_offsets_with_mask
-              ? thrust::make_optional<raft::device_span<edge_t const>>(
+              ? cuda::std::make_optional<raft::device_span<edge_t const>>(
                   (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
-              : thrust::nullopt,
+              : cuda::std::nullopt,
             e_op,
             tmp_keys.data(),
             get_dataframe_buffer_begin(tmp_value_buffer));
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
index 87f590f571f..884079d103d 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,6 +42,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -52,7 +53,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/remove.h>
 #include <thrust/sort.h>
@@ -86,7 +86,7 @@ template <typename key_t,
 struct transform_reduce_v_frontier_call_e_op_t {
   EdgeOp e_op{};
 
-  __device__ thrust::optional<
+  __device__ cuda::std::optional<
     std::conditional_t<!std::is_same_v<key_t, void> && !std::is_same_v<payload_t, void>,
                        thrust::tuple<key_t, payload_t>,
                        std::conditional_t<!std::is_same_v<key_t, void>, key_t, payload_t>>>
@@ -106,7 +106,7 @@ struct transform_reduce_v_frontier_call_e_op_t {
                                   thrust::get<1>(*e_op_result));
       }
     } else {
-      return thrust::nullopt;
+      return cuda::std::nullopt;
     }
   }
 };
@@ -121,7 +121,7 @@ struct update_keep_flag_t {
   raft::device_span<uint32_t> keep_flags{};
   key_t v_range_first{};
   InputKeyIterator input_key_first{};
-  thrust::optional<input_key_t> invalid_input_key{};
+  cuda::std::optional<input_key_t> invalid_input_key{};
 
   __device__ void operator()(size_t i) const
   {
@@ -1058,10 +1058,10 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
 
     if constexpr (GraphViewType::is_multi_gpu) {
       auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
@@ -1110,9 +1110,9 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
  * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor
  * outputs by (tagged-)destination ID.
  *
- * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are
- * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag
- * type (KeyBucketType::key_type is identical to a vertex type otherwise).
+ * Edge functor outputs are cuda::std::optional objects and invalid if cuda::std::nullopt. Vertices
+ * are assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag type
+ * (KeyBucketType::key_type is identical to a vertex type otherwise).
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the
@@ -1141,10 +1141,10 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
  * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
  * access edge property values).
  * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for
- * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be
- * discarded); 2) dummy (but valid) thrust::optional object (e.g.
- * thrust::optional<std::byte>{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is
- * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be
+ * the source, destination, and edge and returns 1) cuda::std::nullopt (if invalid and to be
+ * discarded); 2) dummy (but valid) cuda::std::optional object (e.g.
+ * cuda::std::optional<std::byte>{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type
+ * is void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be
  * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void);
  * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type
  * is not void).
diff --git a/cpp/src/prims/update_v_frontier.cuh b/cpp/src/prims/update_v_frontier.cuh
index a9b0a6b823b..0516ebc7d31 100644
--- a/cpp/src/prims/update_v_frontier.cuh
+++ b/cpp/src/prims/update_v_frontier.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -157,10 +157,10 @@ struct check_invalid_bucket_idx_t {
  * graph_view.local_vertex_partition_range_size().
  * @param v_op Ternary operator that takes (tagged-)vertex ID, *(@p vertex_value_input_first + i)
  * (where i is [0, @p graph_view.local_vertex_partition_range_size())) and the payload value for the
- * (tagged-)vertex ID and returns a tuple of 1) a thrust::optional object optionally storing a
- * bucket index and 2) a thrust::optional object optionally storing a new vertex property value. If
- * the first element of the returned tuple is thrust::nullopt, this (tagged-)vertex won't be
- * inserted to the vertex frontier. If the second element is thrust::nullopt, the vertex property
+ * (tagged-)vertex ID and returns a tuple of 1) a cuda::std::optional object optionally storing a
+ * bucket index and 2) a cuda::std::optional object optionally storing a new vertex property value.
+ * If the first element of the returned tuple is cuda::std::nullopt, this (tagged-)vertex won't be
+ * inserted to the vertex frontier. If the second element is cuda::std::nullopt, the vertex property
  * value for this vertex won't be updated. Note that it is currently undefined behavior if there are
  * multiple tagged-vertices with the same vertex ID (but with different tags) AND @p v_op results on
  * the tagged-vertices with the same vertex ID have more than one valid new vertex property values.
@@ -286,13 +286,13 @@ void update_v_frontier(raft::handle_t const& handle,
  * graph_view.local_vertex_partition_range_size().
  * @param v_op Binary operator that takes (tagged-)vertex ID, and *(@p vertex_value_input_first + i)
  * (where i is [0, @p graph_view.local_vertex_partition_range_size())) and returns a tuple of 1) a
- * thrust::optional object optionally storing a bucket index and 2) a thrust::optional object
+ * cuda::std::optional object optionally storing a bucket index and 2) a cuda::std::optional object
  * optionally storing a new vertex property value. If the first element of the returned tuple is
- * thrust::nullopt, this (tagged-)vertex won't be inserted to the vertex frontier. If the second
- * element is thrust::nullopt, the vertex property value for this vertex won't be updated. Note that
- * it is currently undefined behavior if there are multiple tagged-vertices with the same vertex ID
- * (but with different tags) AND @p v_op results on the tagged-vertices with the same vertex ID have
- * more than one valid new vertex property values.
+ * cuda::std::nullopt, this (tagged-)vertex won't be inserted to the vertex frontier. If the second
+ * element is cuda::std::nullopt, the vertex property value for this vertex won't be updated. Note
+ * that it is currently undefined behavior if there are multiple tagged-vertices with the same
+ * vertex ID (but with different tags) AND @p v_op results on the tagged-vertices with the same
+ * vertex ID have more than one valid new vertex property values.
  */
 template <typename GraphViewType,
           typename KeyBuffer,
diff --git a/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh b/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh
index d2047883349..19d6688f1ed 100644
--- a/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh
+++ b/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 #include <thrust/tuple.h>
 
 namespace cugraph {
@@ -40,8 +40,8 @@ struct return_edges_with_properties_e_op {
   template <typename key_t, typename vertex_t, typename EdgeProperties>
   auto __host__ __device__ operator()(key_t optionally_tagged_src,
                                       vertex_t dst,
-                                      thrust::nullopt_t,
-                                      thrust::nullopt_t,
+                                      cuda::std::nullopt_t,
+                                      cuda::std::nullopt_t,
                                       EdgeProperties edge_properties) const
   {
     static_assert(std::is_same_v<key_t, vertex_t> ||
@@ -51,43 +51,43 @@ struct return_edges_with_properties_e_op {
     if constexpr (std::is_same_v<key_t, vertex_t>) {
       vertex_t src{optionally_tagged_src};
 
-      if constexpr (std::is_same_v<EdgeProperties, thrust::nullopt_t>) {
-        return thrust::make_optional(thrust::make_tuple(src, dst));
+      if constexpr (std::is_same_v<EdgeProperties, cuda::std::nullopt_t>) {
+        return cuda::std::make_optional(thrust::make_tuple(src, dst));
       } else if constexpr (std::is_arithmetic<EdgeProperties>::value) {
-        return thrust::make_optional(thrust::make_tuple(src, dst, edge_properties));
+        return cuda::std::make_optional(thrust::make_tuple(src, dst, edge_properties));
       } else if constexpr (cugraph::is_thrust_tuple_of_arithmetic<EdgeProperties>::value &&
                            (thrust::tuple_size<EdgeProperties>::value == 2)) {
-        return thrust::make_optional(thrust::make_tuple(
+        return cuda::std::make_optional(thrust::make_tuple(
           src, dst, thrust::get<0>(edge_properties), thrust::get<1>(edge_properties)));
       } else if constexpr (cugraph::is_thrust_tuple_of_arithmetic<EdgeProperties>::value &&
                            (thrust::tuple_size<EdgeProperties>::value == 3)) {
-        return thrust::make_optional(thrust::make_tuple(src,
-                                                        dst,
-                                                        thrust::get<0>(edge_properties),
-                                                        thrust::get<1>(edge_properties),
-                                                        thrust::get<2>(edge_properties)));
+        return cuda::std::make_optional(thrust::make_tuple(src,
+                                                           dst,
+                                                           thrust::get<0>(edge_properties),
+                                                           thrust::get<1>(edge_properties),
+                                                           thrust::get<2>(edge_properties)));
       }
     } else if constexpr (std::is_same_v<key_t, thrust::tuple<vertex_t, int32_t>>) {
       vertex_t src{thrust::get<0>(optionally_tagged_src)};
       int32_t label{thrust::get<1>(optionally_tagged_src)};
 
       src = thrust::get<0>(optionally_tagged_src);
-      if constexpr (std::is_same_v<EdgeProperties, thrust::nullopt_t>) {
-        return thrust::make_optional(thrust::make_tuple(src, dst, label));
+      if constexpr (std::is_same_v<EdgeProperties, cuda::std::nullopt_t>) {
+        return cuda::std::make_optional(thrust::make_tuple(src, dst, label));
       } else if constexpr (std::is_arithmetic<EdgeProperties>::value) {
-        return thrust::make_optional(thrust::make_tuple(src, dst, edge_properties, label));
+        return cuda::std::make_optional(thrust::make_tuple(src, dst, edge_properties, label));
       } else if constexpr (cugraph::is_thrust_tuple_of_arithmetic<EdgeProperties>::value &&
                            (thrust::tuple_size<EdgeProperties>::value == 2)) {
-        return thrust::make_optional(thrust::make_tuple(
+        return cuda::std::make_optional(thrust::make_tuple(
           src, dst, thrust::get<0>(edge_properties), thrust::get<1>(edge_properties), label));
       } else if constexpr (cugraph::is_thrust_tuple_of_arithmetic<EdgeProperties>::value &&
                            (thrust::tuple_size<EdgeProperties>::value == 3)) {
-        return thrust::make_optional(thrust::make_tuple(src,
-                                                        dst,
-                                                        thrust::get<0>(edge_properties),
-                                                        thrust::get<1>(edge_properties),
-                                                        thrust::get<2>(edge_properties),
-                                                        label));
+        return cuda::std::make_optional(thrust::make_tuple(src,
+                                                           dst,
+                                                           thrust::get<0>(edge_properties),
+                                                           thrust::get<1>(edge_properties),
+                                                           thrust::get<2>(edge_properties),
+                                                           label));
       }
     }
   }
diff --git a/cpp/src/sampling/detail/prepare_next_frontier_impl.cuh b/cpp/src/sampling/detail/prepare_next_frontier_impl.cuh
index 5c04d628f09..2a6136fb96a 100644
--- a/cpp/src/sampling/detail/prepare_next_frontier_impl.cuh
+++ b/cpp/src/sampling/detail/prepare_next_frontier_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,11 +29,12 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/optional.h>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <optional>
+
 namespace cugraph {
 namespace detail {
 
diff --git a/cpp/src/sampling/detail/remove_visited_vertices_from_frontier.cuh b/cpp/src/sampling/detail/remove_visited_vertices_from_frontier.cuh
index f6793c4a157..9ce8edbb9c1 100644
--- a/cpp/src/sampling/detail/remove_visited_vertices_from_frontier.cuh
+++ b/cpp/src/sampling/detail/remove_visited_vertices_from_frontier.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,11 @@
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/binary_search.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/tuple.h>
 
+#include <optional>
+
 namespace cugraph {
 namespace detail {
 
diff --git a/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v32_e32.cu b/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v32_e32.cu
index 89634253ee7..35c2fc5abb1 100644
--- a/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v32_e32.cu
+++ b/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v32_e32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,10 +21,11 @@
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/binary_search.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/tuple.h>
 
+#include <optional>
+
 namespace cugraph {
 namespace detail {
 
diff --git a/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v64_e64.cu b/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v64_e64.cu
index 41cb7413bc4..b627431d53f 100644
--- a/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v64_e64.cu
+++ b/cpp/src/sampling/detail/remove_visited_vertices_from_frontier_sg_v64_e64.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,10 +21,11 @@
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/binary_search.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/tuple.h>
 
+#include <optional>
+
 namespace cugraph {
 namespace detail {
 
diff --git a/cpp/src/sampling/detail/sample_edges.cuh b/cpp/src/sampling/detail/sample_edges.cuh
index 0c670c6507e..a4e228522aa 100644
--- a/cpp/src/sampling/detail/sample_edges.cuh
+++ b/cpp/src/sampling/detail/sample_edges.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
@@ -41,12 +41,12 @@ struct sample_edges_op_t {
   template <typename EdgeProperties>
   auto __host__ __device__ operator()(vertex_t src,
                                       vertex_t dst,
-                                      thrust::nullopt_t,
-                                      thrust::nullopt_t,
+                                      cuda::std::nullopt_t,
+                                      cuda::std::nullopt_t,
                                       EdgeProperties edge_properties) const
   {
     // FIXME: A solution using thrust_tuple_cat would be more flexible here
-    if constexpr (std::is_same_v<EdgeProperties, thrust::nullopt_t>) {
+    if constexpr (std::is_same_v<EdgeProperties, cuda::std::nullopt_t>) {
       return thrust::make_tuple(src, dst);
     } else if constexpr (std::is_arithmetic<EdgeProperties>::value) {
       return thrust::make_tuple(src, dst, edge_properties);
@@ -68,7 +68,7 @@ struct sample_edges_op_t {
 template <typename vertex_t, typename bias_t>
 struct sample_edge_biases_op_t {
   auto __host__ __device__
-  operator()(vertex_t, vertex_t, thrust::nullopt_t, thrust::nullopt_t, bias_t bias) const
+  operator()(vertex_t, vertex_t, cuda::std::nullopt_t, cuda::std::nullopt_t, bias_t bias) const
   {
     return bias;
   }
diff --git a/cpp/src/sampling/detail/sampling_utils.hpp b/cpp/src/sampling/detail/sampling_utils.hpp
index 17eb8dd0873..71387eb0e63 100644
--- a/cpp/src/sampling/detail/sampling_utils.hpp
+++ b/cpp/src/sampling/detail/sampling_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/optional.h>
+#include <optional>
 
 namespace cugraph {
 namespace detail {
diff --git a/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh b/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
index 391dd99b1df..ce4888e3359 100644
--- a/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
+++ b/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,10 +32,11 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/optional.h>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
+#include <optional>
+
 namespace cugraph {
 namespace detail {
 
diff --git a/cpp/src/sampling/negative_sampling_impl.cuh b/cpp/src/sampling/negative_sampling_impl.cuh
index 541eda67860..9aedc5dfc35 100644
--- a/cpp/src/sampling/negative_sampling_impl.cuh
+++ b/cpp/src/sampling/negative_sampling_impl.cuh
@@ -16,8 +16,11 @@
 
 #pragma once
 
+#include "cugraph/detail/collect_comm_wrapper.hpp"
+#include "cugraph/utilities/device_comm.hpp"
 #include "prims/reduce_v.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
+#include "thrust/iterator/zip_iterator.h"
 #include "utilities/collect_comm.cuh"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
@@ -26,6 +29,10 @@
 #include <cugraph/utilities/device_functors.cuh>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
 #include <rmm/device_scalar.hpp>
 
 #include <thrust/adjacent_difference.h>
@@ -37,6 +44,8 @@
 #include <thrust/transform.h>
 #include <thrust/unique.h>
 
+#include <tuple>
+
 namespace cugraph {
 
 namespace detail {
@@ -265,11 +274,19 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negativ
   bool exact_number_of_samples,
   bool do_expensive_check)
 {
-  rmm::device_uvector<vertex_t> src(0, handle.get_stream());
-  rmm::device_uvector<vertex_t> dst(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
 
   // Optimistically assume we can do this in one pass
-  size_t samples_in_this_batch = num_samples;
+  size_t total_samples{num_samples};
+  std::vector<size_t> samples_per_gpu;
+
+  if constexpr (multi_gpu) {
+    samples_per_gpu = host_scalar_allgather(handle.get_comms(), num_samples, handle.get_stream());
+    total_samples   = std::reduce(samples_per_gpu.begin(), samples_per_gpu.end());
+  }
+
+  size_t samples_in_this_batch = total_samples;
 
   // Normalize the biases and (for MG) determine how the biases are
   // distributed across the GPUs.
@@ -298,16 +315,16 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negativ
            : 0);
     }
 
-    auto batch_src = create_local_samples(
+    auto batch_srcs = create_local_samples(
       handle, rng_state, graph_view, normalized_src_biases, gpu_src_biases, samples_in_this_batch);
-    auto batch_dst = create_local_samples(
+    auto batch_dsts = create_local_samples(
       handle, rng_state, graph_view, normalized_dst_biases, gpu_dst_biases, samples_in_this_batch);
 
     if constexpr (multi_gpu) {
       auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
 
-      std::tie(batch_src,
-               batch_dst,
+      std::tie(batch_srcs,
+               batch_dsts,
                std::ignore,
                std::ignore,
                std::ignore,
@@ -320,8 +337,8 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negativ
                                                                                        int32_t,
                                                                                        int32_t>(
           handle,
-          std::move(batch_src),
-          std::move(batch_dst),
+          std::move(batch_srcs),
+          std::move(batch_dsts),
           std::nullopt,
           std::nullopt,
           std::nullopt,
@@ -333,42 +350,43 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negativ
     if (remove_existing_edges) {
       auto has_edge_flags =
         graph_view.has_edge(handle,
-                            raft::device_span<vertex_t const>{batch_src.data(), batch_src.size()},
-                            raft::device_span<vertex_t const>{batch_dst.data(), batch_dst.size()},
+                            raft::device_span<vertex_t const>{batch_srcs.data(), batch_srcs.size()},
+                            raft::device_span<vertex_t const>{batch_dsts.data(), batch_dsts.size()},
                             do_expensive_check);
 
-      auto begin_iter = thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin());
+      auto begin_iter = thrust::make_zip_iterator(batch_srcs.begin(), batch_dsts.begin());
       auto new_end    = thrust::remove_if(handle.get_thrust_policy(),
                                        begin_iter,
-                                       begin_iter + batch_src.size(),
+                                       begin_iter + batch_srcs.size(),
                                        has_edge_flags.begin(),
                                        thrust::identity<bool>());
 
-      batch_src.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
-      batch_dst.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+      batch_srcs.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+      batch_dsts.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
     }
 
     if (remove_duplicates) {
       thrust::sort(handle.get_thrust_policy(),
-                   thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
-                   thrust::make_zip_iterator(batch_src.end(), batch_dst.end()));
+                   thrust::make_zip_iterator(batch_srcs.begin(), batch_dsts.begin()),
+                   thrust::make_zip_iterator(batch_srcs.end(), batch_dsts.end()));
 
-      auto new_end = thrust::unique(handle.get_thrust_policy(),
-                                    thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
-                                    thrust::make_zip_iterator(batch_src.end(), batch_dst.end()));
+      auto new_end =
+        thrust::unique(handle.get_thrust_policy(),
+                       thrust::make_zip_iterator(batch_srcs.begin(), batch_dsts.begin()),
+                       thrust::make_zip_iterator(batch_srcs.end(), batch_dsts.end()));
 
-      size_t new_size =
-        thrust::distance(thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), new_end);
+      size_t new_size = thrust::distance(
+        thrust::make_zip_iterator(batch_srcs.begin(), batch_dsts.begin()), new_end);
 
-      if (src.size() > 0) {
-        rmm::device_uvector<vertex_t> new_src(src.size() + new_size, handle.get_stream());
-        rmm::device_uvector<vertex_t> new_dst(dst.size() + new_size, handle.get_stream());
+      if (srcs.size() > 0) {
+        rmm::device_uvector<vertex_t> new_src(srcs.size() + new_size, handle.get_stream());
+        rmm::device_uvector<vertex_t> new_dst(dsts.size() + new_size, handle.get_stream());
 
         thrust::merge(handle.get_thrust_policy(),
-                      thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                      thrust::make_zip_iterator(batch_srcs.begin(), batch_dsts.begin()),
                       new_end,
-                      thrust::make_zip_iterator(src.begin(), dst.begin()),
-                      thrust::make_zip_iterator(src.end(), dst.end()),
+                      thrust::make_zip_iterator(srcs.begin(), dsts.begin()),
+                      thrust::make_zip_iterator(srcs.end(), dsts.end()),
                       thrust::make_zip_iterator(new_src.begin(), new_dst.begin()));
 
         new_end = thrust::unique(handle.get_thrust_policy(),
@@ -378,32 +396,32 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negativ
         new_size =
           thrust::distance(thrust::make_zip_iterator(new_src.begin(), new_dst.begin()), new_end);
 
-        src = std::move(new_src);
-        dst = std::move(new_dst);
+        srcs = std::move(new_src);
+        dsts = std::move(new_dst);
       } else {
-        src = std::move(batch_src);
-        dst = std::move(batch_dst);
+        srcs = std::move(batch_srcs);
+        dsts = std::move(batch_dsts);
       }
 
-      src.resize(new_size, handle.get_stream());
-      dst.resize(new_size, handle.get_stream());
-    } else if (src.size() > 0) {
-      size_t current_end = src.size();
+      srcs.resize(new_size, handle.get_stream());
+      dsts.resize(new_size, handle.get_stream());
+    } else if (srcs.size() > 0) {
+      size_t current_end = srcs.size();
 
-      src.resize(src.size() + batch_src.size(), handle.get_stream());
-      dst.resize(dst.size() + batch_dst.size(), handle.get_stream());
+      srcs.resize(srcs.size() + batch_srcs.size(), handle.get_stream());
+      dsts.resize(dsts.size() + batch_dsts.size(), handle.get_stream());
 
       thrust::copy(handle.get_thrust_policy(),
-                   thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
-                   thrust::make_zip_iterator(batch_src.end(), batch_dst.end()),
-                   thrust::make_zip_iterator(src.begin(), dst.begin()) + current_end);
+                   thrust::make_zip_iterator(batch_srcs.begin(), batch_dsts.begin()),
+                   thrust::make_zip_iterator(batch_srcs.end(), batch_dsts.end()),
+                   thrust::make_zip_iterator(srcs.begin(), dsts.begin()) + current_end);
     } else {
-      src = std::move(batch_src);
-      dst = std::move(batch_dst);
+      srcs = std::move(batch_srcs);
+      dsts = std::move(batch_dsts);
     }
 
     if (exact_number_of_samples) {
-      size_t current_sample_size = src.size();
+      size_t current_sample_size = srcs.size();
       if constexpr (multi_gpu) {
         current_sample_size = cugraph::host_scalar_allreduce(
           handle.get_comms(), current_sample_size, raft::comms::op_t::SUM, handle.get_stream());
@@ -412,16 +430,142 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negativ
       // FIXME: We could oversample and discard the unnecessary samples
       // to reduce the number of iterations in the outer loop, but it seems like
       // exact_number_of_samples is an edge case not worth optimizing for at this time.
-      samples_in_this_batch = num_samples - current_sample_size;
+      samples_in_this_batch = total_samples - current_sample_size;
     } else {
       samples_in_this_batch = 0;
     }
   }
 
-  src.shrink_to_fit(handle.get_stream());
-  dst.shrink_to_fit(handle.get_stream());
+  srcs.shrink_to_fit(handle.get_stream());
+  dsts.shrink_to_fit(handle.get_stream());
+
+  if constexpr (multi_gpu) {
+    auto const& comm     = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    // Randomly shuffle the samples so that each gpu gets their
+    // desired number of samples
+
+    if (!exact_number_of_samples) {
+      // If we didn't force generating the exact number of samples,
+      // we might have fewer samples than requested.  We need to
+      // accommodate this situation.  For now we'll just
+      // uniformly(-ish) reduce the requested size.
+      size_t total_extracted = host_scalar_allreduce(
+        handle.get_comms(), srcs.size(), raft::comms::op_t::SUM, handle.get_stream());
+      size_t reduction = total_samples - total_extracted;
+
+      while (reduction > 0) {
+        size_t est_reduction_per_gpu = (reduction + comm_size - 1) / comm_size;
+        for (size_t i = 0; i < samples_per_gpu.size(); ++i) {
+          if (samples_per_gpu[i] > est_reduction_per_gpu) {
+            samples_per_gpu[i] -= est_reduction_per_gpu;
+            reduction -= est_reduction_per_gpu;
+          } else {
+            reduction -= samples_per_gpu[i];
+            samples_per_gpu[i] = 0;
+          }
+
+          if (reduction < est_reduction_per_gpu) est_reduction_per_gpu = reduction;
+        }
+      }
+      num_samples = samples_per_gpu[comm_rank];
+    }
+
+    // Mimic the logic of permute_range...
+    //
+    //  1) Randomly assign each entry to a GPU
+    //  2) Count how many are assigned to each GPU
+    //  3) Allgatherv (allgather?) to give each GPU a count for how many entries are destined for
+    //  that GPU 4) Identify extras/deficits for each GPU, arbitrarily adjust counts to make correct
+    //  5) Shuffle accordingly
+    //
+    rmm::device_uvector<int> gpu_assignment(srcs.size(), handle.get_stream());
+
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         gpu_assignment.data(),
+                                         gpu_assignment.size(),
+                                         int{0},
+                                         int{comm_size},
+                                         rng_state);
+
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        gpu_assignment.begin(),
+                        gpu_assignment.end(),
+                        thrust::make_zip_iterator(srcs.begin(), dsts.begin()));
+
+    rmm::device_uvector<size_t> d_send_counts(comm_size, handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      d_send_counts.begin(),
+      d_send_counts.end(),
+      [gpu_assignment_span = raft::device_span<const int>{
+         gpu_assignment.data(), gpu_assignment.size()}] __device__(size_t i) {
+        auto begin = thrust::lower_bound(
+          thrust::seq, gpu_assignment_span.begin(), gpu_assignment_span.end(), static_cast<int>(i));
+        auto end =
+          thrust::upper_bound(thrust::seq, begin, gpu_assignment_span.end(), static_cast<int>(i));
+        return thrust::distance(begin, end);
+      });
+
+    std::vector<size_t> tx_value_counts(comm_size, 0);
+    raft::update_host(
+      tx_value_counts.data(), d_send_counts.data(), d_send_counts.size(), handle.get_stream());
+
+    std::forward_as_tuple(std::tie(srcs, dsts), std::ignore) =
+      cugraph::shuffle_values(handle.get_comms(),
+                              thrust::make_zip_iterator(srcs.begin(), dsts.begin()),
+                              tx_value_counts,
+                              handle.get_stream());
+
+    rmm::device_uvector<float> fractional_random_numbers(srcs.size(), handle.get_stream());
+
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         fractional_random_numbers.data(),
+                                         fractional_random_numbers.size(),
+                                         float{0.0},
+                                         float{1.0},
+                                         rng_state);
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        fractional_random_numbers.begin(),
+                        fractional_random_numbers.end(),
+                        thrust::make_zip_iterator(srcs.begin(), dsts.begin()));
+
+    size_t nr_extras{0};
+    size_t nr_deficits{0};
+    if (srcs.size() > num_samples) {
+      nr_extras = srcs.size() - static_cast<size_t>(num_samples);
+    } else {
+      nr_deficits = static_cast<size_t>(num_samples) - srcs.size();
+    }
+
+    auto extra_srcs = cugraph::detail::device_allgatherv(
+      handle, comm, raft::device_span<vertex_t const>(srcs.data() + num_samples, nr_extras));
+    // nr_extras > 0 ? nr_extras : 0));
+    auto extra_dsts = cugraph::detail::device_allgatherv(
+      handle, comm, raft::device_span<vertex_t const>(dsts.data() + num_samples, nr_extras));
+    // nr_extras > 0 ? nr_extras : 0));
+
+    srcs.resize(num_samples, handle.get_stream());
+    dsts.resize(num_samples, handle.get_stream());
+    auto deficits =
+      cugraph::host_scalar_allgather(handle.get_comms(), nr_deficits, handle.get_stream());
+
+    std::exclusive_scan(deficits.begin(), deficits.end(), deficits.begin(), vertex_t{0});
+
+    raft::copy(srcs.data() + num_samples - nr_deficits,
+               extra_srcs.begin() + deficits[comm_rank],
+               nr_deficits,
+               handle.get_stream());
+
+    raft::copy(dsts.data() + num_samples - nr_deficits,
+               extra_dsts.begin() + deficits[comm_rank],
+               nr_deficits,
+               handle.get_stream());
+  }
 
-  return std::make_tuple(std::move(src), std::move(dst));
+  return std::make_tuple(std::move(srcs), std::move(dsts));
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_impl.hpp b/cpp/src/sampling/neighbor_sampling_impl.hpp
index bbc0fbc17af..b759e479bc6 100644
--- a/cpp/src/sampling/neighbor_sampling_impl.hpp
+++ b/cpp/src/sampling/neighbor_sampling_impl.hpp
@@ -31,6 +31,7 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/unique.h>
 
 namespace cugraph {
@@ -123,9 +124,9 @@ neighbor_sample_impl(raft::handle_t const& handle,
         *edge_type_view,
         [valid_edge_type = i] __device__(auto src,
                                          auto dst,
-                                         thrust::nullopt_t,
-                                         thrust::nullopt_t,
-                                         /*thrust::nullopt_t*/ auto edge_type) {
+                                         cuda::std::nullopt_t,
+                                         cuda::std::nullopt_t,
+                                         /*cuda::std::nullopt_t*/ auto edge_type) {
           return edge_type == valid_edge_type;
         },
         edge_mask.mutable_view(),
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 0b1d9dcdb56..440c0c7a6ec 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -43,7 +44,6 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
-#include <thrust/optional.h>
 #include <thrust/remove.h>
 #include <thrust/scan.h>
 #include <thrust/scatter.h>
@@ -216,8 +216,8 @@ struct col_indx_extract_t {
        ptr_d_coalesced_v = original::raw_const_ptr(d_coalesced_src_v),
        row_offsets       = row_offsets_,
        col_indices       = col_indices_,
-       values            = values_ ? thrust::optional<weight_t const*>{*values_}
-                                   : thrust::nullopt] __device__(auto indx, auto col_indx) {
+       values            = values_ ? cuda::std::optional<weight_t const*>{*values_}
+                                   : cuda::std::nullopt] __device__(auto indx, auto col_indx) {
         auto delta     = ptr_d_sizes[indx] - 1;
         auto v_indx    = ptr_d_coalesced_v[indx * max_depth + delta];
         auto start_row = row_offsets[v_indx];
diff --git a/cpp/src/sampling/random_walks_impl.cuh b/cpp/src/sampling/random_walks_impl.cuh
index 6c10fc473f3..fbf0836dac5 100644
--- a/cpp/src/sampling/random_walks_impl.cuh
+++ b/cpp/src/sampling/random_walks_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 
 #include <algorithm>
 #include <limits>
@@ -54,14 +54,14 @@ template <typename vertex_t, typename weight_t>
 struct sample_edges_op_t {
   template <typename W = weight_t>
   __device__ std::enable_if_t<std::is_same_v<W, void>, vertex_t> operator()(
-    vertex_t, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+    vertex_t, vertex_t dst, cuda::std::nullopt_t, cuda::std::nullopt_t, cuda::std::nullopt_t) const
   {
     return dst;
   }
 
   template <typename W = weight_t>
   __device__ std::enable_if_t<!std::is_same_v<W, void>, thrust::tuple<vertex_t, W>> operator()(
-    vertex_t, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, W w) const
+    vertex_t, vertex_t dst, cuda::std::nullopt_t, cuda::std::nullopt_t, W w) const
   {
     return thrust::make_tuple(dst, w);
   }
@@ -69,8 +69,8 @@ struct sample_edges_op_t {
 
 template <typename vertex_t, typename bias_t>
 struct biased_random_walk_e_bias_op_t {
-  __device__ bias_t
-  operator()(vertex_t, vertex_t, bias_t src_out_weight_sum, thrust::nullopt_t, bias_t weight) const
+  __device__ bias_t operator()(
+    vertex_t, vertex_t, bias_t src_out_weight_sum, cuda::std::nullopt_t, bias_t weight) const
   {
     return weight / src_out_weight_sum;
   }
@@ -79,7 +79,7 @@ struct biased_random_walk_e_bias_op_t {
 template <typename vertex_t, typename weight_t>
 struct biased_sample_edges_op_t {
   __device__ thrust::tuple<vertex_t, weight_t> operator()(
-    vertex_t, vertex_t dst, weight_t, thrust::nullopt_t, weight_t weight) const
+    vertex_t, vertex_t dst, weight_t, cuda::std::nullopt_t, weight_t weight) const
   {
     return thrust::make_tuple(dst, weight);
   }
@@ -99,9 +99,9 @@ struct node2vec_random_walk_e_bias_op_t {
   __device__ std::enable_if_t<std::is_same_v<W, void>, bias_t> operator()(
     thrust::tuple<vertex_t, vertex_t> tagged_src,
     vertex_t dst,
-    thrust::nullopt_t,
-    thrust::nullopt_t,
-    thrust::nullopt_t) const
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t) const
   {
     //  Check tag (prev vert) for destination
     if (dst == thrust::get<1>(tagged_src)) { return 1.0 / p_; }
@@ -126,8 +126,8 @@ struct node2vec_random_walk_e_bias_op_t {
   __device__ std::enable_if_t<!std::is_same_v<W, void>, bias_t> operator()(
     thrust::tuple<vertex_t, vertex_t> tagged_src,
     vertex_t dst,
-    thrust::nullopt_t,
-    thrust::nullopt_t,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t,
     W) const
   {
     //  Check tag (prev vert) for destination
@@ -155,9 +155,9 @@ struct node2vec_sample_edges_op_t {
   __device__ std::enable_if_t<std::is_same_v<W, void>, vertex_t> operator()(
     thrust::tuple<vertex_t, vertex_t> tagged_src,
     vertex_t dst,
-    thrust::nullopt_t,
-    thrust::nullopt_t,
-    thrust::nullopt_t) const
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t) const
   {
     return dst;
   }
@@ -166,8 +166,8 @@ struct node2vec_sample_edges_op_t {
   __device__ std::enable_if_t<!std::is_same_v<W, void>, thrust::tuple<vertex_t, W>> operator()(
     thrust::tuple<vertex_t, vertex_t> tagged_src,
     vertex_t dst,
-    thrust::nullopt_t,
-    thrust::nullopt_t,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t,
     W w) const
   {
     return thrust::make_tuple(dst, w);
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 2c5658b32a5..d371b4141e0 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,12 +27,12 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/tuple.h>
 
@@ -138,7 +138,7 @@ struct uniform_selector_t {
     {
     }
 
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+    __device__ cuda::std::optional<thrust::tuple<vertex_t, weight_t>> operator()(
       vertex_t src_v,
       real_t rnd_val,
       vertex_t = 0 /* not used*/,
@@ -146,7 +146,7 @@ struct uniform_selector_t {
       bool     = false /* not used*/) const
     {
       auto crt_out_deg = ptr_d_cache_out_degs_[src_v];
-      if (crt_out_deg == 0) return thrust::nullopt;  // src_v is a sink
+      if (crt_out_deg == 0) return cuda::std::nullopt;  // src_v is a sink
 
       vertex_t v_indx =
         static_cast<vertex_t>(rnd_val >= 1.0 ? crt_out_deg - 1 : rnd_val * crt_out_deg);
@@ -156,7 +156,8 @@ struct uniform_selector_t {
       auto weight_value =
         (values_ == nullptr ? weight_t{1}
                             : values_[start_row + col_indx]);  // account for un-weighted graphs
-      return thrust::optional{thrust::make_tuple(col_indices_[start_row + col_indx], weight_value)};
+      return cuda::std::optional{
+        thrust::make_tuple(col_indices_[start_row + col_indx], weight_value)};
     }
 
    private:
@@ -211,7 +212,7 @@ struct biased_selector_t {
     // Sum(weights(neighborhood(src_v))) are pre-computed and
     // stored in ptr_d_sum_weights_ (too expensive to check, here);
     //
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+    __device__ cuda::std::optional<thrust::tuple<vertex_t, weight_t>> operator()(
       vertex_t src_v,
       real_t rnd_val,
       vertex_t = 0 /* not used*/,
@@ -223,7 +224,7 @@ struct biased_selector_t {
 
       auto col_indx_begin = row_offsets_[src_v];
       auto col_indx_end   = row_offsets_[src_v + 1];
-      if (col_indx_begin == col_indx_end) return thrust::nullopt;  // src_v is a sink
+      if (col_indx_begin == col_indx_end) return cuda::std::nullopt;  // src_v is a sink
 
       auto col_indx      = col_indx_begin;
       auto prev_col_indx = col_indx;
@@ -234,7 +235,7 @@ struct biased_selector_t {
         run_sum_w += values_[col_indx];
         prev_col_indx = col_indx;
       }
-      return thrust::optional{
+      return cuda::std::optional{
         thrust::make_tuple(col_indices_[prev_col_indx], values_[prev_col_indx])};
     }
 
@@ -293,9 +294,9 @@ struct node2vec_selector_t {
         q_(q),
         coalesced_alpha_{
           (max_degree > 0) && (num_paths > 0) && (ptr_alpha != nullptr)
-            ? thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>{thrust::make_tuple(
+            ? cuda::std::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>{thrust::make_tuple(
                 max_degree, num_paths, ptr_alpha)}
-            : thrust::nullopt}
+            : cuda::std::nullopt}
     {
     }
 
@@ -324,7 +325,7 @@ struct node2vec_selector_t {
       }
     }
 
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+    __device__ cuda::std::optional<thrust::tuple<vertex_t, weight_t>> operator()(
       vertex_t src_v, real_t rnd_val, vertex_t prev_v, edge_t path_index, bool start_path) const
     {
       auto const offset_indx_begin = row_offsets_[src_v];
@@ -333,7 +334,7 @@ struct node2vec_selector_t {
       weight_t sum_scaled_weights{0};
       auto offset_indx = offset_indx_begin;
 
-      if (offset_indx_begin == offset_indx_end) return thrust::nullopt;  // src_v is a sink
+      if (offset_indx_begin == offset_indx_end) return cuda::std::nullopt;  // src_v is a sink
 
       // for 1st vertex in path just use biased random selection:
       //
@@ -359,7 +360,7 @@ struct node2vec_selector_t {
           run_sum_w += crt_weight;
           prev_offset_indx = offset_indx;
         }
-        return thrust::optional{
+        return cuda::std::optional{
           thrust::make_tuple(col_indices_[prev_offset_indx],
                              values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
       }
@@ -402,7 +403,7 @@ struct node2vec_selector_t {
           run_sum_w += ptr_d_scaled_weights[start_alpha_offset + nghbr_indx];
           prev_offset_indx = offset_indx;
         }
-        return thrust::optional{
+        return cuda::std::optional{
           thrust::make_tuple(col_indices_[prev_offset_indx],
                              values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
 
@@ -435,7 +436,7 @@ struct node2vec_selector_t {
           run_sum_w += scaled_weight;
           prev_offset_indx = offset_indx;
         }
-        return thrust::optional{
+        return cuda::std::optional{
           thrust::make_tuple(col_indices_[prev_offset_indx],
                              values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
       }
@@ -459,7 +460,7 @@ struct node2vec_selector_t {
     // this is information related to a scratchpad buffer, used as cache, hence mutable;
     // (necessary, because get_strategy() is const)
     //
-    mutable thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>
+    mutable cuda::std::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>
       coalesced_alpha_;  // tuple<max_vertex_degree,
                          // num_paths, alpha_buffer[max_vertex_degree*num_paths]>
   };
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 151350dad6d..ef1a31400f7 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -29,6 +29,7 @@
 
 #include <cub/cub.cuh>
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -51,9 +52,9 @@ namespace {
 
 template <typename vertex_t, typename edge_type_t>
 struct edge_order_t {
-  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
-  thrust::optional<raft::device_span<edge_type_t const>> edgelist_edge_types{thrust::nullopt};
-  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  cuda::std::optional<raft::device_span<size_t const>> edgelist_label_offsets{cuda::std::nullopt};
+  cuda::std::optional<raft::device_span<edge_type_t const>> edgelist_edge_types{cuda::std::nullopt};
+  cuda::std::optional<raft::device_span<int32_t const>> edgelist_hops{cuda::std::nullopt};
   raft::device_span<vertex_t const> edgelist_majors{};
   raft::device_span<vertex_t const> edgelist_minors{};
 
@@ -99,8 +100,8 @@ struct edge_order_t {
 
 template <typename vertex_t>
 struct is_first_triplet_in_run_t {
-  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
-  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  cuda::std::optional<raft::device_span<size_t const>> edgelist_label_offsets{cuda::std::nullopt};
+  cuda::std::optional<raft::device_span<int32_t const>> edgelist_hops{cuda::std::nullopt};
   raft::device_span<vertex_t const> edgelist_majors{};
 
   __device__ bool operator()(size_t i) const
@@ -142,7 +143,7 @@ struct compute_label_index_t {
 
 template <typename label_index_t>
 struct optionally_compute_label_index_t {
-  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  cuda::std::optional<raft::device_span<size_t const>> edgelist_label_offsets{cuda::std::nullopt};
 
   __device__ label_index_t operator()(size_t i) const
   {
@@ -2370,9 +2371,9 @@ heterogeneous_renumber_sampled_edgelist(
         cuda::proclaim_return_type<edge_id_t>(
           [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
            edge_types             = edgelist_edge_types
-                                      ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+                                      ? cuda::std::make_optional<raft::device_span<edge_type_t const>>(
                               (*edgelist_edge_types).data(), (*edgelist_edge_types).size())
-                                      : thrust::nullopt,
+                                      : cuda::std::nullopt,
            renumber_map =
              raft::device_span<edge_id_t const>(segment_sorted_edge_id_renumber_map.data(),
                                                 segment_sorted_edge_id_renumber_map.size()),
@@ -2499,17 +2500,17 @@ sort_sampled_edge_tuples(raft::handle_t const& handle,
                                         handle.get_stream());
     thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
     edge_order_t<vertex_t, edge_type_t> edge_order_comp{
-      edgelist_label_offsets ? thrust::make_optional<raft::device_span<size_t const>>(
+      edgelist_label_offsets ? cuda::std::make_optional<raft::device_span<size_t const>>(
                                  (*edgelist_label_offsets).data() + h_label_offsets[i],
                                  (h_label_offsets[i + 1] - h_label_offsets[i]) + 1)
-                             : thrust::nullopt,
+                             : cuda::std::nullopt,
       edgelist_edge_types && use_edge_type_as_sort_key
-        ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+        ? cuda::std::make_optional<raft::device_span<edge_type_t const>>(
             (*edgelist_edge_types).data() + h_edge_offsets[i], indices.size())
-        : thrust::nullopt,
-      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+        : cuda::std::nullopt,
+      edgelist_hops ? cuda::std::make_optional<raft::device_span<int32_t const>>(
                         (*edgelist_hops).data() + h_edge_offsets[i], indices.size())
-                    : thrust::nullopt,
+                    : cuda::std::nullopt,
       raft::device_span<vertex_t const>(edgelist_majors.data() + h_edge_offsets[i], indices.size()),
       raft::device_span<vertex_t const>(edgelist_minors.data() + h_edge_offsets[i],
                                         indices.size())};
@@ -2686,8 +2687,8 @@ renumber_and_compress_sampled_edgelist(
       auto label_index_first = thrust::make_transform_iterator(
         thrust::make_counting_iterator(size_t{0}),
         optionally_compute_label_index_t<label_index_t>{
-          edgelist_label_offsets ? thrust::make_optional(*edgelist_label_offsets)
-                                 : thrust::nullopt});
+          edgelist_label_offsets ? cuda::std::make_optional(*edgelist_label_offsets)
+                                 : cuda::std::nullopt});
       auto input_key_first = thrust::make_zip_iterator(label_index_first, (*edgelist_hops).begin());
       rmm::device_uvector<label_index_t> unique_key_label_indices(min_vertices.size(),
                                                                   handle.get_stream());
@@ -2781,9 +2782,9 @@ renumber_and_compress_sampled_edgelist(
     thrust::make_counting_iterator(edgelist_majors.size()),
     is_first_triplet_in_run_t<vertex_t>{
       detail::to_thrust_optional(edgelist_label_offsets),
-      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+      edgelist_hops ? cuda::std::make_optional<raft::device_span<int32_t const>>(
                         (*edgelist_hops).data(), (*edgelist_hops).size())
-                    : thrust::nullopt,
+                    : cuda::std::nullopt,
       raft::device_span<vertex_t const>(
         edgelist_majors.data(),
         edgelist_majors.size())});  // number of unique ((label), (hop), major) triplets
@@ -2910,15 +2911,15 @@ renumber_and_compress_sampled_edgelist(
       major_vertex_counts.begin(),
       major_vertex_counts.end(),
       [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
-       edgelist_hops          = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t>>(
+       edgelist_hops = edgelist_hops ? cuda::std::make_optional<raft::device_span<int32_t>>(
                                          (*edgelist_hops).data(), (*edgelist_hops).size())
-                                              : thrust::nullopt,
+                                     : cuda::std::nullopt,
        edgelist_majors =
          raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
        seed_vertices             = renumbered_seed_vertices
-                                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                                     ? cuda::std::make_optional<raft::device_span<vertex_t const>>(
                              (*renumbered_seed_vertices).data(), (*renumbered_seed_vertices).size())
-                                     : thrust::nullopt,
+                                     : cuda::std::nullopt,
        seed_vertex_label_offsets = detail::to_thrust_optional(seed_vertex_label_offsets),
        num_hops,
        compress_per_hop] __device__(size_t i) {
@@ -3045,9 +3046,9 @@ renumber_and_compress_sampled_edgelist(
       [major_vertex_counts =
          raft::device_span<vertex_t const>(major_vertex_counts.data(), major_vertex_counts.size()),
        minor_vertex_counts = minor_vertex_counts
-                               ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                               ? cuda::std::make_optional<raft::device_span<vertex_t const>>(
                                    (*minor_vertex_counts).data(), (*minor_vertex_counts).size())
-                               : thrust::nullopt,
+                               : cuda::std::nullopt,
        num_hops,
        compress_per_hop] __device__(size_t i) {
         auto vertex_count = major_vertex_counts[i];
@@ -3310,9 +3311,9 @@ renumber_and_sort_sampled_edgelist(
       (*edgelist_label_hop_offsets).begin(),
       cuda::proclaim_return_type<size_t>(
         [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
-         edgelist_hops = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+         edgelist_hops = edgelist_hops ? cuda::std::make_optional<raft::device_span<int32_t const>>(
                                            (*edgelist_hops).data(), (*edgelist_hops).size())
-                                       : thrust::nullopt,
+                                       : cuda::std::nullopt,
          num_hops,
          num_edges = edgelist_majors.size()] __device__(size_t i) {
           size_t start_offset{0};
@@ -3500,12 +3501,12 @@ heterogeneous_renumber_and_sort_sampled_edgelist(
       cuda::proclaim_return_type<size_t>(
         [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
          edgelist_edge_types    = edgelist_edge_types
-                                    ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+                                    ? cuda::std::make_optional<raft::device_span<edge_type_t const>>(
                                      (*edgelist_edge_types).data(), (*edgelist_edge_types).size())
-                                    : thrust::nullopt,
-         edgelist_hops = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                                    : cuda::std::nullopt,
+         edgelist_hops = edgelist_hops ? cuda::std::make_optional<raft::device_span<int32_t const>>(
                                            (*edgelist_hops).data(), (*edgelist_hops).size())
-                                       : thrust::nullopt,
+                                       : cuda::std::nullopt,
          num_edge_types,
          num_hops,
          num_edges = edgelist_majors.size()] __device__(size_t i) {
@@ -3653,9 +3654,9 @@ sort_sampled_edgelist(raft::handle_t const& handle,
       (*edgelist_label_hop_offsets).begin(),
       cuda::proclaim_return_type<size_t>(
         [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
-         edgelist_hops = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+         edgelist_hops = edgelist_hops ? cuda::std::make_optional<raft::device_span<int32_t const>>(
                                            (*edgelist_hops).data(), (*edgelist_hops).size())
-                                       : thrust::nullopt,
+                                       : cuda::std::nullopt,
          num_hops,
          num_edges = edgelist_majors.size()] __device__(size_t i) {
           size_t start_offset{0};
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 31de9b1e5d3..f526a6788e1 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/count.h>
 #include <thrust/extrema.h>
@@ -47,7 +48,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
@@ -126,8 +126,8 @@ rmm::device_uvector<edge_t> compute_major_degrees(
       partition.vertex_partition_range_first(major_range_vertex_partition_id);
 
     auto offsets = edge_partition_offsets[i];
-    auto masks =
-      edge_partition_masks ? thrust::make_optional((*edge_partition_masks)[i]) : thrust::nullopt;
+    auto masks   = edge_partition_masks ? cuda::std::make_optional((*edge_partition_masks)[i])
+                                        : cuda::std::nullopt;
     auto segment_offset_size_per_partition =
       edge_partition_segment_offsets.size() / static_cast<size_t>(minor_comm_size);
     auto num_local_degrees =
@@ -202,7 +202,8 @@ rmm::device_uvector<edge_t> compute_major_degrees(
     handle.get_thrust_policy(),
     degrees.begin(),
     degrees.end(),
-    [offsets, masks = masks ? thrust::make_optional(*masks) : thrust::nullopt] __device__(auto i) {
+    [offsets,
+     masks = masks ? cuda::std::make_optional(*masks) : cuda::std::nullopt] __device__(auto i) {
       auto local_degree = offsets[i + 1] - offsets[i];
       if (masks) {
         local_degree =
@@ -842,10 +843,10 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
       edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
     thrust::transform(handle.get_thrust_policy(),
                       sorted_edge_first + edge_partition_offsets[i],
                       sorted_edge_first + edge_partition_offsets[i + 1],
@@ -913,10 +914,10 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
     edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
   auto edge_partition_e_mask =
     edge_mask_view
-      ? thrust::make_optional<
+      ? cuda::std::make_optional<
           detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
           *edge_mask_view, 0)
-      : thrust::nullopt;
+      : cuda::std::nullopt;
   thrust::transform(
     handle.get_thrust_policy(),
     edge_first,
@@ -987,10 +988,10 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
       edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
     auto edge_partition_e_mask =
       edge_mask_view
-        ? thrust::make_optional<
+        ? cuda::std::make_optional<
             detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
             *edge_mask_view, i)
-        : thrust::nullopt;
+        : cuda::std::nullopt;
     thrust::transform(
       handle.get_thrust_policy(),
       sorted_edge_first + edge_partition_offsets[i],
@@ -1058,10 +1059,10 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
     edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
   auto edge_partition_e_mask =
     edge_mask_view
-      ? thrust::make_optional<
+      ? cuda::std::make_optional<
           detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
           *edge_mask_view, 0)
-      : thrust::nullopt;
+      : cuda::std::nullopt;
   thrust::transform(
     handle.get_thrust_policy(),
     edge_first,
diff --git a/cpp/src/structure/induced_subgraph_impl.cuh b/cpp/src/structure/induced_subgraph_impl.cuh
index 3822055b037..e319dc03bbb 100644
--- a/cpp/src/structure/induced_subgraph_impl.cuh
+++ b/cpp/src/structure/induced_subgraph_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,6 +34,7 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -44,7 +45,6 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/scan.h>
 #include <thrust/sort.h>
 #include <thrust/transform.h>
@@ -58,7 +58,7 @@ namespace detail {
 
 template <typename vertex_t, typename weight_t, typename property_t>
 struct induced_subgraph_weighted_edge_op {
-  using return_type = thrust::optional<thrust::tuple<vertex_t, vertex_t, weight_t, size_t>>;
+  using return_type = cuda::std::optional<thrust::tuple<vertex_t, vertex_t, weight_t, size_t>>;
 
   raft::device_span<size_t const> dst_subgraph_offsets;
   raft::device_span<vertex_t const> dst_subgraph_vertices;
@@ -74,15 +74,15 @@ struct induced_subgraph_weighted_edge_op {
                                  dst_subgraph_vertices.data() + dst_subgraph_offsets[subgraph],
                                  dst_subgraph_vertices.data() + dst_subgraph_offsets[subgraph + 1],
                                  dst)
-             ? thrust::make_optional(
+             ? cuda::std::make_optional(
                  thrust::make_tuple(thrust::get<0>(tagged_src), dst, wgt, subgraph))
-             : thrust::nullopt;
+             : cuda::std::nullopt;
   }
 };
 
 template <typename vertex_t, typename property_t>
 struct induced_subgraph_unweighted_edge_op {
-  using return_type = thrust::optional<thrust::tuple<vertex_t, vertex_t, size_t>>;
+  using return_type = cuda::std::optional<thrust::tuple<vertex_t, vertex_t, size_t>>;
 
   raft::device_span<size_t const> dst_subgraph_offsets;
   raft::device_span<vertex_t const> dst_subgraph_vertices;
@@ -91,15 +91,16 @@ struct induced_subgraph_unweighted_edge_op {
                                     vertex_t dst,
                                     property_t sv,
                                     property_t dv,
-                                    thrust::nullopt_t) const
+                                    cuda::std::nullopt_t) const
   {
     size_t subgraph = thrust::get<1>(tagged_src);
     return thrust::binary_search(thrust::seq,
                                  dst_subgraph_vertices.data() + dst_subgraph_offsets[subgraph],
                                  dst_subgraph_vertices.data() + dst_subgraph_offsets[subgraph + 1],
                                  dst)
-             ? thrust::make_optional(thrust::make_tuple(thrust::get<0>(tagged_src), dst, subgraph))
-             : thrust::nullopt;
+             ? cuda::std::make_optional(
+                 thrust::make_tuple(thrust::get<0>(tagged_src), dst, subgraph))
+             : cuda::std::nullopt;
   }
 };
 
@@ -203,8 +204,8 @@ extract_induced_subgraphs(
   dst_subgraph_vertices = raft::device_span<vertex_t const>(dst_subgraph_vertices_v.data(),
                                                             dst_subgraph_vertices_v.size());
 
-  // 3. Call extract_transform_v_frontier_outgoing_e with a functor that returns thrust::nullopt if
-  // the destination vertex has a property of 0, return the edge if the destination vertex has a
+  // 3. Call extract_transform_v_frontier_outgoing_e with a functor that returns cuda::std::nullopt
+  // if the destination vertex has a property of 0, return the edge if the destination vertex has a
   // property of 1
   vertex_frontier_t<vertex_t, size_t, multi_gpu, false> vertex_frontier(handle, 1);
 
@@ -233,7 +234,7 @@ extract_induced_subgraphs(
         edge_src_dummy_property_t{}.view(),
         edge_dst_dummy_property_t{}.view(),
         *edge_weight_view,
-        detail::induced_subgraph_weighted_edge_op<vertex_t, weight_t, thrust::nullopt_t>{
+        detail::induced_subgraph_weighted_edge_op<vertex_t, weight_t, cuda::std::nullopt_t>{
           dst_subgraph_offsets, dst_subgraph_vertices},
         do_expensive_check);
 
@@ -253,7 +254,7 @@ extract_induced_subgraphs(
         edge_src_dummy_property_t{}.view(),
         edge_dst_dummy_property_t{}.view(),
         edge_dummy_property_t{}.view(),
-        detail::induced_subgraph_unweighted_edge_op<vertex_t, thrust::nullopt_t>{
+        detail::induced_subgraph_unweighted_edge_op<vertex_t, cuda::std::nullopt_t>{
           dst_subgraph_offsets, dst_subgraph_vertices},
         do_expensive_check);
 
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index ba40db1f085..2c2674f5bbe 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/fill.h>
@@ -40,7 +41,6 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/set_operations.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
@@ -78,20 +78,26 @@ struct topdown_e_op_t {
   detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool> visited_flags{};
   vertex_t dst_first{};
 
-  __device__ thrust::optional<vertex_t> operator()(
-    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  __device__ cuda::std::optional<vertex_t> operator()(vertex_t src,
+                                                      vertex_t dst,
+                                                      cuda::std::nullopt_t,
+                                                      cuda::std::nullopt_t,
+                                                      cuda::std::nullopt_t) const
   {
     auto dst_offset = dst - dst_first;
     auto old        = prev_visited_flags.get(dst_offset);
     if (!old) { old = visited_flags.atomic_or(dst_offset, true); }
-    return old ? thrust::nullopt : thrust::optional<vertex_t>{src};
+    return old ? cuda::std::nullopt : cuda::std::optional<vertex_t>{src};
   }
 };
 
 template <typename vertex_t>
 struct bottomup_e_op_t {
-  __device__ vertex_t operator()(
-    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  __device__ vertex_t operator()(vertex_t src,
+                                 vertex_t dst,
+                                 cuda::std::nullopt_t,
+                                 cuda::std::nullopt_t,
+                                 cuda::std::nullopt_t) const
   {
     return dst;
   }
@@ -103,8 +109,11 @@ struct bottomup_pred_op_t {
     prev_visited_flags{};  // visited in the previous iterations
   vertex_t dst_first{};
 
-  __device__ bool operator()(
-    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  __device__ bool operator()(vertex_t src,
+                             vertex_t dst,
+                             cuda::std::nullopt_t,
+                             cuda::std::nullopt_t,
+                             cuda::std::nullopt_t) const
   {
     return prev_visited_flags.get(dst - dst_first);
   }
@@ -260,10 +269,10 @@ void bfs(raft::handle_t const& handle,
       auto edge_mask_view = graph_view.edge_mask_view();
       auto edge_partition_e_mask =
         edge_mask_view
-          ? thrust::make_optional<
+          ? cuda::std::make_optional<
               detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
               *edge_mask_view, partition_idx)
-          : thrust::nullopt;
+          : cuda::std::nullopt;
       auto high_and_mid_degree_segment_size =
         (*segment_offsets)[2];  // compute local degrees for high & mid degree segments only, for
                                 // low & hypersparse segments, use low_degree_threshold *
diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh
index d228460bec3..d0a7979d14b 100644
--- a/cpp/src/traversal/extract_bfs_paths_impl.cuh
+++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -52,7 +53,7 @@ template <typename vertex_t>
 struct compute_max {
   vertex_t __device__ operator()(vertex_t lhs, vertex_t rhs)
   {
-    return thrust::max<vertex_t>(lhs, rhs);
+    return cuda::std::max<vertex_t>(lhs, rhs);
   }
 };
 
diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh
index 44fa21a5252..be462720e51 100644
--- a/cpp/src/traversal/k_hop_nbrs_impl.cuh
+++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,13 +30,13 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/count.h>
 #include <thrust/fill.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <limits>
@@ -48,11 +48,11 @@ namespace {
 
 template <typename vertex_t>
 struct e_op_t {
-  __device__ thrust::optional<size_t> operator()(thrust::tuple<vertex_t, size_t> tagged_src,
-                                                 vertex_t,
-                                                 thrust::nullopt_t,
-                                                 thrust::nullopt_t,
-                                                 thrust::nullopt_t) const
+  __device__ cuda::std::optional<size_t> operator()(thrust::tuple<vertex_t, size_t> tagged_src,
+                                                    vertex_t,
+                                                    cuda::std::nullopt_t,
+                                                    cuda::std::nullopt_t,
+                                                    cuda::std::nullopt_t) const
   {
     return thrust::get<1>(tagged_src);
   }
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index b3cd0d57c67..ffe706ca45d 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,12 +37,12 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/fill.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/set_operations.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
@@ -133,11 +133,11 @@ struct e_op_t {
   weight_t cutoff{};
   weight_t invalid_distance{};
 
-  __device__ thrust::optional<thrust::tuple<tag_t, weight_t>> operator()(
+  __device__ cuda::std::optional<thrust::tuple<tag_t, weight_t>> operator()(
     thrust::tuple<vertex_t, tag_t> tagged_src,
     vertex_t dst,
-    thrust::nullopt_t,
-    thrust::nullopt_t,
+    cuda::std::nullopt_t,
+    cuda::std::nullopt_t,
     weight_t w) const
   {
     aggregate_vi_t<vertex_t, tag_t, key_t> aggregator{num_origins};
@@ -150,9 +150,9 @@ struct e_op_t {
     auto dst_val      = key_to_dist_map.find(aggregator(thrust::make_tuple(dst, origin_idx)));
     if (dst_val != invalid_distance) { threshold = dst_val < threshold ? dst_val : threshold; }
     return (new_distance < threshold)
-             ? thrust::optional<thrust::tuple<tag_t, weight_t>>{thrust::make_tuple(origin_idx,
-                                                                                   new_distance)}
-             : thrust::nullopt;
+             ? cuda::std::optional<thrust::tuple<tag_t, weight_t>>{thrust::make_tuple(origin_idx,
+                                                                                      new_distance)}
+             : cuda::std::nullopt;
   }
 };
 
@@ -644,8 +644,8 @@ rmm::device_uvector<weight_t> od_shortest_distances(
         thrust::tuple<vertex_t, od_idx_t>,
         weight_t,
         vertex_t,
-        thrust::nullopt_t,
-        thrust::nullopt_t,
+        cuda::std::nullopt_t,
+        cuda::std::nullopt_t,
         weight_t,
         e_op_t<vertex_t, od_idx_t, key_t, weight_t, GraphViewType::is_multi_gpu>>
         e_op_wrapper{e_op};
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index 3429672b151..8006bbf4063 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,11 +32,11 @@
 
 #include <raft/util/cudart_utils.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -52,8 +52,8 @@ struct e_op_t {
   weight_t const* distances{};
   weight_t cutoff{};
 
-  __device__ thrust::optional<thrust::tuple<weight_t, vertex_t>> operator()(
-    vertex_t src, vertex_t dst, weight_t src_val, thrust::nullopt_t, weight_t w) const
+  __device__ cuda::std::optional<thrust::tuple<weight_t, vertex_t>> operator()(
+    vertex_t src, vertex_t dst, weight_t src_val, cuda::std::nullopt_t, weight_t w) const
   {
     auto push         = true;
     auto new_distance = src_val + w;
@@ -65,9 +65,9 @@ struct e_op_t {
       threshold         = old_distance < threshold ? old_distance : threshold;
     }
     if (new_distance >= threshold) { push = false; }
-    return push ? thrust::optional<thrust::tuple<weight_t, vertex_t>>{thrust::make_tuple(
+    return push ? cuda::std::optional<thrust::tuple<weight_t, vertex_t>>{thrust::make_tuple(
                     new_distance, src)}
-                : thrust::nullopt;
+                : cuda::std::nullopt;
   }
 };
 
@@ -223,11 +223,11 @@ void sssp(raft::handle_t const& handle,
         auto new_dist = thrust::get<0>(pushed_val);
         auto update   = (new_dist < v_val);
         return thrust::make_tuple(
-          update ? thrust::optional<size_t>{new_dist < near_far_threshold ? bucket_idx_next_near
-                                                                          : bucket_idx_far}
-                 : thrust::nullopt,
-          update ? thrust::optional<thrust::tuple<weight_t, vertex_t>>{pushed_val}
-                 : thrust::nullopt);
+          update ? cuda::std::optional<size_t>{new_dist < near_far_threshold ? bucket_idx_next_near
+                                                                             : bucket_idx_far}
+                 : cuda::std::nullopt,
+          update ? cuda::std::optional<thrust::tuple<weight_t, vertex_t>>{pushed_val}
+                 : cuda::std::nullopt);
       });
 
     vertex_frontier.bucket(bucket_idx_cur_near).clear();
@@ -250,9 +250,9 @@ void sssp(raft::handle_t const& handle,
             auto dist =
               *(distances + vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v));
             return dist >= old_near_far_threshold
-                     ? thrust::optional<size_t>{dist < near_far_threshold ? bucket_idx_cur_near
-                                                                          : bucket_idx_far}
-                     : thrust::nullopt;
+                     ? cuda::std::optional<size_t>{dist < near_far_threshold ? bucket_idx_cur_near
+                                                                             : bucket_idx_far}
+                     : cuda::std::nullopt;
           });
         near_size = vertex_frontier.bucket(bucket_idx_cur_near).aggregate_size();
         far_size  = vertex_frontier.bucket(bucket_idx_far).aggregate_size();
diff --git a/cpp/tests/components/mg_vertex_coloring_test.cu b/cpp/tests/components/mg_vertex_coloring_test.cu
index 17327e35c97..89b1df9264e 100644
--- a/cpp/tests/components/mg_vertex_coloring_test.cu
+++ b/cpp/tests/components/mg_vertex_coloring_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,6 +34,8 @@
 
 #include <raft/random/rng_state.hpp>
 
+#include <cuda/std/optional>
+
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -130,7 +132,7 @@ class Tests_MGGraphColoring
                   : cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
                       d_colors.data(), vertex_t{0}),
         cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+        [] __device__(auto src, auto dst, auto src_color, auto dst_color, cuda::std::nullopt_t) {
           if ((src != dst) && (src_color == dst_color)) {
             return uint8_t{1};
           } else {
@@ -168,7 +170,7 @@ class Tests_MGGraphColoring
                       d_colors.begin(), vertex_t{0}),
         cugraph::edge_dummy_property_t{}.view(),
         [renumber_map = (*mg_renumber_map).data()] __device__(
-          auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          auto src, auto dst, auto src_color, auto dst_color, cuda::std::nullopt_t) {
           if ((src != dst) && (src_color == dst_color)) {
             return vertex_t{1};
           } else {
diff --git a/cpp/tests/components/vertex_coloring_test.cu b/cpp/tests/components/vertex_coloring_test.cu
index fed64f272d7..cf55146c5ba 100644
--- a/cpp/tests/components/vertex_coloring_test.cu
+++ b/cpp/tests/components/vertex_coloring_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,8 @@
 
 #include <raft/random/rng_state.hpp>
 
+#include <cuda/std/optional>
+
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -107,7 +109,7 @@ class Tests_SGGraphColoring
         cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(d_colors.data(),
                                                                                vertex_t{0}),
         cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+        [] __device__(auto src, auto dst, auto src_color, auto dst_color, cuda::std::nullopt_t) {
           if ((src != dst) && (src_color == dst_color)) {
             return uint8_t{1};
           } else {
@@ -142,7 +144,7 @@ class Tests_SGGraphColoring
                                                                                vertex_t{0}),
         cugraph::edge_dummy_property_t{}.view(),
         [renumber_map = (*sg_renumber_map).data()] __device__(
-          auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          auto src, auto dst, auto src_color, auto dst_color, cuda::std::nullopt_t) {
           if ((src != dst) && (src_color == dst_color)) {
             return vertex_t{1};
           } else {
diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index 63a785fb182..8796383f45d 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,11 +37,11 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -127,15 +127,16 @@ class Tests_MGCountIfE
       hr_timer.start("MG count_if_e");
     }
 
-    auto result = count_if_e(
-      *handle_,
-      mg_graph_view,
-      mg_src_prop.view(),
-      mg_dst_prop.view(),
-      cugraph::edge_dummy_property_t{}.view(),
-      [] __device__(auto row, auto col, auto src_property, auto dst_property, thrust::nullopt_t) {
-        return src_property < dst_property;
-      });
+    auto result =
+      count_if_e(*handle_,
+                 mg_graph_view,
+                 mg_src_prop.view(),
+                 mg_dst_prop.view(),
+                 cugraph::edge_dummy_property_t{}.view(),
+                 [] __device__(
+                   auto row, auto col, auto src_property, auto dst_property, cuda::std::nullopt_t) {
+                   return src_property < dst_property;
+                 });
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -179,7 +180,7 @@ class Tests_MGCountIfE
           sg_dst_prop.view(),
           cugraph::edge_dummy_property_t{}.view(),
           [] __device__(
-            auto row, auto col, auto src_property, auto dst_property, thrust::nullopt_t) {
+            auto row, auto col, auto src_property, auto dst_property, cuda::std::nullopt_t) {
             return src_property < dst_property;
           });
         ASSERT_TRUE(expected_result == result);
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index d3d6524cbdb..27e3f471c5b 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,11 +40,11 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -65,12 +65,12 @@ struct e_op_t {
                 std::is_same_v<output_payload_t, thrust::tuple<float, int32_t>>);
 
   using return_type =
-    thrust::optional<std::conditional_t<std::is_arithmetic_v<output_payload_t>,
-                                        thrust::tuple<vertex_t, vertex_t, int32_t>,
-                                        thrust::tuple<vertex_t, vertex_t, float, int32_t>>>;
+    cuda::std::optional<std::conditional_t<std::is_arithmetic_v<output_payload_t>,
+                                           thrust::tuple<vertex_t, vertex_t, int32_t>,
+                                           thrust::tuple<vertex_t, vertex_t, float, int32_t>>>;
 
   __device__ return_type operator()(
-    vertex_t src, vertex_t dst, property_t src_val, property_t dst_val, thrust::nullopt_t) const
+    vertex_t src, vertex_t dst, property_t src_val, property_t dst_val, cuda::std::nullopt_t) const
   {
     auto output_payload = static_cast<output_payload_t>(1);
     if (src_val < dst_val) {
@@ -82,7 +82,7 @@ struct e_op_t {
           src, dst, thrust::get<0>(output_payload), thrust::get<1>(output_payload));
       }
     } else {
-      return thrust::nullopt;
+      return cuda::std::nullopt;
     }
   }
 };
diff --git a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
index a8393d84e43..0c625da0a6d 100644
--- a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,11 +39,11 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -65,7 +65,7 @@ struct e_op_t {
   static_assert(std::is_same_v<output_payload_t, int32_t> ||
                 std::is_same_v<output_payload_t, thrust::tuple<float, int32_t>>);
 
-  using return_type = thrust::optional<typename std::conditional_t<
+  using return_type = cuda::std::optional<typename std::conditional_t<
     std::is_same_v<key_t, vertex_t>,
     std::conditional_t<std::is_arithmetic_v<output_payload_t>,
                        thrust::tuple<vertex_t, vertex_t, int32_t>,
@@ -78,7 +78,7 @@ struct e_op_t {
                                     vertex_t dst,
                                     property_t src_val,
                                     property_t dst_val,
-                                    thrust::nullopt_t) const
+                                    cuda::std::nullopt_t) const
   {
     auto output_payload = static_cast<output_payload_t>(1);
     if (src_val < dst_val) {
@@ -109,7 +109,7 @@ struct e_op_t {
         }
       }
     } else {
-      return thrust::nullopt;
+      return cuda::std::nullopt;
     }
   }
 };
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index 386fce24a87..30a53cd15a4 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,9 +41,9 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <gtest/gtest.h>
@@ -53,7 +53,7 @@
 template <typename vertex_t, typename bias_t>
 struct e_bias_op_t {
   __device__ bias_t
-  operator()(vertex_t, vertex_t, thrust::nullopt_t, thrust::nullopt_t, bias_t bias) const
+  operator()(vertex_t, vertex_t, cuda::std::nullopt_t, cuda::std::nullopt_t, bias_t bias) const
   {
     return bias;
   }
@@ -65,8 +65,11 @@ struct e_op_t {
                                                       cugraph::to_thrust_tuple(property_t{}),
                                                       cugraph::to_thrust_tuple(property_t{})));
 
-  __device__ result_t operator()(
-    vertex_t src, vertex_t dst, property_t src_prop, property_t dst_prop, thrust::nullopt_t) const
+  __device__ result_t operator()(vertex_t src,
+                                 vertex_t dst,
+                                 property_t src_prop,
+                                 property_t dst_prop,
+                                 cuda::std::nullopt_t) const
   {
     if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
       static_assert(thrust::tuple_size<property_t>::value == size_t{2});
@@ -401,18 +404,19 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
           thrust::make_counting_iterator(size_t{0}),
           thrust::make_counting_iterator(mg_aggregate_frontier_vertices.size()),
           [frontier_vertex_first = mg_aggregate_frontier_vertices.begin(),
-           sample_offsets = mg_aggregate_sample_offsets ? thrust::make_optional<size_t const*>(
+           sample_offsets = mg_aggregate_sample_offsets ? cuda::std::make_optional<size_t const*>(
                                                             (*mg_aggregate_sample_offsets).data())
-                                                        : thrust::nullopt,
+                                                        : cuda::std::nullopt,
            sample_e_op_result_first =
              cugraph::get_dataframe_buffer_begin(mg_aggregate_sample_e_op_results),
            sg_offsets = sg_offsets.begin(),
            sg_indices = sg_indices.begin(),
-           sg_biases  = sg_biases ? thrust::make_optional((*sg_biases).begin()) : thrust::nullopt,
-           K          = prims_usecase.K,
+           sg_biases =
+             sg_biases ? cuda::std::make_optional((*sg_biases).begin()) : cuda::std::nullopt,
+           K                = prims_usecase.K,
            with_replacement = prims_usecase.with_replacement,
-           invalid_value =
-             invalid_value ? thrust::make_optional<result_t>(*invalid_value) : thrust::nullopt,
+           invalid_value    = invalid_value ? cuda::std::make_optional<result_t>(*invalid_value)
+                                            : cuda::std::nullopt,
            property_transform =
              cugraph::test::detail::vertex_property_transform<vertex_t, property_t>{
                hash_bin_count}] __device__(size_t i) {
@@ -461,8 +465,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
               auto sg_nbr_first = sg_indices + *(sg_offsets + sg_src);
               auto sg_nbr_last  = sg_indices + *(sg_offsets + (sg_src + vertex_t{1}));
               auto sg_nbr_bias_first =
-                sg_biases ? thrust::make_optional((*sg_biases) + *(sg_offsets + sg_src))
-                          : thrust::nullopt;
+                sg_biases ? cuda::std::make_optional((*sg_biases) + *(sg_offsets + sg_src))
+                          : cuda::std::nullopt;
               if (sg_src != v) { return true; }
 
               if (sg_nbr_bias_first) {
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
index 3dd256544b4..040e0a6d716 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,12 +40,12 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/equal.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -218,7 +218,7 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
               cugraph::edge_dummy_property_t{}.view(),
               mg_dst_key.view(),
               mg_kv_store.view(),
-              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              key_aggregated_e_op_t<vertex_t, cuda::std::nullopt_t, result_t>{},
               property_initial_value,
               cugraph::reduce_op::plus<result_t>{},
               cugraph::get_dataframe_buffer_begin(mg_results[i]));
@@ -245,7 +245,7 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
               cugraph::edge_dummy_property_t{}.view(),
               mg_dst_key.view(),
               mg_kv_store.view(),
-              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              key_aggregated_e_op_t<vertex_t, cuda::std::nullopt_t, result_t>{},
               property_initial_value,
               cugraph::reduce_op::elementwise_minimum<result_t>{},
               cugraph::get_dataframe_buffer_begin(mg_results[i]));
@@ -272,7 +272,7 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
               cugraph::edge_dummy_property_t{}.view(),
               mg_dst_key.view(),
               mg_kv_store.view(),
-              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              key_aggregated_e_op_t<vertex_t, cuda::std::nullopt_t, result_t>{},
               property_initial_value,
               cugraph::reduce_op::elementwise_maximum<result_t>{},
               cugraph::get_dataframe_buffer_begin(mg_results[i]));
@@ -414,7 +414,7 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
                   cugraph::edge_dummy_property_t{}.view(),
                   sg_dst_key.view(),
                   sg_kv_store.view(),
-                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  key_aggregated_e_op_t<vertex_t, cuda::std::nullopt_t, result_t>{},
                   property_initial_value,
                   cugraph::reduce_op::plus<result_t>{},
                   cugraph::get_dataframe_buffer_begin(global_result));
@@ -441,7 +441,7 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
                   cugraph::edge_dummy_property_t{}.view(),
                   sg_dst_key.view(),
                   sg_kv_store.view(),
-                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  key_aggregated_e_op_t<vertex_t, cuda::std::nullopt_t, result_t>{},
                   property_initial_value,
                   cugraph::reduce_op::elementwise_minimum<result_t>{},
                   cugraph::get_dataframe_buffer_begin(global_result));
@@ -468,7 +468,7 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
                   cugraph::edge_dummy_property_t{}.view(),
                   sg_dst_key.view(),
                   sg_kv_store.view(),
-                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  key_aggregated_e_op_t<vertex_t, cuda::std::nullopt_t, result_t>{},
                   property_initial_value,
                   cugraph::reduce_op::elementwise_maximum<result_t>{},
                   cugraph::get_dataframe_buffer_begin(global_result));
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index 41830b3017c..57d77f6c4bd 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,12 +40,12 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/equal.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -62,7 +62,7 @@ struct e_op_t {
                                  vertex_t dst,
                                  result_t src_property,
                                  result_t dst_property,
-                                 thrust::nullopt_t) const
+                                 cuda::std::nullopt_t) const
   {
     if (src_property < dst_property) {
       return src_property;
diff --git a/cpp/tests/prims/mg_transform_e.cu b/cpp/tests/prims/mg_transform_e.cu
index 3984c7cd86b..c94637cc657 100644
--- a/cpp/tests/prims/mg_transform_e.cu
+++ b/cpp/tests/prims/mg_transform_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,10 +37,10 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <cuco/hash_functions.cuh>
@@ -177,7 +177,8 @@ class Tests_MGTransformE
         mg_src_prop.view(),
         mg_dst_prop.view(),
         cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+        [] __device__(
+          auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
           if (src_property < dst_property) {
             return src_property;
           } else {
@@ -192,7 +193,8 @@ class Tests_MGTransformE
         mg_src_prop.view(),
         mg_dst_prop.view(),
         cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+        [] __device__(
+          auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
           if (src_property < dst_property) {
             return src_property;
           } else {
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index e290f05e9e4..b5dcfaa7aa7 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,11 +38,11 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -138,7 +138,8 @@ class Tests_MGTransformReduceE
       mg_src_prop.view(),
       mg_dst_prop.view(),
       cugraph::edge_dummy_property_t{}.view(),
-      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+      [] __device__(
+        auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
         if (src_property < dst_property) {
           return src_property;
         } else {
@@ -189,7 +190,7 @@ class Tests_MGTransformReduceE
           sg_dst_prop.view(),
           cugraph::edge_dummy_property_t{}.view(),
           [] __device__(
-            auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+            auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
             if (src_property < dst_property) {
               return src_property;
             } else {
diff --git a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
index b050e314a15..830b48acade 100644
--- a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,11 +39,11 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -149,7 +149,8 @@ class Tests_MGTransformReduceEBySrcDstKey
       mg_dst_prop.view(),
       cugraph::edge_dummy_property_t{}.view(),
       mg_src_key.view(),
-      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+      [] __device__(
+        auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
         if (src_property < dst_property) {
           return src_property;
         } else {
@@ -179,7 +180,8 @@ class Tests_MGTransformReduceEBySrcDstKey
       mg_dst_prop.view(),
       cugraph::edge_dummy_property_t{}.view(),
       mg_dst_key.view(),
-      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+      [] __device__(
+        auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
         if (src_property < dst_property) {
           return src_property;
         } else {
@@ -279,7 +281,7 @@ class Tests_MGTransformReduceEBySrcDstKey
           cugraph::edge_dummy_property_t{}.view(),
           sg_src_key.view(),
           [] __device__(
-            auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+            auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
             if (src_property < dst_property) {
               return src_property;
             } else {
@@ -301,7 +303,7 @@ class Tests_MGTransformReduceEBySrcDstKey
           cugraph::edge_dummy_property_t{}.view(),
           sg_dst_key.view(),
           [] __device__(
-            auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+            auto src, auto dst, auto src_property, auto dst_property, cuda::std::nullopt_t) {
             if (src_property < dst_property) {
               return src_property;
             } else {
diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
index 085077017b3..acc89491e56 100644
--- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,11 +38,11 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -62,25 +62,25 @@ struct e_op_t {
                              vertex_t dst,
                              property_t src_val,
                              property_t dst_val,
-                             thrust::nullopt_t) const
+                             cuda::std::nullopt_t) const
   {
     if constexpr (std::is_same_v<key_t, vertex_t>) {
       if constexpr (std::is_same_v<payload_t, void>) {
-        return src_val < dst_val ? thrust::optional<std::byte>{std::byte{0}} /* dummy */
-                                 : thrust::nullopt;
+        return src_val < dst_val ? cuda::std::optional<std::byte>{std::byte{0}} /* dummy */
+                                 : cuda::std::nullopt;
       } else {
-        return src_val < dst_val ? thrust::optional<payload_t>{static_cast<payload_t>(1)}
-                                 : thrust::nullopt;
+        return src_val < dst_val ? cuda::std::optional<payload_t>{static_cast<payload_t>(1)}
+                                 : cuda::std::nullopt;
       }
     } else {
       auto tag = thrust::get<1>(optionally_tagged_src);
       if constexpr (std::is_same_v<payload_t, void>) {
-        return src_val < dst_val ? thrust::optional<decltype(tag)>{tag} : thrust::nullopt;
+        return src_val < dst_val ? cuda::std::optional<decltype(tag)>{tag} : cuda::std::nullopt;
       } else {
         return src_val < dst_val
-                 ? thrust::optional<thrust::tuple<decltype(tag), payload_t>>{thrust::make_tuple(
+                 ? cuda::std::optional<thrust::tuple<decltype(tag), payload_t>>{thrust::make_tuple(
                      tag, static_cast<payload_t>(1))}
-                 : thrust::nullopt;
+                 : cuda::std::nullopt;
       }
     }
   }
diff --git a/cpp/tests/prims/result_compare.cuh b/cpp/tests/prims/result_compare.cuh
index 5a1abb90e3c..7ee87d402cd 100644
--- a/cpp/tests/prims/result_compare.cuh
+++ b/cpp/tests/prims/result_compare.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/equal.h>
-#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <algorithm>
@@ -36,7 +36,7 @@ namespace detail {
 template <typename T>
 __host__ __device__ bool compare_arithmetic_scalar(T val0,
                                                    T val1,
-                                                   thrust::optional<T> threshold_ratio)
+                                                   cuda::std::optional<T> threshold_ratio)
 {
   if (threshold_ratio) {
     return std::abs(val0 - val1) <= (std::max(std::abs(val0), std::abs(val1)) * *threshold_ratio);
@@ -58,15 +58,16 @@ struct comparator {
       return detail::compare_arithmetic_scalar(
         t0,
         t1,
-        std::is_floating_point_v<T> ? thrust::optional<T>{threshold_ratio} : thrust::nullopt);
+        std::is_floating_point_v<T> ? cuda::std::optional<T>{threshold_ratio} : cuda::std::nullopt);
     } else {
-      auto val0   = thrust::get<0>(t0);
-      auto val1   = thrust::get<0>(t1);
-      auto passed = detail::compare_arithmetic_scalar(
-        val0,
-        val1,
-        std::is_floating_point_v<decltype(val0)> ? thrust::optional<decltype(val0)>{threshold_ratio}
-                                                 : thrust::nullopt);
+      auto val0 = thrust::get<0>(t0);
+      auto val1 = thrust::get<0>(t1);
+      auto passed =
+        detail::compare_arithmetic_scalar(val0,
+                                          val1,
+                                          std::is_floating_point_v<decltype(val0)>
+                                            ? cuda::std::optional<decltype(val0)>{threshold_ratio}
+                                            : cuda::std::nullopt);
       if (!passed) return false;
 
       if constexpr (thrust::tuple_size<T>::value >= 2) {
@@ -76,8 +77,8 @@ struct comparator {
           detail::compare_arithmetic_scalar(val0,
                                             val1,
                                             std::is_floating_point_v<decltype(val1)>
-                                              ? thrust::optional<decltype(val1)>{threshold_ratio}
-                                              : thrust::nullopt);
+                                              ? cuda::std::optional<decltype(val1)>{threshold_ratio}
+                                              : cuda::std::nullopt);
         if (!passed) return false;
       }
       if constexpr (thrust::tuple_size<T>::value >= 3) {
diff --git a/cpp/tests/sampling/detail/sampling_post_processing_validate.cu b/cpp/tests/sampling/detail/sampling_post_processing_validate.cu
index a0babc3b921..ac0523bbce7 100644
--- a/cpp/tests/sampling/detail/sampling_post_processing_validate.cu
+++ b/cpp/tests/sampling/detail/sampling_post_processing_validate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/equal.h>
@@ -383,26 +384,27 @@ bool compare_heterogeneous_edgelist(
       this_label_org_sorted_indices.begin(),
       this_label_org_sorted_indices.end(),
       [edge_types = org_edgelist_edge_types
-                      ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+                      ? cuda::std::make_optional<raft::device_span<edge_type_t const>>(
                           (*org_edgelist_edge_types).data() + label_start_offset,
                           label_end_offset - label_start_offset)
-                      : thrust::nullopt,
-       hops       = org_edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                      : cuda::std::nullopt,
+       hops       = org_edgelist_hops ? cuda::std::make_optional<raft::device_span<int32_t const>>(
                                     (*org_edgelist_hops).data() + label_start_offset,
                                     label_end_offset - label_start_offset)
-                                      : thrust::nullopt,
+                                      : cuda::std::nullopt,
        srcs       = raft::device_span<vertex_t const>(org_edgelist_srcs.data() + label_start_offset,
                                                 label_end_offset - label_start_offset),
        dsts       = raft::device_span<vertex_t const>(org_edgelist_dsts.data() + label_start_offset,
                                                 label_end_offset - label_start_offset),
-       weights    = org_edgelist_weights ? thrust::make_optional<raft::device_span<weight_t const>>(
+       weights = org_edgelist_weights ? cuda::std::make_optional<raft::device_span<weight_t const>>(
                                           (*org_edgelist_weights).data() + label_start_offset,
                                           label_end_offset - label_start_offset)
-                                         : thrust::nullopt,
-       edge_ids = org_edgelist_edge_ids ? thrust::make_optional<raft::device_span<edge_id_t const>>(
-                                            (*org_edgelist_edge_ids).data() + label_start_offset,
-                                            label_end_offset - label_start_offset)
-                                        : thrust::nullopt] __device__(size_t l_idx, size_t r_idx) {
+                                      : cuda::std::nullopt,
+       edge_ids = org_edgelist_edge_ids
+                    ? cuda::std::make_optional<raft::device_span<edge_id_t const>>(
+                        (*org_edgelist_edge_ids).data() + label_start_offset,
+                        label_end_offset - label_start_offset)
+                    : cuda::std::nullopt] __device__(size_t l_idx, size_t r_idx) {
         edge_type_t l_edge_type{0};
         edge_type_t r_edge_type{0};
         if (edge_types) {
@@ -673,15 +675,15 @@ bool compare_heterogeneous_edgelist(
              raft::device_span<vertex_t const>(this_edge_type_unrenumbered_edgelist_dsts.data(),
                                                this_edge_type_unrenumbered_edgelist_dsts.size()),
            weights  = renumbered_edgelist_weights
-                        ? thrust::make_optional<raft::device_span<weight_t const>>(
+                        ? cuda::std::make_optional<raft::device_span<weight_t const>>(
                            (*renumbered_edgelist_weights).data() + edge_type_start_offset,
                            edge_type_end_offset - edge_type_start_offset)
-                        : thrust::nullopt,
+                        : cuda::std::nullopt,
            edge_ids = renumbered_edgelist_edge_ids
-                        ? thrust::make_optional<raft::device_span<edge_id_t const>>(
+                        ? cuda::std::make_optional<raft::device_span<edge_id_t const>>(
                             (*renumbered_edgelist_edge_ids).data() + edge_type_start_offset,
                             edge_type_end_offset - edge_type_start_offset)
-                        : thrust::nullopt] __device__(size_t l_idx, size_t r_idx) {
+                        : cuda::std::nullopt] __device__(size_t l_idx, size_t r_idx) {
             vertex_t l_src = srcs[l_idx];
             vertex_t r_src = srcs[r_idx];
 
@@ -721,15 +723,15 @@ bool compare_heterogeneous_edgelist(
                raft::device_span<vertex_t const>(org_edgelist_dsts.data() + label_start_offset,
                                                  label_end_offset - label_start_offset),
              org_weights  = org_edgelist_weights
-                              ? thrust::make_optional<raft::device_span<weight_t const>>(
+                              ? cuda::std::make_optional<raft::device_span<weight_t const>>(
                                  (*org_edgelist_weights).data() + label_start_offset,
                                  label_end_offset - label_start_offset)
-                              : thrust::nullopt,
+                              : cuda::std::nullopt,
              org_edge_ids = org_edgelist_edge_ids
-                              ? thrust::make_optional<raft::device_span<edge_id_t const>>(
+                              ? cuda::std::make_optional<raft::device_span<edge_id_t const>>(
                                   (*org_edgelist_edge_ids).data() + label_start_offset,
                                   label_end_offset - label_start_offset)
-                              : thrust::nullopt,
+                              : cuda::std::nullopt,
              unrenumbered_srcs =
                raft::device_span<vertex_t const>(this_edge_type_unrenumbered_edgelist_srcs.data(),
                                                  this_edge_type_unrenumbered_edgelist_srcs.size()),
@@ -738,16 +740,16 @@ bool compare_heterogeneous_edgelist(
                                                  this_edge_type_unrenumbered_edgelist_dsts.size()),
              unrenumbered_weights =
                renumbered_edgelist_weights
-                 ? thrust::make_optional<raft::device_span<weight_t const>>(
+                 ? cuda::std::make_optional<raft::device_span<weight_t const>>(
                      (*renumbered_edgelist_weights).data() + edge_type_start_offset,
                      edge_type_end_offset - edge_type_start_offset)
-                 : thrust::nullopt,
+                 : cuda::std::nullopt,
              unrenumbered_edge_ids =
                unrenumbered_edgelist_edge_ids
-                 ? thrust::make_optional<raft::device_span<edge_id_t const>>(
+                 ? cuda::std::make_optional<raft::device_span<edge_id_t const>>(
                      (*unrenumbered_edgelist_edge_ids).data(),
                      (*unrenumbered_edgelist_edge_ids).size())
-                 : thrust::
+                 : cuda::std::
                      nullopt] __device__(size_t org_idx /* from label_start_offset */,
                                          size_t
                                            unrenumbered_idx /* from edge_type_start_offset */) {
diff --git a/cpp/tests/sampling/mg_negative_sampling.cpp b/cpp/tests/sampling/mg_negative_sampling.cpp
index 7c64bb7fbbb..eb9f4fbb394 100644
--- a/cpp/tests/sampling/mg_negative_sampling.cpp
+++ b/cpp/tests/sampling/mg_negative_sampling.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "cugraph/utilities/host_scalar_comm.hpp"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
 #include "utilities/property_generator_utilities.hpp"
@@ -85,8 +86,9 @@ class Tests_MGNegative_Sampling : public ::testing::TestWithParam<input_usecase_
 
     if (negative_sampling_usecase.edge_masking) { graph_view.attach_edge_mask(edge_mask_->view()); }
 
-    size_t num_samples =
-      graph_view.compute_number_of_edges(*handle_) * negative_sampling_usecase.sample_multiplier;
+    size_t num_samples = graph_view.compute_number_of_edges(*handle_) *
+                         negative_sampling_usecase.sample_multiplier /
+                         handle_->get_comms().get_size();
 
     rmm::device_uvector<weight_t> src_bias_v(0, handle_->get_stream());
     rmm::device_uvector<weight_t> dst_bias_v(0, handle_->get_stream());
@@ -150,26 +152,8 @@ class Tests_MGNegative_Sampling : public ::testing::TestWithParam<input_usecase_
                           raft::device_span<vertex_t>{src_out.data(), src_out.size()},
                           raft::device_span<vertex_t>{dst_out.data(), dst_out.size()});
 
-      // TODO:  Move this to validation_utilities...
-      auto h_vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
-      rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
-        h_vertex_partition_range_lasts.size(), handle_->get_stream());
-      raft::update_device(d_vertex_partition_range_lasts.data(),
-                          h_vertex_partition_range_lasts.data(),
-                          h_vertex_partition_range_lasts.size(),
-                          handle_->get_stream());
-
-      size_t error_count = cugraph::test::count_edges_on_wrong_int_gpu(
-        *handle_,
-        raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
-        raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
-        raft::device_span<vertex_t const>{d_vertex_partition_range_lasts.data(),
-                                          d_vertex_partition_range_lasts.size()});
-
-      ASSERT_EQ(error_count, 0) << "generate edges out of range > 0";
-
       if ((negative_sampling_usecase.remove_duplicates) && (src_out.size() > 0)) {
-        error_count = cugraph::test::count_duplicate_vertex_pairs_sorted(
+        size_t error_count = cugraph::test::count_duplicate_vertex_pairs_sorted(
           *handle_,
           raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
           raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
@@ -184,7 +168,7 @@ class Tests_MGNegative_Sampling : public ::testing::TestWithParam<input_usecase_
           cugraph::decompress_to_edgelist<vertex_t, edge_t, float, int, false, true>(
             *handle_, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
 
-        error_count = cugraph::test::count_intersection<vertex_t, edge_t, weight_t, int32_t>(
+        size_t error_count = cugraph::test::count_intersection<vertex_t, edge_t, weight_t, int32_t>(
           *handle_,
           raft::device_span<vertex_t const>{graph_src.data(), graph_src.size()},
           raft::device_span<vertex_t const>{graph_dst.data(), graph_dst.size()},
@@ -202,7 +186,9 @@ class Tests_MGNegative_Sampling : public ::testing::TestWithParam<input_usecase_
       if (negative_sampling_usecase.exact_number_of_samples) {
         size_t sz = cugraph::host_scalar_allreduce(
           handle_->get_comms(), src_out.size(), raft::comms::op_t::SUM, handle_->get_stream());
-        ASSERT_EQ(sz, num_samples) << "Expected exact number of samples";
+        size_t aggregate_sample_count = cugraph::host_scalar_allreduce(
+          handle_->get_comms(), num_samples, raft::comms::op_t::SUM, handle_->get_stream());
+        ASSERT_EQ(sz, aggregate_sample_count) << "Expected exact number of samples";
       }
 
       //  TBD: How do we determine if we have properly reflected the biases?
diff --git a/cpp/tests/utilities/check_utilities.hpp b/cpp/tests/utilities/check_utilities.hpp
index a22d95c87de..6974d14be04 100644
--- a/cpp/tests/utilities/check_utilities.hpp
+++ b/cpp/tests/utilities/check_utilities.hpp
@@ -97,7 +97,7 @@ struct device_nearly_equal {
   bool __device__ operator()(type_t lhs, type_t rhs) const
   {
     return std::abs(lhs - rhs) <
-           cuda::std::max(thrust::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+           cuda::std::max(cuda::std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
   }
 };
 
diff --git a/cpp/tests/utilities/property_generator_kernels.cuh b/cpp/tests/utilities/property_generator_kernels.cuh
index 78b22e0dac2..5c4bc00cdfa 100644
--- a/cpp/tests/utilities/property_generator_kernels.cuh
+++ b/cpp/tests/utilities/property_generator_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "prims/update_edge_src_dst_property.cuh"
 #include "utilities/property_generator_utilities.hpp"
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 #include <thrust/tuple.h>
 
 #include <cuco/hash_functions.cuh>
@@ -69,8 +69,11 @@ template <typename vertex_t, typename property_t>
 struct edge_property_transform {
   int32_t mod{};
 
-  constexpr __device__ property_t operator()(
-    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  constexpr __device__ property_t operator()(vertex_t src,
+                                             vertex_t dst,
+                                             cuda::std::nullopt_t,
+                                             cuda::std::nullopt_t,
+                                             cuda::std::nullopt_t) const
   {
     static_assert(cugraph::is_thrust_tuple_of_arithmetic<property_t>::value ||
                   std::is_arithmetic_v<property_t>);
diff --git a/dependencies.yaml b/dependencies.yaml
index 02fa03cff70..5419be9beda 100755
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: [conda]
     matrix:
-      cuda: ["11.8", "12.5"]
+      cuda: ["11.8", "12.8"]
       arch: [x86_64]
     includes:
       - checks
@@ -279,6 +279,10 @@ dependencies:
               cuda: "12.5"
             packages:
               - cuda-version=12.5
+          - matrix:
+              cuda: "12.8"
+            packages:
+              - cuda-version=12.8
   cuda:
     specific:
       - output_types: [conda]
@@ -300,7 +304,7 @@ dependencies:
               - cuda-nvtx
   common_build:
     common:
-      - output_types: [conda, pyproject]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4,!=3.30.0
           - ninja
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index 2edafe95716..6ac4ca142ee 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -223,6 +223,7 @@ def __get_call_groups(
         input_id: TensorType,
         seeds_per_call: int,
         assume_equal_input_size: bool = False,
+        label: Optional[TensorType] = None,
     ):
         torch = import_optional("torch")
 
@@ -231,6 +232,8 @@ def __get_call_groups(
         # many batches.
         seeds_call_groups = torch.split(seeds, seeds_per_call, dim=-1)
         index_call_groups = torch.split(input_id, seeds_per_call, dim=-1)
+        if label is not None:
+            label_call_groups = torch.split(label, seeds_per_call, dim=-1)
 
         # Need to add empties to the list of call groups to handle the case
         # where not all ranks have the same number of call groups.  This
@@ -251,8 +254,16 @@ def __get_call_groups(
                 [torch.tensor([], dtype=torch.int64, device=input_id.device)]
                 * (int(num_call_groups) - len(index_call_groups))
             )
+            if label is not None:
+                label_call_groups = list(label_call_groups) + (
+                    [torch.tensor([], dtype=label.dtype, device=label.device)]
+                    * (int(num_call_groups) - len(label_call_groups))
+                )
 
-        return seeds_call_groups, index_call_groups
+        if label is not None:
+            return seeds_call_groups, index_call_groups, label_call_groups
+        else:
+            return seeds_call_groups, index_call_groups
 
     def sample_from_nodes(
         self,
@@ -344,7 +355,7 @@ def sample_from_nodes(
     def __sample_from_edges_func(
         self,
         call_id: int,
-        current_seeds_and_ix: Tuple["torch.Tensor", "torch.Tensor"],
+        current_seeds_and_ix: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"],
         batch_id_start: int,
         batch_size: int,
         batches_per_call: int,
@@ -353,7 +364,7 @@ def __sample_from_edges_func(
     ) -> Union[None, Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]]:
         torch = import_optional("torch")
 
-        current_seeds, current_ix = current_seeds_and_ix
+        current_seeds, current_ix, current_label = current_seeds_and_ix
         num_seed_edges = current_ix.numel()
 
         # The index gets stored as-is regardless of what makes it into
@@ -468,6 +479,7 @@ def __sample_from_edges_func(
             random_state=random_state,
         )
         minibatch_dict["input_index"] = current_ix.cuda()
+        minibatch_dict["input_label"] = current_label.cuda()
         minibatch_dict["input_offsets"] = input_offsets
         minibatch_dict[
             "edge_inverse"
@@ -505,6 +517,7 @@ def sample_from_edges(
         random_state: int = 62,
         assume_equal_input_size: bool = False,
         input_id: Optional[TensorType] = None,
+        input_label: Optional[TensorType] = None,
     ) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
         """
         Performs sampling starting from seed edges.
@@ -527,6 +540,10 @@ def sample_from_edges(
             Input ids corresponding to the original batch tensor, if it
             was permuted prior to calling this function.  If present,
             will be saved with the samples.
+        input_label: Optional[TensorType]
+            Input labels corresponding to the input seeds.  Typically used
+            for link prediction sampling.  If present, will be saved with
+            the samples.  Generally not compatible with negative sampling.
         """
 
         torch = import_optional("torch")
@@ -545,12 +562,20 @@ def sample_from_edges(
             local_num_batches, assume_equal_input_size=assume_equal_input_size
         )
 
-        edges_call_groups, index_call_groups = self.__get_call_groups(
+        groups = self.__get_call_groups(
             edges,
             input_id,
             actual_seed_edges_per_call,
             assume_equal_input_size=input_size_is_equal,
+            label=input_label,
         )
+        if len(groups) == 2:
+            edges_call_groups, index_call_groups = groups
+            label_call_groups = [torch.tensor([], dtype=torch.int32)] * len(
+                edges_call_groups
+            )
+        else:
+            edges_call_groups, index_call_groups, label_call_groups = groups
 
         sample_args = [
             batch_id_start,
@@ -563,14 +588,14 @@ def sample_from_edges(
         if self.__writer is None:
             # Buffered sampling
             return BufferedSampleReader(
-                zip(edges_call_groups, index_call_groups),
+                zip(edges_call_groups, index_call_groups, label_call_groups),
                 self.__sample_from_edges_func,
                 *sample_args,
             )
         else:
             # Unbuffered sampling
             for i, current_seeds_and_ix in enumerate(
-                zip(edges_call_groups, index_call_groups)
+                zip(edges_call_groups, index_call_groups, label_call_groups)
             ):
                 sample_args[0] = self.__sample_from_edges_func(
                     i,
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index dfe3b085fdf..060d4ee1e99 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -72,7 +72,7 @@ build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.version = "CMakeLists.txt"
 minimum-version = "build-system.requires"
-ninja.make-fallback = true
+ninja.make-fallback = false
 sdist.reproducible = true
 wheel.packages = ["cugraph"]
 
diff --git a/python/libcugraph/pyproject.toml b/python/libcugraph/pyproject.toml
index a6191e28000..9d85bfa5dac 100644
--- a/python/libcugraph/pyproject.toml
+++ b/python/libcugraph/pyproject.toml
@@ -53,14 +53,14 @@ select = [
 ]
 
 # detect when package size grows significantly
-max_allowed_size_compressed = '1.2G'
+max_allowed_size_compressed = '1.4G'
 
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.version = "CMakeLists.txt"
 minimum-version = "build-system.requires"
-ninja.make-fallback = true
+ninja.make-fallback = false
 sdist.reproducible = true
 wheel.packages = ["libcugraph"]
 wheel.install-dir = "libcugraph"
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index ac124e1fd5f..3c50a79bfa3 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -57,7 +57,7 @@ build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.version = "CMakeLists.txt"
 minimum-version = "build-system.requires"
-ninja.make-fallback = true
+ninja.make-fallback = false
 sdist.reproducible = true
 wheel.packages = ["pylibcugraph"]