diff --git a/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h
index b536a87fb63..9d9c7b7c8d4 100644
--- a/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h
@@ -27,6 +27,8 @@
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
+#if _CCCL_HAS_CUDA_COMPILER
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 /*************************************************************
@@ -101,9 +103,9 @@ inline _CCCL_DEVICE _Tp* __from_ptr_gmem(_CUDA_VSTD::size_t __ptr)
 template <typename _Tp>
 inline _CCCL_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val)
 {
-#if _CCCL_STD_VER >= 2017
+#  if _CCCL_STD_VER >= 2017
   static_assert(sizeof(_Tp) == 4, "");
-#endif // _CCCL_STD_VER >= 2017
+#  endif // _CCCL_STD_VER >= 2017
   // Consider using std::bitcast
   return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val);
 }
@@ -111,13 +113,15 @@ inline _CCCL_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val)
 template <typename _Tp>
 inline _CCCL_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val)
 {
-#if _CCCL_STD_VER >= 2017
+#  if _CCCL_STD_VER >= 2017
   static_assert(sizeof(_Tp) == 8, "");
-#endif // _CCCL_STD_VER >= 2017
+#  endif // _CCCL_STD_VER >= 2017
   // Consider using std::bitcast
   return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
+#endif // _CCCL_HAS_CUDA_COMPILER
+
 #endif // _CUDA_PTX_HELPER_FUNCTIONS_H_
diff --git a/libcudacxx/include/cuda/discard_memory b/libcudacxx/include/cuda/discard_memory
index 6da2ea209c4..5177b7ee407 100644
--- a/libcudacxx/include/cuda/discard_memory
+++ b/libcudacxx/include/cuda/discard_memory
@@ -21,11 +21,12 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbytes) noexcept
+inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, _CUDA_VSTD::size_t __nbytes) noexcept
 {
   // The discard PTX instruction is only available with PTX ISA 7.4 and later
 #if __cccl_ptx_isa < 740ULL
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index d034c931644..c9ee75ae111 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -141,6 +141,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__memcpy_async/completion_mechanism.h>
+#include <cuda/__memcpy_async/memcpy_async_barrier.h>
 #include <cuda/atomic>
 #include <cuda/barrier>
 #include <cuda/std/chrono>
diff --git a/libcudacxx/include/cuda/std/__exception/cuda_error.h b/libcudacxx/include/cuda/std/__exception/cuda_error.h
index 40af7d6c3e6..fdc32cf0571 100644
--- a/libcudacxx/include/cuda/std/__exception/cuda_error.h
+++ b/libcudacxx/include/cuda/std/__exception/cuda_error.h
@@ -22,10 +22,6 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CUDA_COMPILER(CLANG)
-#  include <cuda_runtime_api.h>
-#endif // _CCCL_CUDA_COMPILER(CLANG)
-
 #include <cuda/std/__exception/terminate.h>
 
 #if !_CCCL_COMPILER(NVRTC)
@@ -40,8 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 /**
  * @brief Exception thrown when a CUDA error is encountered.
  */
-#if _CCCL_HAS_CUDA_COMPILER
-#  ifndef _CCCL_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 class cuda_error : public ::std::runtime_error
 {
 private:
@@ -50,37 +45,36 @@ class cuda_error : public ::std::runtime_error
     char __buffer[256];
   };
 
-  static char* __format_cuda_error(::cudaError_t __status, const char* __msg, char* __msg_buffer) noexcept
+  static char* __format_cuda_error(const int __status, const char* __msg, char* __msg_buffer) noexcept
   {
     ::snprintf(__msg_buffer, 256, "cudaError %d: %s", __status, __msg);
     return __msg_buffer;
   }
 
 public:
-  cuda_error(::cudaError_t __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept
+  cuda_error(const int __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept
       : ::std::runtime_error(__format_cuda_error(__status, __msg, __msg_buffer.__buffer))
   {}
 };
 
-_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t __status, const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int __status, const char* __msg)
 {
   NV_IF_ELSE_TARGET(NV_IS_HOST,
                     (throw ::cuda::cuda_error(__status, __msg);),
                     ((void) __status; (void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
 }
-#  else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
 class cuda_error
 {
 public:
-  _LIBCUDACXX_HIDE_FROM_ABI cuda_error(::cudaError_t, const char*) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI cuda_error(const int, const char*) noexcept {}
 };
 
-_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t, const char*)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int, const char*)
 {
   _CUDA_VSTD_NOVERSION::terminate();
 }
-#  endif // _CCCL_NO_EXCEPTIONS
-#endif // _CCCL_CUDA_COMPILER
+#endif // _CCCL_NO_EXCEPTIONS
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
diff --git a/libcudacxx/include/cuda/std/detail/__access_property b/libcudacxx/include/cuda/std/detail/__access_property
index 9ab8eac51d8..8d85b166938 100644
--- a/libcudacxx/include/cuda/std/detail/__access_property
+++ b/libcudacxx/include/cuda/std/detail/__access_property
@@ -129,6 +129,8 @@
  * (v. August 20, 2021)
  */
 
+#include <cuda_runtime_api.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 namespace __detail_ap
@@ -136,12 +138,12 @@ namespace __detail_ap
 
 _CCCL_HOST_DEVICE constexpr uint32_t __ap_floor_log2(uint32_t __x)
 {
-  return (__x == 1 | __x == 0) ? 0 : 1 + __ap_floor_log2(__x >> 1);
+  return ((__x == 1) || (__x == 0)) ? 0 : 1 + __ap_floor_log2(__x >> 1);
 }
 
 _CCCL_HOST_DEVICE constexpr uint32_t __ap_ceil_log2(uint32_t __x)
 {
-  return (__x == 1 | __x == 0) ? 0 : __ap_floor_log2(__x - 1) + 1;
+  return ((__x == 1) || (__x == 0)) ? 0 : __ap_floor_log2(__x - 1) + 1;
 }
 
 _CCCL_HOST_DEVICE constexpr uint32_t __ap_min(uint32_t __a, uint32_t __b) noexcept
@@ -429,7 +431,7 @@ _CCCL_HOST_DEVICE constexpr std::uint64_t __block(
   cudaAccessProperty __hit_prop,
   cudaAccessProperty __miss_prop = cudaAccessPropertyNormal)
 {
-  return (__total_bytes <= (size_t{0xFFFFFFFF}) & __total_bytes != 0 & __hit_bytes <= __total_bytes)
+  return (__total_bytes <= (size_t{0xFFFFFFFF}) && __total_bytes != 0 && __hit_bytes <= __total_bytes)
          ? __sm_80::__block_descriptor_builder(
              reinterpret_cast<std::uintptr_t>(__ptr),
              __hit_bytes,
diff --git a/libcudacxx/include/cuda/std/detail/__annotated_ptr b/libcudacxx/include/cuda/std/detail/__annotated_ptr
index 1991fdab2e2..7a477245cc4 100644
--- a/libcudacxx/include/cuda/std/detail/__annotated_ptr
+++ b/libcudacxx/include/cuda/std/detail/__annotated_ptr
@@ -137,15 +137,16 @@ namespace __detail_ap
 template <typename _Property>
 _CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
 {
+#if _CCCL_HAS_CUDA_COMPILER
   if (std::is_same<_Property, access_property::shared>::value == true)
   {
     bool __b = __isShared(__ptr);
     _CCCL_ASSERT(__b, "");
-#if defined(_CCCL_BUILTIN_ASSUME)
+#  if defined(_CCCL_BUILTIN_ASSUME)
     _CCCL_BUILTIN_ASSUME(__b);
-#else // ^^^ _CCCL_BUILTIN_ASSUME ^^^ / vvv !_CCCL_BUILTIN_ASSUME vvv
+#  else // ^^^ _CCCL_BUILTIN_ASSUME ^^^ / vvv !_CCCL_BUILTIN_ASSUME vvv
     (void) __b;
-#endif // !_CCCL_BUILTIN_ASSUME
+#  endif // !_CCCL_BUILTIN_ASSUME
   }
   else if (std::is_same<_Property, access_property::global>::value == true
            || std::is_same<_Property, access_property::normal>::value == true
@@ -155,12 +156,13 @@ _CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
   {
     bool __b = __isGlobal(__ptr);
     _CCCL_ASSERT(__b, "");
-#if defined(_CCCL_BUILTIN_ASSUME)
+#  if defined(_CCCL_BUILTIN_ASSUME)
     _CCCL_BUILTIN_ASSUME(__b);
-#else // ^^^ !_CCCL_BUILTIN_ASSUME ^^^ / vvv _CCCL_BUILTIN_ASSUME vvv
+#  else // ^^^ !_CCCL_BUILTIN_ASSUME ^^^ / vvv _CCCL_BUILTIN_ASSUME vvv
     (void) __b;
-#endif // !_CCCL_BUILTIN_ASSUME
+#  endif // !_CCCL_BUILTIN_ASSUME
   }
+#endif // _CCCL_HAS_CUDA_COMPILER
 
   return __ptr;
 }
@@ -174,6 +176,7 @@ _CCCL_DEVICE void* __associate_descriptor(void* __ptr, __Prop __prop)
 template <>
 inline _CCCL_DEVICE void* __associate_descriptor(void* __ptr, std::uint64_t __prop)
 {
+  (void) __prop;
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __nv_associate_access_property(__ptr, __prop);), (return __ptr;))
 }
 
diff --git a/libcudacxx/include/cuda/stream_ref b/libcudacxx/include/cuda/stream_ref
index a8b044909eb..857a35f6da4 100644
--- a/libcudacxx/include/cuda/stream_ref
+++ b/libcudacxx/include/cuda/stream_ref
@@ -38,9 +38,6 @@ private:
 }  // cuda
 */
 
-#include <cuda_runtime_api.h>
-// cuda_runtime_api needs to come first
-
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -51,6 +48,8 @@ private:
 #  pragma system_header
 #endif // no system header
 
+#include <cuda_runtime_api.h>
+
 #include <cuda/std/__cuda/api_wrapper.h>
 #include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/cstddef>
diff --git a/libcudacxx/test/public_headers_host_only/CMakeLists.txt b/libcudacxx/test/public_headers_host_only/CMakeLists.txt
index 1bc51d17e10..3cc44d510c0 100644
--- a/libcudacxx/test/public_headers_host_only/CMakeLists.txt
+++ b/libcudacxx/test/public_headers_host_only/CMakeLists.txt
@@ -2,12 +2,18 @@
 # without anything else but also pretents to be a std header
 add_custom_target(libcudacxx.test.public_headers_host_only)
 
+if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  find_package(NVHPC)
+else()
+  find_package(CUDAToolkit)
+endif()
+
 # Grep all public headers
 file(GLOB public_headers_host_only
   LIST_DIRECTORIES false
   RELATIVE "${libcudacxx_SOURCE_DIR}/include/"
   CONFIGURE_DEPENDS
-  "${libcudacxx_SOURCE_DIR}/include/cuda/std/*"
+  "${libcudacxx_SOURCE_DIR}/include/cuda/*"
 )
 
 # mdspan is currently not supported on msvc outside of C++20
@@ -36,6 +42,13 @@ function(libcudacxx_add_std_header_test header)
     target_compile_definitions(headertest_std_${header_name} PRIVATE CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING)
   endif()
 
+  # We want to ensure that we can build headers within <cuda/> with a host compiler but we need cuda_runtime_api.h
+  if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    target_link_libraries(headertest_std_${header_name} NVHPC::CUDART)
+  else()
+    target_link_libraries(headertest_std_${header_name} CUDA::cudart)
+  endif()
+
   add_dependencies(libcudacxx.test.public_headers_host_only headertest_std_${header_name})
 endfunction()