diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h index 279253ef559..4b40d0e1330 100644 --- a/libcudacxx/include/cuda/std/__bit/clz.h +++ b/libcudacxx/include/cuda/std/__bit/clz.h @@ -104,15 +104,18 @@ _LIBCUDACXX_HIDE_FROM_ABI int __runtime_clz(_Tp __x) noexcept NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return sizeof(_Tp) == sizeof(uint32_t) ? __clz(static_cast(__x)) // : __clzll(static_cast(__x));), - (return _CUDA_VSTD::__host_runtime_clz(__x);)) + (return __x == 0 ? numeric_limits<_Tp>::digits : _CUDA_VSTD::__host_runtime_clz(__x);)) } +// __cccl_clz returns numeric_limits<_Tp>::digits if __x == 0 on both host and device template _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(_Tp __x) noexcept { static_assert(is_same_v<_Tp, uint32_t> || is_same_v<_Tp, uint64_t>); #if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) - return is_constant_evaluated() ? _CUDA_VSTD::__constexpr_clz(__x) : _CUDA_VSTD::__runtime_clz(__x); + return is_constant_evaluated() + ? (__x == 0 ? numeric_limits<_Tp>::digits : _CUDA_VSTD::__constexpr_clz(__x)) + : _CUDA_VSTD::__runtime_clz(__x); #else return _CUDA_VSTD::__constexpr_clz(__x); #endif diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h index e59a968fcb0..b1060e7c332 100644 --- a/libcudacxx/include/cuda/std/__bit/countl.h +++ b/libcudacxx/include/cuda/std/__bit/countl.h @@ -40,12 +40,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept { return numeric_limits<_Tp>::digits; } - using _Sp = _If; - auto __clz_result = _CUDA_VSTD::__cccl_clz(static_cast<_Sp>(__t)); - __clz_result -= numeric_limits<_Sp>::digits - numeric_limits<_Tp>::digits; - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (return __clz_result;), // if __t == 0 __clz_result is already equal to numeric_limits<_Tp>::digits - (return __t == 0 ? numeric_limits<_Tp>::digits : __clz_result;)) + using _Sp = _If; + constexpr auto __digits_diff = numeric_limits<_Sp>::digits - numeric_limits<_Tp>::digits; + return _CUDA_VSTD::__cccl_clz(static_cast<_Sp>(__t)) - __digits_diff; } _CCCL_TEMPLATE(class _Tp) diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h index aaa1fa4a32c..fb81f44b4db 100644 --- a/libcudacxx/include/cuda/std/__bit/countr.h +++ b/libcudacxx/include/cuda/std/__bit/countr.h @@ -40,12 +40,8 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept { return numeric_limits<_Tp>::digits; } - using _Sp = _If; - auto __ctz_result = _CUDA_VSTD::__cccl_ctz(static_cast<_Sp>(__t)); - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - // if __t == 0 __ctz_result is already equal to numeric_limits<_Tp>::digits - (return sizeof(_Tp) < sizeof(uint32_t) && __t == 0 ? numeric_limits<_Tp>::digits : __ctz_result;), - (return __t == 0 ? numeric_limits<_Tp>::digits : __ctz_result;)) + using _Sp = _If; + return _CUDA_VSTD::__cccl_ctz(static_cast<_Sp>(__t)); } _CCCL_TEMPLATE(class _Tp) diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h index ded2f316a3d..66b8c20f30f 100644 --- a/libcudacxx/include/cuda/std/__bit/ctz.h +++ b/libcudacxx/include/cuda/std/__bit/ctz.h @@ -114,15 +114,18 @@ _LIBCUDACXX_HIDE_FROM_ABI int __runtime_ctz(_Tp __x) noexcept NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return sizeof(_Tp) == sizeof(uint32_t) ? __clz(__brev(static_cast(__x))) : __clzll(__brevll(static_cast(__x)));), - (return _CUDA_VSTD::__host_runtime_ctz(__x);)) + (return __x == 0 ? numeric_limits<_Tp>::digits : _CUDA_VSTD::__host_runtime_ctz(__x);)) } +// __cccl_clz returns numeric_limits<_Tp>::digits if __x == 0 on both host and device template _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(_Tp __x) noexcept { static_assert(is_same_v<_Tp, uint32_t> || is_same_v<_Tp, uint64_t>); #if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) - return is_constant_evaluated() ? _CUDA_VSTD::__constexpr_ctz(__x) : _CUDA_VSTD::__runtime_ctz(__x); + return is_constant_evaluated() + ? (__x == 0 ? numeric_limits<_Tp>::digits : _CUDA_VSTD::__constexpr_ctz(__x)) + : _CUDA_VSTD::__runtime_ctz(__x); #else NV_IF_ELSE_TARGET(NV_IS_DEVICE, // (return _CUDA_VSTD::__constexpr_ctz(__x);),