diff --git a/CMakeLists.txt b/CMakeLists.txt index de88e943..3e9138a8 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,6 @@ option(PopSift_USE_NVTX_PROFILING "Use CUDA NVTX for profiling." OFF) option(PopSift_ERRCHK_AFTER_KERNEL "Synchronize and check CUDA error after every kernel." OFF) option(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON) option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON) -option(PopSift_USE_NORMF "The __normf function computes Euclidean distance on large arrays. Fast but stability is uncertain." OFF) option(PopSift_NVCC_WARNINGS "Switch on several additional warning for CUDA nvcc" OFF) option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF) option(BUILD_SHARED_LIBS "Build shared libraries" ON) @@ -141,12 +140,6 @@ set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD}) set(CMAKE_CUDA_STANDARD_REQUIRED ON) -if(PopSift_USE_NORMF AND CUDA_VERSION VERSION_GREATER_EQUAL "7.5") - set(PopSift_HAVE_NORMF 1) -else() - set(PopSift_HAVE_NORMF 0) -endif() - if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0") set(PopSift_HAVE_SHFL_DOWN_SYNC 1) else() diff --git a/cmake/sift_config.h.in b/cmake/sift_config.h.in index 86095a55..b6807983 100644 --- a/cmake/sift_config.h.in +++ b/cmake/sift_config.h.in @@ -12,7 +12,6 @@ #define POPSIFT_IS_UNDEFINED(F) F() == 0 #define POPSIFT_HAVE_SHFL_DOWN_SYNC() @PopSift_HAVE_SHFL_DOWN_SYNC@ -#define POPSIFT_HAVE_NORMF() @PopSift_HAVE_NORMF@ #define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@ #define POPSIFT_USE_NVTX() @PopSift_USE_NVTX@ diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h index 3a7ed858..b067d71f 100644 --- a/src/popsift/s_desc_norm_l2.h +++ b/src/popsift/s_desc_norm_l2.h @@ -50,24 +50,10 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool float4 descr; descr = ptr4[threadIdx.x]; -#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_NORMF) - // normf() is an elegant function: sqrt(sum_0^127{v^2}) - // It exists from CUDA 7.5 but the trouble with CUB on the GTX 980 Ti forces - // us to with CUDA 7.0 right now - float norm; - if( threadIdx.x == 0 ) { - norm = normf( 128, src_desc ); - } - __syncthreads(); - norm = popsift::shuffle( norm, 0 ); - - descr.x = min( descr.x, 0.2f*norm ); - descr.y = min( descr.y, 0.2f*norm ); - descr.z = min( descr.z, 0.2f*norm ); - descr.w = min( descr.w, 0.2f*norm ); - + // 32 threads compute 4 squares each, then shuffle to performing a addition by + // reduction for the sum of 128 squares, result in thread 0 norm = descr.x * descr.x + descr.y * descr.y + descr.z * descr.z @@ -77,34 +63,25 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool norm += popsift::shuffle_down( norm, 4 ); norm += popsift::shuffle_down( norm, 2 ); norm += popsift::shuffle_down( norm, 1 ); - if( threadIdx.x == 0 ) { - // norm = __fsqrt_rn( norm ); - // norm = __fdividef( 512.0f, norm ); - norm = __frsqrt_rn( norm ); // inverse square root - norm = scalbnf( norm, d_consts.norm_multi ); - } -#else // not HAVE_NORMF - float norm; - norm = descr.x * descr.x - + descr.y * descr.y - + descr.z * descr.z - + descr.w * descr.w; - norm += popsift::shuffle_down( norm, 16 ); - norm += popsift::shuffle_down( norm, 8 ); - norm += popsift::shuffle_down( norm, 4 ); - norm += popsift::shuffle_down( norm, 2 ); - norm += popsift::shuffle_down( norm, 1 ); if( threadIdx.x == 0 ) { - norm = __fsqrt_rn( norm ); + // compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0 + norm = __frsqrt_rn( norm ); } + + // spread the inverted norm from thread 0 to all threads in the warp norm = popsift::shuffle( norm, 0 ); - descr.x = min( descr.x, 0.2f*norm ); - descr.y = min( descr.y, 0.2f*norm ); - descr.z = min( descr.z, 0.2f*norm ); - descr.w = min( descr.w, 0.2f*norm ); + // quasi-normalize all 128 floats + descr.x = min( descr.x*norm, 0.2f ); + descr.y = min( descr.y*norm, 0.2f ); + descr.z = min( descr.z*norm, 0.2f ); + descr.w = min( descr.w*norm, 0.2f ); + // Repeat the procedure, but also add a multiplier. E.g., if the user wants to + // descriptors as bytes rather than floats, multiply by 256 - or even by 512 + // for better accuracy, which is OK because a point cannot be a keypoint if more + // than half of its gradient is in a single direction. norm = descr.x * descr.x + descr.y * descr.y + descr.z * descr.z @@ -114,13 +91,12 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool norm += popsift::shuffle_down( norm, 4 ); norm += popsift::shuffle_down( norm, 2 ); norm += popsift::shuffle_down( norm, 1 ); + if( threadIdx.x == 0 ) { - // norm = __fsqrt_rn( norm ); - // norm = __fdividef( 512.0f, norm ); norm = __frsqrt_rn( norm ); // inverse square root norm = scalbnf( norm, d_consts.norm_multi ); } -#endif // HAVE_NORMF + norm = popsift::shuffle( norm, 0 ); descr.x = descr.x * norm;