From 8623b697ac632f40d74f8773d646d6a804e89893 Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@uio.no>
Date: Wed, 14 Aug 2024 21:16:51 +0200
Subject: [PATCH] Remove broken and unused code path from L2 normalization
 (#166)

undefined
---
 CMakeLists.txt               |  7 -----
 cmake/sift_config.h.in       |  1 -
 src/popsift/s_desc_norm_l2.h | 58 +++++++++++-------------------------
 3 files changed, 17 insertions(+), 49 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de88e943..3e9138a8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,6 @@ option(PopSift_USE_NVTX_PROFILING     "Use CUDA NVTX for profiling." OFF)
 option(PopSift_ERRCHK_AFTER_KERNEL     "Synchronize and check CUDA error after every kernel." OFF)
 option(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON)
 option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON)
-option(PopSift_USE_NORMF "The __normf function computes Euclidean distance on large arrays. Fast but stability is uncertain." OFF)
 option(PopSift_NVCC_WARNINGS "Switch on several additional warning for CUDA nvcc" OFF)
 option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
@@ -141,12 +140,6 @@ set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 
-if(PopSift_USE_NORMF AND CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
-  set(PopSift_HAVE_NORMF   1)
-else()
-  set(PopSift_HAVE_NORMF   0)
-endif()
-
 if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
   set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
 else()
diff --git a/cmake/sift_config.h.in b/cmake/sift_config.h.in
index 86095a55..b6807983 100644
--- a/cmake/sift_config.h.in
+++ b/cmake/sift_config.h.in
@@ -12,7 +12,6 @@
 #define POPSIFT_IS_UNDEFINED(F) F() == 0
 
 #define POPSIFT_HAVE_SHFL_DOWN_SYNC()     @PopSift_HAVE_SHFL_DOWN_SYNC@
-#define POPSIFT_HAVE_NORMF()              @PopSift_HAVE_NORMF@
 #define POPSIFT_DISABLE_GRID_FILTER()     @DISABLE_GRID_FILTER@
 #define POPSIFT_USE_NVTX()                @PopSift_USE_NVTX@
 
diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h
index 3a7ed858..b067d71f 100644
--- a/src/popsift/s_desc_norm_l2.h
+++ b/src/popsift/s_desc_norm_l2.h
@@ -50,24 +50,10 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
     float4 descr;
     descr = ptr4[threadIdx.x];
 
-#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_NORMF)
-    // normf() is an elegant function: sqrt(sum_0^127{v^2})
-    // It exists from CUDA 7.5 but the trouble with CUB on the GTX 980 Ti forces
-    // us to with CUDA 7.0 right now
-
     float norm;
 
-    if( threadIdx.x == 0 ) {
-        norm = normf( 128, src_desc );
-    }
-    __syncthreads();
-    norm = popsift::shuffle( norm, 0 );
-
-    descr.x = min( descr.x, 0.2f*norm );
-    descr.y = min( descr.y, 0.2f*norm );
-    descr.z = min( descr.z, 0.2f*norm );
-    descr.w = min( descr.w, 0.2f*norm );
-
+    // 32 threads compute 4 squares each, then shuffle to performing a addition by
+    // reduction for the sum of 128 squares, result in thread 0
     norm = descr.x * descr.x
          + descr.y * descr.y
          + descr.z * descr.z
@@ -77,34 +63,25 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
     norm += popsift::shuffle_down( norm,  4 );
     norm += popsift::shuffle_down( norm,  2 );
     norm += popsift::shuffle_down( norm,  1 );
-    if( threadIdx.x == 0 ) {
-        // norm = __fsqrt_rn( norm );
-        // norm = __fdividef( 512.0f, norm );
-        norm = __frsqrt_rn( norm ); // inverse square root
-        norm = scalbnf( norm, d_consts.norm_multi );
-    }
-#else // not HAVE_NORMF
-    float norm;
 
-    norm = descr.x * descr.x
-         + descr.y * descr.y
-         + descr.z * descr.z
-         + descr.w * descr.w;
-    norm += popsift::shuffle_down( norm, 16 );
-    norm += popsift::shuffle_down( norm,  8 );
-    norm += popsift::shuffle_down( norm,  4 );
-    norm += popsift::shuffle_down( norm,  2 );
-    norm += popsift::shuffle_down( norm,  1 );
     if( threadIdx.x == 0 ) {
-        norm = __fsqrt_rn( norm );
+        // compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0
+        norm = __frsqrt_rn( norm );
     }
+
+    // spread the inverted norm from thread 0 to all threads in the warp
     norm = popsift::shuffle( norm,  0 );
 
-    descr.x = min( descr.x, 0.2f*norm );
-    descr.y = min( descr.y, 0.2f*norm );
-    descr.z = min( descr.z, 0.2f*norm );
-    descr.w = min( descr.w, 0.2f*norm );
+    // quasi-normalize all 128 floats
+    descr.x = min( descr.x*norm, 0.2f );
+    descr.y = min( descr.y*norm, 0.2f );
+    descr.z = min( descr.z*norm, 0.2f );
+    descr.w = min( descr.w*norm, 0.2f );
 
+    // Repeat the procedure, but also add a multiplier. E.g., if the user wants to
+    // descriptors as bytes rather than floats, multiply by 256 - or even by 512
+    // for better accuracy, which is OK because a point cannot be a keypoint if more
+    // than half of its gradient is in a single direction.
     norm = descr.x * descr.x
          + descr.y * descr.y
          + descr.z * descr.z
@@ -114,13 +91,12 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
     norm += popsift::shuffle_down( norm,  4 );
     norm += popsift::shuffle_down( norm,  2 );
     norm += popsift::shuffle_down( norm,  1 );
+
     if( threadIdx.x == 0 ) {
-        // norm = __fsqrt_rn( norm );
-        // norm = __fdividef( 512.0f, norm );
         norm = __frsqrt_rn( norm ); // inverse square root
         norm = scalbnf( norm, d_consts.norm_multi );
     }
-#endif // HAVE_NORMF
+
     norm = popsift::shuffle( norm,  0 );
 
     descr.x = descr.x * norm;