Skip to content

Commit

Permalink
Merge branch 'develop' into dev/add-cuda-managed-memory
Browse files Browse the repository at this point in the history
  • Loading branch information
Carsten Griwodz committed Aug 15, 2024
2 parents 23811c2 + 8623b69 commit 29f8f35
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 49 deletions.
7 changes: 0 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ option(PopSift_USE_NVTX_PROFILING "Use CUDA NVTX for profiling." OFF)
option(PopSift_ERRCHK_AFTER_KERNEL "Synchronize and check CUDA error after every kernel." OFF)
option(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON)
option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON)
option(PopSift_USE_NORMF "The __normf function computes Euclidean distance on large arrays. Fast but stability is uncertain." OFF)
option(PopSift_NVCC_WARNINGS "Switch on several additional warning for CUDA nvcc" OFF)
option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF)
option(BUILD_SHARED_LIBS "Build shared libraries" ON)
Expand Down Expand Up @@ -141,12 +140,6 @@ set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED ON)


if(PopSift_USE_NORMF AND CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
set(PopSift_HAVE_NORMF 1)
else()
set(PopSift_HAVE_NORMF 0)
endif()

if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
else()
Expand Down
1 change: 0 additions & 1 deletion cmake/sift_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#define POPSIFT_IS_UNDEFINED(F) F() == 0

#define POPSIFT_HAVE_SHFL_DOWN_SYNC() @PopSift_HAVE_SHFL_DOWN_SYNC@
#define POPSIFT_HAVE_NORMF() @PopSift_HAVE_NORMF@
#define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@
#define POPSIFT_USE_NVTX() @PopSift_USE_NVTX@

58 changes: 17 additions & 41 deletions src/popsift/s_desc_norm_l2.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,24 +50,10 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
float4 descr;
descr = ptr4[threadIdx.x];

#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_NORMF)
// normf() is an elegant function: sqrt(sum_0^127{v^2})
// It exists from CUDA 7.5 but the trouble with CUB on the GTX 980 Ti forces
// us to with CUDA 7.0 right now

float norm;

if( threadIdx.x == 0 ) {
norm = normf( 128, src_desc );
}
__syncthreads();
norm = popsift::shuffle( norm, 0 );

descr.x = min( descr.x, 0.2f*norm );
descr.y = min( descr.y, 0.2f*norm );
descr.z = min( descr.z, 0.2f*norm );
descr.w = min( descr.w, 0.2f*norm );

// 32 threads compute 4 squares each, then shuffle to performing a addition by
// reduction for the sum of 128 squares, result in thread 0
norm = descr.x * descr.x
+ descr.y * descr.y
+ descr.z * descr.z
Expand All @@ -77,34 +63,25 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
norm += popsift::shuffle_down( norm, 4 );
norm += popsift::shuffle_down( norm, 2 );
norm += popsift::shuffle_down( norm, 1 );
if( threadIdx.x == 0 ) {
// norm = __fsqrt_rn( norm );
// norm = __fdividef( 512.0f, norm );
norm = __frsqrt_rn( norm ); // inverse square root
norm = scalbnf( norm, d_consts.norm_multi );
}
#else // not HAVE_NORMF
float norm;

norm = descr.x * descr.x
+ descr.y * descr.y
+ descr.z * descr.z
+ descr.w * descr.w;
norm += popsift::shuffle_down( norm, 16 );
norm += popsift::shuffle_down( norm, 8 );
norm += popsift::shuffle_down( norm, 4 );
norm += popsift::shuffle_down( norm, 2 );
norm += popsift::shuffle_down( norm, 1 );
if( threadIdx.x == 0 ) {
norm = __fsqrt_rn( norm );
// compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0
norm = __frsqrt_rn( norm );
}

// spread the inverted norm from thread 0 to all threads in the warp
norm = popsift::shuffle( norm, 0 );

descr.x = min( descr.x, 0.2f*norm );
descr.y = min( descr.y, 0.2f*norm );
descr.z = min( descr.z, 0.2f*norm );
descr.w = min( descr.w, 0.2f*norm );
// quasi-normalize all 128 floats
descr.x = min( descr.x*norm, 0.2f );
descr.y = min( descr.y*norm, 0.2f );
descr.z = min( descr.z*norm, 0.2f );
descr.w = min( descr.w*norm, 0.2f );

// Repeat the procedure, but also add a multiplier. E.g., if the user wants to
// descriptors as bytes rather than floats, multiply by 256 - or even by 512
// for better accuracy, which is OK because a point cannot be a keypoint if more
// than half of its gradient is in a single direction.
norm = descr.x * descr.x
+ descr.y * descr.y
+ descr.z * descr.z
Expand All @@ -114,13 +91,12 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
norm += popsift::shuffle_down( norm, 4 );
norm += popsift::shuffle_down( norm, 2 );
norm += popsift::shuffle_down( norm, 1 );

if( threadIdx.x == 0 ) {
// norm = __fsqrt_rn( norm );
// norm = __fdividef( 512.0f, norm );
norm = __frsqrt_rn( norm ); // inverse square root
norm = scalbnf( norm, d_consts.norm_multi );
}
#endif // HAVE_NORMF

norm = popsift::shuffle( norm, 0 );

descr.x = descr.x * norm;
Expand Down

0 comments on commit 29f8f35

Please sign in to comment.