diff --git a/cmake/pcl_find_cuda.cmake b/cmake/pcl_find_cuda.cmake
index 26b6d908d5d..466d033feac 100644
--- a/cmake/pcl_find_cuda.cmake
+++ b/cmake/pcl_find_cuda.cmake
@@ -12,6 +12,7 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.11)
 
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.17)
       find_package(CUDAToolkit QUIET)
+      set(CUDA_TOOLKIT_INCLUDE ${CUDAToolkit_INCLUDE_DIRS})
     else()
       set(CUDA_FIND_QUIETLY TRUE)
       find_package(CUDA 9.0)
diff --git a/gpu/features/src/centroid.cu b/gpu/features/src/centroid.cu
index 5e73df35d80..8ba0f80eefc 100644
--- a/gpu/features/src/centroid.cu
+++ b/gpu/features/src/centroid.cu
@@ -36,6 +36,7 @@
 
 #include "internal.hpp"
 
+#include <thrust/tuple.h>
 #include <thrust/device_ptr.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/iterator/permutation_iterator.h>
@@ -56,26 +57,26 @@ namespace pcl
 
         struct PlusFloat3
         {
-            __device__ __forceinline__ float3 operator()(const float3& e1, const float3& e2) const 
-            { 
-                return make_float3(e1.x + e2.x, e1.y + e2.y, e1.z + e2.z); 
+            __device__ __forceinline__ float3 operator()(const float3& e1, const float3& e2) const
+            {
+                return make_float3(e1.x + e2.x, e1.y + e2.y, e1.z + e2.z);
             }
         };
-        
+
         struct TupleDistCvt
         {
             float3 pivot_;
             TupleDistCvt(const float3& pivot) : pivot_(pivot) {}
-            __device__ __forceinline__ thrust::tuple<float, int> operator()(const thrust::tuple<float4, int>& t) const 
-            { 
-                float4 point = t.get<0>();
+            __device__ __forceinline__ thrust::tuple<float, int> operator()(const thrust::tuple<float4, int>& t) const
+            {
+                float4 point = thrust::get<0>(t);
 
                 float dx = pivot_.x - point.x;
                 float dy = pivot_.y - point.y;
                 float dz = pivot_.z - point.z;
                 float dist = sqrt(dx*dx + dy*dy + dz*dz);
 
-                return thrust::tuple<float, int>(dist, t.get<1>());                                
+                return thrust::tuple<float, int>(dist, thrust::get<1>(t));
             }
         };
 
@@ -87,7 +88,7 @@ void pcl::device::compute3DCentroid(const DeviceArray<PointT>& cloud, float3& ce
 {
     thrust::device_ptr<PointT> src_beg((PointT*)cloud.ptr());
     thrust::device_ptr<PointT> src_end = src_beg + cloud.size();
-                
+
     centroid = transform_reduce(src_beg, src_beg, PointT2float3<PointT>(), make_float3(0.f, 0.f, 0.f), PlusFloat3());
     centroid *= 1.f/cloud.size();
 }
@@ -99,13 +100,13 @@ void pcl::device::compute3DCentroid(const DeviceArray<PointT>& cloud, const Indi
         compute3DCentroid(cloud, centroid);
     else
     {
-        thrust::device_ptr<PointT> src_beg((PointT*)cloud.ptr());    
+        thrust::device_ptr<PointT> src_beg((PointT*)cloud.ptr());
         thrust::device_ptr<int> map_beg((int*)indices.ptr());
         thrust::device_ptr<int> map_end = map_beg + indices.size();
 
 
         centroid = transform_reduce(make_permutation_iterator(src_beg, map_beg),
-            make_permutation_iterator(src_beg, map_end), 
+            make_permutation_iterator(src_beg, map_end),
             PointT2float3<PointT>(), make_float3(0.f, 0.f, 0.f), PlusFloat3());
 
         centroid *= 1.f/indices.size();
@@ -114,7 +115,7 @@ void pcl::device::compute3DCentroid(const DeviceArray<PointT>& cloud, const Indi
 
 template<typename PointT>
 float3 pcl::device::getMaxDistance(const DeviceArray<PointT>& cloud, const float3& pivot)
-{   
+{
     thrust::device_ptr<PointT> src_beg((PointT*)cloud.ptr());
     thrust::device_ptr<PointT> src_end = src_beg + cloud.size();
 
@@ -123,14 +124,14 @@ float3 pcl::device::getMaxDistance(const DeviceArray<PointT>& cloud, const float
 
     thrust::tuple<float, int> init(0.f, 0);
     thrust::maximum<thrust::tuple<float, int>> op;
-    
+
     thrust::tuple<float, int> res =
         transform_reduce(
         make_zip_iterator(make_tuple( src_beg, cf )),
         make_zip_iterator(make_tuple( src_beg, ce )),
         TupleDistCvt(pivot), init, op);
 
-    float4 point = src_beg[res.get<1>()];
+    float4 point = src_beg[thrust::get<1>(res)];
 
     return make_float3(point.x, point.y, point.z);
 }
@@ -138,11 +139,11 @@ float3 pcl::device::getMaxDistance(const DeviceArray<PointT>& cloud, const float
 
 template<typename PointT>
 float3 pcl::device::getMaxDistance(const DeviceArray<PointT>& cloud, const Indices& indices, const float3& pivot)
-{   
+{
     if (indices.empty())
         return getMaxDistance(cloud, pivot);
 
-    thrust::device_ptr<PointT> src_beg((PointT*)cloud.ptr());    
+    thrust::device_ptr<PointT> src_beg((PointT*)cloud.ptr());
     thrust::device_ptr<int> map_beg((int*)indices.ptr());
     thrust::device_ptr<int> map_end = map_beg + indices.size();
 
@@ -151,13 +152,13 @@ float3 pcl::device::getMaxDistance(const DeviceArray<PointT>& cloud, const Indic
 
     thrust::tuple<float, int> init(0.f, 0);
     thrust::maximum<thrust::tuple<float, int>> op;
-    
+
     thrust::tuple<float, int> res = transform_reduce(
         make_zip_iterator(make_tuple( make_permutation_iterator(src_beg, map_beg), cf )),
         make_zip_iterator(make_tuple( make_permutation_iterator(src_beg, map_end), ce )),
         TupleDistCvt(pivot), init, op);
 
-    float4 point = src_beg[map_beg[res.get<1>()]];
+    float4 point = src_beg[map_beg[thrust::get<1>(res)]];
 
     return make_float3(point.x, point.y, point.z);
 }
diff --git a/gpu/octree/src/cuda/octree_builder.cu b/gpu/octree/src/cuda/octree_builder.cu
index 9616143695c..7ca1110e363 100644
--- a/gpu/octree/src/cuda/octree_builder.cu
+++ b/gpu/octree/src/cuda/octree_builder.cu
@@ -43,6 +43,7 @@
 #include "utils/scan_block.hpp"
 #include "utils/morton.hpp"
 
+#include <thrust/tuple.h>
 #include <thrust/device_ptr.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
@@ -51,7 +52,7 @@
 
 using namespace pcl::gpu;
 
-namespace pcl 
+namespace pcl
 {
     namespace device
     {
@@ -64,21 +65,21 @@ namespace pcl
                 result.x = fmin(e1.x, e2.x);
                 result.y = fmin(e1.y, e2.y);
                 result.z = fmin(e1.z, e2.z);
-                return result;		        
-	        }	
+                return result;
+	        }
         };
 
         template<typename PointType>
         struct SelectMaxPoint
         {
-	        __host__ __device__ __forceinline__ PointType operator()(const PointType& e1, const PointType& e2) const 
-	        {		
+	        __host__ __device__ __forceinline__ PointType operator()(const PointType& e1, const PointType& e2) const
+	        {
                 PointType result;
                 result.x = fmax(e1.x, e2.x);
                 result.y = fmax(e1.y, e2.y);
                 result.z = fmax(e1.z, e2.z);
-                return result;		    
-	        }	
+                return result;
+	        }
         };
 
 
@@ -88,9 +89,9 @@ namespace pcl
             __device__ __forceinline__ thrust::tuple<float, float, float> operator()(const PointType& arg) const
             {
                 thrust::tuple<float, float, float> res;
-                res.get<0>() = arg.x;
-                res.get<1>() = arg.y;
-                res.get<2>() = arg.z;
+                thrust::get<0>(res) = arg.x;
+                thrust::get<1>(res) = arg.y;
+                thrust::get<2>(res) = arg.z;
                 return res;
             }
         };
@@ -102,8 +103,8 @@ namespace pcl
     {
         const static int max_points_per_leaf = 96;
 
-        enum 
-        { 
+        enum
+        {
             GRID_SIZE = 1,
             CTA_SIZE = 1024-32,
             STRIDE = CTA_SIZE,
@@ -116,7 +117,7 @@ namespace pcl
         __shared__ int tasks_beg;
         __shared__ int tasks_end;
         __shared__ int total_new;
-        __shared__ volatile int offsets[CTA_SIZE];                
+        __shared__ volatile int offsets[CTA_SIZE];
 
         struct SingleStepBuild
         {
@@ -127,14 +128,14 @@ namespace pcl
             static __device__ __forceinline__ int divUp(int total, int grain) { return (total + grain - 1) / grain; };
 
             __device__ __forceinline__ int FindCells(int task, int level, int cell_begs[], char cell_code[]) const
-            {               
+            {
                 int cell_count = 0;
 
                 int beg = octree.begs[task];
-                int end = octree.ends[task];                
+                int end = octree.ends[task];
 
                 if (end - beg < max_points_per_leaf)
-                {                        
+                {
                     //cell_count == 0;
                 }
                 else
@@ -142,13 +143,13 @@ namespace pcl
                     int cur_code = Morton::extractLevelCode(codes[beg], level);
 
                     cell_begs[cell_count] = beg;
-                    cell_code[cell_count] = cur_code;     
-                    ++cell_count;                        
+                    cell_code[cell_count] = cur_code;
+                    ++cell_count;
 
                     int last_code = Morton::extractLevelCode(codes[end - 1], level);
                     if (last_code == cur_code)
                     {
-                        cell_begs[cell_count] = end;                                         
+                        cell_begs[cell_count] = end;
                     }
                     else
                     {
@@ -162,7 +163,7 @@ namespace pcl
                             }
 
                             int morton_code = Morton::shiftLevelCode(search_code, level);
-                            int pos = lower_bound(codes + beg, codes + end, morton_code, CompareByLevelCode(level)) - codes; 
+                            int pos = lower_bound(codes + beg, codes + end, morton_code, CompareByLevelCode(level)) - codes;
 
                             if (pos == end)
                             {
@@ -175,7 +176,7 @@ namespace pcl
                             cell_code[cell_count] = cur_code;
                             ++cell_count;
                             beg = pos;
-                        }        
+                        }
                     }
                 }
                 return cell_count;
@@ -183,7 +184,7 @@ namespace pcl
 
 
             __device__  __forceinline__ void operator()() const
-            {             
+            {
                 //32 is a performance penalty step for search
                 static_assert((max_points_per_leaf % 32) == 0, "max_points_per_leaf must be a multiple of 32");
 
@@ -196,7 +197,7 @@ namespace pcl
                     octree. ends[0] = points_number;
                     octree.parent[0] = -1;
 
-                    //init shared                    
+                    //init shared
                     nodes_num = 1;
                     tasks_beg = 0;
                     tasks_end = 1;
@@ -211,8 +212,8 @@ namespace pcl
                 __syncthreads();
 
                 while (tasks_beg < tasks_end && level < Morton::levels)
-                {                  
-                    int task_count = tasks_end - tasks_beg;                    
+                {
+                    int task_count = tasks_end - tasks_beg;
                     int iters = divUp(task_count, CTA_SIZE);
 
                     int task = tasks_beg + threadIdx.x;
@@ -220,14 +221,14 @@ namespace pcl
                     //__syncthreads(); // extra??
 
                     for(int it = 0; it < iters; ++it, task += STRIDE)
-                    {   
+                    {
                         int cell_count = (task < tasks_end) ? FindCells(task, level, cell_begs, cell_code) : 0;
-                        
+
                         offsets[threadIdx.x] = cell_count;
                         __syncthreads();
 
                         scan_block<pcl::device::exclusive>(offsets);
-                        
+
                         //__syncthreads();  //because sync is inside the scan above
 
                         if (task < tasks_end)
@@ -255,24 +256,24 @@ namespace pcl
 
                         __syncthreads();
                         if (threadIdx.x == CTA_SIZE - 1)
-                        {                            
+                        {
                             total_new += cell_count + offsets[threadIdx.x];
                             nodes_num += cell_count + offsets[threadIdx.x];
-                        }    
-                        __syncthreads(); 
+                        }
+                        __syncthreads();
 
                     } /* for(int it = 0; it < iters; ++it, task += STRIDE) */
 
                     //__syncthreads(); //extra ??
 
                     if (threadIdx.x == CTA_SIZE - 1)
-                    {                       
+                    {
                         tasks_beg  = tasks_end;
-                        tasks_end += total_new;        
+                        tasks_end += total_new;
                         total_new = 0;
                     }
                     ++level;
-                    __syncthreads();                    
+                    __syncthreads();
                 }
 
                 if (threadIdx.x == CTA_SIZE - 1)
@@ -285,7 +286,7 @@ namespace pcl
 }
 
 void pcl::device::OctreeImpl::build()
-{       
+{
     using namespace pcl::device;
     host_octree.downloaded = false;
 
@@ -293,7 +294,7 @@ void pcl::device::OctreeImpl::build()
 
     //allocatations
     {
-        //ScopeTimer timer("new_allocs"); 
+        //ScopeTimer timer("new_allocs");
         //+1 codes               * points_num * sizeof(int)
         //+1 indices             * points_num * sizeof(int)
         //+1 octreeGlobal.nodes  * points_num * sizeof(int)
@@ -306,22 +307,22 @@ void pcl::device::OctreeImpl::build()
 
         //+3 points_sorted       * points_num * sizeof(float)
         //==
-        // 10 rows   
+        // 10 rows
 
-        //left 
-        //octreeGlobal.nodes_num * 1 * sizeof(int)          
+        //left
+        //octreeGlobal.nodes_num * 1 * sizeof(int)
         //==
-        // 3 * sizeof(int) => +1 row        
+        // 3 * sizeof(int) => +1 row
 
         const int transaction_size = 128 / sizeof(int);
         int cols = std::max<int>(points_num, transaction_size * 4);
         int rows = 10 + 1; // = 13
-            
+
         storage.create(rows, cols);
-        
+
         codes   = DeviceArray<int>(storage.ptr(0), points_num);
         indices = DeviceArray<int>(storage.ptr(1), points_num);
-        
+
         octreeGlobal.nodes   = storage.ptr(2);
         octreeGlobal.codes   = storage.ptr(3);
         octreeGlobal.begs    = storage.ptr(4);
@@ -332,10 +333,10 @@ void pcl::device::OctreeImpl::build()
 
         points_sorted = DeviceArray2D<float>(3, points_num, storage.ptr(8), storage.step());
     }
-    
+
     {
-        //ScopeTimer timer("reduce-morton-sort-permutations"); 
-    	
+        //ScopeTimer timer("reduce-morton-sort-permutations");
+
         thrust::device_ptr<PointType> beg(points.ptr());
         thrust::device_ptr<PointType> end = beg + points.size();
 
@@ -345,49 +346,49 @@ void pcl::device::OctreeImpl::build()
             atmin.x = atmin.y = atmin.z = std::numeric_limits<float>::lowest();
             atmax.w = atmin.w = 0;
 
-            //ScopeTimer timer("reduce"); 
+            //ScopeTimer timer("reduce");
             PointType minp = thrust::reduce(beg, end, atmax, SelectMinPoint<PointType>());
             PointType maxp = thrust::reduce(beg, end, atmin, SelectMaxPoint<PointType>());
 
             octreeGlobal.minp = make_float3(minp.x, minp.y, minp.z);
             octreeGlobal.maxp = make_float3(maxp.x, maxp.y, maxp.z);
         }
-    		
+
         thrust::device_ptr<int> codes_beg(codes.ptr());
         thrust::device_ptr<int> codes_end = codes_beg + codes.size();
         {
-            //ScopeTimer timer("morton"); 
+            //ScopeTimer timer("morton");
 	        thrust::transform(beg, end, codes_beg, CalcMorton(octreeGlobal.minp, octreeGlobal.maxp));
         }
 
         thrust::device_ptr<int> indices_beg(indices.ptr());
         thrust::device_ptr<int> indices_end = indices_beg + indices.size();
         {
-            //ScopeTimer timer("sort"); 
+            //ScopeTimer timer("sort");
             thrust::sequence(indices_beg, indices_end);
-            thrust::sort_by_key(codes_beg, codes_end, indices_beg );		
+            thrust::sort_by_key(codes_beg, codes_end, indices_beg );
         }
         {
-            ////ScopeTimer timer("perm"); 
+            ////ScopeTimer timer("perm");
             //thrust::copy(make_permutation_iterator(beg, indices_beg),
-            //                  make_permutation_iterator(end, indices_end), device_ptr<float3>(points_sorted.ptr()));    
+            //                  make_permutation_iterator(end, indices_end), device_ptr<float3>(points_sorted.ptr()));
+
 
-             
         }
 
         {
             thrust::device_ptr<float> xs(points_sorted.ptr(0));
             thrust::device_ptr<float> ys(points_sorted.ptr(1));
             thrust::device_ptr<float> zs(points_sorted.ptr(2));
-            //ScopeTimer timer("perm2"); 
+            //ScopeTimer timer("perm2");
             thrust::transform(make_permutation_iterator(beg, indices_beg),
-                              make_permutation_iterator(end, indices_end), 
+                              make_permutation_iterator(end, indices_end),
                               make_zip_iterator(make_tuple(xs, ys, zs)), PointType_to_tuple<PointType>());
 
-        
+
         }
     }
-    
+
     SingleStepBuild ssb;
     ssb.octree = octreeGlobal;
     ssb.codes  = codes;
diff --git a/gpu/octree/src/utils/approx_nearest_utils.hpp b/gpu/octree/src/utils/approx_nearest_utils.hpp
index 247eda0dc9b..9bec17e74d6 100644
--- a/gpu/octree/src/utils/approx_nearest_utils.hpp
+++ b/gpu/octree/src/utils/approx_nearest_utils.hpp
@@ -12,6 +12,7 @@
 #include "morton.hpp"
 #include <assert.h>
 
+#include <cstdint>
 #include <limits>
 #include <tuple>
 #include <bitset>
diff --git a/gpu/surface/src/cuda/convex_hull.cu b/gpu/surface/src/cuda/convex_hull.cu
index c1f20234eba..98e89cd703d 100644
--- a/gpu/surface/src/cuda/convex_hull.cu
+++ b/gpu/surface/src/cuda/convex_hull.cu
@@ -61,89 +61,89 @@
 namespace pcl
 {
   namespace device
-  { 	  
+  {
 	  template<bool use_max>
 	  struct IndOp
 	  {
 		  __device__ __forceinline__ thrust::tuple<float, int> operator()(const thrust::tuple<float, int>& e1, const thrust::tuple<float, int>& e2) const
-		  {	
+		  {
 			  thrust::tuple<float, int> res;
-			  
+
 			  if (use_max)
-			    res.get<0>() = fmax(e1.get<0>(), e2.get<0>());			  			  
+			    thrust::get<0>(res) = fmax(thrust::get<0>(e1), thrust::get<0>(e2));
 			  else
-				res.get<0>() = fmin(e1.get<0>(), e2.get<0>());			  			  
+				thrust::get<0>(res) = fmin(thrust::get<0>(e1), thrust::get<0>(e2));
 
-			  res.get<1>()  = (res.get<0>() == e1.get<0>()) ? e1.get<1>() : e2.get<1>();
-			  return res;			  
-		  }		 
+			  thrust::get<1>(res)  = (thrust::get<0>(res) == thrust::get<0>(e1)) ? thrust::get<1>(e1) : thrust::get<1>(e2);
+			  return res;
+		  }
 	  };
 
 	  struct X
-	  {			  
-		  __device__ __forceinline__ 
-		  thrust::tuple<float, int> 
+	  {
+		  __device__ __forceinline__
+		  thrust::tuple<float, int>
 		  operator()(const thrust::tuple<PointType, int>& in) const
 		  {
-			return thrust::tuple<float, int>(in.get<0>().x, in.get<1>());			  
+			return thrust::tuple<float, int>(thrust::get<0>(in).x, thrust::get<1>(in));
 		  }
 	  };
 
 	  struct Y
-	  {			  
+	  {
 		  __device__ __forceinline__  float operator()(const PointType& in) const { return in.y; }
 	  };
 
 	  struct Z
-	  {			  
+	  {
 		  __device__ __forceinline__  float operator()(const PointType& in) const { return in.z; }
 	  };
-		  
+
 	  struct LineDist
 	  {
 		  float3 x1, x2;
 		  LineDist(const PointType& p1, const PointType& p2) : x1(tr(p1)), x2(tr(p2)) {}
-		  
+
 		  __device__ __forceinline__
 		  thrust::tuple<float, int> operator()(const thrust::tuple<PointType, int>& in) const
-		  {			  
-			  float3 x0 = tr(in.get<0>());
+		  {
+			  float3 x0 = tr(thrust::get<0>(in));
 
-			  float dist = norm(cross(x0 - x1, x0 - x2))/norm(x1 - x2);			  
-			  return thrust::tuple<float, int>(dist, in.get<1>());
-		  }	      
+			  float dist = norm(cross(x0 - x1, x0 - x2))/norm(x1 - x2);
+			  return thrust::tuple<float, int>(dist, thrust::get<1>(in));
+		  }
 	  };
 
 	  struct PlaneDist
-	  {		  
+	  {
 		  float3 x1, n;
 		  PlaneDist(const PointType& p1, const PointType& p2, const PointType& p3) : x1(tr(p1))
 		  {
 			  float3 x2 = tr(p2), x3 = tr(p3);
               n = normalized(cross(x2 - x1, x3 - x1));
 		  }
-		  
+
 		  __device__ __forceinline__
 		  thrust::tuple<float, int> operator()(const thrust::tuple<PointType, int>& in) const
 		  {
-			  float3 x0 = tr(in.get<0>());
+			  float3 x0 = tr(thrust::get<0>(in));
               float dist = std::abs(dot(n, x0 - x1));
-			  return thrust::tuple<float, int>(dist, in.get<1>());
+			  return thrust::tuple<float, int>(dist, thrust::get<1>(in));
 		  }
 	  };
-	  
+
 	  template<typename It, typename Unary, typename Init, typename Binary>
       int transform_reduce_index(It beg, It end, Unary unop, Init init, Binary binary)
 	  {
 	    thrust::counting_iterator<int> cbeg(0);
 		thrust::counting_iterator<int> cend = cbeg + thrust::distance(beg, end);
-			 		
-	    thrust::tuple<float, int> t = transform_reduce( 
-		  make_zip_iterator(thrust::make_tuple(beg, cbeg)), 
-		  make_zip_iterator(thrust::make_tuple(end, cend)), 
+
+	    thrust::tuple<float, int> t = transform_reduce(
+		  make_zip_iterator(thrust::make_tuple(beg, cbeg)),
+		  make_zip_iterator(thrust::make_tuple(end, cend)),
 		  unop, init, binary);
-		
-		return t.get<1>();
+
+		return thrust::get<1>(t);
 	  }
 
 	  template<typename It, typename Unary>
@@ -158,35 +158,35 @@ namespace pcl
 	  {
 		thrust::tuple<float, int> max_tuple(std::numeric_limits<float>::min(), 0);
 		return transform_reduce_index(beg, end, unop, max_tuple, IndOp<true>());
-	  }	 
+	  }
   }
 }
 
 pcl::device::PointStream::PointStream(const Cloud& cloud_) : cloud(cloud_)
-{				
+{
   cloud_size = cloud.size();
   facets_dists.create(cloud_size);
   perm.create(cloud_size);
 
-  thrust::device_ptr<int> pbeg(perm.ptr());  
+  thrust::device_ptr<int> pbeg(perm.ptr());
   thrust::sequence(pbeg, pbeg + cloud_size);
 }
 
 void pcl::device::PointStream::computeInitalSimplex()
 {
-  thrust::device_ptr<const PointType> beg(cloud.ptr());  
+  thrust::device_ptr<const PointType> beg(cloud.ptr());
   thrust::device_ptr<const PointType> end = beg + cloud_size;
-     
+
   int minx = transform_reduce_min_index(beg, end, X());
   int maxx = transform_reduce_max_index(beg, end, X());
 
   PointType p1 = *(beg + minx);
   PointType p2 = *(beg + maxx);
-  	        
+
   int maxl = transform_reduce_max_index(beg, end, LineDist(p1, p2));
 
   PointType p3 = *(beg + maxl);
-    
+
   int maxp = transform_reduce_max_index(beg, end, PlaneDist(p1, p2, p3));
 
   PointType p4 = *(beg + maxp);
@@ -194,12 +194,12 @@ void pcl::device::PointStream::computeInitalSimplex()
   simplex.x1 = tr(p1);  simplex.x2 = tr(p2);  simplex.x3 = tr(p3);  simplex.x4 = tr(p4);
   simplex.i1 = minx;    simplex.i2 = maxx;    simplex.i3 = maxl;    simplex.i4 = maxp;
 
-  float maxy = transform_reduce(beg, end, Y(), std::numeric_limits<float>::min(), thrust::maximum<float>()); 
-  float miny = transform_reduce(beg, end, Y(), std::numeric_limits<float>::max(), thrust::minimum<float>()); 
+  float maxy = transform_reduce(beg, end, Y(), std::numeric_limits<float>::min(), thrust::maximum<float>());
+  float miny = transform_reduce(beg, end, Y(), std::numeric_limits<float>::max(), thrust::minimum<float>());
+
+  float maxz = transform_reduce(beg, end, Z(), std::numeric_limits<float>::min(), thrust::maximum<float>());
+  float minz = transform_reduce(beg, end, Z(), std::numeric_limits<float>::max(), thrust::minimum<float>());
 
-  float maxz = transform_reduce(beg, end, Z(), std::numeric_limits<float>::min(), thrust::maximum<float>()); 
-  float minz = transform_reduce(beg, end, Z(), std::numeric_limits<float>::max(), thrust::minimum<float>()); 
-		  
   float dx = (p2.x - p1.x);
   float dy = (maxy - miny);
   float dz = (maxz - minz);
@@ -209,7 +209,7 @@ void pcl::device::PointStream::computeInitalSimplex()
   simplex.p1 = compute_plane(simplex.x4, simplex.x2, simplex.x3, simplex.x1);
   simplex.p2 = compute_plane(simplex.x3, simplex.x1, simplex.x4, simplex.x2);
   simplex.p3 = compute_plane(simplex.x2, simplex.x1, simplex.x4, simplex.x3);
-  simplex.p4 = compute_plane(simplex.x1, simplex.x2, simplex.x3, simplex.x4);  
+  simplex.p4 = compute_plane(simplex.x1, simplex.x2, simplex.x3, simplex.x4);
 }
 
 namespace pcl
@@ -217,7 +217,7 @@ namespace pcl
   namespace device
   {
     __global__ void init_fs(int i1, int i2, int i3, int i4, PtrStep<int> verts_inds)
-	{	  	  	  
+	{
       *(int4*)verts_inds.ptr(0) = make_int4(i2, i1, i1, i1);
       *(int4*)verts_inds.ptr(1) = make_int4(i3, i3, i2, i2);
       *(int4*)verts_inds.ptr(2) = make_int4(i4, i4, i4, i3);
@@ -227,10 +227,10 @@ namespace pcl
 }
 
 void pcl::device::FacetStream::setInitialFacets(const InitalSimplex& s)
-{  
-  init_fs<<<1, 1>>>(s.i1, s.i2, s.i3, s.i4, verts_inds);  
+{
+  init_fs<<<1, 1>>>(s.i1, s.i2, s.i3, s.i4, verts_inds);
   cudaSafeCall( cudaGetLastError() );
-  cudaSafeCall( cudaDeviceSynchronize() );  
+  cudaSafeCall( cudaDeviceSynchronize() );
   facet_count = 4;
 }
 
@@ -245,20 +245,20 @@ namespace pcl
 	{
       float diag;
       float4 pl1, pl2, pl3, pl4;
-	  	  
-      InitalClassify(const float4& p1, const float4& p2, const float4& p3, const float4& p4, float diagonal) 
+
+      InitalClassify(const float4& p1, const float4& p2, const float4& p3, const float4& p4, float diagonal)
           : diag(diagonal), pl1(p1), pl2(p2), pl3(p3), pl4(p4)
-	  {				
+	  {
         pl1 *= compue_inv_normal_norm(pl1);
         pl2 *= compue_inv_normal_norm(pl2);
         pl3 *= compue_inv_normal_norm(pl3);
         pl4 *= compue_inv_normal_norm(pl4);
 	  }
-	  	  
+
 	  __device__ __forceinline__
-	  std::uint64_t 
+	  std::uint64_t
 	  operator()(const PointType& p) const
-	  {                   
+	  {
 		  float4 x = p;
 		  x.w = 1;
 
@@ -270,7 +270,7 @@ namespace pcl
           float dists[] = { d0, d1, d2, d3 };
           int negs_inds[4];
           int neg_count = 0;
-          
+
           int idx = std::numeric_limits<int>::max();
           float dist = 0;
 
@@ -283,7 +283,7 @@ namespace pcl
           {
              int i1 = negs_inds[1];
              int i2 = negs_inds[2];
-             
+
              int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1;
              negs_inds[1] = ir;
              --neg_count;
@@ -293,10 +293,10 @@ namespace pcl
           {
              int i1 = negs_inds[0];
              int i2 = negs_inds[1];
-             
+
              int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1;
              negs_inds[0] = ir;
-             --neg_count;              
+             --neg_count;
           }
 
           if (neg_count == 1)
@@ -311,28 +311,28 @@ namespace pcl
 		  std::uint64_t res = idx;
 		  res <<= 32;
 		  return res + *reinterpret_cast<unsigned int*>(&dist);
-	  }		
-	};		
+	  }
+	};
 
-    __global__ void initalClassifyKernel(const InitalClassify ic, const PointType* points, int cloud_size, std::uint64_t* output) 
-    { 
+    __global__ void initalClassifyKernel(const InitalClassify ic, const PointType* points, int cloud_size, std::uint64_t* output)
+    {
         int index = threadIdx.x + blockIdx.x * blockDim.x;
 
-        if (index < cloud_size)              
-          output[index] = ic(points[index]); 
+        if (index < cloud_size)
+          output[index] = ic(points[index]);
     }
   }
 }
 
 void pcl::device::PointStream::initalClassify()
-{        
+{
   //thrust::device_ptr<const PointType> beg(cloud.ptr());
   //thrust::device_ptr<const PointType> end = beg + cloud_size;
   thrust::device_ptr<std::uint64_t> out(facets_dists.ptr());
-  
+
   InitalClassify ic(simplex.p1, simplex.p2, simplex.p3, simplex.p4, cloud_diag);
   //thrust::transform(beg, end, out, ic);
-  
+
   //printFuncAttrib(initalClassifyKernel);
 
   initalClassifyKernel<<<divUp(cloud_size, 256), 256>>>(ic, cloud, cloud_size, facets_dists);
@@ -350,9 +350,9 @@ namespace pcl
 {
   namespace device
   {
-    __device__ int new_cloud_size;    
+    __device__ int new_cloud_size;
 	struct SearchFacetHeads
-	{		
+	{
 	  std::uint64_t *facets_dists;
 	  int cloud_size;
 	  int facet_count;
@@ -361,25 +361,25 @@ namespace pcl
 
 	  mutable int* head_points;
       //bool logger;
-	
+
 	  __device__ __forceinline__
 	  void operator()(int facet) const
-	  {			
+	  {
 		const std::uint64_t* b = facets_dists;
 		const std::uint64_t* e = b + cloud_size;
 
         bool last_thread = facet == facet_count;
 
-        int search_value = !last_thread ? facet : std::numeric_limits<int>::max();		
-		int index = lower_bound(b, e, search_value, LessThanByFacet()) - b;			
-        
+        int search_value = !last_thread ? facet : std::numeric_limits<int>::max();
+		int index = lower_bound(b, e, search_value, LessThanByFacet()) - b;
+
         if (last_thread)
             new_cloud_size = index;
         else
         {
           bool not_found = index == cloud_size || (facet != (facets_dists[index] >> 32));
 
-          head_points[facet] = not_found ? -1 : perm[index];		
+          head_points[facet] = not_found ? -1 : perm[index];
         }
 	  }
 	};
@@ -403,18 +403,18 @@ int pcl::device::PointStream::searchFacetHeads(std::size_t facet_count, DeviceAr
 	sfh.facet_count = (int)facet_count;
 	sfh.perm = perm;
 	sfh.points = cloud.ptr();
-	sfh.head_points = head_points;  
-	
+	sfh.head_points = head_points;
+
     //thrust::counting_iterator<int> b(0);
-    //thrust::counting_iterator<int> e = b + facet_count + 1;  	
+    //thrust::counting_iterator<int> e = b + facet_count + 1;
     //thrust::for_each(b, e, sfh);
 
     searchFacetHeadsKernel<<<divUp(facet_count+1, 256), 256>>>(sfh);
     cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall( cudaDeviceSynchronize() );        
+    cudaSafeCall( cudaDeviceSynchronize() );
 
 	int new_size;
-	cudaSafeCall( cudaMemcpyFromSymbol(	(void*)&new_size,  pcl::device::new_cloud_size, sizeof(new_size)) );	
+	cudaSafeCall( cudaMemcpyFromSymbol(	(void*)&new_size,  pcl::device::new_cloud_size, sizeof(new_size)) );
 	return new_size;
 }
 
@@ -434,7 +434,7 @@ namespace pcl
 
 	struct Compaction
 	{
-		enum 
+		enum
 		{
 			CTA_SIZE = 256,
 
@@ -443,18 +443,18 @@ namespace pcl
 
 		int* head_points_in;
 		PtrStep<int>  verts_inds_in;
-		
+
 
 		int *scan_buffer;
 		int facet_count;
 
 		mutable int* head_points_out;
 		mutable PtrStep<int>  verts_inds_out;
-		
+
 
 		mutable PtrStep<int> empty_facets;
 		mutable int *empty_count;
-		  
+
 		__device__ __forceinline__
 		void operator()() const
 		{
@@ -473,16 +473,16 @@ namespace pcl
 					int offset = scan_buffer[idx];
 
 					head_points_out[offset] = head_idx;
-					
+
 					verts_inds_out.ptr(0)[offset] = verts_inds_in.ptr(0)[idx];
 					verts_inds_out.ptr(1)[offset] = verts_inds_in.ptr(1)[idx];
 					verts_inds_out.ptr(2)[offset] = verts_inds_in.ptr(2)[idx];
 
-                    
-					
+
+
 				}
-				else                
-				  empty = 1;                
+				else
+				  empty = 1;
 			}
 
       int total = __popc (__ballot_sync (__activemask (), empty));
@@ -498,7 +498,7 @@ namespace pcl
 				if (laneid == 0)
 				{
 					int old = atomicAdd(empty_count, total);
-					wapr_buffer[warpid] = old;                    
+					wapr_buffer[warpid] = old;
 				}
 				int old = wapr_buffer[warpid];
 
@@ -506,11 +506,11 @@ namespace pcl
                 {
 				  empty_facets.ptr(0)[old + offset] = verts_inds_in.ptr(0)[idx];
 				  empty_facets.ptr(1)[old + offset] = verts_inds_in.ptr(1)[idx];
-				  empty_facets.ptr(2)[old + offset] = verts_inds_in.ptr(2)[idx];		                  
+				  empty_facets.ptr(2)[old + offset] = verts_inds_in.ptr(2)[idx];
 
                   int a1 = verts_inds_in.ptr(0)[idx], a2 = verts_inds_in.ptr(1)[idx], a3 = verts_inds_in.ptr(2)[idx];
                 }
-			}							
+			}
 		}
 	};
 
@@ -521,19 +521,19 @@ namespace pcl
 
 void pcl::device::FacetStream::compactFacets()
 {
-  int old_empty_count;  
-  empty_count.download(&old_empty_count); 
+  int old_empty_count;
+  empty_count.download(&old_empty_count);
 
   thrust::device_ptr<int> b(head_points.ptr());
   thrust::device_ptr<int> e = b + facet_count;
   thrust::device_ptr<int> o(scan_buffer.ptr());
-  
-  thrust::transform_exclusive_scan(b, e, o, NotMinus1(), 0, thrust::plus<int>());                                                                                    
-  
+
+  thrust::transform_exclusive_scan(b, e, o, NotMinus1(), 0, thrust::plus<int>());
+
   Compaction c;
 
   c.verts_inds_in   = verts_inds;
-  c.head_points_in  = head_points;    
+  c.head_points_in  = head_points;
 
   c.scan_buffer = scan_buffer;
   c.facet_count = facet_count;
@@ -543,20 +543,20 @@ void pcl::device::FacetStream::compactFacets()
 
   c.empty_facets = empty_facets;
   c.empty_count = empty_count;
- 
+
   int block = Compaction::CTA_SIZE;
   int grid = divUp(facet_count, block);
 
-  compactionKernel<<<grid, block>>>(c);   
+  compactionKernel<<<grid, block>>>(c);
   cudaSafeCall( cudaGetLastError() );
   cudaSafeCall( cudaDeviceSynchronize() );
-    
+
   verts_inds.swap(verts_inds2);
   head_points.swap(head_points2);
 
-  int new_empty_count;  
-  empty_count.download(&new_empty_count); 
-  
+  int new_empty_count;
+  empty_count.download(&new_empty_count);
+
   facet_count -= new_empty_count - old_empty_count;
 }
 
@@ -583,11 +583,11 @@ namespace pcl
 
 		int facet_count;
 
-		__device__ __forceinline__ 
+		__device__ __forceinline__
 		void operator()(int point_idx) const
 		{
           int perm_index = perm[point_idx];
-          
+
 		  int facet = facets_dists[point_idx] >> 32;
 		  facet = scan_buffer[facet];
 
@@ -596,10 +596,10 @@ namespace pcl
           if (hi == perm_index)
           {
             std::uint64_t res = std::numeric_limits<int>::max();
-		    res <<= 32;		                      
+		    res <<= 32;
             facets_dists[point_idx] = res;
           }
-          else            
+          else
           {
 
 		    int i1 = verts_inds.ptr(0)[facet];
@@ -610,16 +610,16 @@ namespace pcl
 		    float3 v1 = tr( points[ i1 ] );
 		    float3 v2 = tr( points[ i2 ] );
 		    float3 v3 = tr( points[ i3 ] );
-		
+
 		    float4 p0 = compute_plane(hp, v1, v2, /*opposite*/v3); // j
 		    float4 p1 = compute_plane(hp, v2, v3, /*opposite*/v1); // facet_count + j
-		    float4 p2 = compute_plane(hp, v3, v1, /*opposite*/v2); // facet_count + j*2			
+		    float4 p2 = compute_plane(hp, v3, v1, /*opposite*/v2); // facet_count + j*2
 
             p0 *= compue_inv_normal_norm(p0);
             p1 *= compue_inv_normal_norm(p1);
             p2 *= compue_inv_normal_norm(p2);
 
-          
+
 		    float4 p = points[perm_index];
 		    p.w = 1;
 
@@ -640,12 +640,12 @@ namespace pcl
             for(int i = 0; i < 3; ++i)
               if (dists[i] < 0)
                negs_inds[neg_count++] = i;
- 
+
             if (neg_count == 3)
             {
               int i1 = negs_inds[1];
               int i2 = negs_inds[2];
-           
+
               int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1;
               negs_inds[1] = ir;
               --neg_count;
@@ -655,10 +655,10 @@ namespace pcl
             {
               int i1 = negs_inds[0];
               int i2 = negs_inds[1];
-           
+
               int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1;
               negs_inds[0] = ir;
-              --neg_count;              
+              --neg_count;
             }
 
             if (neg_count == 1)
@@ -670,16 +670,16 @@ namespace pcl
 
             // if (neg_count == 0)
             // new_idx = std::numeric_limits<int>::max() ==>> internal point
-                      	       	 	   
+
             std::uint64_t res = new_idx;
 		    res <<= 32;
 		    res += *reinterpret_cast<unsigned int*>(&dist);
 
 		    facets_dists[point_idx] = res;
 
-          } /* if (hi == perm_index) */            
+          } /* if (hi == perm_index) */
         }
-	  };    
+	  };
 
       __global__ void classifyKernel(const Classify c, int cloud_size)
       {
@@ -692,7 +692,7 @@ namespace pcl
 }
 
 void pcl::device::PointStream::classify(FacetStream& fs)
-{   
+{
   Classify c;
 
   c.facets_dists = facets_dists;
@@ -706,16 +706,16 @@ void pcl::device::PointStream::classify(FacetStream& fs)
   c.diag = cloud_diag;
   c.facet_count = fs.facet_count;
 
-  //thrust::counting_iterator<int> b(0);    
+  //thrust::counting_iterator<int> b(0);
   //thrust::for_each(b, b + cloud_size, c);
 
   classifyKernel<<<divUp(cloud_size, 256), 256>>>(c, cloud_size);
   cudaSafeCall( cudaGetLastError() );
   cudaSafeCall( cudaDeviceSynchronize() );
-  
+
   thrust::device_ptr<std::uint64_t> beg(facets_dists.ptr());
   thrust::device_ptr<std::uint64_t> end = beg + cloud_size;
-  
+
   thrust::device_ptr<int> pbeg(perm.ptr());
   thrust::sort_by_key(beg, end, pbeg);
 }
@@ -731,14 +731,14 @@ namespace pcl
 
       mutable PtrStep<int>  verts_inds;
 
-      __device__ __forceinline__ 
+      __device__ __forceinline__
       void operator()(int facet) const
       {
         int hi = head_points[facet];
         int i1 = verts_inds.ptr(0)[facet];
         int i2 = verts_inds.ptr(1)[facet];
         int i3 = verts_inds.ptr(2)[facet];
-        
+
         make_facet(hi, i1, i2, facet);
         make_facet(hi, i2, i3, facet + facet_count);
         make_facet(hi, i3, i1, facet + facet_count * 2);
@@ -757,8 +757,8 @@ namespace pcl
     {
       int facet = threadIdx.x + blockIdx.x * blockDim.x;
 
-      if (facet < sf.facet_count)        
-        sf(facet);        
+      if (facet < sf.facet_count)
+        sf(facet);
     }
   }
 }
@@ -769,9 +769,9 @@ void pcl::device::FacetStream::splitFacets()
   sf.head_points = head_points;
   sf.verts_inds = verts_inds;
   sf.facet_count = facet_count;
-    
 
-  //thrust::counting_iterator<int> b(0);    
+
+  //thrust::counting_iterator<int> b(0);
   //thrust::for_each(b, b + facet_count, sf);
 
   splitFacetsKernel<<<divUp(facet_count, 256), 256>>>(sf);
@@ -786,8 +786,8 @@ size_t pcl::device::remove_duplicates(DeviceArray<int>& indeces)
   thrust::device_ptr<int> beg(indeces.ptr());
   thrust::device_ptr<int> end = beg + indeces.size();
 
-  thrust::sort(beg, end);  
-  return (std::size_t)(thrust::unique(beg, end) - beg);  
+  thrust::sort(beg, end);
+  return (std::size_t)(thrust::unique(beg, end) - beg);
 }
 
 
@@ -810,15 +810,15 @@ void pcl::device::pack_hull(const DeviceArray<PointType>& points, const DeviceAr
 {
   output.create(indeces.size());
 
-  //thrust::device_ptr<const PointType> in(points.ptr());  
-  
+  //thrust::device_ptr<const PointType> in(points.ptr());
+
   //thrust::device_ptr<const int> mb(indeces.ptr());
   //thrust::device_ptr<const int> me = mb + indeces.size();
 
-  //thrust::device_ptr<PointType> out(output.ptr());  
+  //thrust::device_ptr<PointType> out(output.ptr());
 
   //thrust::gather(mb, me, in, out);
-  
+
   gatherKernel<<<divUp(indeces.size(), 256), 256>>>(indeces, points, output);
   cudaSafeCall( cudaGetLastError() );
   cudaSafeCall( cudaDeviceSynchronize() );