diff --git a/cmake/pcl_find_cuda.cmake b/cmake/pcl_find_cuda.cmake index 26b6d908d5d..466d033feac 100644 --- a/cmake/pcl_find_cuda.cmake +++ b/cmake/pcl_find_cuda.cmake @@ -12,6 +12,7 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.11) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.17) find_package(CUDAToolkit QUIET) + set(CUDA_TOOLKIT_INCLUDE ${CUDAToolkit_INCLUDE_DIRS}) else() set(CUDA_FIND_QUIETLY TRUE) find_package(CUDA 9.0) diff --git a/gpu/features/src/centroid.cu b/gpu/features/src/centroid.cu index 5e73df35d80..8ba0f80eefc 100644 --- a/gpu/features/src/centroid.cu +++ b/gpu/features/src/centroid.cu @@ -36,6 +36,7 @@ #include "internal.hpp" +#include #include #include #include @@ -56,26 +57,26 @@ namespace pcl struct PlusFloat3 { - __device__ __forceinline__ float3 operator()(const float3& e1, const float3& e2) const - { - return make_float3(e1.x + e2.x, e1.y + e2.y, e1.z + e2.z); + __device__ __forceinline__ float3 operator()(const float3& e1, const float3& e2) const + { + return make_float3(e1.x + e2.x, e1.y + e2.y, e1.z + e2.z); } }; - + struct TupleDistCvt { float3 pivot_; TupleDistCvt(const float3& pivot) : pivot_(pivot) {} - __device__ __forceinline__ thrust::tuple operator()(const thrust::tuple& t) const - { - float4 point = t.get<0>(); + __device__ __forceinline__ thrust::tuple operator()(const thrust::tuple& t) const + { + float4 point = thrust::get<0>(t); float dx = pivot_.x - point.x; float dy = pivot_.y - point.y; float dz = pivot_.z - point.z; float dist = sqrt(dx*dx + dy*dy + dz*dz); - return thrust::tuple(dist, t.get<1>()); + return thrust::tuple(dist, thrust::get<1>(t)); } }; @@ -87,7 +88,7 @@ void pcl::device::compute3DCentroid(const DeviceArray& cloud, float3& ce { thrust::device_ptr src_beg((PointT*)cloud.ptr()); thrust::device_ptr src_end = src_beg + cloud.size(); - + centroid = transform_reduce(src_beg, src_beg, PointT2float3(), make_float3(0.f, 0.f, 0.f), PlusFloat3()); centroid *= 1.f/cloud.size(); } @@ -99,13 +100,13 @@ void pcl::device::compute3DCentroid(const DeviceArray& cloud, const Indi compute3DCentroid(cloud, centroid); else { - thrust::device_ptr src_beg((PointT*)cloud.ptr()); + thrust::device_ptr src_beg((PointT*)cloud.ptr()); thrust::device_ptr map_beg((int*)indices.ptr()); thrust::device_ptr map_end = map_beg + indices.size(); centroid = transform_reduce(make_permutation_iterator(src_beg, map_beg), - make_permutation_iterator(src_beg, map_end), + make_permutation_iterator(src_beg, map_end), PointT2float3(), make_float3(0.f, 0.f, 0.f), PlusFloat3()); centroid *= 1.f/indices.size(); @@ -114,7 +115,7 @@ void pcl::device::compute3DCentroid(const DeviceArray& cloud, const Indi template float3 pcl::device::getMaxDistance(const DeviceArray& cloud, const float3& pivot) -{ +{ thrust::device_ptr src_beg((PointT*)cloud.ptr()); thrust::device_ptr src_end = src_beg + cloud.size(); @@ -123,14 +124,14 @@ float3 pcl::device::getMaxDistance(const DeviceArray& cloud, const float thrust::tuple init(0.f, 0); thrust::maximum> op; - + thrust::tuple res = transform_reduce( make_zip_iterator(make_tuple( src_beg, cf )), make_zip_iterator(make_tuple( src_beg, ce )), TupleDistCvt(pivot), init, op); - float4 point = src_beg[res.get<1>()]; + float4 point = src_beg[thrust::get<1>(res)]; return make_float3(point.x, point.y, point.z); } @@ -138,11 +139,11 @@ float3 pcl::device::getMaxDistance(const DeviceArray& cloud, const float template float3 pcl::device::getMaxDistance(const DeviceArray& cloud, const Indices& indices, const float3& pivot) -{ +{ if (indices.empty()) return getMaxDistance(cloud, pivot); - thrust::device_ptr src_beg((PointT*)cloud.ptr()); + thrust::device_ptr src_beg((PointT*)cloud.ptr()); thrust::device_ptr map_beg((int*)indices.ptr()); thrust::device_ptr map_end = map_beg + indices.size(); @@ -151,13 +152,13 @@ float3 pcl::device::getMaxDistance(const DeviceArray& cloud, const Indic thrust::tuple init(0.f, 0); thrust::maximum> op; - + thrust::tuple res = transform_reduce( make_zip_iterator(make_tuple( make_permutation_iterator(src_beg, map_beg), cf )), make_zip_iterator(make_tuple( make_permutation_iterator(src_beg, map_end), ce )), TupleDistCvt(pivot), init, op); - float4 point = src_beg[map_beg[res.get<1>()]]; + float4 point = src_beg[map_beg[thrust::get<1>(res)]]; return make_float3(point.x, point.y, point.z); } diff --git a/gpu/octree/src/cuda/octree_builder.cu b/gpu/octree/src/cuda/octree_builder.cu index 9616143695c..7ca1110e363 100644 --- a/gpu/octree/src/cuda/octree_builder.cu +++ b/gpu/octree/src/cuda/octree_builder.cu @@ -43,6 +43,7 @@ #include "utils/scan_block.hpp" #include "utils/morton.hpp" +#include #include #include #include @@ -51,7 +52,7 @@ using namespace pcl::gpu; -namespace pcl +namespace pcl { namespace device { @@ -64,21 +65,21 @@ namespace pcl result.x = fmin(e1.x, e2.x); result.y = fmin(e1.y, e2.y); result.z = fmin(e1.z, e2.z); - return result; - } + return result; + } }; template struct SelectMaxPoint { - __host__ __device__ __forceinline__ PointType operator()(const PointType& e1, const PointType& e2) const - { + __host__ __device__ __forceinline__ PointType operator()(const PointType& e1, const PointType& e2) const + { PointType result; result.x = fmax(e1.x, e2.x); result.y = fmax(e1.y, e2.y); result.z = fmax(e1.z, e2.z); - return result; - } + return result; + } }; @@ -88,9 +89,9 @@ namespace pcl __device__ __forceinline__ thrust::tuple operator()(const PointType& arg) const { thrust::tuple res; - res.get<0>() = arg.x; - res.get<1>() = arg.y; - res.get<2>() = arg.z; + thrust::get<0>(res) = arg.x; + thrust::get<1>(res) = arg.y; + thrust::get<2>(res) = arg.z; return res; } }; @@ -102,8 +103,8 @@ namespace pcl { const static int max_points_per_leaf = 96; - enum - { + enum + { GRID_SIZE = 1, CTA_SIZE = 1024-32, STRIDE = CTA_SIZE, @@ -116,7 +117,7 @@ namespace pcl __shared__ int tasks_beg; __shared__ int tasks_end; __shared__ int total_new; - __shared__ volatile int offsets[CTA_SIZE]; + __shared__ volatile int offsets[CTA_SIZE]; struct SingleStepBuild { @@ -127,14 +128,14 @@ namespace pcl static __device__ __forceinline__ int divUp(int total, int grain) { return (total + grain - 1) / grain; }; __device__ __forceinline__ int FindCells(int task, int level, int cell_begs[], char cell_code[]) const - { + { int cell_count = 0; int beg = octree.begs[task]; - int end = octree.ends[task]; + int end = octree.ends[task]; if (end - beg < max_points_per_leaf) - { + { //cell_count == 0; } else @@ -142,13 +143,13 @@ namespace pcl int cur_code = Morton::extractLevelCode(codes[beg], level); cell_begs[cell_count] = beg; - cell_code[cell_count] = cur_code; - ++cell_count; + cell_code[cell_count] = cur_code; + ++cell_count; int last_code = Morton::extractLevelCode(codes[end - 1], level); if (last_code == cur_code) { - cell_begs[cell_count] = end; + cell_begs[cell_count] = end; } else { @@ -162,7 +163,7 @@ namespace pcl } int morton_code = Morton::shiftLevelCode(search_code, level); - int pos = lower_bound(codes + beg, codes + end, morton_code, CompareByLevelCode(level)) - codes; + int pos = lower_bound(codes + beg, codes + end, morton_code, CompareByLevelCode(level)) - codes; if (pos == end) { @@ -175,7 +176,7 @@ namespace pcl cell_code[cell_count] = cur_code; ++cell_count; beg = pos; - } + } } } return cell_count; @@ -183,7 +184,7 @@ namespace pcl __device__ __forceinline__ void operator()() const - { + { //32 is a performance penalty step for search static_assert((max_points_per_leaf % 32) == 0, "max_points_per_leaf must be a multiple of 32"); @@ -196,7 +197,7 @@ namespace pcl octree. ends[0] = points_number; octree.parent[0] = -1; - //init shared + //init shared nodes_num = 1; tasks_beg = 0; tasks_end = 1; @@ -211,8 +212,8 @@ namespace pcl __syncthreads(); while (tasks_beg < tasks_end && level < Morton::levels) - { - int task_count = tasks_end - tasks_beg; + { + int task_count = tasks_end - tasks_beg; int iters = divUp(task_count, CTA_SIZE); int task = tasks_beg + threadIdx.x; @@ -220,14 +221,14 @@ namespace pcl //__syncthreads(); // extra?? for(int it = 0; it < iters; ++it, task += STRIDE) - { + { int cell_count = (task < tasks_end) ? FindCells(task, level, cell_begs, cell_code) : 0; - + offsets[threadIdx.x] = cell_count; __syncthreads(); scan_block(offsets); - + //__syncthreads(); //because sync is inside the scan above if (task < tasks_end) @@ -255,24 +256,24 @@ namespace pcl __syncthreads(); if (threadIdx.x == CTA_SIZE - 1) - { + { total_new += cell_count + offsets[threadIdx.x]; nodes_num += cell_count + offsets[threadIdx.x]; - } - __syncthreads(); + } + __syncthreads(); } /* for(int it = 0; it < iters; ++it, task += STRIDE) */ //__syncthreads(); //extra ?? if (threadIdx.x == CTA_SIZE - 1) - { + { tasks_beg = tasks_end; - tasks_end += total_new; + tasks_end += total_new; total_new = 0; } ++level; - __syncthreads(); + __syncthreads(); } if (threadIdx.x == CTA_SIZE - 1) @@ -285,7 +286,7 @@ namespace pcl } void pcl::device::OctreeImpl::build() -{ +{ using namespace pcl::device; host_octree.downloaded = false; @@ -293,7 +294,7 @@ void pcl::device::OctreeImpl::build() //allocatations { - //ScopeTimer timer("new_allocs"); + //ScopeTimer timer("new_allocs"); //+1 codes * points_num * sizeof(int) //+1 indices * points_num * sizeof(int) //+1 octreeGlobal.nodes * points_num * sizeof(int) @@ -306,22 +307,22 @@ void pcl::device::OctreeImpl::build() //+3 points_sorted * points_num * sizeof(float) //== - // 10 rows + // 10 rows - //left - //octreeGlobal.nodes_num * 1 * sizeof(int) + //left + //octreeGlobal.nodes_num * 1 * sizeof(int) //== - // 3 * sizeof(int) => +1 row + // 3 * sizeof(int) => +1 row const int transaction_size = 128 / sizeof(int); int cols = std::max(points_num, transaction_size * 4); int rows = 10 + 1; // = 13 - + storage.create(rows, cols); - + codes = DeviceArray(storage.ptr(0), points_num); indices = DeviceArray(storage.ptr(1), points_num); - + octreeGlobal.nodes = storage.ptr(2); octreeGlobal.codes = storage.ptr(3); octreeGlobal.begs = storage.ptr(4); @@ -332,10 +333,10 @@ void pcl::device::OctreeImpl::build() points_sorted = DeviceArray2D(3, points_num, storage.ptr(8), storage.step()); } - + { - //ScopeTimer timer("reduce-morton-sort-permutations"); - + //ScopeTimer timer("reduce-morton-sort-permutations"); + thrust::device_ptr beg(points.ptr()); thrust::device_ptr end = beg + points.size(); @@ -345,49 +346,49 @@ void pcl::device::OctreeImpl::build() atmin.x = atmin.y = atmin.z = std::numeric_limits::lowest(); atmax.w = atmin.w = 0; - //ScopeTimer timer("reduce"); + //ScopeTimer timer("reduce"); PointType minp = thrust::reduce(beg, end, atmax, SelectMinPoint()); PointType maxp = thrust::reduce(beg, end, atmin, SelectMaxPoint()); octreeGlobal.minp = make_float3(minp.x, minp.y, minp.z); octreeGlobal.maxp = make_float3(maxp.x, maxp.y, maxp.z); } - + thrust::device_ptr codes_beg(codes.ptr()); thrust::device_ptr codes_end = codes_beg + codes.size(); { - //ScopeTimer timer("morton"); + //ScopeTimer timer("morton"); thrust::transform(beg, end, codes_beg, CalcMorton(octreeGlobal.minp, octreeGlobal.maxp)); } thrust::device_ptr indices_beg(indices.ptr()); thrust::device_ptr indices_end = indices_beg + indices.size(); { - //ScopeTimer timer("sort"); + //ScopeTimer timer("sort"); thrust::sequence(indices_beg, indices_end); - thrust::sort_by_key(codes_beg, codes_end, indices_beg ); + thrust::sort_by_key(codes_beg, codes_end, indices_beg ); } { - ////ScopeTimer timer("perm"); + ////ScopeTimer timer("perm"); //thrust::copy(make_permutation_iterator(beg, indices_beg), - // make_permutation_iterator(end, indices_end), device_ptr(points_sorted.ptr())); + // make_permutation_iterator(end, indices_end), device_ptr(points_sorted.ptr())); + - } { thrust::device_ptr xs(points_sorted.ptr(0)); thrust::device_ptr ys(points_sorted.ptr(1)); thrust::device_ptr zs(points_sorted.ptr(2)); - //ScopeTimer timer("perm2"); + //ScopeTimer timer("perm2"); thrust::transform(make_permutation_iterator(beg, indices_beg), - make_permutation_iterator(end, indices_end), + make_permutation_iterator(end, indices_end), make_zip_iterator(make_tuple(xs, ys, zs)), PointType_to_tuple()); - + } } - + SingleStepBuild ssb; ssb.octree = octreeGlobal; ssb.codes = codes; diff --git a/gpu/octree/src/utils/approx_nearest_utils.hpp b/gpu/octree/src/utils/approx_nearest_utils.hpp index 247eda0dc9b..9bec17e74d6 100644 --- a/gpu/octree/src/utils/approx_nearest_utils.hpp +++ b/gpu/octree/src/utils/approx_nearest_utils.hpp @@ -12,6 +12,7 @@ #include "morton.hpp" #include +#include #include #include #include diff --git a/gpu/surface/src/cuda/convex_hull.cu b/gpu/surface/src/cuda/convex_hull.cu index c1f20234eba..98e89cd703d 100644 --- a/gpu/surface/src/cuda/convex_hull.cu +++ b/gpu/surface/src/cuda/convex_hull.cu @@ -61,89 +61,89 @@ namespace pcl { namespace device - { + { template struct IndOp { __device__ __forceinline__ thrust::tuple operator()(const thrust::tuple& e1, const thrust::tuple& e2) const - { + { thrust::tuple res; - + if (use_max) - res.get<0>() = fmax(e1.get<0>(), e2.get<0>()); + thrust::get<0>(res) = fmax(thrust::get<0>(e1), thrust::get<0>(e2)); else - res.get<0>() = fmin(e1.get<0>(), e2.get<0>()); + thrust::get<0>(res) = fmin(thrust::get<0>(e1), thrust::get<0>(e2)); - res.get<1>() = (res.get<0>() == e1.get<0>()) ? e1.get<1>() : e2.get<1>(); - return res; - } + thrust::get<1>(res) = (thrust::get<0>(res) == thrust::get<0>(e1)) ? thrust::get<1>(e1) : thrust::get<1>(e2); + return res; + } }; struct X - { - __device__ __forceinline__ - thrust::tuple + { + __device__ __forceinline__ + thrust::tuple operator()(const thrust::tuple& in) const { - return thrust::tuple(in.get<0>().x, in.get<1>()); + return thrust::tuple(thrust::get<0>(in).x, thrust::get<1>(in)); } }; struct Y - { + { __device__ __forceinline__ float operator()(const PointType& in) const { return in.y; } }; struct Z - { + { __device__ __forceinline__ float operator()(const PointType& in) const { return in.z; } }; - + struct LineDist { float3 x1, x2; LineDist(const PointType& p1, const PointType& p2) : x1(tr(p1)), x2(tr(p2)) {} - + __device__ __forceinline__ thrust::tuple operator()(const thrust::tuple& in) const - { - float3 x0 = tr(in.get<0>()); + { + float3 x0 = tr(thrust::get<0>(in)); - float dist = norm(cross(x0 - x1, x0 - x2))/norm(x1 - x2); - return thrust::tuple(dist, in.get<1>()); - } + float dist = norm(cross(x0 - x1, x0 - x2))/norm(x1 - x2); + return thrust::tuple(dist, thrust::get<1>(in)); + } }; struct PlaneDist - { + { float3 x1, n; PlaneDist(const PointType& p1, const PointType& p2, const PointType& p3) : x1(tr(p1)) { float3 x2 = tr(p2), x3 = tr(p3); n = normalized(cross(x2 - x1, x3 - x1)); } - + __device__ __forceinline__ thrust::tuple operator()(const thrust::tuple& in) const { - float3 x0 = tr(in.get<0>()); + float3 x0 = tr(thrust::get<0>(in)); float dist = std::abs(dot(n, x0 - x1)); - return thrust::tuple(dist, in.get<1>()); + return thrust::tuple(dist, thrust::get<1>(in)); } }; - + template int transform_reduce_index(It beg, It end, Unary unop, Init init, Binary binary) { thrust::counting_iterator cbeg(0); thrust::counting_iterator cend = cbeg + thrust::distance(beg, end); - - thrust::tuple t = transform_reduce( - make_zip_iterator(thrust::make_tuple(beg, cbeg)), - make_zip_iterator(thrust::make_tuple(end, cend)), + + thrust::tuple t = transform_reduce( + make_zip_iterator(thrust::make_tuple(beg, cbeg)), + make_zip_iterator(thrust::make_tuple(end, cend)), unop, init, binary); - - return t.get<1>(); + + return thrust::get<1>(t); } template @@ -158,35 +158,35 @@ namespace pcl { thrust::tuple max_tuple(std::numeric_limits::min(), 0); return transform_reduce_index(beg, end, unop, max_tuple, IndOp()); - } + } } } pcl::device::PointStream::PointStream(const Cloud& cloud_) : cloud(cloud_) -{ +{ cloud_size = cloud.size(); facets_dists.create(cloud_size); perm.create(cloud_size); - thrust::device_ptr pbeg(perm.ptr()); + thrust::device_ptr pbeg(perm.ptr()); thrust::sequence(pbeg, pbeg + cloud_size); } void pcl::device::PointStream::computeInitalSimplex() { - thrust::device_ptr beg(cloud.ptr()); + thrust::device_ptr beg(cloud.ptr()); thrust::device_ptr end = beg + cloud_size; - + int minx = transform_reduce_min_index(beg, end, X()); int maxx = transform_reduce_max_index(beg, end, X()); PointType p1 = *(beg + minx); PointType p2 = *(beg + maxx); - + int maxl = transform_reduce_max_index(beg, end, LineDist(p1, p2)); PointType p3 = *(beg + maxl); - + int maxp = transform_reduce_max_index(beg, end, PlaneDist(p1, p2, p3)); PointType p4 = *(beg + maxp); @@ -194,12 +194,12 @@ void pcl::device::PointStream::computeInitalSimplex() simplex.x1 = tr(p1); simplex.x2 = tr(p2); simplex.x3 = tr(p3); simplex.x4 = tr(p4); simplex.i1 = minx; simplex.i2 = maxx; simplex.i3 = maxl; simplex.i4 = maxp; - float maxy = transform_reduce(beg, end, Y(), std::numeric_limits::min(), thrust::maximum()); - float miny = transform_reduce(beg, end, Y(), std::numeric_limits::max(), thrust::minimum()); + float maxy = transform_reduce(beg, end, Y(), std::numeric_limits::min(), thrust::maximum()); + float miny = transform_reduce(beg, end, Y(), std::numeric_limits::max(), thrust::minimum()); + + float maxz = transform_reduce(beg, end, Z(), std::numeric_limits::min(), thrust::maximum()); + float minz = transform_reduce(beg, end, Z(), std::numeric_limits::max(), thrust::minimum()); - float maxz = transform_reduce(beg, end, Z(), std::numeric_limits::min(), thrust::maximum()); - float minz = transform_reduce(beg, end, Z(), std::numeric_limits::max(), thrust::minimum()); - float dx = (p2.x - p1.x); float dy = (maxy - miny); float dz = (maxz - minz); @@ -209,7 +209,7 @@ void pcl::device::PointStream::computeInitalSimplex() simplex.p1 = compute_plane(simplex.x4, simplex.x2, simplex.x3, simplex.x1); simplex.p2 = compute_plane(simplex.x3, simplex.x1, simplex.x4, simplex.x2); simplex.p3 = compute_plane(simplex.x2, simplex.x1, simplex.x4, simplex.x3); - simplex.p4 = compute_plane(simplex.x1, simplex.x2, simplex.x3, simplex.x4); + simplex.p4 = compute_plane(simplex.x1, simplex.x2, simplex.x3, simplex.x4); } namespace pcl @@ -217,7 +217,7 @@ namespace pcl namespace device { __global__ void init_fs(int i1, int i2, int i3, int i4, PtrStep verts_inds) - { + { *(int4*)verts_inds.ptr(0) = make_int4(i2, i1, i1, i1); *(int4*)verts_inds.ptr(1) = make_int4(i3, i3, i2, i2); *(int4*)verts_inds.ptr(2) = make_int4(i4, i4, i4, i3); @@ -227,10 +227,10 @@ namespace pcl } void pcl::device::FacetStream::setInitialFacets(const InitalSimplex& s) -{ - init_fs<<<1, 1>>>(s.i1, s.i2, s.i3, s.i4, verts_inds); +{ + init_fs<<<1, 1>>>(s.i1, s.i2, s.i3, s.i4, verts_inds); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); facet_count = 4; } @@ -245,20 +245,20 @@ namespace pcl { float diag; float4 pl1, pl2, pl3, pl4; - - InitalClassify(const float4& p1, const float4& p2, const float4& p3, const float4& p4, float diagonal) + + InitalClassify(const float4& p1, const float4& p2, const float4& p3, const float4& p4, float diagonal) : diag(diagonal), pl1(p1), pl2(p2), pl3(p3), pl4(p4) - { + { pl1 *= compue_inv_normal_norm(pl1); pl2 *= compue_inv_normal_norm(pl2); pl3 *= compue_inv_normal_norm(pl3); pl4 *= compue_inv_normal_norm(pl4); } - + __device__ __forceinline__ - std::uint64_t + std::uint64_t operator()(const PointType& p) const - { + { float4 x = p; x.w = 1; @@ -270,7 +270,7 @@ namespace pcl float dists[] = { d0, d1, d2, d3 }; int negs_inds[4]; int neg_count = 0; - + int idx = std::numeric_limits::max(); float dist = 0; @@ -283,7 +283,7 @@ namespace pcl { int i1 = negs_inds[1]; int i2 = negs_inds[2]; - + int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1; negs_inds[1] = ir; --neg_count; @@ -293,10 +293,10 @@ namespace pcl { int i1 = negs_inds[0]; int i2 = negs_inds[1]; - + int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1; negs_inds[0] = ir; - --neg_count; + --neg_count; } if (neg_count == 1) @@ -311,28 +311,28 @@ namespace pcl std::uint64_t res = idx; res <<= 32; return res + *reinterpret_cast(&dist); - } - }; + } + }; - __global__ void initalClassifyKernel(const InitalClassify ic, const PointType* points, int cloud_size, std::uint64_t* output) - { + __global__ void initalClassifyKernel(const InitalClassify ic, const PointType* points, int cloud_size, std::uint64_t* output) + { int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < cloud_size) - output[index] = ic(points[index]); + if (index < cloud_size) + output[index] = ic(points[index]); } } } void pcl::device::PointStream::initalClassify() -{ +{ //thrust::device_ptr beg(cloud.ptr()); //thrust::device_ptr end = beg + cloud_size; thrust::device_ptr out(facets_dists.ptr()); - + InitalClassify ic(simplex.p1, simplex.p2, simplex.p3, simplex.p4, cloud_diag); //thrust::transform(beg, end, out, ic); - + //printFuncAttrib(initalClassifyKernel); initalClassifyKernel<<>>(ic, cloud, cloud_size, facets_dists); @@ -350,9 +350,9 @@ namespace pcl { namespace device { - __device__ int new_cloud_size; + __device__ int new_cloud_size; struct SearchFacetHeads - { + { std::uint64_t *facets_dists; int cloud_size; int facet_count; @@ -361,25 +361,25 @@ namespace pcl mutable int* head_points; //bool logger; - + __device__ __forceinline__ void operator()(int facet) const - { + { const std::uint64_t* b = facets_dists; const std::uint64_t* e = b + cloud_size; bool last_thread = facet == facet_count; - int search_value = !last_thread ? facet : std::numeric_limits::max(); - int index = lower_bound(b, e, search_value, LessThanByFacet()) - b; - + int search_value = !last_thread ? facet : std::numeric_limits::max(); + int index = lower_bound(b, e, search_value, LessThanByFacet()) - b; + if (last_thread) new_cloud_size = index; else { bool not_found = index == cloud_size || (facet != (facets_dists[index] >> 32)); - head_points[facet] = not_found ? -1 : perm[index]; + head_points[facet] = not_found ? -1 : perm[index]; } } }; @@ -403,18 +403,18 @@ int pcl::device::PointStream::searchFacetHeads(std::size_t facet_count, DeviceAr sfh.facet_count = (int)facet_count; sfh.perm = perm; sfh.points = cloud.ptr(); - sfh.head_points = head_points; - + sfh.head_points = head_points; + //thrust::counting_iterator b(0); - //thrust::counting_iterator e = b + facet_count + 1; + //thrust::counting_iterator e = b + facet_count + 1; //thrust::for_each(b, e, sfh); searchFacetHeadsKernel<<>>(sfh); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); int new_size; - cudaSafeCall( cudaMemcpyFromSymbol( (void*)&new_size, pcl::device::new_cloud_size, sizeof(new_size)) ); + cudaSafeCall( cudaMemcpyFromSymbol( (void*)&new_size, pcl::device::new_cloud_size, sizeof(new_size)) ); return new_size; } @@ -434,7 +434,7 @@ namespace pcl struct Compaction { - enum + enum { CTA_SIZE = 256, @@ -443,18 +443,18 @@ namespace pcl int* head_points_in; PtrStep verts_inds_in; - + int *scan_buffer; int facet_count; mutable int* head_points_out; mutable PtrStep verts_inds_out; - + mutable PtrStep empty_facets; mutable int *empty_count; - + __device__ __forceinline__ void operator()() const { @@ -473,16 +473,16 @@ namespace pcl int offset = scan_buffer[idx]; head_points_out[offset] = head_idx; - + verts_inds_out.ptr(0)[offset] = verts_inds_in.ptr(0)[idx]; verts_inds_out.ptr(1)[offset] = verts_inds_in.ptr(1)[idx]; verts_inds_out.ptr(2)[offset] = verts_inds_in.ptr(2)[idx]; - - + + } - else - empty = 1; + else + empty = 1; } int total = __popc (__ballot_sync (__activemask (), empty)); @@ -498,7 +498,7 @@ namespace pcl if (laneid == 0) { int old = atomicAdd(empty_count, total); - wapr_buffer[warpid] = old; + wapr_buffer[warpid] = old; } int old = wapr_buffer[warpid]; @@ -506,11 +506,11 @@ namespace pcl { empty_facets.ptr(0)[old + offset] = verts_inds_in.ptr(0)[idx]; empty_facets.ptr(1)[old + offset] = verts_inds_in.ptr(1)[idx]; - empty_facets.ptr(2)[old + offset] = verts_inds_in.ptr(2)[idx]; + empty_facets.ptr(2)[old + offset] = verts_inds_in.ptr(2)[idx]; int a1 = verts_inds_in.ptr(0)[idx], a2 = verts_inds_in.ptr(1)[idx], a3 = verts_inds_in.ptr(2)[idx]; } - } + } } }; @@ -521,19 +521,19 @@ namespace pcl void pcl::device::FacetStream::compactFacets() { - int old_empty_count; - empty_count.download(&old_empty_count); + int old_empty_count; + empty_count.download(&old_empty_count); thrust::device_ptr b(head_points.ptr()); thrust::device_ptr e = b + facet_count; thrust::device_ptr o(scan_buffer.ptr()); - - thrust::transform_exclusive_scan(b, e, o, NotMinus1(), 0, thrust::plus()); - + + thrust::transform_exclusive_scan(b, e, o, NotMinus1(), 0, thrust::plus()); + Compaction c; c.verts_inds_in = verts_inds; - c.head_points_in = head_points; + c.head_points_in = head_points; c.scan_buffer = scan_buffer; c.facet_count = facet_count; @@ -543,20 +543,20 @@ void pcl::device::FacetStream::compactFacets() c.empty_facets = empty_facets; c.empty_count = empty_count; - + int block = Compaction::CTA_SIZE; int grid = divUp(facet_count, block); - compactionKernel<<>>(c); + compactionKernel<<>>(c); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); - + verts_inds.swap(verts_inds2); head_points.swap(head_points2); - int new_empty_count; - empty_count.download(&new_empty_count); - + int new_empty_count; + empty_count.download(&new_empty_count); + facet_count -= new_empty_count - old_empty_count; } @@ -583,11 +583,11 @@ namespace pcl int facet_count; - __device__ __forceinline__ + __device__ __forceinline__ void operator()(int point_idx) const { int perm_index = perm[point_idx]; - + int facet = facets_dists[point_idx] >> 32; facet = scan_buffer[facet]; @@ -596,10 +596,10 @@ namespace pcl if (hi == perm_index) { std::uint64_t res = std::numeric_limits::max(); - res <<= 32; + res <<= 32; facets_dists[point_idx] = res; } - else + else { int i1 = verts_inds.ptr(0)[facet]; @@ -610,16 +610,16 @@ namespace pcl float3 v1 = tr( points[ i1 ] ); float3 v2 = tr( points[ i2 ] ); float3 v3 = tr( points[ i3 ] ); - + float4 p0 = compute_plane(hp, v1, v2, /*opposite*/v3); // j float4 p1 = compute_plane(hp, v2, v3, /*opposite*/v1); // facet_count + j - float4 p2 = compute_plane(hp, v3, v1, /*opposite*/v2); // facet_count + j*2 + float4 p2 = compute_plane(hp, v3, v1, /*opposite*/v2); // facet_count + j*2 p0 *= compue_inv_normal_norm(p0); p1 *= compue_inv_normal_norm(p1); p2 *= compue_inv_normal_norm(p2); - + float4 p = points[perm_index]; p.w = 1; @@ -640,12 +640,12 @@ namespace pcl for(int i = 0; i < 3; ++i) if (dists[i] < 0) negs_inds[neg_count++] = i; - + if (neg_count == 3) { int i1 = negs_inds[1]; int i2 = negs_inds[2]; - + int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1; negs_inds[1] = ir; --neg_count; @@ -655,10 +655,10 @@ namespace pcl { int i1 = negs_inds[0]; int i2 = negs_inds[1]; - + int ir = std::abs(dists[i1]) < std::abs(dists[i2]) ? i2 : i1; negs_inds[0] = ir; - --neg_count; + --neg_count; } if (neg_count == 1) @@ -670,16 +670,16 @@ namespace pcl // if (neg_count == 0) // new_idx = std::numeric_limits::max() ==>> internal point - + std::uint64_t res = new_idx; res <<= 32; res += *reinterpret_cast(&dist); facets_dists[point_idx] = res; - } /* if (hi == perm_index) */ + } /* if (hi == perm_index) */ } - }; + }; __global__ void classifyKernel(const Classify c, int cloud_size) { @@ -692,7 +692,7 @@ namespace pcl } void pcl::device::PointStream::classify(FacetStream& fs) -{ +{ Classify c; c.facets_dists = facets_dists; @@ -706,16 +706,16 @@ void pcl::device::PointStream::classify(FacetStream& fs) c.diag = cloud_diag; c.facet_count = fs.facet_count; - //thrust::counting_iterator b(0); + //thrust::counting_iterator b(0); //thrust::for_each(b, b + cloud_size, c); classifyKernel<<>>(c, cloud_size); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); - + thrust::device_ptr beg(facets_dists.ptr()); thrust::device_ptr end = beg + cloud_size; - + thrust::device_ptr pbeg(perm.ptr()); thrust::sort_by_key(beg, end, pbeg); } @@ -731,14 +731,14 @@ namespace pcl mutable PtrStep verts_inds; - __device__ __forceinline__ + __device__ __forceinline__ void operator()(int facet) const { int hi = head_points[facet]; int i1 = verts_inds.ptr(0)[facet]; int i2 = verts_inds.ptr(1)[facet]; int i3 = verts_inds.ptr(2)[facet]; - + make_facet(hi, i1, i2, facet); make_facet(hi, i2, i3, facet + facet_count); make_facet(hi, i3, i1, facet + facet_count * 2); @@ -757,8 +757,8 @@ namespace pcl { int facet = threadIdx.x + blockIdx.x * blockDim.x; - if (facet < sf.facet_count) - sf(facet); + if (facet < sf.facet_count) + sf(facet); } } } @@ -769,9 +769,9 @@ void pcl::device::FacetStream::splitFacets() sf.head_points = head_points; sf.verts_inds = verts_inds; sf.facet_count = facet_count; - - //thrust::counting_iterator b(0); + + //thrust::counting_iterator b(0); //thrust::for_each(b, b + facet_count, sf); splitFacetsKernel<<>>(sf); @@ -786,8 +786,8 @@ size_t pcl::device::remove_duplicates(DeviceArray& indeces) thrust::device_ptr beg(indeces.ptr()); thrust::device_ptr end = beg + indeces.size(); - thrust::sort(beg, end); - return (std::size_t)(thrust::unique(beg, end) - beg); + thrust::sort(beg, end); + return (std::size_t)(thrust::unique(beg, end) - beg); } @@ -810,15 +810,15 @@ void pcl::device::pack_hull(const DeviceArray& points, const DeviceAr { output.create(indeces.size()); - //thrust::device_ptr in(points.ptr()); - + //thrust::device_ptr in(points.ptr()); + //thrust::device_ptr mb(indeces.ptr()); //thrust::device_ptr me = mb + indeces.size(); - //thrust::device_ptr out(output.ptr()); + //thrust::device_ptr out(output.ptr()); //thrust::gather(mb, me, in, out); - + gatherKernel<<>>(indeces, points, output); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() );