diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h new file mode 100644 index 0000000000000..c477922e59533 --- /dev/null +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameChunk.h @@ -0,0 +1,150 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// + +#ifndef TRACKINGITSGPU_INCLUDE_TIMEFRAMECHUNKGPU_H +#define TRACKINGITSGPU_INCLUDE_TIMEFRAMECHUNKGPU_H + +#include "ITStracking/Configuration.h" +#include "ITStracking/TimeFrame.h" + +#include "ITStrackingGPU/ClusterLinesGPU.h" +#include "ITStrackingGPU/Array.h" +#include "ITStrackingGPU/Vector.h" +#include "ITStrackingGPU/Stream.h" + +#include + +namespace o2::its::gpu +{ +template +struct StaticTrackingParameters { + StaticTrackingParameters& operator=(const StaticTrackingParameters& t) = default; + void set(const TrackingParameters& pars) + { + ClusterSharing = pars.ClusterSharing; + MinTrackLength = pars.MinTrackLength; + NSigmaCut = pars.NSigmaCut; + PVres = pars.PVres; + DeltaROF = pars.DeltaROF; + ZBins = pars.ZBins; + PhiBins = pars.PhiBins; + CellDeltaTanLambdaSigma = pars.CellDeltaTanLambdaSigma; + } + + /// General parameters + int ClusterSharing = 0; + int MinTrackLength = nLayers; + float NSigmaCut = 5; + float PVres = 1.e-2f; + int DeltaROF = 0; + int ZBins{256}; + int PhiBins{128}; + + /// Cell finding cuts + float CellDeltaTanLambdaSigma = 0.007f; +}; + +template +class GpuTimeFrameChunk +{ + public: + static size_t computeScalingSizeBytes(const int, const TimeFrameGPUParameters&); + static size_t computeFixedSizeBytes(const TimeFrameGPUParameters&); + static size_t computeRofPerChunk(const TimeFrameGPUParameters&, const size_t); + + GpuTimeFrameChunk() = delete; + GpuTimeFrameChunk(o2::its::TimeFrame* tf, TimeFrameGPUParameters& conf) + { + mTimeFramePtr = tf; + mTFGPUParams = &conf; + } + ~GpuTimeFrameChunk(); + + /// Most relevant operations + void allocate(const size_t, Stream&); + void reset(const Task, Stream&); + size_t loadDataOnDevice(const size_t, const size_t, const int, Stream&); + + /// Interface + Cluster* getDeviceClusters(const int); + int* getDeviceClusterExternalIndices(const int); + int* getDeviceIndexTables(const int); + Tracklet* getDeviceTracklets(const int); + int* getDeviceTrackletsLookupTables(const int); + CellSeed* getDeviceCells(const int); + int* getDeviceCellsLookupTables(const int); + int* getDeviceRoadsLookupTables(const int); + TimeFrameGPUParameters* getTimeFrameGPUParameters() const { return mTFGPUParams; } + + int* getDeviceCUBTmpBuffer() { return mCUBTmpBufferDevice; } + int* getDeviceFoundTracklets() { return mFoundTrackletsDevice; } + int* getDeviceNFoundCells() { return mNFoundCellsDevice; } + int* getDeviceCellNeigboursLookupTables(const int); + int* getDeviceCellNeighbours(const int); + CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } + int** getDeviceArrayNeighboursCell() const { return mNeighboursCellDeviceArray; } + int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLookupTablesDeviceArray; } + + /// Vertexer only + int* getDeviceNTrackletCluster(const int combid) { return mNTrackletsPerClusterDevice[combid]; } + Line* getDeviceLines() { return mLinesDevice; }; + int* getDeviceNFoundLines() { return mNFoundLinesDevice; } + int* getDeviceNExclusiveFoundLines() { return mNExclusiveFoundLinesDevice; } + unsigned char* getDeviceUsedTracklets() { return mUsedTrackletsDevice; } + int* getDeviceClusteredLines() { return mClusteredLinesDevice; } + size_t getNPopulatedRof() const { return mNPopulatedRof; } + + private: + /// Host + std::array, nLayers> mHostClusters; + std::array, nLayers> mHostIndexTables; + + /// Device + std::array mClustersDevice; + std::array mClusterExternalIndicesDevice; + std::array mIndexTablesDevice; + std::array mTrackletsDevice; + std::array mTrackletsLookupTablesDevice; + std::array mCellsDevice; + // Road* mRoadsDevice; + std::array mCellsLookupTablesDevice; + std::array mNeighboursCellDevice; + std::array mNeighboursCellLookupTablesDevice; + std::array mRoadsLookupTablesDevice; + + // These are to make them accessible using layer index + CellSeed** mCellsDeviceArray; + int** mNeighboursCellDeviceArray; + int** mNeighboursCellLookupTablesDeviceArray; + + // Small accessory buffers + int* mCUBTmpBufferDevice; + int* mFoundTrackletsDevice; + int* mNFoundCellsDevice; + + /// Vertexer only + Line* mLinesDevice; + int* mNFoundLinesDevice; + int* mNExclusiveFoundLinesDevice; + unsigned char* mUsedTrackletsDevice; + std::array mNTrackletsPerClusterDevice; + int* mClusteredLinesDevice; + + /// State and configuration + bool mAllocated = false; + size_t mNRof = 0; + size_t mNPopulatedRof = 0; + o2::its::TimeFrame* mTimeFramePtr = nullptr; + TimeFrameGPUParameters* mTFGPUParams = nullptr; +}; +} // namespace o2::its::gpu +#endif \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 73955be325ff7..db1bfd836e8e6 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -20,23 +20,14 @@ #include "ITStrackingGPU/Array.h" #include "ITStrackingGPU/Vector.h" #include "ITStrackingGPU/Stream.h" +#include "ITStrackingGPU/TimeFrameChunk.h" #include namespace o2 { -namespace gpu -{ -class GPUChainITS; -} namespace its { -template -struct gpuPair { - T1 first; - T2 second; -}; - namespace gpu { @@ -44,132 +35,6 @@ class DefaultGPUAllocator : public ExternalAllocator { void* allocate(size_t size) override; }; -template -struct StaticTrackingParameters { - StaticTrackingParameters& operator=(const StaticTrackingParameters& t) = default; - void set(const TrackingParameters& pars) - { - ClusterSharing = pars.ClusterSharing; - MinTrackLength = pars.MinTrackLength; - NSigmaCut = pars.NSigmaCut; - PVres = pars.PVres; - DeltaROF = pars.DeltaROF; - ZBins = pars.ZBins; - PhiBins = pars.PhiBins; - CellDeltaTanLambdaSigma = pars.CellDeltaTanLambdaSigma; - } - - /// General parameters - int ClusterSharing = 0; - int MinTrackLength = nLayers; - float NSigmaCut = 5; - float PVres = 1.e-2f; - int DeltaROF = 0; - int ZBins{256}; - int PhiBins{128}; - - /// Cell finding cuts - float CellDeltaTanLambdaSigma = 0.007f; -}; - -enum class Task { - Tracker = 0, - Vertexer = 1 -}; - -template -class GpuTimeFrameChunk -{ - public: - static size_t computeScalingSizeBytes(const int, const TimeFrameGPUParameters&); - static size_t computeFixedSizeBytes(const TimeFrameGPUParameters&); - static size_t computeRofPerChunk(const TimeFrameGPUParameters&, const size_t); - - GpuTimeFrameChunk() = delete; - GpuTimeFrameChunk(o2::its::TimeFrame* tf, TimeFrameGPUParameters& conf) - { - mTimeFramePtr = tf; - mTFGPUParams = &conf; - } - ~GpuTimeFrameChunk(); - - /// Most relevant operations - void allocate(const size_t, Stream&); - void reset(const Task, Stream&); - size_t loadDataOnDevice(const size_t, const size_t, const int, Stream&); - - /// Interface - Cluster* getDeviceClusters(const int); - int* getDeviceClusterExternalIndices(const int); - int* getDeviceIndexTables(const int); - Tracklet* getDeviceTracklets(const int); - int* getDeviceTrackletsLookupTables(const int); - CellSeed* getDeviceCells(const int); - int* getDeviceCellsLookupTables(const int); - int* getDeviceRoadsLookupTables(const int); - TimeFrameGPUParameters* getTimeFrameGPUParameters() const { return mTFGPUParams; } - - int* getDeviceCUBTmpBuffer() { return mCUBTmpBufferDevice; } - int* getDeviceFoundTracklets() { return mFoundTrackletsDevice; } - int* getDeviceNFoundCells() { return mNFoundCellsDevice; } - int* getDeviceCellNeigboursLookupTables(const int); - int* getDeviceCellNeighbours(const int); - CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } - int** getDeviceArrayNeighboursCell() const { return mNeighboursCellDeviceArray; } - int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLookupTablesDeviceArray; } - - /// Vertexer only - int* getDeviceNTrackletCluster(const int combid) { return mNTrackletsPerClusterDevice[combid]; } - Line* getDeviceLines() { return mLinesDevice; }; - int* getDeviceNFoundLines() { return mNFoundLinesDevice; } - int* getDeviceNExclusiveFoundLines() { return mNExclusiveFoundLinesDevice; } - unsigned char* getDeviceUsedTracklets() { return mUsedTrackletsDevice; } - int* getDeviceClusteredLines() { return mClusteredLinesDevice; } - size_t getNPopulatedRof() const { return mNPopulatedRof; } - - private: - /// Host - std::array, nLayers> mHostClusters; - std::array, nLayers> mHostIndexTables; - - /// Device - std::array mClustersDevice; - std::array mClusterExternalIndicesDevice; - std::array mIndexTablesDevice; - std::array mTrackletsDevice; - std::array mTrackletsLookupTablesDevice; - std::array mCellsDevice; - // Road* mRoadsDevice; - std::array mCellsLookupTablesDevice; - std::array mNeighboursCellDevice; - std::array mNeighboursCellLookupTablesDevice; - std::array mRoadsLookupTablesDevice; - - // These are to make them accessible using layer index - CellSeed** mCellsDeviceArray; - int** mNeighboursCellDeviceArray; - int** mNeighboursCellLookupTablesDeviceArray; - - // Small accessory buffers - int* mCUBTmpBufferDevice; - int* mFoundTrackletsDevice; - int* mNFoundCellsDevice; - - /// Vertexer only - Line* mLinesDevice; - int* mNFoundLinesDevice; - int* mNExclusiveFoundLinesDevice; - unsigned char* mUsedTrackletsDevice; - std::array mNTrackletsPerClusterDevice; - int* mClusteredLinesDevice; - - /// State and configuration - bool mAllocated = false; - size_t mNRof = 0; - size_t mNPopulatedRof = 0; - o2::its::TimeFrame* mTimeFramePtr = nullptr; - TimeFrameGPUParameters* mTFGPUParams = nullptr; -}; template class TimeFrameGPU : public TimeFrame @@ -191,13 +56,19 @@ class TimeFrameGPU : public TimeFrame void loadClustersDevice(); void loadTrackletsDevice(); void loadCellsDevice(); + void loadCellsLUT(); void loadTrackSeedsDevice(); void loadTrackSeedsChi2Device(); void loadRoadsDevice(); void loadTrackSeedsDevice(std::vector&); - void createCellNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); + void createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); + void createNeighboursLUTDevice(const int, const unsigned int); void createTrackITSExtDevice(std::vector&); void downloadTrackITSExtDevice(std::vector&); + void downloadCellsNeighbours(std::vector>>&, const int); + void downloadNeighboursLUT(std::vector&, const int); + void downloadCellsDevice(const int); + void unregisterRest(); void initDeviceChunks(const int, const int); template size_t loadChunkData(const size_t, const size_t, const size_t); @@ -224,6 +95,7 @@ class TimeFrameGPU : public TimeFrame // Hybrid Road* getDeviceRoads() { return mRoadsDevice; } TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; } + int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; } gpuPair* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } TrackingFrameInfo* getDeviceTrackingFrameInfo(const int); // TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() { return mTrackingFrameInfoDeviceArray; } @@ -231,10 +103,14 @@ class TimeFrameGPU : public TimeFrame Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } + int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } + int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; } o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } + int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; } + void setDevicePropagator(const o2::base::PropagatorImpl*) override; // Host-specific getters @@ -263,7 +139,13 @@ class TimeFrameGPU : public TimeFrame Cluster** mUnsortedClustersDeviceArray; std::array mTrackletsDevice; Tracklet** mTrackletsDeviceArray; + std::array mCellsLUTDevice; + std::array mNeighboursLUTDevice; + int** mCellsLUTDeviceArray; + int** mNeighboursCellDeviceArray; + int** mNeighboursCellLUTDeviceArray; std::array mCellsDevice; + std::array mNeighboursIndexTablesDevice; CellSeed* mTrackSeedsDevice; CellSeed** mCellsDeviceArray; std::array mCellSeedsDevice; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index cc74456bbb1aa..167baa905f790 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -49,6 +49,37 @@ GPUg() void fitTrackSeedsKernel( const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT); #endif } // namespace gpu +void countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUTs, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads); + +void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUTs, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads); + +void filterCellNeighboursHandler(std::vector&, + gpuPair*, + unsigned int); void trackSeedHandler(CellSeed* trackSeeds, const TrackingFrameInfo** foundTrackingFrameInfo, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 42b2777d9b4dd..66244bf854b5f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -23,8 +23,20 @@ namespace o2 { namespace its { +template +struct gpuPair { + T1 first; + T2 second; +}; + namespace gpu { + +enum class Task { + Tracker = 0, + Vertexer = 1 +}; + template GPUhd() T* getPtrFromRuler(int index, T* src, const int* ruler, const int stride = 1) { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 2f8422112dc58..c8e1d0a910e5b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,13 +13,14 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") - +# add_compile_options(-O0 -g -lineinfo -fPIC) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu Context.cu Stream.cu TrackerTraitsGPU.cxx TimeFrameGPU.cu + TimeFrameChunk.cu TracerGPU.cu TrackingKernels.cu VertexingKernels.cu @@ -31,7 +32,7 @@ o2_add_library(ITStrackingCUDA O2::SimulationDataFormat O2::ReconstructionDataFormats O2::GPUCommon - CUDA::nvToolsExt # TODO: change to CUDA::nvtx3 when CMake bump >= 3.25 + CUDA::nvToolsExt PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider TARGETVARNAME targetName) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu new file mode 100644 index 0000000000000..8353b6ff0aa8b --- /dev/null +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameChunk.cu @@ -0,0 +1,293 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +#include +#include +#include + +#include "ITStracking/Constants.h" + +#include "ITStrackingGPU/Utils.h" +#include "ITStrackingGPU/TracerGPU.h" + +#include "ITStrackingGPU/TimeFrameChunk.h" + +#include +#include + +#include "GPUCommonDef.h" +#include "GPUCommonMath.h" +#include "GPUCommonLogger.h" + +#ifndef __HIPCC__ +#define THRUST_NAMESPACE thrust::cuda +#else +#define THRUST_NAMESPACE thrust::hip +#endif + +namespace o2::its +{ +using constants::GB; +using constants::MB; +namespace gpu +{ +using utils::checkGPUError; + +template +GpuTimeFrameChunk::~GpuTimeFrameChunk() +{ + if (mAllocated) { + for (int i = 0; i < nLayers; ++i) { + checkGPUError(cudaFree(mClustersDevice[i])); + // checkGPUError(cudaFree(mTrackingFrameInfoDevice[i])); + checkGPUError(cudaFree(mClusterExternalIndicesDevice[i])); + checkGPUError(cudaFree(mIndexTablesDevice[i])); + if (i < nLayers - 1) { + checkGPUError(cudaFree(mTrackletsDevice[i])); + checkGPUError(cudaFree(mTrackletsLookupTablesDevice[i])); + if (i < nLayers - 2) { + checkGPUError(cudaFree(mCellsDevice[i])); + checkGPUError(cudaFree(mCellsLookupTablesDevice[i])); + checkGPUError(cudaFree(mRoadsLookupTablesDevice[i])); + if (i < nLayers - 3) { + checkGPUError(cudaFree(mNeighboursCellLookupTablesDevice[i])); + checkGPUError(cudaFree(mNeighboursCellDevice[i])); + } + } + } + } + // checkGPUError(cudaFree(mRoadsDevice)); + checkGPUError(cudaFree(mCUBTmpBufferDevice)); + checkGPUError(cudaFree(mFoundTrackletsDevice)); + checkGPUError(cudaFree(mNFoundCellsDevice)); + checkGPUError(cudaFree(mCellsDeviceArray)); + checkGPUError(cudaFree(mNeighboursCellDeviceArray)); + checkGPUError(cudaFree(mNeighboursCellLookupTablesDeviceArray)); + } +} + +template +void GpuTimeFrameChunk::allocate(const size_t nrof, Stream& stream) +{ + RANGE("device_partition_allocation", 2); + mNRof = nrof; + // for (int i = 0; i < nLayers; ++i) { + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mClustersDevice[i])), sizeof(Cluster) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mTrackingFrameInfoDevice[i])), sizeof(TrackingFrameInfo) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mClusterExternalIndicesDevice[i])), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mIndexTablesDevice[i])), sizeof(int) * (256 * 128 + 1) * nrof, &stream, true); + // if (i < nLayers - 1) { + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mTrackletsLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mTrackletsDevice[i])), sizeof(Tracklet) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // if (i < nLayers - 2) { + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mCellsLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->validatedTrackletsCapacity * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mCellsDevice[i])), sizeof(CellSeed) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mRoadsLookupTablesDevice[i]), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); + // if (i < nLayers - 3) { + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mNeighboursCellLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mNeighboursCellDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); + // } + // if (i < 2) { + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mNTrackletsPerClusterDevice[i])), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // } + // } + // } + // } + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mCUBTmpBufferDevice), mTFGPUParams->tmpCUBBufferSize * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mLinesDevice), sizeof(Line) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNFoundLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNExclusiveFoundLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof + 1, &stream, true); // + 1 for cub::DeviceScan::ExclusiveSum, to cover cases where we have maximum number of clusters per ROF + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mUsedTrackletsDevice), sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mClusteredLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * nrof, &stream, true); + // // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mRoadsDevice), sizeof(Road) * mTFGPUParams->maxRoadPerRofSize * nrof, &stream, true); + + // /// Invariant allocations + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mFoundTrackletsDevice), (nLayers - 1) * sizeof(int) * nrof, &stream, true); // No need to reset, we always read it after writing + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNFoundCellsDevice), (nLayers - 2) * sizeof(int) * nrof, &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNeighboursCellDeviceArray), (nLayers - 3) * sizeof(int*), &stream, true); + // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNeighboursCellLookupTablesDeviceArray), (nLayers - 3) * sizeof(int*), &stream, true); + + // /// Copy pointers of allocated memory to regrouping arrays + // checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, stream.get())); + // checkGPUError(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); + // checkGPUError(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); + + mAllocated = true; +} + +template +void GpuTimeFrameChunk::reset(const Task task, Stream& stream) +{ + RANGE("buffer_reset", 0); + // if ((bool)task) { // Vertexer-only initialisation (cannot be constexpr: due to the presence of gpu raw calls can't be put in header) + // for (int i = 0; i < 2; i++) { + // auto thrustTrackletsBegin = thrust::device_ptr(mTrackletsDevice[i]); + // auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof; + // thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{}); + // checkGPUError(cudaMemsetAsync(mNTrackletsPerClusterDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); + // } + // checkGPUError(cudaMemsetAsync(mUsedTrackletsDevice, false, sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); + // checkGPUError(cudaMemsetAsync(mClusteredLinesDevice, -1, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * mNRof, stream.get())); + // } else { + // for (int i = 0; i < nLayers; ++i) { + // if (i < nLayers - 1) { + // checkGPUError(cudaMemsetAsync(mTrackletsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); + // auto thrustTrackletsBegin = thrust::device_ptr(mTrackletsDevice[i]); + // auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof; + // thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{}); + // if (i < nLayers - 2) { + // checkGPUError(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get())); + // checkGPUError(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); + // if (i < nLayers - 3) { + // checkGPUError(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); + // checkGPUError(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); + // } + // } + // } + // } + // checkGPUError(cudaMemsetAsync(mNFoundCellsDevice, 0, (nLayers - 2) * sizeof(int), stream.get())); + // } +} + +template +size_t GpuTimeFrameChunk::computeScalingSizeBytes(const int nrof, const TimeFrameGPUParameters& config) +{ + size_t rofsize = nLayers * sizeof(int); // number of clusters per ROF + // rofsize += nLayers * sizeof(Cluster) * config.clustersPerROfCapacity; // clusters + // rofsize += nLayers * sizeof(TrackingFrameInfo) * config.clustersPerROfCapacity; // tracking frame info + // rofsize += nLayers * sizeof(int) * config.clustersPerROfCapacity; // external cluster indices + // rofsize += nLayers * sizeof(int) * (256 * 128 + 1); // index tables + // rofsize += (nLayers - 1) * sizeof(int) * config.clustersPerROfCapacity; // tracklets lookup tables + // rofsize += (nLayers - 1) * sizeof(Tracklet) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity; // tracklets + // rofsize += 2 * sizeof(int) * config.clustersPerROfCapacity; // tracklets found per cluster (vertexer) + // rofsize += sizeof(unsigned char) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity; // used tracklets (vertexer) + // rofsize += (nLayers - 2) * sizeof(int) * config.validatedTrackletsCapacity; // cells lookup tables + // rofsize += (nLayers - 2) * sizeof(CellSeed) * config.maxNeighboursSize; // cells + // rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize; // cell neighbours lookup tables + // rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize; // cell neighbours + // rofsize += sizeof(Road) * config.maxRoadPerRofSize; // roads + // rofsize += (nLayers - 2) * sizeof(int) * config.maxNeighboursSize; // road LUT + // rofsize += sizeof(Line) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity; // lines + // rofsize += sizeof(int) * config.clustersPerROfCapacity; // found lines + // rofsize += sizeof(int) * config.clustersPerROfCapacity; // found lines exclusive sum + // rofsize += sizeof(int) * config.clustersPerROfCapacity * config.maxTrackletsPerCluster; // lines used in clusterlines + + // rofsize += (nLayers - 1) * sizeof(int); // total found tracklets + // rofsize += (nLayers - 2) * sizeof(int); // total found cells + + return rofsize * nrof; +} + +template +size_t GpuTimeFrameChunk::computeFixedSizeBytes(const TimeFrameGPUParameters& config) +{ + size_t total = config.tmpCUBBufferSize; // CUB tmp buffers + total += sizeof(gpu::StaticTrackingParameters); // static parameters loaded once + return total; +} + +template +size_t GpuTimeFrameChunk::computeRofPerChunk(const TimeFrameGPUParameters& config, const size_t m) +{ + return (m * GB / (float)(config.nTimeFrameChunks) - GpuTimeFrameChunk::computeFixedSizeBytes(config)) / (float)GpuTimeFrameChunk::computeScalingSizeBytes(1, config); +} + +/// Interface +template +Cluster* GpuTimeFrameChunk::getDeviceClusters(const int layer) +{ + return mClustersDevice[layer]; +} + +template +int* GpuTimeFrameChunk::getDeviceClusterExternalIndices(const int layer) +{ + return mClusterExternalIndicesDevice[layer]; +} + +template +int* GpuTimeFrameChunk::getDeviceIndexTables(const int layer) +{ + return mIndexTablesDevice[layer]; +} + +template +Tracklet* GpuTimeFrameChunk::getDeviceTracklets(const int layer) +{ + return mTrackletsDevice[layer]; +} + +template +int* GpuTimeFrameChunk::getDeviceTrackletsLookupTables(const int layer) +{ + return mTrackletsLookupTablesDevice[layer]; +} + +template +CellSeed* GpuTimeFrameChunk::getDeviceCells(const int layer) +{ + return mCellsDevice[layer]; +} + +template +int* GpuTimeFrameChunk::getDeviceCellsLookupTables(const int layer) +{ + return mCellsLookupTablesDevice[layer]; +} + +template +int* GpuTimeFrameChunk::getDeviceCellNeigboursLookupTables(const int layer) +{ + return mNeighboursCellLookupTablesDevice[layer]; +} + +template +int* GpuTimeFrameChunk::getDeviceCellNeighbours(const int layer) +{ + return mNeighboursCellDevice[layer]; +} + +template +int* GpuTimeFrameChunk::getDeviceRoadsLookupTables(const int layer) +{ + return mRoadsLookupTablesDevice[layer]; +} + +// Load data +template +size_t GpuTimeFrameChunk::loadDataOnDevice(const size_t startRof, const size_t maxRof, const int maxLayers, Stream& stream) +{ + RANGE("load_clusters_data", 5); + // auto nRofs = std::min(maxRof - startRof, mNRof); + // mNPopulatedRof = mTimeFramePtr->getNClustersROFrange(startRof, nRofs, 0).size(); + // for (int i = 0; i < maxLayers; ++i) { + // mHostClusters[i] = mTimeFramePtr->getClustersPerROFrange(startRof, nRofs, i); + // mHostIndexTables[i] = mTimeFramePtr->getIndexTablePerROFrange(startRof, nRofs, i); + // if (mHostClusters[i].size() > mTFGPUParams->clustersPerROfCapacity * nRofs) { + // LOGP(warning, "Clusters on layer {} exceed the expected value, resizing to config value: {}, will lose information!", i, mTFGPUParams->clustersPerROfCapacity * nRofs); + // } + // checkGPUError(cudaMemcpyAsync(mClustersDevice[i], + // mHostClusters[i].data(), + // (int)std::min(mHostClusters[i].size(), mTFGPUParams->clustersPerROfCapacity * nRofs) * sizeof(Cluster), + // cudaMemcpyHostToDevice, stream.get())); + // if (mHostIndexTables[i].data()) { + // checkGPUError(cudaMemcpyAsync(mIndexTablesDevice[i], + // mHostIndexTables[i].data(), + // mHostIndexTables[i].size() * sizeof(int), + // cudaMemcpyHostToDevice, stream.get())); + // } + // } + return mNPopulatedRof; // return the number of ROFs we loaded the data for. +} +template class GpuTimeFrameChunk<7>; +} // namespace gpu +} // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 05edc847f1e05..c9c6792b5417b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -45,291 +45,224 @@ using utils::checkGPUError; void* DefaultGPUAllocator::allocate(size_t size) { - LOGP(info, "Called DefaultGPUAllocator::allocate with size {}", size); + LOGP(fatal, "Called DefaultGPUAllocator::allocate with size {}", size); return nullptr; // to be implemented } -///////////////////////////////////////////////////////////////////////////////////////// -// GpuChunk -///////////////////////////////////////////////////////////////////////////////////////// template -GpuTimeFrameChunk::~GpuTimeFrameChunk() +TimeFrameGPU::TimeFrameGPU() { - if (mAllocated) { - for (int i = 0; i < nLayers; ++i) { - checkGPUError(cudaFree(mClustersDevice[i])); - // checkGPUError(cudaFree(mTrackingFrameInfoDevice[i])); - checkGPUError(cudaFree(mClusterExternalIndicesDevice[i])); - checkGPUError(cudaFree(mIndexTablesDevice[i])); - if (i < nLayers - 1) { - checkGPUError(cudaFree(mTrackletsDevice[i])); - checkGPUError(cudaFree(mTrackletsLookupTablesDevice[i])); - if (i < nLayers - 2) { - checkGPUError(cudaFree(mCellsDevice[i])); - checkGPUError(cudaFree(mCellsLookupTablesDevice[i])); - checkGPUError(cudaFree(mRoadsLookupTablesDevice[i])); - if (i < nLayers - 3) { - checkGPUError(cudaFree(mNeighboursCellLookupTablesDevice[i])); - checkGPUError(cudaFree(mNeighboursCellDevice[i])); - } - } - } - } - // checkGPUError(cudaFree(mRoadsDevice)); - checkGPUError(cudaFree(mCUBTmpBufferDevice)); - checkGPUError(cudaFree(mFoundTrackletsDevice)); - checkGPUError(cudaFree(mNFoundCellsDevice)); - checkGPUError(cudaFree(mCellsDeviceArray)); - checkGPUError(cudaFree(mNeighboursCellDeviceArray)); - checkGPUError(cudaFree(mNeighboursCellLookupTablesDeviceArray)); - } + mIsGPU = true; + utils::getDeviceProp(0, true); } template -void GpuTimeFrameChunk::allocate(const size_t nrof, Stream& stream) -{ - RANGE("device_partition_allocation", 2); - mNRof = nrof; - // for (int i = 0; i < nLayers; ++i) { - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mClustersDevice[i])), sizeof(Cluster) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mTrackingFrameInfoDevice[i])), sizeof(TrackingFrameInfo) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mClusterExternalIndicesDevice[i])), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mIndexTablesDevice[i])), sizeof(int) * (256 * 128 + 1) * nrof, &stream, true); - // if (i < nLayers - 1) { - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mTrackletsLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mTrackletsDevice[i])), sizeof(Tracklet) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // if (i < nLayers - 2) { - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mCellsLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->validatedTrackletsCapacity * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mCellsDevice[i])), sizeof(CellSeed) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mRoadsLookupTablesDevice[i]), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); - // if (i < nLayers - 3) { - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mNeighboursCellLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mNeighboursCellDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, &stream, true); - // } - // if (i < 2) { - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&(mNTrackletsPerClusterDevice[i])), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // } - // } - // } - // } - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mCUBTmpBufferDevice), mTFGPUParams->tmpCUBBufferSize * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mLinesDevice), sizeof(Line) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNFoundLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNExclusiveFoundLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof + 1, &stream, true); // + 1 for cub::DeviceScan::ExclusiveSum, to cover cases where we have maximum number of clusters per ROF - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mUsedTrackletsDevice), sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mClusteredLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * nrof, &stream, true); - // // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mRoadsDevice), sizeof(Road) * mTFGPUParams->maxRoadPerRofSize * nrof, &stream, true); - - // /// Invariant allocations - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mFoundTrackletsDevice), (nLayers - 1) * sizeof(int) * nrof, &stream, true); // No need to reset, we always read it after writing - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNFoundCellsDevice), (nLayers - 2) * sizeof(int) * nrof, &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNeighboursCellDeviceArray), (nLayers - 3) * sizeof(int*), &stream, true); - // static_cast*>(mTimeFramePtr)->allocMemAsync(reinterpret_cast(&mNeighboursCellLookupTablesDeviceArray), (nLayers - 3) * sizeof(int*), &stream, true); - - // /// Copy pointers of allocated memory to regrouping arrays - // checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, stream.get())); - // checkGPUError(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); - // checkGPUError(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get())); - - mAllocated = true; -} +TimeFrameGPU::~TimeFrameGPU() = default; template -void GpuTimeFrameChunk::reset(const Task task, Stream& stream) +void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream* strPtr, bool extAllocator) { - RANGE("buffer_reset", 0); - // if ((bool)task) { // Vertexer-only initialisation (cannot be constexpr: due to the presence of gpu raw calls can't be put in header) - // for (int i = 0; i < 2; i++) { - // auto thrustTrackletsBegin = thrust::device_ptr(mTrackletsDevice[i]); - // auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof; - // thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{}); - // checkGPUError(cudaMemsetAsync(mNTrackletsPerClusterDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); - // } - // checkGPUError(cudaMemsetAsync(mUsedTrackletsDevice, false, sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); - // checkGPUError(cudaMemsetAsync(mClusteredLinesDevice, -1, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * mNRof, stream.get())); - // } else { - // for (int i = 0; i < nLayers; ++i) { - // if (i < nLayers - 1) { - // checkGPUError(cudaMemsetAsync(mTrackletsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mNRof, stream.get())); - // auto thrustTrackletsBegin = thrust::device_ptr(mTrackletsDevice[i]); - // auto thrustTrackletsEnd = thrustTrackletsBegin + mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * mNRof; - // thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{}); - // if (i < nLayers - 2) { - // checkGPUError(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get())); - // checkGPUError(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); - // if (i < nLayers - 3) { - // checkGPUError(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); - // checkGPUError(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get())); - // } - // } - // } - // } - // checkGPUError(cudaMemsetAsync(mNFoundCellsDevice, 0, (nLayers - 2) * sizeof(int), stream.get())); - // } + if (extAllocator) { + *ptr = mAllocator->allocate(size); + } else { + LOGP(info, "Calling default CUDA allocator"); + checkGPUError(cudaMallocAsync(reinterpret_cast(ptr), size, strPtr->get())); + } } template -size_t GpuTimeFrameChunk::computeScalingSizeBytes(const int nrof, const TimeFrameGPUParameters& config) +void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl* propagator) { - size_t rofsize = nLayers * sizeof(int); // number of clusters per ROF - // rofsize += nLayers * sizeof(Cluster) * config.clustersPerROfCapacity; // clusters - // rofsize += nLayers * sizeof(TrackingFrameInfo) * config.clustersPerROfCapacity; // tracking frame info - // rofsize += nLayers * sizeof(int) * config.clustersPerROfCapacity; // external cluster indices - // rofsize += nLayers * sizeof(int) * (256 * 128 + 1); // index tables - // rofsize += (nLayers - 1) * sizeof(int) * config.clustersPerROfCapacity; // tracklets lookup tables - // rofsize += (nLayers - 1) * sizeof(Tracklet) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity; // tracklets - // rofsize += 2 * sizeof(int) * config.clustersPerROfCapacity; // tracklets found per cluster (vertexer) - // rofsize += sizeof(unsigned char) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity; // used tracklets (vertexer) - // rofsize += (nLayers - 2) * sizeof(int) * config.validatedTrackletsCapacity; // cells lookup tables - // rofsize += (nLayers - 2) * sizeof(CellSeed) * config.maxNeighboursSize; // cells - // rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize; // cell neighbours lookup tables - // rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize; // cell neighbours - // rofsize += sizeof(Road) * config.maxRoadPerRofSize; // roads - // rofsize += (nLayers - 2) * sizeof(int) * config.maxNeighboursSize; // road LUT - // rofsize += sizeof(Line) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity; // lines - // rofsize += sizeof(int) * config.clustersPerROfCapacity; // found lines - // rofsize += sizeof(int) * config.clustersPerROfCapacity; // found lines exclusive sum - // rofsize += sizeof(int) * config.clustersPerROfCapacity * config.maxTrackletsPerCluster; // lines used in clusterlines - - // rofsize += (nLayers - 1) * sizeof(int); // total found tracklets - // rofsize += (nLayers - 2) * sizeof(int); // total found cells - - return rofsize * nrof; + mPropagatorDevice = propagator; } template -size_t GpuTimeFrameChunk::computeFixedSizeBytes(const TimeFrameGPUParameters& config) +void TimeFrameGPU::loadUnsortedClustersDevice() { - size_t total = config.tmpCUBBufferSize; // CUB tmp buffers - total += sizeof(gpu::StaticTrackingParameters); // static parameters loaded once - return total; + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", mUnsortedClusters[iLayer].size(), iLayer, mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB); + allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); + // Register and move data + checkGPUError(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template -size_t GpuTimeFrameChunk::computeRofPerChunk(const TimeFrameGPUParameters& config, const size_t m) +void TimeFrameGPU::loadClustersDevice() { - return (m * GB / (float)(config.nTimeFrameChunks) - GpuTimeFrameChunk::computeFixedSizeBytes(config)) / (float)GpuTimeFrameChunk::computeScalingSizeBytes(1, config); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", mClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(Cluster) / MB); + allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), mClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); + // Register and move data + checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } -/// Interface template -Cluster* GpuTimeFrameChunk::getDeviceClusters(const int layer) +void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) { - return mClustersDevice[layer]; + if (!iteration) { + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", mTrackingFrameInfo[iLayer].size(), iLayer, mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB); + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, getExtAllocator()); + // Register and move data + checkGPUError(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } } template -int* GpuTimeFrameChunk::getDeviceClusterExternalIndices(const int layer) +void TimeFrameGPU::loadTrackletsDevice() { - return mClusterExternalIndicesDevice[layer]; + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mTracklets[iLayer].size() * sizeof(Tracklet), nullptr, getExtAllocator()); + // Register and move data + checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template -int* GpuTimeFrameChunk::getDeviceIndexTables(const int layer) +void TimeFrameGPU::loadCellsDevice() { - return mIndexTablesDevice[layer]; + for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", mCells[iLayer].size(), iLayer, mCells[iLayer].size() * sizeof(CellSeed) / MB); + allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mCells[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding. + checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + // Register and move data + checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template -Tracklet* GpuTimeFrameChunk::getDeviceTracklets(const int layer) +void TimeFrameGPU::loadCellsLUT() { - return mTrackletsDevice[layer]; + for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} cell LUTs on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&(mCellsLUTDevice[iLayer])), sizeof(int) * mCellsLookupTable[iLayer].size(), nullptr, getExtAllocator()); + // Register and move data + checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template -int* GpuTimeFrameChunk::getDeviceTrackletsLookupTables(const int layer) +void TimeFrameGPU::loadRoadsDevice() { - return mTrackletsLookupTablesDevice[layer]; + LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", mRoads.size(), mRoads.size() * sizeof(Road) / MB); + allocMemAsync(reinterpret_cast(&mRoadsDevice), mRoads.size() * sizeof(Road), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaHostRegister(mRoads.data(), mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mRoadsDevice, mRoads.data(), mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template -CellSeed* GpuTimeFrameChunk::getDeviceCells(const int layer) +void TimeFrameGPU::loadTrackSeedsDevice(std::vector& seeds) { - return mCellsDevice[layer]; + LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB); + allocMemAsync(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template -int* GpuTimeFrameChunk::getDeviceCellsLookupTables(const int layer) +void TimeFrameGPU::createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours) { - return mCellsLookupTablesDevice[layer]; + mCellsNeighbours[layer].clear(); + mCellsNeighbours[layer].resize(neighbours.size()); + LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); + allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); } template -int* GpuTimeFrameChunk::getDeviceCellNeigboursLookupTables(const int layer) +void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) { - return mNeighboursCellLookupTablesDevice[layer]; + LOGP(debug, "gpu-allocation: reserving {} slots for neighbours LUT, for {} MB.", nCells + 1, (nCells + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc + checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); } template -int* GpuTimeFrameChunk::getDeviceCellNeighbours(const int layer) +void TimeFrameGPU::createTrackITSExtDevice(std::vector& seeds) { - return mNeighboursCellDevice[layer]; + mTrackITSExt.clear(); + mTrackITSExt.resize(seeds.size()); + LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB); + allocMemAsync(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); + checkGPUError(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); } template -int* GpuTimeFrameChunk::getDeviceRoadsLookupTables(const int layer) +void TimeFrameGPU::downloadCellsDevice(const int layer) { - return mRoadsLookupTablesDevice[layer]; + LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mCells[layer].size(), layer, mCells[layer].size() * sizeof(CellSeed) / MB); + checkGPUError(cudaMemcpyAsync(mCells[layer].data(), mCellsDevice[layer], mCells[layer].size() * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + checkGPUError(cudaHostUnregister(mCells[layer].data())); } -// Load data template -size_t GpuTimeFrameChunk::loadDataOnDevice(const size_t startRof, const size_t maxRof, const int maxLayers, Stream& stream) +void TimeFrameGPU::downloadCellsNeighbours(std::vector>>& neighbours, const int layer) { - RANGE("load_clusters_data", 5); - // auto nRofs = std::min(maxRof - startRof, mNRof); - // mNPopulatedRof = mTimeFramePtr->getNClustersROFrange(startRof, nRofs, 0).size(); - // for (int i = 0; i < maxLayers; ++i) { - // mHostClusters[i] = mTimeFramePtr->getClustersPerROFrange(startRof, nRofs, i); - // mHostIndexTables[i] = mTimeFramePtr->getIndexTablePerROFrange(startRof, nRofs, i); - // if (mHostClusters[i].size() > mTFGPUParams->clustersPerROfCapacity * nRofs) { - // LOGP(warning, "Clusters on layer {} exceed the expected value, resizing to config value: {}, will lose information!", i, mTFGPUParams->clustersPerROfCapacity * nRofs); - // } - // checkGPUError(cudaMemcpyAsync(mClustersDevice[i], - // mHostClusters[i].data(), - // (int)std::min(mHostClusters[i].size(), mTFGPUParams->clustersPerROfCapacity * nRofs) * sizeof(Cluster), - // cudaMemcpyHostToDevice, stream.get())); - // if (mHostIndexTables[i].data()) { - // checkGPUError(cudaMemcpyAsync(mIndexTablesDevice[i], - // mHostIndexTables[i].data(), - // mHostIndexTables[i].size() * sizeof(int), - // cudaMemcpyHostToDevice, stream.get())); - // } - // } - return mNPopulatedRof; // return the number of ROFs we loaded the data for. + LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair) / MB); + // TOOD: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) + checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } -///////////////////////////////////////////////////////////////////////////////////////// -// TimeFrameGPU -///////////////////////////////////////////////////////////////////////////////////////// template -TimeFrameGPU::TimeFrameGPU() +void TimeFrameGPU::downloadNeighboursLUT(std::vector& lut, const int layer) { - mIsGPU = true; - utils::getDeviceProp(0, true); + LOGP(debug, "gpu-transfer: downloading {} neighbours lut, for {} MB.", lut.size(), lut.size() * sizeof(int) / MB); + checkGPUError(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } template -TimeFrameGPU::~TimeFrameGPU() = default; - -template -void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream* strPtr, bool extAllocator) +void TimeFrameGPU::downloadTrackITSExtDevice(std::vector& seeds) { - if (extAllocator) { - *ptr = mAllocator->allocate(size); - } else { - LOGP(info, "Calling default CUDA allocator"); - checkGPUError(cudaMallocAsync(reinterpret_cast(ptr), size, strPtr->get())); - } + LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB); + checkGPUError(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + checkGPUError(cudaHostUnregister(mTrackITSExt.data())); + checkGPUError(cudaHostUnregister(seeds.data())); + // discardResult(cudaDeviceSynchronize()); } template -void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl* propagator) +void TimeFrameGPU::unregisterRest() { - mPropagatorDevice = propagator; + LOGP(debug, "unregistering rest of the host memory..."); + checkGPUError(cudaHostUnregister(mCells[0].data())); + checkGPUError(cudaHostUnregister(mCellsDevice.data())); + checkGPUError(cudaHostUnregister(mCellsLUTDevice.data())); + for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { + checkGPUError(cudaHostUnregister(mCellsLookupTable[iLayer].data())); + } } - +//////////////////////////////////////////////////////////////////////// +/// Legacy template void TimeFrameGPU::registerHostMemory(const int maxLayers) { @@ -429,133 +362,6 @@ void TimeFrameGPU::initDevice(IndexTableUtils* utils, // checkGPUError(cudaMemcpy(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice)); } -template -void TimeFrameGPU::loadUnsortedClustersDevice() -{ - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", mUnsortedClusters[iLayer].size(), iLayer, mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB); - allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - } - allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); -} - -template -void TimeFrameGPU::loadClustersDevice() -{ - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", mClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(Cluster) / MB); - allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), mClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - } - allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); -} - -template -void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) -{ - if (!iteration) { - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(info, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", mTrackingFrameInfo[iLayer].size(), iLayer, mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB); - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - } - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - } -} - -template -void TimeFrameGPU::loadTrackletsDevice() -{ - for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mTracklets[iLayer].size() * sizeof(Tracklet), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - } - allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); -} - -template -void TimeFrameGPU::loadCellsDevice() -{ - for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", mCells[iLayer].size(), iLayer, mCells[iLayer].size() * sizeof(CellSeed) / MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - } - allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); -} - -template -void TimeFrameGPU::loadRoadsDevice() -{ - LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", mRoads.size(), mRoads.size() * sizeof(Road) / MB); - allocMemAsync(reinterpret_cast(&mRoadsDevice), mRoads.size() * sizeof(Road), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaHostRegister(mRoads.data(), mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mRoadsDevice, mRoads.data(), mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice, mGpuStreams[0].get())); -} - -template -void TimeFrameGPU::loadTrackSeedsDevice(std::vector& seeds) -{ - LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB); - allocMemAsync(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); -} - -template -void TimeFrameGPU::createCellNeighboursDevice(const unsigned int& layer, std::vector>& neighbours) -{ - mCellsNeighbours[layer].clear(); - mCellsNeighbours[layer].resize(neighbours.size()); - LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); - allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], 0, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); - checkGPUError(cudaHostRegister(neighbours.data(), neighbours.size() * sizeof(std::pair), cudaHostRegisterPortable)); -} - -template -void TimeFrameGPU::createTrackITSExtDevice(std::vector& seeds) -{ - mTrackITSExt.clear(); - mTrackITSExt.resize(seeds.size()); - LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB); - allocMemAsync(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); - checkGPUError(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); -} - -template -void TimeFrameGPU::downloadTrackITSExtDevice(std::vector& seeds) -{ - LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB); - checkGPUError(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); - checkGPUError(cudaHostUnregister(mTrackITSExt.data())); - checkGPUError(cudaHostUnregister(seeds.data())); - discardResult(cudaDeviceSynchronize()); -} - template unsigned char* TimeFrameGPU::getDeviceUsedClusters(const int layer) { @@ -575,7 +381,6 @@ gsl::span TimeFrameGPU::getHostNCells(const int chunkId) } template class TimeFrameGPU<7>; -template class GpuTimeFrameChunk<7>; } // namespace gpu } // namespace its } // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index ac8b3f87b874c..45fee9976bca6 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -21,7 +21,6 @@ #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" #include "ITStracking/TrackingConfigParam.h" - namespace o2::its { constexpr int UnusedIndex{-1}; @@ -324,45 +323,62 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) template void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) { - TrackerTraits::findCellsNeighbours(iteration); - // for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - // const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer + 1].size())}; - // mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); - // mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); - // if (mTimeFrameGPU->getCells()[iLayer + 1].empty() || - // mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) { - // mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); - // continue; - // } - - // int layerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer].size())}; - // std::vector> cellsNeighbours; - // cellsNeighbours.reserve(nextLayerCellsNum); - // mTimeFrameGPU->createCellNeighboursDevice(iLayer, cellsNeighbours); - - // // // [...] - // // cellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbours(iLayer)); - // // // // Compute Cell Neighbours LUT - // // // checkGPUError(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // // // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // // // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_in - // // // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_out - // // // mTimeFrameGPU->getHostNCells(chunkId)[iLayer + 1], // num_items - // // // mTimeFrameGPU->getStream(chunkId).get())); - - // // cellsNeighboursHandler(mTimeFrameGPU->getDeviceNeighbours(iLayer)); - // // // [...] - - // std::sort(cellsNeighbours.begin(), cellsNeighbours.end(), [](const std::pair& a, const std::pair& b) { - // return a.second < b.second; - // }); - // mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); - // mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighbours.size()); - // for (auto& cellNeighboursIndex : cellsNeighbours) { - // mTimeFrameGPU->getCellsNeighbours()[iLayer].push_back(cellNeighboursIndex.first); - // } - // std::inclusive_scan(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].end(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin()); - // } + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); + mTimeFrameGPU->loadCellsDevice(); + mTimeFrameGPU->loadCellsLUT(); + std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); + for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { + const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer + 1].size())}; + mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); + mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); + + if (mTimeFrameGPU->getCells()[iLayer + 1].empty() || + mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) { + mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); + continue; + } + + int layerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer].size())}; + mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); + countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), + mTrkParams[0].MaxChi2ClusterAttachment, + mBz, + iLayer, + layerCellsNum, + nextLayerCellsNum, + 1e2, + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->downloadNeighboursLUT(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer); + // Get the number of found cells from LUT + cellsNeighboursLayer[iLayer].resize(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].back()); + mTimeFrameGPU->createNeighboursDevice(iLayer, cellsNeighboursLayer[iLayer]); + computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), + mTrkParams[0].MaxChi2ClusterAttachment, + mBz, + iLayer, + layerCellsNum, + nextLayerCellsNum, + 1e2, + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); + mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size()); + mTimeFrameGPU->downloadCellsDevice(iLayer + 1); // Cells on layer 0 did not change. + + filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer], + mTimeFrameGPU->getDeviceNeighbours(iLayer), + cellsNeighboursLayer[iLayer].size()); + } + mTimeFrameGPU->unregisterRest(); }; template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 60683e5fea30b..9d00892f4b680 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -19,11 +19,13 @@ #include #include +#include #include #include #include #include #include +#include #include "ITStracking/Constants.h" #include "ITStracking/Configuration.h" @@ -40,6 +42,14 @@ #define THRUST_NAMESPACE thrust::hip #endif +#ifdef GPUCA_NO_FAST_MATH +#define GPU_BLOCKS 1 +#define GPU_THREADS 1 +#else +#define GPU_BLOCKS 99999 +#define GPU_THREADS 99999 +#endif + // O2 track model #include "ReconstructionDataFormats/Track.h" #include "DetectorsBase/Propagator.h" @@ -184,25 +194,25 @@ GPUg() void fitTrackSeedsKernel( template // Version for new tracker to supersede the old one GPUg() void computeLayerCellNeighboursKernel( - CellSeed* cellsCurrentLayer, - CellSeed* cellsNextLayer, + CellSeed** cellSeedArray, int* neighboursLUT, - const int* cellsNextLayerLUT, + int* neighboursIndexTable, + int** cellsLUTs, gpuPair* cellNeighbours, const float maxChi2ClusterAttachment, const float bz, const int layerIndex, - const int* nCells, + const unsigned int nCells, const int maxCellNeighbours = 1e2) { - for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells[layerIndex]; iCurrentCellIndex += blockDim.x * gridDim.x) { - const auto& currentCellSeed{cellsCurrentLayer[iCurrentCellIndex]}; + for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells; iCurrentCellIndex += blockDim.x * gridDim.x) { + const auto& currentCellSeed{cellSeedArray[layerIndex][iCurrentCellIndex]}; const int nextLayerTrackletIndex{currentCellSeed.getSecondTrackletIndex()}; - const int nextLayerFirstCellIndex{cellsNextLayerLUT[nextLayerTrackletIndex]}; - const int nextLayerLastCellIndex{cellsNextLayerLUT[nextLayerTrackletIndex + 1]}; + const int nextLayerFirstCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex]}; + const int nextLayerLastCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex + 1]}; int foundNeighbours{0}; for (int iNextCell{nextLayerFirstCellIndex}; iNextCell < nextLayerLastCellIndex; ++iNextCell) { - CellSeed nextCellSeed{cellsNextLayer[iNextCell]}; // Copy + CellSeed nextCellSeed{cellSeedArray[layerIndex + 1][iNextCell]}; // Copy if (nextCellSeed.getFirstTrackletIndex() != nextLayerTrackletIndex) { // Check if cells share the same tracklet break; } @@ -217,23 +227,54 @@ GPUg() void computeLayerCellNeighboursKernel( } if constexpr (initRun) { atomicAdd(neighboursLUT + iNextCell, 1); + foundNeighbours++; + neighboursIndexTable[iCurrentCellIndex]++; } else { - if (foundNeighbours >= maxCellNeighbours) { - printf("its-gpu-neighbours-finder: data loss on layer: %d: number of neightbours exceeded the threshold!\n"); - continue; - } - cellNeighbours[neighboursLUT[iNextCell] + foundNeighbours++] = {iCurrentCellIndex, iNextCell}; - + cellNeighbours[neighboursIndexTable[iCurrentCellIndex] + foundNeighbours] = {iCurrentCellIndex, iNextCell}; + foundNeighbours++; // FIXME: this is prone to race conditions: check on level is not atomic const int currentCellLevel{currentCellSeed.getLevel()}; if (currentCellLevel >= nextCellSeed.getLevel()) { - atomicExch(cellsNextLayer[iNextCell].getLevelPtr(), currentCellLevel + 1); // Update level on corresponding cell + // atomicExch(cellSeedArray[layerIndex + 1][iNextCell].getLevelPtr(), currentCellLevel + 1); // Update level on corresponding cell + cellSeedArray[layerIndex + 1][iNextCell].setLevel(currentCellLevel + 1); } } } } } +template +struct pair_to_first : public thrust::unary_function, T1> { + GPUhd() int operator()(const gpuPair& a) const + { + return a.first; + } +}; + +template +struct pair_to_second : public thrust::unary_function, T2> { + GPUhd() int operator()(const gpuPair& a) const + { + return a.second; + } +}; + +template +struct is_invalid_pair { + GPUhd() bool operator()(const gpuPair& p) const + { + return p.first == -1 && p.second == -1; + } +}; + +template +struct is_valid_pair { + GPUhd() bool operator()(const gpuPair& p) const + { + return !(p.first == -1 && p.second == -1); + } +}; + //////////////////////////////////////////////////////////////////////////////// // Legacy Kernels, to possibly take inspiration from //////////////////////////////////////////////////////////////////////////////// @@ -344,6 +385,18 @@ GPUg() void printTrackletsNotStrided(const Tracklet* t, } } +GPUg() void printNeighbours(const gpuPair* neighbours, + const int* nNeighboursIndexTable, + const unsigned int nCells, + const unsigned int tId = 0) +{ + for (unsigned int iNeighbour{0}; iNeighbour < nNeighboursIndexTable[nCells]; ++iNeighbour) { + if (threadIdx.x == tId) { + printf("%d -> %d\n", neighbours[iNeighbour].first, neighbours[iNeighbour].second); + } + } +} + // Compute the tracklets for a given layer template GPUg() void computeLayerTrackletsKernelSingleRof( @@ -597,7 +650,7 @@ GPUg() void computeLayerCellsKernel( const int* trackletsCurrentLayerLUT, const int nTrackletsCurrent, CellSeed* cells, - int* cellsLUT, + int* cellsLUTs, const StaticTrackingParameters* trkPars) { for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) { @@ -618,17 +671,17 @@ GPUg() void computeLayerCellsKernel( if (deltaTanLambda / trkPars->CellDeltaTanLambdaSigma < trkPars->NSigmaCut) { if constexpr (!initRun) { - new (cells + cellsLUT[iCurrentTrackletIndex] + foundCells) Cell{currentTracklet.firstClusterIndex, nextTracklet.firstClusterIndex, - nextTracklet.secondClusterIndex, - iCurrentTrackletIndex, - iNextTrackletIndex}; + new (cells + cellsLUTs[iCurrentTrackletIndex] + foundCells) Cell{currentTracklet.firstClusterIndex, nextTracklet.firstClusterIndex, + nextTracklet.secondClusterIndex, + iCurrentTrackletIndex, + iNextTrackletIndex}; } ++foundCells; } } if constexpr (initRun) { // Fill cell Lookup table - cellsLUT[iCurrentTrackletIndex] = foundCells; + cellsLUTs[iCurrentTrackletIndex] = foundCells; } } } @@ -683,29 +736,120 @@ GPUg() void computeLayerRoadsKernel( } } // namespace gpu -template -void cellNeighboursHandler(CellSeed* cellsCurrentLayer, - CellSeed* cellsNextLayer, - int* neighboursLUT, - const int* cellsNextLayerLUT, - gpuPair* cellNeighbours, - const float maxChi2ClusterAttachment, - const float bz, - const int layerIndex, - const int* nCells, - const int maxCellNeighbours = 1e2) +void countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUT, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads) +{ + gpu::computeLayerCellNeighboursKernel<<>>( + cellsLayersDevice, + neighboursLUT, + neighboursIndexTable, + cellsLUTs, + cellNeighbours, + maxChi2ClusterAttachment, + bz, + layerIndex, + nCells, + maxCellNeighbours); + gpuCheckError(cudaPeekAtLastError()); + gpuCheckError(cudaDeviceSynchronize()); + void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; + size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; + gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + neighboursLUT, // d_in + neighboursLUT, // d_out + nCellsNext)); // num_items + + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + neighboursLUT, // d_in + neighboursLUT, // d_out + nCellsNext)); // num_items + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + temp_storage_bytes_2, // temp_storage_bytes + neighboursIndexTable, // d_in + neighboursIndexTable, // d_out + nCells + 1, // num_items + 0)); + discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + temp_storage_bytes_2, // temp_storage_bytes + neighboursIndexTable, // d_in + neighboursIndexTable, // d_out + nCells + 1, // num_items + 0)); + gpuCheckError(cudaFree(d_temp_storage)); + gpuCheckError(cudaFree(d_temp_storage_2)); + gpuCheckError(cudaPeekAtLastError()); + gpuCheckError(cudaDeviceSynchronize()); +} + +void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUT, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads) +{ + + gpu::computeLayerCellNeighboursKernel<<>>( + cellsLayersDevice, + neighboursLUT, + neighboursIndexTable, + cellsLUTs, + cellNeighbours, + maxChi2ClusterAttachment, + bz, + layerIndex, + nCells, + maxCellNeighbours); + gpuCheckError(cudaPeekAtLastError()); + gpuCheckError(cudaDeviceSynchronize()); +} + +void filterCellNeighboursHandler(std::vector& neighHost, + gpuPair* cellNeighbours, + unsigned int nNeigh) { - gpu::computeLayerCellNeighboursKernel<<<20, 512>>>( - cellsCurrentLayer, // CellSeed* cellsCurrentLayer, - cellsNextLayer, // CellSeed* cellsNextLayer, - neighboursLUT, // int* neighboursLUT, - cellsNextLayerLUT, // const int* cellsNextLayerLUT, - cellNeighbours, // gpuPair* cellNeighbours, - maxChi2ClusterAttachment, // const float maxChi2ClusterAttachment, - bz, // const float bz, - layerIndex, // const int layerIndex, - nCells, // const int* nCells, - maxCellNeighbours); // const int maxCellNeighbours = 1e2 + thrust::device_ptr> neighVector(cellNeighbours); + thrust::device_vector keys(nNeigh); // TODO: externally allocate. + thrust::device_vector vals(nNeigh); // TODO: externally allocate. + thrust::copy(thrust::make_transform_iterator(neighVector, gpu::pair_to_second()), + thrust::make_transform_iterator(neighVector + nNeigh, gpu::pair_to_second()), + keys.begin()); + thrust::sequence(vals.begin(), vals.end()); + thrust::sort_by_key(keys.begin(), keys.end(), vals.begin()); + thrust::device_vector> sortedNeigh(nNeigh); + thrust::copy(thrust::make_permutation_iterator(neighVector, vals.begin()), + thrust::make_permutation_iterator(neighVector, vals.end()), + sortedNeigh.begin()); + discardResult(cudaDeviceSynchronize()); + auto trimmedBegin = thrust::find_if(sortedNeigh.begin(), sortedNeigh.end(), gpu::is_valid_pair()); // trim leading -1s + auto trimmedSize = sortedNeigh.end() - trimmedBegin; + thrust::device_vector validNeigh(trimmedSize); + neighHost.resize(trimmedSize); + thrust::transform(trimmedBegin, sortedNeigh.end(), validNeigh.begin(), gpu::pair_to_first()); + gpuCheckError(cudaMemcpy(neighHost.data(), thrust::raw_pointer_cast(validNeigh.data()), trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); } void trackSeedHandler(CellSeed* trackSeeds, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt index f51544eaa970c..0b686273a159a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt @@ -16,6 +16,7 @@ if(HIP_ENABLED) SOURCES ../cuda/ClusterLinesGPU.cu ../cuda/Context.cu ../cuda/TimeFrameGPU.cu + ../cuda/TimeFrameChunk.cu ../cuda/Stream.cu ../cuda/TrackerTraitsGPU.cxx ../cuda/TracerGPU.cu @@ -31,4 +32,4 @@ if(HIP_ENABLED) hip::host PRIVATE_LINK_LIBRARIES O2::GPUTrackingHIPExternalProvider TARGETVARNAME targetName) -endif() +endif() \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 08e50cf9ea824..906eb0fa5c21e 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -288,6 +288,7 @@ class TimeFrame std::vector> mRoads; std::vector> mTracks; std::vector> mCellsNeighbours; + std::vector> mCellsLookupTable; const o2::base::PropagatorImpl* mPropagatorDevice = nullptr; // Needed only for GPU protected: @@ -315,7 +316,6 @@ class TimeFrame std::vector> mPValphaX; /// PV x and alpha for track propagation std::vector> mTrackletLabels; std::vector> mCellLabels; - std::vector> mCellsLookupTable; std::vector> mCellsNeighboursLUT; std::vector> mTracksLabel; std::vector mBogusClusters; /// keep track of clusters with wild coordinates