From d025f3d758632272e31661da57d04063701bdaa7 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 5 Feb 2024 20:06:50 +0100 Subject: [PATCH] GPU TPC: Create and distribute TPC Cluster Occupancy Map --- GPU/GPUTracking/Base/GPUParam.inc | 4 +- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 1 + .../Base/GPUReconstructionIncludesDevice.h | 3 + GPU/GPUTracking/CMakeLists.txt | 2 + GPU/GPUTracking/DataTypes/GPUOutputControl.h | 1 + .../DataTypes/GPUTPCClusterOccupancyMap.cxx | 29 +++++++++ .../DataTypes/GPUTPCClusterOccupancyMap.h | 33 ++++++++++ .../Definitions/GPUDefGPUParameters.h | 6 ++ GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + GPU/GPUTracking/Global/GPUChain.h | 4 ++ .../Global/GPUChainTrackingSliceTracker.cxx | 17 +++++- .../Global/GPUTrackingInputProvider.cxx | 13 +++- .../Global/GPUTrackingInputProvider.h | 6 ++ .../SliceTracker/GPUTPCCreateOccupancyMap.cxx | 60 +++++++++++++++++++ .../SliceTracker/GPUTPCCreateOccupancyMap.h | 39 ++++++++++++ GPU/GPUTracking/kernels.cmake | 2 + GPU/Workflow/src/GPUWorkflowSpec.cxx | 6 ++ 17 files changed, 223 insertions(+), 4 deletions(-) create mode 100644 GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.cxx create mode 100644 GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.h create mode 100644 GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx create mode 100644 GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h diff --git a/GPU/GPUTracking/Base/GPUParam.inc b/GPU/GPUTracking/Base/GPUParam.inc index 2cb5f2275b13d..bfbedb374d098 100644 --- a/GPU/GPUTracking/Base/GPUParam.inc +++ b/GPU/GPUTracking/Base/GPUParam.inc @@ -77,9 +77,9 @@ GPUdi() float MEM_LG(GPUParam)::GetSystematicClusterErrorIFC2(float x, float z, } else if (rec.tpc.sysClusErrorZRegion > kEpsZBoundary && sideC) { return 0; // don't apply to C-side clusters if the Z-boundary is for A-region } - const float dz = CAMath::Abs((rec.tpc.sysClusErrorZRegion-z)*rec.tpc.sysClusErrorZRegionSigInv); + const float dz = CAMath::Abs((rec.tpc.sysClusErrorZRegion - z) * rec.tpc.sysClusErrorZRegionSigInv); if (dz < kMaxExpArgZ) { // is it small enough to call exp? - argExp += 0.5f * dz*dz; + argExp += 0.5f * dz * dz; if (argExpgetTimer(name, num); } + // Get GRID with NBLOCKS minimal such that nThreads * NBLOCS >= totalItems krnlExec GetGrid(unsigned int totalItems, unsigned int nThreads, int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep); + // Get GRID with NBLOCKS minimal such that ideal number of threads * NBLOCKS >= totalItems krnlExec GetGrid(unsigned int totalItems, int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep); + // Get GRID with specified number of blocks, each block with ideal number of threads krnlExec GetGridBlk(unsigned int nBlocks, int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep); krnlExec GetGridBlkStep(unsigned int nBlocks, int stream, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep); + // Get GRID with ideal number of threads / blocks for GPU krnlExec GetGridAuto(int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep); krnlExec GetGridAutoStep(int stream, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx index 4d023cc8bca22..379c302159ef5 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx @@ -17,6 +17,8 @@ #include "GPUO2DataTypes.h" #include "GPUMemorySizeScalers.h" #include "GPUTPCClusterData.h" +#include "GPUTrackingInputProvider.h" +#include "GPUTPCClusterOccupancyMap.h" #include "utils/strtag.h" #include @@ -143,6 +145,20 @@ int GPUChainTracking::RunTPCTrackingSlices_internal() return (2); } + if (param().rec.tpc.occupancyMapTimeBins) { + AllocateRegisteredMemory(mInputsHost->mResourceOccupancyMap, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcOccupancyMap)]); + ReleaseEvent(mEvents->init); + auto* ptr = doGPU ? mInputsShadow->mTPCClusterOccupancyMap : mInputsHost->mTPCClusterOccupancyMap; + runKernel(GetGridAutoStep(mRec->NStreams() - 1, RecoStep::TPCSliceTracking), krnlRunRangeNone, {}, ptr, GPUTPCClusterOccupancyMapBin::getTotalSize(param())); + runKernel(GetGridBlk(GPUCA_NSLICES * GPUCA_ROW_COUNT, mRec->NStreams() - 1), krnlRunRangeNone, krnlEventNone, ptr); + runKernel(GetGridBlk(GPUCA_NSLICES * GPUCA_ROW_COUNT, mRec->NStreams() - 1), krnlRunRangeNone, {&mEvents->init}, ptr); + if (doGPU) { + TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap); + } else { + TransferMemoryResourceLinkToGPU(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap); + } + } + int streamMap[NSLICES]; bool error = false; @@ -170,7 +186,6 @@ int GPUChainTracking::RunTPCTrackingSlices_internal() GPUInfo("Waiting for helper thread %d", iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1); } while (HelperDone(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1) < (int)iSlice) { - ; } if (HelperError(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1)) { error = 1; diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx index 2d9f7aea723d4..f541b4a6be8d1 100644 --- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx +++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx @@ -16,6 +16,7 @@ #include "GPUDataTypes.h" #include "GPUTRDTrackletWord.h" #include "GPUReconstruction.h" +#include "GPUTPCClusterOccupancyMap.h" #include "GPUErrors.h" using namespace GPUCA_NAMESPACE::gpu; @@ -75,10 +76,19 @@ void* GPUTrackingInputProvider::SetPointersInputTRD(void* mem) return mem; } +void* GPUTrackingInputProvider::SetPointersTPCOccupancyMap(void* mem) +{ + if (mHoldTPCOccupancyMap) { + computePointerWithAlignment(mem, mTPCClusterOccupancyMap, GPUTPCClusterOccupancyMapBin::getNBins(mRec->GetParam())); + } + return mem; +} + void GPUTrackingInputProvider::RegisterMemoryAllocation() { mResourceErrorCodes = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersErrorCodes, GPUMemoryResource::MEMORY_PERMANENT, "ErrorCodes"); mResourceZS = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputZS, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_PERMANENT, "InputZS"); + mResourceOccupancyMap = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersTPCOccupancyMap, GPUMemoryResource::MEMORY_INOUT | GPUMemoryResource::MEMORY_CUSTOM, "OccupancyMap"); mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT, "ClusterNativeAccess"); mResourceClusterNativeBuffer = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputClusterNativeBuffer, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_CUSTOM, "ClusterNativeBuffer"); mResourceClusterNativeOutput = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputClusterNativeOutput, GPUMemoryResource::MEMORY_OUTPUT_FLAG | GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_CUSTOM, "ClusterNativeOutput"); @@ -89,5 +99,6 @@ void GPUTrackingInputProvider::SetMaxData(const GPUTrackingInOutPointers& io) { mHoldTPCZS = io.tpcZS && (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding); mHoldTPCClusterNative = (io.tpcZS || io.tpcPackedDigits || io.clustersNative || io.tpcCompressedClusters) && mRec->IsGPU(); - mHoldTPCClusterNativeOutput = (io.tpcZS || io.tpcPackedDigits || io.tpcCompressedClusters); + mHoldTPCOccupancyMap = (io.tpcZS || io.tpcPackedDigits || io.clustersNative || io.tpcCompressedClusters) && mRec->GetParam().rec.tpc.occupancyMapTimeBins; + mHoldTPCClusterNativeOutput = io.tpcZS || io.tpcPackedDigits || io.tpcCompressedClusters; } diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h index a14dcdebc39de..0f958353fa464 100644 --- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h +++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h @@ -33,6 +33,7 @@ namespace gpu { struct GPUTrackingInOutZS; +struct GPUTPCClusterOccupancyMapBin; class GPUTRDTrackletWord; class GPUTRDSpacePoint; @@ -44,6 +45,7 @@ class GPUTrackingInputProvider : public GPUProcessor void RegisterMemoryAllocation(); void SetMaxData(const GPUTrackingInOutPointers& io); + void* SetPointersTPCOccupancyMap(void* mem); void* SetPointersInputZS(void* mem); void* SetPointersInputClusterNativeAccess(void* mem); void* SetPointersInputClusterNativeBuffer(void* mem); @@ -58,10 +60,12 @@ class GPUTrackingInputProvider : public GPUProcessor unsigned short mResourceClusterNativeOutput = -1; unsigned short mResourceErrorCodes = -1; unsigned short mResourceTRD = -1; + unsigned short mResourceOccupancyMap = -1; bool mHoldTPCZS = false; bool mHoldTPCClusterNative = false; bool mHoldTPCClusterNativeOutput = false; + bool mHoldTPCOccupancyMap = false; unsigned int mNClusterNative = 0; GPUTrackingInOutZS* mPzsMeta = nullptr; @@ -81,6 +85,8 @@ class GPUTrackingInputProvider : public GPUProcessor o2::tpc::ClusterNative* mPclusterNativeBuffer = nullptr; o2::tpc::ClusterNative* mPclusterNativeOutput = nullptr; + GPUTPCClusterOccupancyMapBin* mTPCClusterOccupancyMap = nullptr; + unsigned int* mErrorCodes = nullptr; }; diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx new file mode 100644 index 0000000000000..749ea81dd528b --- /dev/null +++ b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx @@ -0,0 +1,60 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCCreateOccupancyMap.cxx +/// \author David Rohr + +#include "GPUTPCCreateOccupancyMap.h" +#include "GPUTPCClusterOccupancyMap.h" + +using namespace GPUCA_NAMESPACE::gpu; + +template <> +GPUdii() void GPUTPCCreateOccupancyMap::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors, GPUTPCClusterOccupancyMapBin* GPUrestrict() map) +{ + const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs; + const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = ioPtrs.clustersNative; + GPUParam& GPUrestrict() param = processors.param; + const int iSliceRow = iBlock * nThreads + iThread; + if (iSliceRow >= GPUCA_ROW_COUNT * GPUCA_NSLICES) { + return; + } + const unsigned int iSlice = iSliceRow / GPUCA_ROW_COUNT; + const unsigned int iRow = iSliceRow % GPUCA_ROW_COUNT; + for (unsigned int i = 0; i < clusters->nClusters[iSlice][iRow]; i++) { + const unsigned int bin = clusters->clusters[iSlice][iRow][i].getTime() / param.rec.tpc.occupancyMapTimeBins; + map[bin].bin[iSlice][iRow]++; + } +} + +template <> +GPUdii() void GPUTPCCreateOccupancyMap::Thread<1>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors, GPUTPCClusterOccupancyMapBin* GPUrestrict() map) +{ + GPUParam& GPUrestrict() param = processors.param; + const int iSliceRow = iBlock * nThreads + iThread; + if (iSliceRow > GPUCA_ROW_COUNT * GPUCA_NSLICES) { + return; + } + const unsigned int iSlice = iSliceRow / GPUCA_ROW_COUNT; + const unsigned int iRow = iSliceRow % GPUCA_ROW_COUNT; + const unsigned int nBins = GPUTPCClusterOccupancyMapBin::getNBins(param); + const unsigned int nFoldBins = CAMath::Min(5u, nBins); + unsigned int sum = 0; + for (unsigned int i = 0; i < nFoldBins; i++) { + sum += map[i].bin[iSlice][iRow]; + } + unsigned short lastVal; + for (unsigned int i = 0; i < nBins; i++) { + lastVal = map[i].bin[iSlice][iRow]; + map[i].bin[iSlice][iRow] = sum / nFoldBins; + sum += map[CAMath::Min(i + nFoldBins, nBins - 1)].bin[iSlice][iRow] - lastVal; + } +} diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h new file mode 100644 index 0000000000000..33a0d18b92a30 --- /dev/null +++ b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h @@ -0,0 +1,39 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCCreateOccupancyMap.h +/// \author David Rohr + +#ifndef GPUTPCCREATEOCCUPANCYMAP_H +#define GPUTPCCREATEOCCUPANCYMAP_H + +#include "GPUTPCDef.h" +#include "GPUGeneralKernels.h" +#include "GPUConstantMem.h" + +namespace GPUCA_NAMESPACE::gpu +{ +struct GPUTPCClusterOccupancyMapBin; + +class GPUTPCCreateOccupancyMap : public GPUKernelTemplate +{ + public: + enum K { defaultKernel = 0, + fill = 0, + fold = 1 }; + GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCSliceTracking; } + template + GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors, GPUTPCClusterOccupancyMapBin* map); +}; + +} // namespace GPUCA_NAMESPACE::gpu + +#endif diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index b06749010d2d9..ff35e7e0e02cc 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -23,6 +23,8 @@ o2_gpu_add_kernel("GPUMemClean16" NO_OCL1 "simple o2_gpu_add_kernel("GPUTPCGlobalTrackingCopyNumbers" NO_OCL1 single int n) o2_gpu_add_kernel("GPUTPCCreateSliceData" LB single) o2_gpu_add_kernel("GPUTPCGlobalTracking" LB single) +o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fill" LB simple GPUTPCClusterOccupancyMapBin* map) +o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fold" LB simple GPUTPCClusterOccupancyMapBin* map) o2_gpu_add_kernel("GPUTPCGMMergerTrackFit" LB simple int mode) o2_gpu_add_kernel("GPUTPCGMMergerFollowLoopers" LB simple) o2_gpu_add_kernel("GPUTPCGMMergerUnpackResetIds" LB simple int iSlice) diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index b1f4b00d3240f..3e024c595b97c 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -737,6 +737,7 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) setOutputAllocator("COMPCLUSTERSFLAT", mSpecConfig.outputCompClustersFlat, outputRegions.compressedClusters, std::make_tuple(gDataOriginTPC, (DataDescription) "COMPCLUSTERSFLAT", 0)); setOutputAllocator("CLUSTERNATIVE", mClusterOutputIds.size() > 0, outputRegions.clustersNative, std::make_tuple(gDataOriginTPC, mSpecConfig.sendClustersPerSector ? (DataDescription) "CLUSTERNATIVETMP" : (DataDescription) "CLUSTERNATIVE", NSectors, clusterOutputSectorHeader), sizeof(o2::tpc::ClusterCountIndex)); setOutputAllocator("CLSHAREDMAP", mSpecConfig.outputSharedClusterMap, outputRegions.sharedClusterMap, std::make_tuple(gDataOriginTPC, (DataDescription) "CLSHAREDMAP", 0)); + setOutputAllocator("TPCOCCUPANCYMAP", mSpecConfig.outputSharedClusterMap, outputRegions.tpcOccupancyMap, std::make_tuple(gDataOriginTPC, (DataDescription) "TPCOCCUPANCYMAP", 0)); setOutputAllocator("TRACKS", mSpecConfig.outputTracks, outputRegions.tpcTracksO2, std::make_tuple(gDataOriginTPC, (DataDescription) "TRACKS", 0)); setOutputAllocator("CLUSREFS", mSpecConfig.outputTracks, outputRegions.tpcTracksO2ClusRefs, std::make_tuple(gDataOriginTPC, (DataDescription) "CLUSREFS", 0)); setOutputAllocator("TRACKSMCLBL", mSpecConfig.outputTracks && mSpecConfig.processMC, outputRegions.tpcTracksO2Labels, std::make_tuple(gDataOriginTPC, (DataDescription) "TRACKSMCLBL", 0)); @@ -837,6 +838,10 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) } } + if (mConfig->configReconstruction.tpc.occupancyMapTimeBins == 0) { + pc.outputs().make>({gDataOriginTPC, "TPCOCCUPANCYMAP", 0}, 0u); + } + std::unique_ptr tmpEmptyClNative; if (createEmptyOutput) { memset(&ptrs, 0, sizeof(ptrs)); @@ -1201,6 +1206,7 @@ Outputs GPURecoWorkflowSpec::outputs() } if (mSpecConfig.outputSharedClusterMap) { outputSpecs.emplace_back(gDataOriginTPC, "CLSHAREDMAP", 0, Lifetime::Timeframe); + outputSpecs.emplace_back(gDataOriginTPC, "TPCOCCUPANCYMAP", 0, Lifetime::Timeframe); } if (mSpecConfig.tpcTriggerHandling) { outputSpecs.emplace_back(gDataOriginTPC, "TRIGGERWORDS", 0, Lifetime::Timeframe);