From d025f3d758632272e31661da57d04063701bdaa7 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 5 Feb 2024 20:06:50 +0100
Subject: [PATCH] GPU TPC: Create and distribute TPC Cluster Occupancy Map

---
 GPU/GPUTracking/Base/GPUParam.inc             |  4 +-
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  1 +
 .../Base/GPUReconstructionIncludesDevice.h    |  3 +
 GPU/GPUTracking/CMakeLists.txt                |  2 +
 GPU/GPUTracking/DataTypes/GPUOutputControl.h  |  1 +
 .../DataTypes/GPUTPCClusterOccupancyMap.cxx   | 29 +++++++++
 .../DataTypes/GPUTPCClusterOccupancyMap.h     | 33 ++++++++++
 .../Definitions/GPUDefGPUParameters.h         |  6 ++
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 GPU/GPUTracking/Global/GPUChain.h             |  4 ++
 .../Global/GPUChainTrackingSliceTracker.cxx   | 17 +++++-
 .../Global/GPUTrackingInputProvider.cxx       | 13 +++-
 .../Global/GPUTrackingInputProvider.h         |  6 ++
 .../SliceTracker/GPUTPCCreateOccupancyMap.cxx | 60 +++++++++++++++++++
 .../SliceTracker/GPUTPCCreateOccupancyMap.h   | 39 ++++++++++++
 GPU/GPUTracking/kernels.cmake                 |  2 +
 GPU/Workflow/src/GPUWorkflowSpec.cxx          |  6 ++
 17 files changed, 223 insertions(+), 4 deletions(-)
 create mode 100644 GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.cxx
 create mode 100644 GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.h
 create mode 100644 GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx
 create mode 100644 GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h

diff --git a/GPU/GPUTracking/Base/GPUParam.inc b/GPU/GPUTracking/Base/GPUParam.inc
index 2cb5f2275b13d..bfbedb374d098 100644
--- a/GPU/GPUTracking/Base/GPUParam.inc
+++ b/GPU/GPUTracking/Base/GPUParam.inc
@@ -77,9 +77,9 @@ GPUdi() float MEM_LG(GPUParam)::GetSystematicClusterErrorIFC2(float x, float z,
       } else if (rec.tpc.sysClusErrorZRegion > kEpsZBoundary && sideC) {
         return 0; // don't apply to C-side clusters if the Z-boundary is for A-region
       }
-      const float dz = CAMath::Abs((rec.tpc.sysClusErrorZRegion-z)*rec.tpc.sysClusErrorZRegionSigInv);
+      const float dz = CAMath::Abs((rec.tpc.sysClusErrorZRegion - z) * rec.tpc.sysClusErrorZRegionSigInv);
       if (dz < kMaxExpArgZ) { // is it small enough to call exp?
-        argExp += 0.5f * dz*dz;
+        argExp += 0.5f * dz * dz;
         if (argExp<kMaxExpArg) {
           sysErr = rec.tpc.sysClusErrorInner0 * CAMath::Exp(-argExp);
         }
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 5b176218984e5..e3866350a60a2 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -32,6 +32,7 @@
 #include "GPUTPCTrackletSelector.h"
 #include "GPUTPCGlobalTracking.h"
 #include "GPUTRDTrackerKernels.h"
+#include "GPUTPCCreateOccupancyMap.h"
 #ifdef GPUCA_NOCOMPAT
 #include "GPUTPCGMMergerGPU.h"
 #endif
diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
index 0297a1a6cb9d1..db2d248b819f9 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
@@ -43,6 +43,9 @@ using namespace GPUCA_NAMESPACE::gpu;
 #include "GPUTPCGlobalTracking.cxx"
 
 #if !defined(GPUCA_OPENCL1) && !defined(GPUCA_ALIROOT_LIB)
+#include "GPUTPCClusterOccupancyMap.cxx"
+#include "GPUTPCCreateOccupancyMap.cxx"
+
 // Files for TPC Merger
 #include "GPUTPCGMMerger.cxx"
 #include "GPUTPCGMMergerGPU.cxx"
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index d568892800451..3926547d70fda 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -41,6 +41,7 @@ set(SRCS
     SliceTracker/GPUTPCSliceData.cxx
     SliceTracker/GPUTPCSliceOutput.cxx
     SliceTracker/GPUTPCTrackletConstructor.cxx
+    SliceTracker/GPUTPCCreateOccupancyMap.cxx
     SliceTracker/GPUTPCNeighboursFinder.cxx
     SliceTracker/GPUTPCGrid.cxx
     SliceTracker/GPUTPCTrackletSelector.cxx
@@ -74,6 +75,7 @@ set(HDRS_CINT_O2_ADDITIONAL DataTypes/GPUSettings.h Definitions/GPUSettingsList.
 set(SRCS_NO_CINT
     DataTypes/GPUMemorySizeScalers.cxx
     DataTypes/GPUNewCalibValues.cxx
+    DataTypes/GPUTPCClusterOccupancyMap.cxx
     Base/GPUReconstruction.cxx
     Base/GPUReconstructionCPU.cxx
     Base/GPUProcessor.cxx
diff --git a/GPU/GPUTracking/DataTypes/GPUOutputControl.h b/GPU/GPUTracking/DataTypes/GPUOutputControl.h
index 639b92e9aa381..291850b60468b 100644
--- a/GPU/GPUTracking/DataTypes/GPUOutputControl.h
+++ b/GPU/GPUTracking/DataTypes/GPUOutputControl.h
@@ -71,6 +71,7 @@ struct GPUTrackingOutputs {
   GPUOutputControl tpcTracks;
   GPUOutputControl clusterLabels;
   GPUOutputControl sharedClusterMap;
+  GPUOutputControl tpcOccupancyMap;
   GPUOutputControl tpcTracksO2;
   GPUOutputControl tpcTracksO2ClusRefs;
   GPUOutputControl tpcTracksO2Labels;
diff --git a/GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.cxx b/GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.cxx
new file mode 100644
index 0000000000000..b0ea414f9ff87
--- /dev/null
+++ b/GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.cxx
@@ -0,0 +1,29 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCClusterOccupancyMap.cxx
+/// \author David Rohr
+
+#include "GPUTPCClusterOccupancyMap.h"
+#include "GPUParam.h"
+
+using namespace GPUCA_NAMESPACE::gpu;
+
+GPUd() unsigned int GPUTPCClusterOccupancyMapBin::getNBins(const GPUParam& param)
+{
+  unsigned int maxTimeBin = param.par.continuousTracking ? param.par.continuousMaxTimeBin : TPC_MAX_TIME_BIN_TRIGGERED;
+  return (maxTimeBin + param.rec.tpc.occupancyMapTimeBins) / param.rec.tpc.occupancyMapTimeBins; // Not -1, since maxTimeBin is allowed
+}
+
+GPUd() unsigned int GPUTPCClusterOccupancyMapBin::getTotalSize(const GPUParam& param)
+{
+  return getNBins(param) * sizeof(GPUTPCClusterOccupancyMapBin);
+}
diff --git a/GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.h b/GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.h
new file mode 100644
index 0000000000000..b6f7950ff6917
--- /dev/null
+++ b/GPU/GPUTracking/DataTypes/GPUTPCClusterOccupancyMap.h
@@ -0,0 +1,33 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCClusterOccupancyMap.h
+/// \author David Rohr
+
+#ifndef GPUTPCCLUSTEROCCUPANCYMAP_H
+#define GPUTPCCLUSTEROCCUPANCYMAP_H
+
+#include "GPUCommonDef.h"
+#include "GPUDefConstantsAndSettings.h"
+
+namespace GPUCA_NAMESPACE::gpu
+{
+struct GPUParam;
+struct GPUTPCClusterOccupancyMapBin {
+  unsigned short bin[GPUCA_NSLICES][GPUCA_ROW_COUNT];
+
+  GPUd() static unsigned int getNBins(const GPUParam& param);
+  GPUd() static unsigned int getTotalSize(const GPUParam& param);
+};
+
+} // namespace GPUCA_NAMESPACE::gpu
+
+#endif
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 07f9035329847..e67c57a2e499a 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -310,6 +310,12 @@
   #ifndef GPUCA_LB_GPUTRDTrackerKernels_gpuVersion
     #define GPUCA_LB_GPUTRDTrackerKernels_gpuVersion 512
   #endif
+  #ifndef GPUCA_LB_GPUTPCCreateOccupancyMap_fill
+    #define GPUCA_LB_GPUTPCCreateOccupancyMap_fill 256
+  #endif
+  #ifndef GPUCA_LB_GPUTPCCreateOccupancyMap_fold
+    #define GPUCA_LB_GPUTPCCreateOccupancyMap_fold 256
+  #endif
   #ifndef GPUCA_LB_GPUTRDTrackerKernels_o2Version
     #define GPUCA_LB_GPUTRDTrackerKernels_o2Version 512
   #endif
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 250f444fb830d..0f26853db8a06 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -90,6 +90,7 @@ AddOptionRTC(trackMergerFactor2General, float, 3.5f * 3.5f, "", 0, "General fact
 AddOptionRTC(maxTimeBinAboveThresholdIn1000Bin, unsigned short, 500, "", 0, "Except pad from cluster finding if total number of charges in a fragment is above this baseline (disable = 0)")
 AddOptionRTC(maxConsecTimeBinAboveThreshold, unsigned short, 200, "", 0, "Except pad from cluster finding if number of consecutive charges in a fragment is above this baseline (disable = 0)")
 AddOptionRTC(noisyPadSaturationThreshold, unsigned short, 700, "", 0, "Threshold where a timebin is considered saturated, disabling the noisy pad check for that pad")
+AddOptionRTC(occupancyMapTimeBins, unsigned short, 100, "", 0, "Number of timebins per histogram bin of occupancy map (0 = disable occupancy map)")
 AddOptionRTC(trackFitCovLimit, unsigned short, 1000, "", 0, "Abort fit when y/z cov exceed the limit")
 AddOptionRTC(addErrorsCECrossing, unsigned char, 0, "", 0, "Add additional custom track errors when crossing CE, 0 = no custom errors but att 0.5 to sigma_z^2, 1 = only to cov diagonal, 2 = preserve correlations")
 AddOptionRTC(trackMergerMinPartHits, unsigned char, 10, "", 0, "Minimum hits of track part during track merging")
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 29f9061833eb0..ec7c94a60660f 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -199,10 +199,14 @@ class GPUChain
   {
     return mRec->getTimer<T, J>(name, num);
   }
+  // Get GRID with NBLOCKS minimal such that nThreads * NBLOCS >= totalItems
   krnlExec GetGrid(unsigned int totalItems, unsigned int nThreads, int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep);
+  // Get GRID with NBLOCKS minimal such that ideal number of threads * NBLOCKS >= totalItems
   krnlExec GetGrid(unsigned int totalItems, int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep);
+  // Get GRID with specified number of blocks, each block with ideal number of threads
   krnlExec GetGridBlk(unsigned int nBlocks, int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep);
   krnlExec GetGridBlkStep(unsigned int nBlocks, int stream, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep);
+  // Get GRID with ideal number of threads / blocks for GPU
   krnlExec GetGridAuto(int stream, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep);
   krnlExec GetGridAutoStep(int stream, GPUCA_RECO_STEP st = GPUCA_RECO_STEP::NoRecoStep);
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
index 4d023cc8bca22..379c302159ef5 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
@@ -17,6 +17,8 @@
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUTPCClusterData.h"
+#include "GPUTrackingInputProvider.h"
+#include "GPUTPCClusterOccupancyMap.h"
 #include "utils/strtag.h"
 #include <fstream>
 
@@ -143,6 +145,20 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
     return (2);
   }
 
+  if (param().rec.tpc.occupancyMapTimeBins) {
+    AllocateRegisteredMemory(mInputsHost->mResourceOccupancyMap, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcOccupancyMap)]);
+    ReleaseEvent(mEvents->init);
+    auto* ptr = doGPU ? mInputsShadow->mTPCClusterOccupancyMap : mInputsHost->mTPCClusterOccupancyMap;
+    runKernel<GPUMemClean16>(GetGridAutoStep(mRec->NStreams() - 1, RecoStep::TPCSliceTracking), krnlRunRangeNone, {}, ptr, GPUTPCClusterOccupancyMapBin::getTotalSize(param()));
+    runKernel<GPUTPCCreateOccupancyMap, GPUTPCCreateOccupancyMap::fill>(GetGridBlk(GPUCA_NSLICES * GPUCA_ROW_COUNT, mRec->NStreams() - 1), krnlRunRangeNone, krnlEventNone, ptr);
+    runKernel<GPUTPCCreateOccupancyMap, GPUTPCCreateOccupancyMap::fold>(GetGridBlk(GPUCA_NSLICES * GPUCA_ROW_COUNT, mRec->NStreams() - 1), krnlRunRangeNone, {&mEvents->init}, ptr);
+    if (doGPU) {
+      TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap);
+    } else {
+      TransferMemoryResourceLinkToGPU(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap);
+    }
+  }
+
   int streamMap[NSLICES];
 
   bool error = false;
@@ -170,7 +186,6 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
         GPUInfo("Waiting for helper thread %d", iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1);
       }
       while (HelperDone(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1) < (int)iSlice) {
-        ;
       }
       if (HelperError(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1)) {
         error = 1;
diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx
index 2d9f7aea723d4..f541b4a6be8d1 100644
--- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx
+++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx
@@ -16,6 +16,7 @@
 #include "GPUDataTypes.h"
 #include "GPUTRDTrackletWord.h"
 #include "GPUReconstruction.h"
+#include "GPUTPCClusterOccupancyMap.h"
 #include "GPUErrors.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
@@ -75,10 +76,19 @@ void* GPUTrackingInputProvider::SetPointersInputTRD(void* mem)
   return mem;
 }
 
+void* GPUTrackingInputProvider::SetPointersTPCOccupancyMap(void* mem)
+{
+  if (mHoldTPCOccupancyMap) {
+    computePointerWithAlignment(mem, mTPCClusterOccupancyMap, GPUTPCClusterOccupancyMapBin::getNBins(mRec->GetParam()));
+  }
+  return mem;
+}
+
 void GPUTrackingInputProvider::RegisterMemoryAllocation()
 {
   mResourceErrorCodes = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersErrorCodes, GPUMemoryResource::MEMORY_PERMANENT, "ErrorCodes");
   mResourceZS = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputZS, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_PERMANENT, "InputZS");
+  mResourceOccupancyMap = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersTPCOccupancyMap, GPUMemoryResource::MEMORY_INOUT | GPUMemoryResource::MEMORY_CUSTOM, "OccupancyMap");
   mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT, "ClusterNativeAccess");
   mResourceClusterNativeBuffer = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputClusterNativeBuffer, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_CUSTOM, "ClusterNativeBuffer");
   mResourceClusterNativeOutput = mRec->RegisterMemoryAllocation(this, &GPUTrackingInputProvider::SetPointersInputClusterNativeOutput, GPUMemoryResource::MEMORY_OUTPUT_FLAG | GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_CUSTOM, "ClusterNativeOutput");
@@ -89,5 +99,6 @@ void GPUTrackingInputProvider::SetMaxData(const GPUTrackingInOutPointers& io)
 {
   mHoldTPCZS = io.tpcZS && (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding);
   mHoldTPCClusterNative = (io.tpcZS || io.tpcPackedDigits || io.clustersNative || io.tpcCompressedClusters) && mRec->IsGPU();
-  mHoldTPCClusterNativeOutput = (io.tpcZS || io.tpcPackedDigits || io.tpcCompressedClusters);
+  mHoldTPCOccupancyMap = (io.tpcZS || io.tpcPackedDigits || io.clustersNative || io.tpcCompressedClusters) && mRec->GetParam().rec.tpc.occupancyMapTimeBins;
+  mHoldTPCClusterNativeOutput = io.tpcZS || io.tpcPackedDigits || io.tpcCompressedClusters;
 }
diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h
index a14dcdebc39de..0f958353fa464 100644
--- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h
+++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h
@@ -33,6 +33,7 @@ namespace gpu
 {
 
 struct GPUTrackingInOutZS;
+struct GPUTPCClusterOccupancyMapBin;
 class GPUTRDTrackletWord;
 class GPUTRDSpacePoint;
 
@@ -44,6 +45,7 @@ class GPUTrackingInputProvider : public GPUProcessor
   void RegisterMemoryAllocation();
   void SetMaxData(const GPUTrackingInOutPointers& io);
 
+  void* SetPointersTPCOccupancyMap(void* mem);
   void* SetPointersInputZS(void* mem);
   void* SetPointersInputClusterNativeAccess(void* mem);
   void* SetPointersInputClusterNativeBuffer(void* mem);
@@ -58,10 +60,12 @@ class GPUTrackingInputProvider : public GPUProcessor
   unsigned short mResourceClusterNativeOutput = -1;
   unsigned short mResourceErrorCodes = -1;
   unsigned short mResourceTRD = -1;
+  unsigned short mResourceOccupancyMap = -1;
 
   bool mHoldTPCZS = false;
   bool mHoldTPCClusterNative = false;
   bool mHoldTPCClusterNativeOutput = false;
+  bool mHoldTPCOccupancyMap = false;
   unsigned int mNClusterNative = 0;
 
   GPUTrackingInOutZS* mPzsMeta = nullptr;
@@ -81,6 +85,8 @@ class GPUTrackingInputProvider : public GPUProcessor
   o2::tpc::ClusterNative* mPclusterNativeBuffer = nullptr;
   o2::tpc::ClusterNative* mPclusterNativeOutput = nullptr;
 
+  GPUTPCClusterOccupancyMapBin* mTPCClusterOccupancyMap = nullptr;
+
   unsigned int* mErrorCodes = nullptr;
 };
 
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx
new file mode 100644
index 0000000000000..749ea81dd528b
--- /dev/null
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx
@@ -0,0 +1,60 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCCreateOccupancyMap.cxx
+/// \author David Rohr
+
+#include "GPUTPCCreateOccupancyMap.h"
+#include "GPUTPCClusterOccupancyMap.h"
+
+using namespace GPUCA_NAMESPACE::gpu;
+
+template <>
+GPUdii() void GPUTPCCreateOccupancyMap::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors, GPUTPCClusterOccupancyMapBin* GPUrestrict() map)
+{
+  const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;
+  const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = ioPtrs.clustersNative;
+  GPUParam& GPUrestrict() param = processors.param;
+  const int iSliceRow = iBlock * nThreads + iThread;
+  if (iSliceRow >= GPUCA_ROW_COUNT * GPUCA_NSLICES) {
+    return;
+  }
+  const unsigned int iSlice = iSliceRow / GPUCA_ROW_COUNT;
+  const unsigned int iRow = iSliceRow % GPUCA_ROW_COUNT;
+  for (unsigned int i = 0; i < clusters->nClusters[iSlice][iRow]; i++) {
+    const unsigned int bin = clusters->clusters[iSlice][iRow][i].getTime() / param.rec.tpc.occupancyMapTimeBins;
+    map[bin].bin[iSlice][iRow]++;
+  }
+}
+
+template <>
+GPUdii() void GPUTPCCreateOccupancyMap::Thread<1>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors, GPUTPCClusterOccupancyMapBin* GPUrestrict() map)
+{
+  GPUParam& GPUrestrict() param = processors.param;
+  const int iSliceRow = iBlock * nThreads + iThread;
+  if (iSliceRow > GPUCA_ROW_COUNT * GPUCA_NSLICES) {
+    return;
+  }
+  const unsigned int iSlice = iSliceRow / GPUCA_ROW_COUNT;
+  const unsigned int iRow = iSliceRow % GPUCA_ROW_COUNT;
+  const unsigned int nBins = GPUTPCClusterOccupancyMapBin::getNBins(param);
+  const unsigned int nFoldBins = CAMath::Min(5u, nBins);
+  unsigned int sum = 0;
+  for (unsigned int i = 0; i < nFoldBins; i++) {
+    sum += map[i].bin[iSlice][iRow];
+  }
+  unsigned short lastVal;
+  for (unsigned int i = 0; i < nBins; i++) {
+    lastVal = map[i].bin[iSlice][iRow];
+    map[i].bin[iSlice][iRow] = sum / nFoldBins;
+    sum += map[CAMath::Min(i + nFoldBins, nBins - 1)].bin[iSlice][iRow] - lastVal;
+  }
+}
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h
new file mode 100644
index 0000000000000..33a0d18b92a30
--- /dev/null
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h
@@ -0,0 +1,39 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCCreateOccupancyMap.h
+/// \author David Rohr
+
+#ifndef GPUTPCCREATEOCCUPANCYMAP_H
+#define GPUTPCCREATEOCCUPANCYMAP_H
+
+#include "GPUTPCDef.h"
+#include "GPUGeneralKernels.h"
+#include "GPUConstantMem.h"
+
+namespace GPUCA_NAMESPACE::gpu
+{
+struct GPUTPCClusterOccupancyMapBin;
+
+class GPUTPCCreateOccupancyMap : public GPUKernelTemplate
+{
+ public:
+  enum K { defaultKernel = 0,
+           fill = 0,
+           fold = 1 };
+  GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCSliceTracking; }
+  template <int iKernel = defaultKernel>
+  GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors, GPUTPCClusterOccupancyMapBin* map);
+};
+
+} // namespace GPUCA_NAMESPACE::gpu
+
+#endif
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index b06749010d2d9..ff35e7e0e02cc 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -23,6 +23,8 @@ o2_gpu_add_kernel("GPUMemClean16"                                NO_OCL1 "simple
 o2_gpu_add_kernel("GPUTPCGlobalTrackingCopyNumbers"              NO_OCL1 single int n)
 o2_gpu_add_kernel("GPUTPCCreateSliceData"                        LB      single)
 o2_gpu_add_kernel("GPUTPCGlobalTracking"                         LB      single)
+o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fill"               LB      simple GPUTPCClusterOccupancyMapBin* map)
+o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fold"               LB      simple GPUTPCClusterOccupancyMapBin* map)
 o2_gpu_add_kernel("GPUTPCGMMergerTrackFit"                       LB      simple int mode)
 o2_gpu_add_kernel("GPUTPCGMMergerFollowLoopers"                  LB      simple)
 o2_gpu_add_kernel("GPUTPCGMMergerUnpackResetIds"                 LB      simple int iSlice)
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index b1f4b00d3240f..3e024c595b97c 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -737,6 +737,7 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
   setOutputAllocator("COMPCLUSTERSFLAT", mSpecConfig.outputCompClustersFlat, outputRegions.compressedClusters, std::make_tuple(gDataOriginTPC, (DataDescription) "COMPCLUSTERSFLAT", 0));
   setOutputAllocator("CLUSTERNATIVE", mClusterOutputIds.size() > 0, outputRegions.clustersNative, std::make_tuple(gDataOriginTPC, mSpecConfig.sendClustersPerSector ? (DataDescription) "CLUSTERNATIVETMP" : (DataDescription) "CLUSTERNATIVE", NSectors, clusterOutputSectorHeader), sizeof(o2::tpc::ClusterCountIndex));
   setOutputAllocator("CLSHAREDMAP", mSpecConfig.outputSharedClusterMap, outputRegions.sharedClusterMap, std::make_tuple(gDataOriginTPC, (DataDescription) "CLSHAREDMAP", 0));
+  setOutputAllocator("TPCOCCUPANCYMAP", mSpecConfig.outputSharedClusterMap, outputRegions.tpcOccupancyMap, std::make_tuple(gDataOriginTPC, (DataDescription) "TPCOCCUPANCYMAP", 0));
   setOutputAllocator("TRACKS", mSpecConfig.outputTracks, outputRegions.tpcTracksO2, std::make_tuple(gDataOriginTPC, (DataDescription) "TRACKS", 0));
   setOutputAllocator("CLUSREFS", mSpecConfig.outputTracks, outputRegions.tpcTracksO2ClusRefs, std::make_tuple(gDataOriginTPC, (DataDescription) "CLUSREFS", 0));
   setOutputAllocator("TRACKSMCLBL", mSpecConfig.outputTracks && mSpecConfig.processMC, outputRegions.tpcTracksO2Labels, std::make_tuple(gDataOriginTPC, (DataDescription) "TRACKSMCLBL", 0));
@@ -837,6 +838,10 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
     }
   }
 
+  if (mConfig->configReconstruction.tpc.occupancyMapTimeBins == 0) {
+    pc.outputs().make<DataAllocator::UninitializedVector<outputDataType>>({gDataOriginTPC, "TPCOCCUPANCYMAP", 0}, 0u);
+  }
+
   std::unique_ptr<o2::tpc::ClusterNativeAccess> tmpEmptyClNative;
   if (createEmptyOutput) {
     memset(&ptrs, 0, sizeof(ptrs));
@@ -1201,6 +1206,7 @@ Outputs GPURecoWorkflowSpec::outputs()
   }
   if (mSpecConfig.outputSharedClusterMap) {
     outputSpecs.emplace_back(gDataOriginTPC, "CLSHAREDMAP", 0, Lifetime::Timeframe);
+    outputSpecs.emplace_back(gDataOriginTPC, "TPCOCCUPANCYMAP", 0, Lifetime::Timeframe);
   }
   if (mSpecConfig.tpcTriggerHandling) {
     outputSpecs.emplace_back(gDataOriginTPC, "TRIGGERWORDS", 0, Lifetime::Timeframe);