From 0eec04648c79d725df0ca6ba38ef856f99275ae8 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Mon, 9 Sep 2024 23:24:02 +0200
Subject: [PATCH] DCAFitter GPU: add test to validate the implementation
 (#13447)

---
 Common/DCAFitter/GPU/CMakeLists.txt           |  30 +-
 Common/DCAFitter/GPU/DCAFitterN.cu            |  38 --
 Common/DCAFitter/GPU/cuda/CMakeLists.txt      |  32 +
 Common/DCAFitter/GPU/cuda/DCAFitterN.cu       | 106 ++++
 .../GPU/cuda/test/testDCAFitterNGPU.cxx       | 556 ++++++++++++++++++
 Common/DCAFitter/GPU/hip/CMakeLists.txt       |  34 ++
 .../DCAFitter/GPU/test/testDCAFitterNCUDA.cu  |  37 --
 .../DCAFitter/include/DCAFitter/DCAFitterN.h  |  54 +-
 .../DCAFitter/include/DCAFitter/HelixHelper.h |  46 +-
 .../MathUtils/include/MathUtils/Cartesian.h   |  12 +
 .../MathUtils/include/MathUtils/SMatrixGPU.h  |  63 +-
 Common/MathUtils/include/MathUtils/Utils.h    |   6 +-
 .../src/TrackParametrization.cxx              |  11 -
 GPU/Common/GPUCommonMath.h                    |   2 +-
 14 files changed, 844 insertions(+), 183 deletions(-)
 delete mode 100644 Common/DCAFitter/GPU/DCAFitterN.cu
 create mode 100644 Common/DCAFitter/GPU/cuda/CMakeLists.txt
 create mode 100644 Common/DCAFitter/GPU/cuda/DCAFitterN.cu
 create mode 100644 Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
 create mode 100644 Common/DCAFitter/GPU/hip/CMakeLists.txt
 delete mode 100644 Common/DCAFitter/GPU/test/testDCAFitterNCUDA.cu

diff --git a/Common/DCAFitter/GPU/CMakeLists.txt b/Common/DCAFitter/GPU/CMakeLists.txt
index f2206c0bc6ddf..faf15f8aab3df 100644
--- a/Common/DCAFitter/GPU/CMakeLists.txt
+++ b/Common/DCAFitter/GPU/CMakeLists.txt
@@ -9,28 +9,10 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-if(CUDA_ENABLED)
-# o2_add_library(DCAFitterCUDA
-#                TARGETVARNAME targetName
-#                SOURCES DCAFitterN.cu
-#                # src/FwdDCAFitterN.cxx
-#                PUBLIC_INCLUDE_DIRECTORIES ../include
-#                PUBLIC_LINK_LIBRARIES O2::MathUtils
-#                                      O2::ReconstructionDataFormats
-#                                      O2::DetectorsBase)
-
-# o2_add_test(DCAFitterNCUDA NAME testDCAFitterNCUDA
-#             SOURCES test/testDCAFitterNCUDA.cu
-#             COMPONENT_NAME DCAFitterCUDA
-#             PUBLIC_LINK_LIBRARIES O2::DCAFitter
-#             COMPONENT_NAME GPU
-#             LABELS gpu vertexing)
+if (CUDA_ENABLED)
+add_subdirectory(cuda)
 endif()
-# if (HIP_ENABLED)
-# o2_add_test(DCAFitterNHIP NAME testDCAFitterNHIP
-#             SOURCES test/testDCAFitterNCUDA.cu
-#             HIPIFIED test
-#             PUBLIC_LINK_LIBRARIES O2::DCAFitterHIP
-#             COMPONENT_NAME GPU
-#             LABELS gpu vertexing)
-# endif()
\ No newline at end of file
+
+if (HIP_ENABLED)
+add_subdirectory(hip)
+endif()
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/DCAFitterN.cu b/Common/DCAFitter/GPU/DCAFitterN.cu
deleted file mode 100644
index 10f2c40dc41e1..0000000000000
--- a/Common/DCAFitter/GPU/DCAFitterN.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-#ifdef __HIPCC__
-#include "hip/hip_runtime.h"
-#else
-#include <cuda.h>
-#endif
-
-#include "GPUCommonDef.h"
-#include "DCAFitter/DCAFitterN.h"
-
-namespace o2
-{
-namespace vertexing
-{
-GPUg() void __dummy_instance__()
-{
-#ifdef GPUCA_GPUCODE_DEVICE
-#pragma message "Compiling device code"
-#endif
-  DCAFitter2 ft2;
-  DCAFitter3 ft3;
-  o2::track::TrackParCov tr;
-  ft2.process(tr, tr);
-  ft3.process(tr, tr, tr);
-}
-
-} // namespace vertexing
-} // namespace o2
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/CMakeLists.txt b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..a498d0c350202
--- /dev/null
+++ b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+o2_add_library(DCAFitterCUDA
+               TARGETVARNAME targetName
+               SOURCES DCAFitterN.cu
+               PUBLIC_INCLUDE_DIRECTORIES ../../include
+               PUBLIC_LINK_LIBRARIES O2::MathUtils
+                                     O2::ReconstructionDataFormats
+                                     O2::DetectorsBase
+               PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
+set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+o2_add_test(DCAFitterNCUDA
+            SOURCES test/testDCAFitterNGPU.cxx
+            PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats
+                                  O2::DCAFitterCUDA
+                                  O2::DCAFitter
+                                  ROOT::Core
+                                  ROOT::Physics
+            COMPONENT_NAME gpu
+            LABELS vertexing
+            ENVIRONMENT O2_ROOT=${CMAKE_BINARY_DIR}/stage
+            VMCWORKDIR=${CMAKE_BINARY_DIR}/stage/${CMAKE_INSTALL_DATADIR})
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
new file mode 100644
index 0000000000000..f543b7bc7cdd4
--- /dev/null
+++ b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -0,0 +1,106 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+#ifdef __HIPCC__
+#include "hip/hip_runtime.h"
+#else
+#include <cuda.h>
+#endif
+
+#include "GPUCommonDef.h"
+#include "DCAFitter/DCAFitterN.h"
+// #include "MathUtils/SMatrixGPU.h"
+
+#define gpuCheckError(x)                \
+  {                                     \
+    gpuAssert((x), __FILE__, __LINE__); \
+  }
+inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
+{
+  if (code != cudaSuccess) {
+    std::cout << "GPUassert: " << cudaGetErrorString(code) << " " << file << " " << line << std::endl;
+    if (abort) {
+      throw std::runtime_error("GPU assert failed.");
+    }
+  }
+}
+namespace o2::vertexing::device
+{
+namespace kernel
+{
+GPUg() void printKernel(o2::vertexing::DCAFitterN<2>* ft)
+{
+  if (threadIdx.x == 0) {
+    printf(" =============== GPU DCA Fitter ================\n");
+    ft->print();
+    printf(" ===============================================\n");
+  }
+}
+
+GPUg() void processKernel(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParCov* t1, o2::track::TrackParCov* t2, int* res)
+{
+  *res = ft->process(*t1, *t2);
+}
+} // namespace kernel
+
+void print(o2::vertexing::DCAFitterN<2>* ft,
+           const int nBlocks,
+           const int nThreads)
+{
+  DCAFitterN<2>* ft_device;
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
+  gpuCheckError(cudaMemcpy(ft_device, ft, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
+
+  kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
+
+  gpuCheckError(cudaPeekAtLastError());
+  gpuCheckError(cudaDeviceSynchronize());
+}
+
+int process(o2::vertexing::DCAFitterN<2>* fitter,
+            o2::track::TrackParCov& track1,
+            o2::track::TrackParCov& track2,
+            const int nBlocks,
+            const int nThreads)
+{
+  DCAFitterN<2>* ft_device;
+  o2::track::TrackParCov* t1_device;
+  o2::track::TrackParCov* t2_device;
+  int result, *result_device;
+
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t1_device), sizeof(o2::track::TrackParCov)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t2_device), sizeof(o2::track::TrackParCov)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
+
+  gpuCheckError(cudaMemcpy(ft_device, fitter, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
+  gpuCheckError(cudaMemcpy(t1_device, &track1, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
+  gpuCheckError(cudaMemcpy(t2_device, &track2, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
+
+  kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, t1_device, t2_device, result_device);
+
+  gpuCheckError(cudaPeekAtLastError());
+  gpuCheckError(cudaDeviceSynchronize());
+
+  gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(fitter, ft_device, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(&track1, t1_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(&track2, t2_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaFree(ft_device));
+  gpuCheckError(cudaFree(t1_device));
+  gpuCheckError(cudaFree(t2_device));
+
+  gpuCheckError(cudaFree(result_device));
+
+  return result;
+}
+
+} // namespace o2::vertexing::device
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
new file mode 100644
index 0000000000000..829ecb808f5c2
--- /dev/null
+++ b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
@@ -0,0 +1,556 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+#define BOOST_TEST_MODULE Test DCAFitterN class
+#define BOOST_TEST_MAIN
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include "DCAFitter/DCAFitterN.h"
+#include "CommonUtils/TreeStreamRedirector.h"
+#include <TRandom.h>
+#include <TGenPhaseSpace.h>
+#include <TLorentzVector.h>
+#include <TStopwatch.h>
+#include <Math/SVector.h>
+#include <array>
+
+namespace o2
+{
+namespace vertexing
+{
+
+using Vec3D = ROOT::Math::SVector<double, 3>;
+
+template <class FITTER>
+float checkResults(o2::utils::TreeStreamRedirector& outs, std::string& treeName, FITTER& fitter,
+                   Vec3D& vgen, TLorentzVector& genPar, const std::vector<double>& dtMass)
+{
+  int nCand = fitter.getNCandidates();
+  std::array<float, 3> p;
+  float distMin = 1e9;
+  bool absDCA = fitter.getUseAbsDCA();
+  bool useWghDCA = fitter.getWeightedFinalPCA();
+  for (int ic = 0; ic < nCand; ic++) {
+    const auto& vtx = fitter.getPCACandidate(ic);
+    auto df = vgen;
+    df -= vtx;
+
+    TLorentzVector moth, prong;
+    for (int i = 0; i < fitter.getNProngs(); i++) {
+      const auto& trc = fitter.getTrack(i, ic);
+      trc.getPxPyPzGlo(p);
+      prong.SetVectM({p[0], p[1], p[2]}, dtMass[i]);
+      moth += prong;
+    }
+    auto nIter = fitter.getNIterations(ic);
+    auto chi2 = fitter.getChi2AtPCACandidate(ic);
+    double dst = TMath::Sqrt(df[0] * df[0] + df[1] * df[1] + df[2] * df[2]);
+    distMin = dst < distMin ? dst : distMin;
+    auto parentTrack = fitter.createParentTrackParCov(ic);
+    //    float genX
+    outs << treeName.c_str() << "cand=" << ic << "ncand=" << nCand << "nIter=" << nIter << "chi2=" << chi2
+         << "genPart=" << genPar << "recPart=" << moth
+         << "genX=" << vgen[0] << "genY=" << vgen[1] << "genZ=" << vgen[2]
+         << "dx=" << df[0] << "dy=" << df[1] << "dz=" << df[2] << "dst=" << dst
+         << "useAbsDCA=" << absDCA << "useWghDCA=" << useWghDCA << "parent=" << parentTrack;
+    for (int i = 0; i < fitter.getNProngs(); i++) {
+      outs << treeName.c_str() << fmt::format("prong{}=", i).c_str() << fitter.getTrack(i, ic);
+    }
+    outs << treeName.c_str() << "\n";
+  }
+  return distMin;
+}
+
+TLorentzVector generate(Vec3D& vtx, std::vector<o2::track::TrackParCov>& vctr, float bz,
+                        TGenPhaseSpace& genPHS, double parMass, const std::vector<double>& dtMass, std::vector<int> forceQ)
+{
+  const float errYZ = 1e-2, errSlp = 1e-3, errQPT = 2e-2;
+  std::array<float, 15> covm = {
+    errYZ * errYZ,
+    0., errYZ * errYZ,
+    0, 0., errSlp * errSlp,
+    0., 0., 0., errSlp * errSlp,
+    0., 0., 0., 0., errQPT * errQPT};
+  bool accept = true;
+  TLorentzVector parent, d0, d1, d2;
+  do {
+    accept = true;
+    double y = gRandom->Rndm() - 0.5;
+    double pt = 0.1 + gRandom->Rndm() * 3;
+    double mt = TMath::Sqrt(parMass * parMass + pt * pt);
+    double pz = mt * TMath::SinH(y);
+    double phi = gRandom->Rndm() * TMath::Pi() * 2;
+    double en = mt * TMath::CosH(y);
+    double rdec = 10.; // radius of the decay
+    vtx[0] = rdec * TMath::Cos(phi);
+    vtx[1] = rdec * TMath::Sin(phi);
+    vtx[2] = rdec * pz / pt;
+    parent.SetPxPyPzE(pt * TMath::Cos(phi), pt * TMath::Sin(phi), pz, en);
+    int nd = dtMass.size();
+    genPHS.SetDecay(parent, nd, dtMass.data());
+    genPHS.Generate();
+    vctr.clear();
+    float p[4];
+    for (int i = 0; i < nd; i++) {
+      auto* dt = genPHS.GetDecay(i);
+      if (dt->Pt() < 0.05) {
+        accept = false;
+        break;
+      }
+      dt->GetXYZT(p);
+      float s, c, x;
+      std::array<float, 5> params;
+      o2::math_utils::sincos(dt->Phi(), s, c);
+      o2::math_utils::rotateZInv(vtx[0], vtx[1], x, params[0], s, c);
+
+      params[1] = vtx[2];
+      params[2] = 0.; // since alpha = phi
+      params[3] = 1. / TMath::Tan(dt->Theta());
+      params[4] = (i % 2 ? -1. : 1.) / dt->Pt();
+      covm[14] = errQPT * errQPT * params[4] * params[4];
+      //
+      // randomize
+      float r1, r2;
+      gRandom->Rannor(r1, r2);
+      params[0] += r1 * errYZ;
+      params[1] += r2 * errYZ;
+      gRandom->Rannor(r1, r2);
+      params[2] += r1 * errSlp;
+      params[3] += r2 * errSlp;
+      params[4] *= gRandom->Gaus(1., errQPT);
+      if (forceQ[i] == 0) {
+        params[4] = 0.; // impose straight track
+      }
+      auto& trc = vctr.emplace_back(x, dt->Phi(), params, covm);
+      float rad = forceQ[i] == 0 ? 600. : TMath::Abs(1. / trc.getCurvature(bz));
+      if (!trc.propagateTo(trc.getX() + (gRandom->Rndm() - 0.5) * rad * 0.05, bz) ||
+          !trc.rotate(trc.getAlpha() + (gRandom->Rndm() - 0.5) * 0.2)) {
+        printf("Failed to randomize ");
+        trc.print();
+      }
+    }
+  } while (!accept);
+
+  return parent;
+}
+
+BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
+{
+  constexpr int NTest = 10000;
+  o2::utils::TreeStreamRedirector outStream("dcafitterNTest.root");
+
+  TGenPhaseSpace genPHS;
+  constexpr double ele = 0.00051;
+  constexpr double gamma = 2 * ele + 1e-6;
+  constexpr double pion = 0.13957;
+  constexpr double k0 = 0.49761;
+  constexpr double kch = 0.49368;
+  constexpr double dch = 1.86965;
+  std::vector<double> gammadec = {ele, ele};
+  std::vector<double> k0dec = {pion, pion};
+  std::vector<double> dchdec = {pion, kch, pion};
+  std::vector<o2::track::TrackParCov> vctracks;
+  Vec3D vtxGen;
+
+  double bz = 5.0;
+  // 2 prongs vertices
+  {
+    LOG(info) << "Processing 2-prong Helix - Helix case";
+    std::vector<int> forceQ{1, 1};
+
+    o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
+    ft.setBz(bz);
+    ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
+    ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
+    ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
+    ft.setMaxDXYIni(4);          // do not consider V0 seeds with tracks XY-distance exceeding this. This is default anyway
+    ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
+    ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
+
+    std::string treeName2A = "pr2a", treeName2AW = "pr2aw", treeName2W = "pr2w";
+    TStopwatch swA, swAW, swW;
+    int nfoundA = 0, nfoundAW = 0, nfoundW = 0;
+    double meanDA = 0, meanDAW = 0, meanDW = 0;
+    swA.Stop();
+    swAW.Stop();
+    swW.Stop();
+    for (int iev = 0; iev < NTest; iev++) {
+      auto genParent = generate(vtxGen, vctracks, bz, genPHS, k0, k0dec, forceQ);
+      ft.setUseAbsDCA(true);
+      swA.Start(false);
+      int ncA = device::process(&ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      swA.Stop();
+      LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
+      if (ncA) {
+        auto minD = checkResults(outStream, treeName2A, ft, vtxGen, genParent, k0dec);
+        meanDA += minD;
+        nfoundA++;
+      }
+
+      ft.setUseAbsDCA(true);
+      ft.setWeightedFinalPCA(true);
+      swAW.Start(false);
+      int ncAW = device::process(&ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      swAW.Stop();
+      LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
+      if (ncAW) {
+        auto minD = checkResults(outStream, treeName2AW, ft, vtxGen, genParent, k0dec);
+        meanDAW += minD;
+        nfoundAW++;
+      }
+
+      ft.setUseAbsDCA(false);
+      ft.setWeightedFinalPCA(false);
+      swW.Start(false);
+      int ncW = device::process(&ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      swW.Stop();
+      LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
+      if (ncW) {
+        auto minD = checkResults(outStream, treeName2W, ft, vtxGen, genParent, k0dec);
+        meanDW += minD;
+        nfoundW++;
+      }
+    }
+    ft.print();
+    device::print(&ft, 1, 1);
+    meanDA /= nfoundA ? nfoundA : 1;
+    meanDAW /= nfoundA ? nfoundA : 1;
+    meanDW /= nfoundW ? nfoundW : 1;
+    LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix";
+    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+              << " mean.dist to truth: " << meanDA << " GPU time: " << swA.CpuTime();
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+              << " mean.dist to truth: " << meanDAW << " GPU time: " << swAW.CpuTime();
+    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+              << " mean.dist to truth: " << meanDW << " GPU time: " << swW.CpuTime();
+    BOOST_CHECK(nfoundA > 0.99 * NTest);
+    // BOOST_CHECK(nfoundAW > 0.99 * NTest);
+    // BOOST_CHECK(nfoundW > 0.99 * NTest);
+    // BOOST_CHECK(meanDA < 0.1);
+    // BOOST_CHECK(meanDAW < 0.1);
+    // BOOST_CHECK(meanDW < 0.1);
+  }
+
+  // // 2 prongs vertices with collinear tracks (gamma conversion)
+  // {
+  //   LOG(info) << "Processing 2-prong Helix - Helix case gamma conversion";
+  //   std::vector<int> forceQ{1, 1};
+
+  //   o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
+  //   ft.setBz(bz);
+  //   ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
+  //   ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
+  //   ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
+  //   ft.setMaxDXYIni(4);          // do not consider V0 seeds with tracks XY-distance exceeding this. This is default anyway
+  //   ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
+  //   ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
+
+  //   std::string treeName2A = "gpr2a", treeName2AW = "gpr2aw", treeName2W = "gpr2w";
+  //   TStopwatch swA, swAW, swW;
+  //   int nfoundA = 0, nfoundAW = 0, nfoundW = 0;
+  //   double meanDA = 0, meanDAW = 0, meanDW = 0;
+  //   swA.Stop();
+  //   swAW.Stop();
+  //   swW.Stop();
+  //   for (int iev = 0; iev < NTest; iev++) {
+  //     auto genParent = generate(vtxGen, vctracks, bz, genPHS, gamma, gammadec, forceQ);
+
+  //     ft.setUseAbsDCA(true);
+  //     swA.Start(false);
+  //     int ncA = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swA.Stop();
+  //     LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncA) {
+  //       auto minD = checkResults(outStream, treeName2A, ft, vtxGen, genParent, gammadec);
+  //       meanDA += minD;
+  //       nfoundA++;
+  //     }
+
+  //     ft.setUseAbsDCA(true);
+  //     ft.setWeightedFinalPCA(true);
+  //     swAW.Start(false);
+  //     int ncAW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swAW.Stop();
+  //     LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncAW) {
+  //       auto minD = checkResults(outStream, treeName2AW, ft, vtxGen, genParent, gammadec);
+  //       meanDAW += minD;
+  //       nfoundAW++;
+  //     }
+
+  //     ft.setUseAbsDCA(false);
+  //     ft.setWeightedFinalPCA(false);
+  //     swW.Start(false);
+  //     int ncW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swW.Stop();
+  //     LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncW) {
+  //       auto minD = checkResults(outStream, treeName2W, ft, vtxGen, genParent, gammadec);
+  //       meanDW += minD;
+  //       nfoundW++;
+  //     }
+  //   }
+  //   ft.print();
+  //   meanDA /= nfoundA ? nfoundA : 1;
+  //   meanDAW /= nfoundA ? nfoundA : 1;
+  //   meanDW /= nfoundW ? nfoundW : 1;
+  //   LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix from gamma conversion";
+  //   LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+  //             << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
+  //   LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+  //             << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
+  //   LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+  //             << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
+  //   BOOST_CHECK(nfoundA > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundAW > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundW > 0.99 * NTest);
+  //   BOOST_CHECK(meanDA < 2.1);
+  //   BOOST_CHECK(meanDAW < 2.1);
+  //   BOOST_CHECK(meanDW < 2.1);
+  // }
+
+  // // 2 prongs vertices with one of charges set to 0: Helix : Line
+  // {
+  //   std::vector<int> forceQ{1, 1};
+  //   LOG(info) << "Processing 2-prong Helix - Line case";
+  //   o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
+  //   ft.setBz(bz);
+  //   ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
+  //   ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
+  //   ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
+  //   ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
+  //   ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
+
+  //   std::string treeName2A = "pr2aHL", treeName2AW = "pr2awHL", treeName2W = "pr2wHL";
+  //   TStopwatch swA, swAW, swW;
+  //   int nfoundA = 0, nfoundAW = 0, nfoundW = 0;
+  //   double meanDA = 0, meanDAW = 0, meanDW = 0;
+  //   swA.Stop();
+  //   swAW.Stop();
+  //   swW.Stop();
+  //   for (int iev = 0; iev < NTest; iev++) {
+  //     forceQ[iev % 2] = 1;
+  //     forceQ[1 - iev % 2] = 0;
+  //     auto genParent = generate(vtxGen, vctracks, bz, genPHS, k0, k0dec, forceQ);
+
+  //     ft.setUseAbsDCA(true);
+  //     swA.Start(false);
+  //     int ncA = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swA.Stop();
+  //     LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncA) {
+  //       auto minD = checkResults(outStream, treeName2A, ft, vtxGen, genParent, k0dec);
+  //       meanDA += minD;
+  //       nfoundA++;
+  //     }
+
+  //     ft.setUseAbsDCA(true);
+  //     ft.setWeightedFinalPCA(true);
+  //     swAW.Start(false);
+  //     int ncAW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swAW.Stop();
+  //     LOG(debug) << "fit abs.dist  " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncAW) {
+  //       auto minD = checkResults(outStream, treeName2AW, ft, vtxGen, genParent, k0dec);
+  //       meanDAW += minD;
+  //       nfoundAW++;
+  //     }
+
+  //     ft.setUseAbsDCA(false);
+  //     ft.setWeightedFinalPCA(false);
+  //     swW.Start(false);
+  //     int ncW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swW.Stop();
+  //     LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncW) {
+  //       auto minD = checkResults(outStream, treeName2W, ft, vtxGen, genParent, k0dec);
+  //       meanDW += minD;
+  //       nfoundW++;
+  //     }
+  //   }
+  //   ft.print();
+  //   meanDA /= nfoundA ? nfoundA : 1;
+  //   meanDAW /= nfoundAW ? nfoundAW : 1;
+  //   meanDW /= nfoundW ? nfoundW : 1;
+  //   LOG(info) << "Processed " << NTest << " 2-prong vertices: Helix : Line";
+  //   LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+  //             << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
+  //   LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+  //             << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
+  //   LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+  //             << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
+  //   BOOST_CHECK(nfoundA > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundAW > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundW > 0.99 * NTest);
+  //   BOOST_CHECK(meanDA < 0.1);
+  //   BOOST_CHECK(meanDAW < 0.1);
+  //   BOOST_CHECK(meanDW < 0.1);
+  // }
+
+  // // 2 prongs vertices with both of charges set to 0: Line : Line
+  // {
+  //   std::vector<int> forceQ{0, 0};
+  //   LOG(info) << "Processing 2-prong Line - Line case";
+  //   o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
+  //   ft.setBz(bz);
+  //   ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
+  //   ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
+  //   ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
+  //   ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
+  //   ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
+
+  //   std::string treeName2A = "pr2aLL", treeName2AW = "pr2awLL", treeName2W = "pr2wLL";
+  //   TStopwatch swA, swAW, swW;
+  //   int nfoundA = 0, nfoundAW = 0, nfoundW = 0;
+  //   double meanDA = 0, meanDAW = 0, meanDW = 0;
+  //   swA.Stop();
+  //   swAW.Stop();
+  //   swW.Stop();
+  //   for (int iev = 0; iev < NTest; iev++) {
+  //     forceQ[0] = forceQ[1] = 0;
+  //     auto genParent = generate(vtxGen, vctracks, bz, genPHS, k0, k0dec, forceQ);
+
+  //     ft.setUseAbsDCA(true);
+  //     swA.Start(false);
+  //     int ncA = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swA.Stop();
+  //     LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncA) {
+  //       auto minD = checkResults(outStream, treeName2A, ft, vtxGen, genParent, k0dec);
+  //       meanDA += minD;
+  //       nfoundA++;
+  //     }
+
+  //     ft.setUseAbsDCA(true);
+  //     ft.setWeightedFinalPCA(true);
+  //     swAW.Start(false);
+  //     int ncAW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swAW.Stop();
+  //     LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncAW) {
+  //       auto minD = checkResults(outStream, treeName2AW, ft, vtxGen, genParent, k0dec);
+  //       meanDAW += minD;
+  //       nfoundAW++;
+  //     }
+
+  //     ft.setUseAbsDCA(false);
+  //     ft.setWeightedFinalPCA(false);
+  //     swW.Start(false);
+  //     int ncW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+  //     swW.Stop();
+  //     LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncW) {
+  //       auto minD = checkResults(outStream, treeName2W, ft, vtxGen, genParent, k0dec);
+  //       meanDW += minD;
+  //       nfoundW++;
+  //     }
+  //   }
+  //   ft.print();
+  //   meanDA /= nfoundA ? nfoundA : 1;
+  //   meanDAW /= nfoundAW ? nfoundAW : 1;
+  //   meanDW /= nfoundW ? nfoundW : 1;
+  //   LOG(info) << "Processed " << NTest << " 2-prong vertices: Line : Line";
+  //   LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+  //             << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
+  //   LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+  //             << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
+  //   LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+  //             << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
+  //   BOOST_CHECK(nfoundA > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundAW > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundW > 0.99 * NTest);
+  //   BOOST_CHECK(meanDA < 0.1);
+  //   BOOST_CHECK(meanDAW < 0.1);
+  //   BOOST_CHECK(meanDW < 0.1);
+  // }
+
+  // // 3 prongs vertices
+  // {
+  //   std::vector<int> forceQ{1, 1, 1};
+
+  //   o2::vertexing::DCAFitterN<3> ft; // 3 prong fitter
+  //   ft.setBz(bz);
+  //   ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
+  //   ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
+  //   ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
+  //   ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
+  //   ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
+
+  //   std::string treeName3A = "pr3a", treeName3AW = "pr3aw", treeName3W = "pr3w";
+  //   TStopwatch swA, swAW, swW;
+  //   int nfoundA = 0, nfoundAW = 0, nfoundW = 0;
+  //   double meanDA = 0, meanDAW = 0, meanDW = 0;
+  //   swA.Stop();
+  //   swAW.Stop();
+  //   swW.Stop();
+  //   for (int iev = 0; iev < NTest; iev++) {
+  //     auto genParent = generate(vtxGen, vctracks, bz, genPHS, dch, dchdec, forceQ);
+
+  //     ft.setUseAbsDCA(true);
+  //     swA.Start(false);
+  //     int ncA = ft.process(vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+  //     swA.Stop();
+  //     LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncA) {
+  //       auto minD = checkResults(outStream, treeName3A, ft, vtxGen, genParent, dchdec);
+  //       meanDA += minD;
+  //       nfoundA++;
+  //     }
+
+  //     ft.setUseAbsDCA(true);
+  //     ft.setWeightedFinalPCA(true);
+  //     swAW.Start(false);
+  //     int ncAW = ft.process(vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+  //     swAW.Stop();
+  //     LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncAW) {
+  //       auto minD = checkResults(outStream, treeName3AW, ft, vtxGen, genParent, dchdec);
+  //       meanDAW += minD;
+  //       nfoundAW++;
+  //     }
+
+  //     ft.setUseAbsDCA(false);
+  //     ft.setWeightedFinalPCA(false);
+  //     swW.Start(false);
+  //     int ncW = ft.process(vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+  //     swW.Stop();
+  //     LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
+  //     if (ncW) {
+  //       auto minD = checkResults(outStream, treeName3W, ft, vtxGen, genParent, dchdec);
+  //       meanDW += minD;
+  //       nfoundW++;
+  //     }
+  //   }
+  //   ft.print();
+  //   meanDA /= nfoundA ? nfoundA : 1;
+  //   meanDAW /= nfoundAW ? nfoundAW : 1;
+  //   meanDW /= nfoundW ? nfoundW : 1;
+  //   LOG(info) << "Processed " << NTest << " 3-prong vertices";
+  //   LOG(info) << "3-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+  //             << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
+  //   LOG(info) << "3-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+  //             << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
+  //   LOG(info) << "3-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+  //             << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
+  //   BOOST_CHECK(nfoundA > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundAW > 0.99 * NTest);
+  //   BOOST_CHECK(nfoundW > 0.99 * NTest);
+  //   BOOST_CHECK(meanDA < 0.1);
+  //   BOOST_CHECK(meanDAW < 0.1);
+  //   BOOST_CHECK(meanDW < 0.1);
+  // }
+
+  outStream.Close();
+}
+
+} // namespace vertexing
+} // namespace o2
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/hip/CMakeLists.txt b/Common/DCAFitter/GPU/hip/CMakeLists.txt
new file mode 100644
index 0000000000000..272d18a81bab4
--- /dev/null
+++ b/Common/DCAFitter/GPU/hip/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -fgpu-rdc")
+o2_add_hipified_library(DCAFitterHIP
+                        SOURCES ../cuda/DCAFitterN.cu
+                        PUBLIC_INCLUDE_DIRECTORIES ../../include
+                        PUBLIC_LINK_LIBRARIES O2::MathUtils
+                                              O2::ReconstructionDataFormats
+                                              O2::DetectorsBase
+                                              hip::host
+                        PRIVATE_LINK_LIBRARIES O2::GPUTrackingHIPExternalProvider
+                        TARGETVARNAME targetNAme)
+
+o2_add_test(DCAFitterNHIP
+            SOURCES ../cuda/test/testDCAFitterNGPU.cxx
+            PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats
+                                  O2::DCAFitterHIP
+                                  O2::DCAFitter
+                                  ROOT::Core
+                                  ROOT::Physics
+            HIPIFIED test
+            COMPONENT_NAME gpu
+            LABELS vertexing
+            ENVIRONMENT O2_ROOT=${CMAKE_BINARY_DIR}/stage
+            VMCWORKDIR=${CMAKE_BINARY_DIR}/stage/${CMAKE_INSTALL_DATADIR})
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/test/testDCAFitterNCUDA.cu b/Common/DCAFitter/GPU/test/testDCAFitterNCUDA.cu
deleted file mode 100644
index 6e5a3e1b225e9..0000000000000
--- a/Common/DCAFitter/GPU/test/testDCAFitterNCUDA.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-//
-/// \author matteo.concas@cern.ch
-#define BOOST_TEST_MODULE Test DCAFitterN class on GPU
-
-#ifdef __HIPCC__
-#define GPUPLATFORM "HIP"
-#include "hip/hip_runtime.h"
-#else
-#define GPUPLATFORM "CUDA"
-#include <cuda.h>
-#endif
-
-#include "DCAFitter/DCAFitterN.h"
-#include "GPUCommonDef.h"
-#include <boost/test/unit_test.hpp>
-
-namespace gpu
-{
-GPUg() void testDCAFitterInstanceKernel()
-{
-  o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
-  ft.setBz(0.f);
-}
-} // namespace gpu
-BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
-{
-}
\ No newline at end of file
diff --git a/Common/DCAFitter/include/DCAFitter/DCAFitterN.h b/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
index 340a0be199c68..b74fec89d9f37 100644
--- a/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
+++ b/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
@@ -48,7 +48,7 @@ struct TrackCovI {
       syz = -cyz * detYZI;
       szz = cyy * detYZI;
     } else {
-#ifndef GPUCA_GPUCODE_DEVICE
+#ifndef GPUCA_GPUCODE
       throw std::runtime_error("invalid track covariance");
 #endif
     }
@@ -122,10 +122,10 @@ class DCAFitterN
 
   ///< prepare copies of tracks at the V0 candidate (no check for the candidate validity)
   ///  must be called before getTrack(i,cand) query
-  bool propagateTracksToVertex(int cand = 0);
+  GPUd() bool propagateTracksToVertex(int cand = 0);
 
   ///< check if propagation of tracks to candidate vertex was done
-  bool isPropagateTracksToVertexDone(int cand = 0) const { return mTrPropDone[mOrder[cand]]; }
+  GPUd() bool isPropagateTracksToVertexDone(int cand = 0) const { return mTrPropDone[mOrder[cand]]; }
 
   ///< check if propagation of tracks to candidate vertex was done
   bool isPropagationFailure(int cand = 0) const { return mPropFailed[mOrder[cand]]; }
@@ -153,18 +153,18 @@ class DCAFitterN
   }
 
   ///< create parent track param with errors for decay vertex
-  o2::track::TrackParCov createParentTrackParCov(int cand = 0, bool sectorAlpha = true) const;
+  GPUd() o2::track::TrackParCov createParentTrackParCov(int cand = 0, bool sectorAlpha = true) const;
 
   ///< create parent track param w/o errors for decay vertex
-  o2::track::TrackPar createParentTrackPar(int cand = 0, bool sectorAlpha = true) const;
+  GPUd() o2::track::TrackPar createParentTrackPar(int cand = 0, bool sectorAlpha = true) const;
 
   ///< calculate on the fly track param (no cov mat) at candidate, check isValid to make sure propagation was successful
-  o2::track::TrackPar getTrackParamAtPCA(int i, int cand = 0);
+  GPUd() o2::track::TrackPar getTrackParamAtPCA(int i, int cand = 0);
 
   ///< recalculate PCA as a cov-matrix weighted mean, even if absDCA method was used
-  bool recalculatePCAWithErrors(int cand = 0);
+  GPUd() bool recalculatePCAWithErrors(int cand = 0);
 
-  MatSym3D calcPCACovMatrix(int cand = 0) const;
+  GPUd() MatSym3D calcPCACovMatrix(int cand = 0) const;
 
   o2::gpu::gpustd::array<float, 6> calcPCACovMatrixFlat(int cand = 0) const
   {
@@ -517,7 +517,7 @@ GPUd() void DCAFitterN<N, Args...>::calcResidDerivatives()
         dr2[2] += trDx.d2zdx2;
       }
     } // track over which we differentiate
-  }   // residual being differentiated
+  } // residual being differentiated
 }
 
 //__________________________________________________________________________
@@ -567,7 +567,7 @@ GPUd() void DCAFitterN<N, Args...>::calcResidDerivativesNoErr()
       dr2ji[2] = -trDxi.d2zdx2 * NInv;
 
     } // track over which we differentiate
-  }   // residual being differentiated
+  } // residual being differentiated
 }
 
 //__________________________________________________________________________
@@ -1011,10 +1011,21 @@ GPUd() bool DCAFitterN<N, Args...>::closerToAlternative() const
 template <int N, typename... Args>
 GPUd() void DCAFitterN<N, Args...>::print() const
 {
+#ifndef GPUCA_GPUCODE_DEVICE
   LOG(info) << N << "-prong vertex fitter in " << (mUseAbsDCA ? "abs." : "weighted") << " distance minimization mode";
   LOG(info) << "Bz: " << mBz << " MaxIter: " << mMaxIter << " MaxChi2: " << mMaxChi2;
   LOG(info) << "Stopping condition: Max.param change < " << mMinParamChange << " Rel.Chi2 change > " << mMinRelChi2Change;
   LOG(info) << "Discard candidates for : Rvtx > " << getMaxR() << " DZ between tracks > " << mMaxDZIni;
+#else
+  if (mUseAbsDCA) {
+    printf("%d-prong vertex fitter in abs. distance minimization mode\n", N);
+  } else {
+    printf("%d-prong vertex fitter in weighted distance minimization mode\n", N);
+  }
+  printf("Bz: %f MaxIter: %d  MaxChi2: %f\n", mBz, mMaxIter, mMaxChi2);
+  printf("Stopping condition: Max.param change < %f Rel.Chi2 change > %f\n", mMinParamChange, mMinRelChi2Change);
+  printf("Discard candidates for : Rvtx > %f DZ between tracks > %f\n", getMaxR(), mMaxDZIni);
+#endif
 }
 
 //___________________________________________________________________
@@ -1079,7 +1090,9 @@ GPUdi() bool DCAFitterN<N, Args...>::propagateParamToX(o2::track::TrackPar& t, f
 {
   bool res = true;
   if (mUsePropagator || mMatCorr != o2::base::Propagator::MatCorrType::USEMatCorrNONE) {
+#ifndef GPUCA_GPUCODE
     res = o2::base::Propagator::Instance()->PropagateToXBxByBz(t, x, mMaxSnp, mMaxStep, mMatCorr);
+#endif
   } else {
     res = t.propagateParamTo(x, mBz);
   }
@@ -1095,7 +1108,9 @@ GPUdi() bool DCAFitterN<N, Args...>::propagateToX(o2::track::TrackParCov& t, flo
 {
   bool res = true;
   if (mUsePropagator || mMatCorr != o2::base::Propagator::MatCorrType::USEMatCorrNONE) {
+#ifndef GPUCA_GPUCODE
     res = o2::base::Propagator::Instance()->PropagateToXBxByBz(t, x, mMaxSnp, mMaxStep, mMatCorr);
+#endif
   } else {
     res = t.propagateTo(x, mBz);
   }
@@ -1107,7 +1122,24 @@ GPUdi() bool DCAFitterN<N, Args...>::propagateToX(o2::track::TrackParCov& t, flo
 
 using DCAFitter2 = DCAFitterN<2, o2::track::TrackParCov>;
 using DCAFitter3 = DCAFitterN<3, o2::track::TrackParCov>;
-
+#ifdef GPUCA_GPUCODE
+namespace gpu::kernel
+{
+GPUg() void printKernel(o2::vertexing::DCAFitterN<2>* ft);
+GPUg() void processKernel(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParCov* t1, o2::track::TrackParCov* t2, int* res);
+} // namespace gpu::kernel
+#endif
+namespace device
+{
+void print(o2::vertexing::DCAFitterN<2>*,
+           const int nBlocks = 1,
+           const int nThreads = 1);
+int process(o2::vertexing::DCAFitterN<2>*,
+            o2::track::TrackParCov&,
+            o2::track::TrackParCov&,
+            const int nBlocks = 1,
+            const int nThreads = 1);
+} // namespace device
 } // namespace vertexing
 } // namespace o2
 #endif // _ALICEO2_DCA_FITTERN_
diff --git a/Common/DCAFitter/include/DCAFitter/HelixHelper.h b/Common/DCAFitter/include/DCAFitter/HelixHelper.h
index 39dee1d399848..ee6d6838b3ed9 100644
--- a/Common/DCAFitter/include/DCAFitter/HelixHelper.h
+++ b/Common/DCAFitter/include/DCAFitter/HelixHelper.h
@@ -41,7 +41,7 @@ struct TrackAuxPar : public o2::math_utils::CircleXYf_t {
   float sinDif(const TrackAuxPar& t) const { return s * t.c - c * t.s; } // sin(alpha_this - alha_t)
 
   template <typename T>
-  void set(const T& trc, float bz)
+  GPUd() void set(const T& trc, float bz)
   {
     trc.getCircleParams(bz, *this, s, c);
     cc = c * c;
@@ -59,13 +59,13 @@ struct CrossInfo {
   float yDCA[2] = {};
   int nDCA = 0;
 
-  int circlesCrossInfo(const TrackAuxPar& trax0, const TrackAuxPar& trax1, float maxDistXY = MaxDistXYDef, bool isCollinear = false)
+  GPUd() int circlesCrossInfo(const TrackAuxPar& trax0, const TrackAuxPar& trax1, float maxDistXY = MaxDistXYDef, bool isCollinear = false)
   {
     const auto& trcA = trax0.rC > trax1.rC ? trax0 : trax1; // designate the largest circle as A
     const auto& trcB = trax0.rC > trax1.rC ? trax1 : trax0;
     float xDist = trcB.xC - trcA.xC, yDist = trcB.yC - trcA.yC;
-    float dist2 = xDist * xDist + yDist * yDist, dist = std::sqrt(dist2), rsum = trcA.rC + trcB.rC;
-    if (std::abs(dist) < 1e-12) {
+    float dist2 = xDist * xDist + yDist * yDist, dist = o2::gpu::GPUCommonMath::Sqrt(dist2), rsum = trcA.rC + trcB.rC;
+    if (o2::gpu::GPUCommonMath::Sqrt(dist) < 1e-12) {
       return nDCA; // circles are concentric?
     }
     if (dist > rsum) { // circles don't touch, chose a point in between
@@ -89,13 +89,13 @@ struct CrossInfo {
         xDCA[0] = r2_r * trcA.xC + r1_r * trcB.xC;
         yDCA[0] = r2_r * trcA.yC + r1_r * trcB.yC;
         nDCA = 1;
-      } else if (std::abs(xDist) < std::abs(yDist)) {
+      } else if (o2::gpu::GPUCommonMath::Sqrt(xDist) < o2::gpu::GPUCommonMath::Sqrt(yDist)) {
         // to simplify calculations, we move to new frame x->x+Xc0, y->y+Yc0, so that
         // the 1st one is centered in origin
         float a = (trcA.rC * trcA.rC - trcB.rC * trcB.rC + dist2) / (2. * yDist), b = -xDist / yDist, ab = a * b, bb = b * b;
         float det = ab * ab - (1. + bb) * (a * a - trcA.rC * trcA.rC);
         if (det > 0.) {
-          det = std::sqrt(det);
+          det = o2::gpu::GPUCommonMath::Sqrt(det);
           xDCA[0] = (-ab + det) / (1. + b * b);
           yDCA[0] = a + b * xDCA[0] + trcA.yC;
           xDCA[0] += trcA.xC;
@@ -110,7 +110,7 @@ struct CrossInfo {
         float a = (trcA.rC * trcA.rC - trcB.rC * trcB.rC + dist2) / (2. * xDist), b = -yDist / xDist, ab = a * b, bb = b * b;
         float det = ab * ab - (1. + bb) * (a * a - trcA.rC * trcA.rC);
         if (det > 0.) {
-          det = std::sqrt(det);
+          det = o2::gpu::GPUCommonMath::Sqrt(det);
           yDCA[0] = (-ab + det) / (1. + bb);
           xDCA[0] = a + b * yDCA[0] + trcA.xC;
           yDCA[0] += trcA.yC;
@@ -126,14 +126,14 @@ struct CrossInfo {
     return nDCA;
   }
 
-  void notTouchingXY(float dist, float xDist, float yDist, const TrackAuxPar& trcA, float rBSign, bool isCollinear = false)
+  GPUd() void notTouchingXY(float dist, float xDist, float yDist, const TrackAuxPar& trcA, float rBSign, bool isCollinear = false)
   {
     if (isCollinear) {
       /// for collinear tracks it is better to take
       /// a weighted average of the crossing points as a radius
-      float r2r = trcA.rC + std::abs(rBSign);
+      float r2r = trcA.rC + o2::gpu::GPUCommonMath::Sqrt(rBSign);
       float r1_r = trcA.rC / r2r;
-      float r2_r = std::abs(rBSign) / r2r;
+      float r2_r = o2::gpu::GPUCommonMath::Sqrt(rBSign) / r2r;
       xDCA[0] = r2_r * trcA.xC + r1_r * (xDist + trcA.xC);
       yDCA[0] = r2_r * trcA.yC + r1_r * (yDist + trcA.yC);
     } else {
@@ -151,8 +151,8 @@ struct CrossInfo {
   }
 
   template <typename T>
-  int linesCrossInfo(const TrackAuxPar& trax0, const T& tr0,
-                     const TrackAuxPar& trax1, const T& tr1, float maxDistXY = MaxDistXYDef)
+  GPUd() int linesCrossInfo(const TrackAuxPar& trax0, const T& tr0,
+                            const TrackAuxPar& trax1, const T& tr1, float maxDistXY = MaxDistXYDef)
   {
     /// closest approach of 2 straight lines
     ///  TrackParam propagation can be parameterized in lab in a form
@@ -172,14 +172,14 @@ struct CrossInfo {
     float dy = trax1.yC - trax0.yC; //
     float dz = tr1.getZ() - tr0.getZ();
     auto csp0i2 = 1. / tr0.getCsp2(); // 1 / csp^2
-    auto csp0i = std::sqrt(csp0i2);
+    auto csp0i = o2::gpu::GPUCommonMath::Sqrt(csp0i2);
     auto tgp0 = tr0.getSnp() * csp0i;
     float kx0 = trax0.c - trax0.s * tgp0;
     float ky0 = trax0.s + trax0.c * tgp0;
     float kz0 = tr0.getTgl() * csp0i;
     auto csp1i2 = 1. / tr1.getCsp2(); // 1 / csp^2
-    auto csp1i = std::sqrt(csp1i2);
-    auto tgp1 = tr1.getSnp() * std::sqrt(csp1i2);
+    auto csp1i = o2::gpu::GPUCommonMath::Sqrt(csp1i2);
+    auto tgp1 = tr1.getSnp() * o2::gpu::GPUCommonMath::Sqrt(csp1i2);
     float kx1 = trax1.c - trax1.s * tgp1;
     float ky1 = trax1.s + trax1.c * tgp1;
     float kz1 = tr1.getTgl() * csp1i;
@@ -194,7 +194,7 @@ struct CrossInfo {
     float a00 = (1.f + tr0.getTgl() * tr0.getTgl()) * csp0i2, a11 = (1.f + tr1.getTgl() * tr1.getTgl()) * csp1i2, a01 = -(kx0 * kx1 + ky0 * ky1 + kz0 * kz1);
     float b0 = dx * kx0 + dy * ky0 + dz * kz0, b1 = -(dx * kx1 + dy * ky1 + dz * kz1);
     float det = a00 * a11 - a01 * a01, det0 = b0 * a11 - b1 * a01, det1 = a00 * b1 - a01 * b0;
-    if (std::abs(det) > o2::constants::math::Almost0) {
+    if (o2::gpu::GPUCommonMath::Sqrt(det) > o2::constants::math::Almost0) {
       auto detI = 1. / det;
       auto t0 = det0 * detI;
       auto t1 = det1 * detI;
@@ -212,8 +212,8 @@ struct CrossInfo {
   }
 
   template <typename T>
-  int circleLineCrossInfo(const TrackAuxPar& trax0, const T& tr0,
-                          const TrackAuxPar& trax1, const T& tr1, float maxDistXY = MaxDistXYDef)
+  GPUd() int circleLineCrossInfo(const TrackAuxPar& trax0, const T& tr0,
+                                 const TrackAuxPar& trax1, const T& tr1, float maxDistXY = MaxDistXYDef)
   {
     /// closest approach of line and circle
     ///  TrackParam propagation can be parameterized in lab in a form
@@ -238,14 +238,14 @@ struct CrossInfo {
     float dy = traxL.yC - traxH.yC; // Y...
     // t^2(kx^2+ky^2) + 2t(dx*kx+dy*ky) + dx^2 + dy^2 - r^2 = 0
     auto cspi2 = 1. / trcL.getCsp2(); // 1 / csp^2 == kx^2 +  ky^2
-    auto cspi = std::sqrt(cspi2);
+    auto cspi = o2::gpu::GPUCommonMath::Sqrt(cspi2);
     auto tgp = trcL.getSnp() * cspi;
     float kx = traxL.c - traxL.s * tgp;
     float ky = traxL.s + traxL.c * tgp;
     double dk = dx * kx + dy * ky;
     double det = dk * dk - cspi2 * (dx * dx + dy * dy - traxH.rC * traxH.rC);
     if (det > 0) { // 2 crossings
-      det = std::sqrt(det);
+      det = o2::gpu::GPUCommonMath::Sqrt(det);
       float t0 = (-dk + det) * cspi2;
       float t1 = (-dk - det) * cspi2;
       xDCA[0] = traxL.xC + kx * t0;
@@ -257,7 +257,7 @@ struct CrossInfo {
       // there is no crossing, find the point of the closest approach on the line which is closest to the circle center
       float t = -dk * cspi2;
       float xL = traxL.xC + kx * t, yL = traxL.yC + ky * t; // point on the line, need to average with point on the circle
-      float dxc = xL - traxH.xC, dyc = yL - traxH.yC, dist = std::sqrt(dxc * dxc + dyc * dyc);
+      float dxc = xL - traxH.xC, dyc = yL - traxH.yC, dist = o2::gpu::GPUCommonMath::Sqrt(dxc * dxc + dyc * dyc);
       if (dist - traxH.rC > maxDistXY) { // too large distance
         return nDCA;
       }
@@ -271,7 +271,7 @@ struct CrossInfo {
   }
 
   template <typename T>
-  int set(const TrackAuxPar& trax0, const T& tr0, const TrackAuxPar& trax1, const T& tr1, float maxDistXY = MaxDistXYDef, bool isCollinear = false)
+  GPUd() int set(const TrackAuxPar& trax0, const T& tr0, const TrackAuxPar& trax1, const T& tr1, float maxDistXY = MaxDistXYDef, bool isCollinear = false)
   {
     // calculate up to 2 crossings between 2 circles
     nDCA = 0;
@@ -286,7 +286,7 @@ struct CrossInfo {
     return nDCA;
   }
 
-  CrossInfo() = default;
+  GPUdDefault() CrossInfo() = default;
 
   template <typename T>
   CrossInfo(const TrackAuxPar& trax0, const T& tr0, const TrackAuxPar& trax1, const T& tr1, float maxDistXY = MaxDistXYDef, bool isCollinear = false)
diff --git a/Common/MathUtils/include/MathUtils/Cartesian.h b/Common/MathUtils/include/MathUtils/Cartesian.h
index 67e17ae6bd47d..9b917707835a6 100644
--- a/Common/MathUtils/include/MathUtils/Cartesian.h
+++ b/Common/MathUtils/include/MathUtils/Cartesian.h
@@ -264,6 +264,18 @@ inline SMatrix<T, D1, D1, MatRepSym<T, D1>> Similarity(const SMatrix<T, D1, D2,
 {
   return ROOT::Math::Similarity(lhs, rhs);
 }
+#else
+template <class T, unsigned int D>
+GPUdi() T Dot(const SVector<T, D>& lhs, const SVector<T, D>& rhs)
+{
+  return o2::math_utils::detail::Dot(lhs, rhs);
+}
+
+template <class T, unsigned int D1, unsigned int D2, class R>
+GPUdi() SMatrix<T, D1, D1, MatRepSym<T, D1>> Similarity(const SMatrix<T, D1, D2, R>& lhs, const SMatrix<T, D2, D2, MatRepSym<T, D2>>& rhs)
+{
+  return o2::math_utils::detail::Similarity(lhs, rhs);
+}
 #endif // Disable for GPU
 
 } // namespace math_utils
diff --git a/Common/MathUtils/include/MathUtils/SMatrixGPU.h b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
index fdb5bb2a5da66..1372b2a861409 100644
--- a/Common/MathUtils/include/MathUtils/SMatrixGPU.h
+++ b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
@@ -67,9 +67,9 @@ class SVectorGPU
   GPUd() SVectorGPU();
   GPUd() SVectorGPU(const SVectorGPU<T, N>& rhs);
 
-  GPUd() const T& operator[](unsigned int i) const;
+  GPUhd() const T& operator[](unsigned int i) const;
+  GPUhd() T& operator[](unsigned int i);
   GPUd() const T& operator()(unsigned int i) const;
-  GPUd() T& operator[](unsigned int i);
   GPUd() T& operator()(unsigned int i);
   GPUd() const T* Array() const;
   GPUd() T* Array();
@@ -114,19 +114,19 @@ GPUdi() const T* SVectorGPU<T, D>::end() const
 }
 template <class T, unsigned int N>
 
-GPUdi() const T& SVectorGPU<T, N>::operator[](unsigned int i) const
+GPUhdi() const T& SVectorGPU<T, N>::operator[](unsigned int i) const
 {
   return mArray[i];
 }
 
 template <class T, unsigned int N>
-GPUdi() const T& SVectorGPU<T, N>::operator()(unsigned int i) const
+GPUhdi() T& SVectorGPU<T, N>::operator[](unsigned int i)
 {
   return mArray[i];
 }
 
 template <class T, unsigned int N>
-GPUdi() T& SVectorGPU<T, N>::operator[](unsigned int i)
+GPUdi() const T& SVectorGPU<T, N>::operator()(unsigned int i) const
 {
   return mArray[i];
 }
@@ -205,7 +205,7 @@ GPUdi() SVectorGPU<T, D>& SVectorGPU<T, D>::operator-=(const SVectorGPU<T, D>& r
 }
 
 template <class T, unsigned int D>
-GPUd() SVectorGPU<T, D>& SVectorGPU<T, D>::operator+=(const SVectorGPU<T, D>& rhs)
+GPUdi() SVectorGPU<T, D>& SVectorGPU<T, D>::operator+=(const SVectorGPU<T, D>& rhs)
 {
   for (unsigned int i = 0; i < D; ++i) {
     mArray[i] += rhs.apply(i);
@@ -304,31 +304,11 @@ class MatRepSymGPU
  public:
   typedef T value_type;
   GPUdDefault() MatRepSymGPU() = default;
-  GPUdi() T& operator()(unsigned int i, unsigned int j)
-  {
-    return mArray[offset(i, j)];
-  }
-
-  GPUdi() T const& operator()(unsigned int i, unsigned int j) const
-  {
-    return mArray[offset(i, j)];
-  }
-
-  GPUdi() T& operator[](unsigned int i)
-  {
-    return mArray[off(i)];
-  }
-
-  GPUdi() T const& operator[](unsigned int i) const
-  {
-    return mArray[off(i)];
-  }
-
-  GPUdi() T apply(unsigned int i) const
-  {
-    return mArray[off(i)];
-  }
-
+  GPUdi() T& operator()(unsigned int i, unsigned int j) { return mArray[offset(i, j)]; }
+  GPUdi() T const& operator()(unsigned int i, unsigned int j) const { return mArray[offset(i, j)]; }
+  GPUhdi() T& operator[](unsigned int i) { return mArray[off(i)]; }
+  GPUdi() T const& operator[](unsigned int i) const { return mArray[off(i)]; }
+  GPUdi() T apply(unsigned int i) const { return mArray[off(i)]; }
   GPUdi() T* Array() { return mArray; }
 
   GPUdi() const T* Array() const { return mArray; }
@@ -481,9 +461,10 @@ class SMatrixGPU
     kCols = D2,     // columns
     kSize = D1 * D2 // rows*columns
   };
-  GPUd() T apply(unsigned int i) const;
-  GPUd() const T* Array() const;
-  GPUd() T* Array();
+  // https://root.cern/doc/master/SMatrix_8icc_source.html#l00627
+  GPUd() T apply(unsigned int i) const { return mRep[i]; }
+  GPUd() const T* Array() const { return mRep.Array(); }
+  GPUd() T* Array() { return mRep.Array(); }
   GPUd() iterator begin();
   GPUd() iterator end();
   GPUd() const T& operator()(unsigned int i, unsigned int j) const;
@@ -518,6 +499,9 @@ class SMatrixGPU
 
   GPUd() SMatrixRowGPUconst operator[](unsigned int i) const { return SMatrixRowGPUconst(*this, i); }
   GPUd() SMatrixRowGPU operator[](unsigned int i) { return SMatrixRowGPU(*this, i); }
+  template <class R2>
+  GPUd() SMatrixGPU<T, D1, D2, R>& operator+=(const SMatrixGPU<T, D1, D2, R2>& rhs);
+
   GPUd() bool Invert();
   GPUd() bool IsInUse(const T* p) const;
 
@@ -676,7 +660,7 @@ GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator=(const Expr
 
 template <class T, unsigned int D1, unsigned int D2, class R>
 template <class M>
-GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator=(const M& rhs)
+GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator=(const M & rhs)
 {
   mRep = rhs.mRep;
   return *this;
@@ -1399,6 +1383,14 @@ GPUdi() bool SMatrixGPU<T, D1, D2, R>::Invert()
   return Inverter<D1, D2>::Dinv((*this).mRep);
 }
 
+template <class T, unsigned int D1, unsigned int D2, class R>
+template <class R2>
+GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator+=(const SMatrixGPU<T, D1, D2, R2>& rhs)
+{
+  mRep += rhs.mRep;
+  return *this;
+}
+
 template <class T, unsigned int D1, unsigned int D2, class R>
 struct TranspPolicyGPU {
   enum {
@@ -1425,6 +1417,7 @@ class TransposeOpGPU
   {
     return mRhs.apply((i % D1) * D2 + i / D1);
   }
+
   GPUdi() T operator()(unsigned int i, unsigned j) const
   {
     return mRhs(j, i);
diff --git a/Common/MathUtils/include/MathUtils/Utils.h b/Common/MathUtils/include/MathUtils/Utils.h
index 618d3e379f6d2..79263b4142216 100644
--- a/Common/MathUtils/include/MathUtils/Utils.h
+++ b/Common/MathUtils/include/MathUtils/Utils.h
@@ -128,17 +128,17 @@ GPUdi() void rotateZd(double xL, double yL, double& xG, double& yG, double snAlp
   return detail::rotateZ<double>(xL, yL, xG, yG, snAlp, csAlp);
 }
 
-#ifndef GPUCA_GPUCODE_DEVICE
-inline void rotateZInv(float xG, float yG, float& xL, float& yL, float snAlp, float csAlp)
+GPUdi() void rotateZInv(float xG, float yG, float& xL, float& yL, float snAlp, float csAlp)
 {
   detail::rotateZInv<float>(xG, yG, xL, yL, snAlp, csAlp);
 }
 
-inline void rotateZInvd(double xG, double yG, double& xL, double& yL, double snAlp, double csAlp)
+GPUdi() void rotateZInvd(double xG, double yG, double& xL, double& yL, double snAlp, double csAlp)
 {
   detail::rotateZInv<double>(xG, yG, xL, yL, snAlp, csAlp);
 }
 
+#ifndef GPUCA_GPUCODE_DEVICE
 inline std::tuple<float, float> rotateZInv(float xG, float yG, float snAlp, float csAlp)
 {
   return detail::rotateZInv<float>(xG, yG, snAlp, csAlp);
diff --git a/DataFormats/Reconstruction/src/TrackParametrization.cxx b/DataFormats/Reconstruction/src/TrackParametrization.cxx
index ec63bc2e95db1..3f60455bc45f8 100644
--- a/DataFormats/Reconstruction/src/TrackParametrization.cxx
+++ b/DataFormats/Reconstruction/src/TrackParametrization.cxx
@@ -14,17 +14,6 @@
 /// @since  Oct 1, 2020
 /// @brief
 
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
 #include "ReconstructionDataFormats/TrackParametrization.h"
 #include "ReconstructionDataFormats/Vertex.h"
 #include "ReconstructionDataFormats/DCA.h"
diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h
index abb35e1af1655..b926188e5f39a 100644
--- a/GPU/Common/GPUCommonMath.h
+++ b/GPU/Common/GPUCommonMath.h
@@ -325,7 +325,7 @@ GPUhdi() float GPUCommonMath::Hypot(float x, float y, float z, float w)
 }
 
 template <typename T>
-void _swap(T& a, T& b)
+GPUd() void _swap(T& a, T& b)
 {
   T tmp = a;
   a = b;