From c3acc581cc65ae43427453884fff25dd8f2ecc80 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 28 Jun 2021 09:41:51 +0200
Subject: [PATCH 01/85] Working with CUDA11 without warnings

---
 CUDA/CMakeLists.txt | 2 +-
 setup.py            | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt
index 693714d0..5f80d6b0 100644
--- a/CUDA/CMakeLists.txt
+++ b/CUDA/CMakeLists.txt
@@ -100,8 +100,8 @@ IF(FERMI_GPU)
   list(APPEND MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50)
   list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=sm_52)
   list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=compute_52)
+  list(APPEND MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50)
 ELSE(FERMI_GPU)
-  set(MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50)
   list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=sm_52)
   list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=compute_52)
 
diff --git a/setup.py b/setup.py
index 6b801034..a99c31ca 100644
--- a/setup.py
+++ b/setup.py
@@ -107,6 +107,8 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
+    version="0.2.0",
+    description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From f471098a8066e161a637aba3127d81215b2c9a04 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 29 Jun 2021 14:35:15 +0200
Subject: [PATCH 02/85] Free memory issues and update tests

---
 CUDA/src/gpuNUFFT_operator.cpp |  6 ++++--
 python/test_file.py            | 12 ++++++++++++
 python/test_nufftOp.py         | 12 ++++++------
 3 files changed, 22 insertions(+), 8 deletions(-)
 create mode 100644 python/test_file.py

diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 1fd6c352..02a6062c 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -572,7 +572,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(
   // move memory management into constructor/destructor of GpuNUFFT Operator!!!
   //
   freeTotalDeviceMemory(imdata_sum_d, NULL);
-  // this->freeDeviceMemory();
+  this->freeDeviceMemory();
 
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in gpuNUFFT_gpu_adj function: %s\n",
@@ -854,6 +854,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(
            cudaGetErrorString(cudaGetLastError()));
 
   freeTotalDeviceMemory(data_d, imdata_d, imdata_sum_d, NULL);
+  this->freeDeviceMemory();
 
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in gpuNUFFT_gpu_adj function: %s\n",
@@ -1051,7 +1052,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
   }  // iterate over coils
 
   freeTotalDeviceMemory(imdata_d, NULL);
-  // this->freeDeviceMemory();
+  this->freeDeviceMemory();
 
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n",
@@ -1261,6 +1262,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
   }  // iterate over coils
 
   freeTotalDeviceMemory(data_d, imdata_d, NULL);
+  this->freeDeviceMemory();
 
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n",
diff --git a/python/test_file.py b/python/test_file.py
new file mode 100644
index 00000000..9ada03eb
--- /dev/null
+++ b/python/test_file.py
@@ -0,0 +1,12 @@
+import numpy as np
+from mri.operators import NonCartesianFFT
+from mri.operators.fourier.utils import estimate_density_compensation
+
+
+traj = np.load('/volatile/temp_traj.npy')
+#D = estimate_density_compensation(traj, (384, 384, 208), 2)
+fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT')
+K = fourier.op(np.zeros((384, 384, 208)))
+print("Forward done")
+I = fourier.adj_op(K)
+print("Backward done")
diff --git a/python/test_nufftOp.py b/python/test_nufftOp.py
index b4d5eab1..4d7a1387 100644
--- a/python/test_nufftOp.py
+++ b/python/test_nufftOp.py
@@ -84,13 +84,13 @@ def setUp(self):
     def test_multicoil_with_sense(self):
         print('Apply forward op')
         operator = self.get_nufft_op(self.coil_maps)
-        x = operator.op(np.reshape(self.img.T, self.img.size))
+        x = operator.op(np.reshape(self.img.T, self.img.size), False)
         y = np.random.random(x.shape)
         print('Output kdata shape is', x.shape)
         print('-------------------------------')
         print('Apply adjoint op')
-        img_adj = operator.adj_op(x)
-        adj_y = operator.adj_op(y)
+        img_adj = operator.adj_op(x, False)
+        adj_y = operator.adj_op(y, False)
         print('Output adjoint img shape is', img_adj.shape)
         img_adj = np.squeeze(img_adj).T
         adj_y = np.squeeze(adj_y).T
@@ -110,18 +110,18 @@ def test_multicoil_without_sense(self):
         operator = self.get_nufft_op()
         x = operator.op(np.asarray(
             [np.reshape(image_ch.T, image_ch.size) for image_ch in self.multi_img]
-        ).T)
+        ).T, False)
         y = np.random.random(x.shape)
         print('Output kdata shape is', x.shape)
         print('-------------------------------')
         print('Apply adjoint op')
-        img_adj = operator.adj_op(x)
+        img_adj = operator.adj_op(x, False)
         print('Output adjoint img shape is', img_adj.shape)
         img_adj = np.squeeze(img_adj)
         img_adj = np.asarray(
                 [image_ch.T for image_ch in img_adj]
             )
-        adj_y = np.squeeze(operator.adj_op(y))
+        adj_y = np.squeeze(operator.adj_op(y), False)
         adj_y = np.asarray(
                 [image_ch.T for image_ch in adj_y]
             )

From 63d36642f4674a1253603d38c94f0b9789aba0e9 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 30 Jun 2021 09:41:32 +0200
Subject: [PATCH 03/85] Add Clear Memory

---
 CUDA/inc/gpuNUFFT_operator.hpp                         |  1 +
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp    |  4 ++++
 CUDA/src/gpuNUFFT_operator.cpp                         | 10 +++++++++-
 python/test_file.py                                    |  4 ++--
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 1999680a..61ad6f6c 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -344,6 +344,7 @@ class GpuNUFFTOperator
   Array<CufftType> performForwardGpuNUFFT(Array<DType2> imgData,
                                           GpuNUFFTOutput gpuNUFFTOut);
 
+  void clean_memory();
   /** \brief Check if density compensation data is available. */
   bool applyDensComp()
   {
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index a77281f2..8d15631f 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -167,6 +167,10 @@ class GpuNUFFTPythonOperator
         cudaThreadSynchronize();
         return out_result;
     }
+    void clean_memory()
+    {
+       gpuNUFFTOp->clean_memory();
+    }
     ~GpuNUFFTPythonOperator()
     {
         delete gpuNUFFTOp;
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 02a6062c..9fecbff9 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -1178,12 +1178,15 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     performForwardDeapodization(imdata_d, deapo_d, gi_host);
 	  if(gpuNUFFTOut == DENSITY_ESTIMATION)
 	  {
-	      forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d,
+	    forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d,
                        sector_centers_d, gi_host);
         writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
                     (int)this->kSpaceTraj.count(), n_coils_cc);
         copyFromDevice(data_sorted_d, kspaceData.data + data_coil_offset,
                    data_count * n_coils_cc);
+        if ((coil_it + n_coils_cc) < (n_coils))
+            continue;
+        freeTotalDeviceMemory(data_d, imdata_d, NULL);
         return;
     }
     if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
@@ -1311,3 +1314,8 @@ float gpuNUFFT::GpuNUFFTOperator::stopTiming()
   HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
   return time;
 }
+
+void gpuNUFFT::GpuNUFFTOperator::clean_memory()
+{
+    this->freeDeviceMemory();
+}
\ No newline at end of file
diff --git a/python/test_file.py b/python/test_file.py
index 9ada03eb..10f8d355 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -4,8 +4,8 @@
 
 
 traj = np.load('/volatile/temp_traj.npy')
-#D = estimate_density_compensation(traj, (384, 384, 208), 2)
-fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT')
+D = estimate_density_compensation(traj, (384, 384, 208), 10)
+fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', density_comp=D)
 K = fourier.op(np.zeros((384, 384, 208)))
 print("Forward done")
 I = fourier.adj_op(K)

From ea1722e92dd277ea63b16f445563d5c468fdcb53 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 30 Jun 2021 11:13:39 +0200
Subject: [PATCH 04/85] Fix memory leaks for python operator

---
 CUDA/inc/gpuNUFFT_operator_factory.hpp                   | 2 +-
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++--
 CUDA/src/gpuNUFFT_operator.cpp                           | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp
index 6df9af70..b4b9f461 100644
--- a/CUDA/inc/gpuNUFFT_operator_factory.hpp
+++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp
@@ -52,7 +52,7 @@ class GpuNUFFTOperatorFactory
   /** \brief Constructor overload
     *
     * @param useTextures Flag to indicate texture interpolation
-    * @param useGpu Flag to indicate gpu usage for precomputation
+    * @param useGpu Flag to indicat&GpuNUFFTPythonOperator::adj_op);e gpu usage for precomputation
     * @param balanceWorkload Flag to indicate load balancing
     */
   GpuNUFFTOperatorFactory(const bool useTextures = true, const bool useGpu = true,
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 8d15631f..8f9dba42 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -173,7 +173,6 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
-        delete gpuNUFFTOp;
         if(has_sense_data == true)
             free(sensArray.data);
     }
@@ -182,6 +181,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
         .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool>())
         .def("op", &GpuNUFFTPythonOperator::op)
-        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op);
+        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
+        .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory);
 }
 #endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 9fecbff9..8e375845 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -747,6 +747,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(
         printf("test value at point zero: %f\n", (imgData.data)[0].x);
 
       freeTotalDeviceMemory(data_d, imdata_d, imdata_sum_d, NULL);
+      this->freeDeviceMemory();
       return;
     }
     if ((cudaDeviceSynchronize() != cudaSuccess))
@@ -1187,6 +1188,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
         if ((coil_it + n_coils_cc) < (n_coils))
             continue;
         freeTotalDeviceMemory(data_d, imdata_d, NULL);
+        this->freeDeviceMemory();
         return;
     }
     if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))

From 283ed59841dd3c53ffa9a37eccfc9ac36890a2b9 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 5 Jul 2021 13:59:24 +0200
Subject: [PATCH 05/85] Rename and do right DC

---
 CUDA/src/gpu/std_gpuNUFFT_kernels.cu |  4 ++--
 CUDA/src/gpuNUFFT_operator.cpp       | 11 +----------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
index 3fb39be7..2331c2cf 100644
--- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
@@ -226,8 +226,8 @@ __global__ void densityCompensationKernel(DType2* data, DType* density_comp, int
     for (int c = threadIdx.z; c < GI.n_coils_cc; c+= blockDim.z)
     {
       DType2 data_p = data[t + c*N]; 
-      data_p.x = data_p.x * sqrt(density_comp[t]);
-      data_p.y = data_p.y * sqrt(density_comp[t]);
+      data_p.x = data_p.x * density_comp[t];
+      data_p.y = data_p.y * density_comp[t];
       data[t + c*N] = data_p;
     }
     t = t+ blockDim.x*gridDim.x;
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 8e375845..03cd13f2 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -1041,10 +1041,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       printf("error: at thread synchronization 8: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     
-    // Also apply density compensation here
-    if (this->applyDensComp())
-      performDensityCompensation(data_d, density_comp_d, gi_host);
-
     // write result in correct order back into output array
     writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
                     (int)this->kSpaceTraj.count(), n_coils_cc);
@@ -1075,7 +1071,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
 //
 // parameters:
 //  * data         : output kspace data
-//  * data_count   : number of samples on trajectory
 //  * n_coils      : number of channels or coils
 //  * crds         : coordinates on trajectory, passed as SoA
 //  * imdata       : input image data
@@ -1254,10 +1249,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       printf("error: at thread synchronization 8: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
-    // Also apply density compensation here
-    if (this->applyDensComp())
-      performDensityCompensation(data_d, density_comp_d, gi_host);
-
     // write result in correct order back into output array
     writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
                     (int)this->kSpaceTraj.count(), n_coils_cc);
@@ -1320,4 +1311,4 @@ float gpuNUFFT::GpuNUFFTOperator::stopTiming()
 void gpuNUFFT::GpuNUFFTOperator::clean_memory()
 {
     this->freeDeviceMemory();
-}
\ No newline at end of file
+}

From 4a5596ff9797b3d70cf52d3249b9e88040cece6c Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 5 Jul 2021 13:59:56 +0200
Subject: [PATCH 06/85] Minor version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a99c31ca..de582070 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.2.0",
+    version="0.2.1",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From 71c24659bdb6713287b40b63a10d0f376bff4db5 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 9 Nov 2021 21:56:36 +0100
Subject: [PATCH 07/85] Fix memory

---
 CUDA/inc/balanced_gpuNUFFT_operator.hpp       |  1 -
 .../balanced_texture_gpuNUFFT_operator.hpp    |  3 +--
 CUDA/inc/gpuNUFFT_operator.hpp                | 20 ++++++++-----------
 CUDA/inc/gpuNUFFT_operator_factory.hpp        | 10 +++++++++-
 CUDA/inc/texture_gpuNUFFT_operator.hpp        |  1 -
 .../gpuNUFFT_operator_python_factory.cpp      |  1 +
 CUDA/src/gpuNUFFT_operator_factory.cpp        |  3 +--
 python/test_file.py                           | 12 +++++------
 8 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/CUDA/inc/balanced_gpuNUFFT_operator.hpp b/CUDA/inc/balanced_gpuNUFFT_operator.hpp
index 66246819..e4aa8248 100644
--- a/CUDA/inc/balanced_gpuNUFFT_operator.hpp
+++ b/CUDA/inc/balanced_gpuNUFFT_operator.hpp
@@ -29,7 +29,6 @@ class BalancedGpuNUFFTOperator : public GpuNUFFTOperator,
 
   ~BalancedGpuNUFFTOperator()
   {
-    if (!matlabSharedMem)
       freeLocalMemberArray(this->sectorProcessingOrder.data);
   }
 
diff --git a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp b/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp
index 3eae6a7c..d7672f73 100644
--- a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp
+++ b/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp
@@ -36,8 +36,7 @@ class BalancedTextureGpuNUFFTOperator : public TextureGpuNUFFTOperator,
 
   ~BalancedTextureGpuNUFFTOperator()
   {
-    if (!matlabSharedMem)
-      freeLocalMemberArray(this->sectorProcessingOrder.data);
+     freeLocalMemberArray(this->sectorProcessingOrder.data);
   }
 
   // OPERATIONS
diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 61ad6f6c..2965667c 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -73,17 +73,13 @@ class GpuNUFFTOperator
   virtual ~GpuNUFFTOperator()
   {
     freeLocalMemberArray(this->kernel.data);
-
-    if (!matlabSharedMem) {
-      freeLocalMemberArray(this->dens.data);
-      freeLocalMemberArray(this->deapo.data);
-      freeLocalMemberArray(this->kSpaceTraj.data);
-      freeLocalMemberArray(this->sectorCenters.data);
-      freeLocalMemberArray(this->dataIndices.data);
-      freeLocalMemberArray(this->sectorDataCount.data);
-    }
-
-    freeDeviceMemory();
+    freeLocalMemberArray(this->dens.data);
+    freeLocalMemberArray(this->sens.data);
+    freeLocalMemberArray(this->deapo.data);
+    freeLocalMemberArray(this->kSpaceTraj.data);
+    freeLocalMemberArray(this->sectorCenters.data);
+    freeLocalMemberArray(this->dataIndices.data);
+    freeLocalMemberArray(this->sectorDataCount.data);
   }
 
   friend class GpuNUFFTOperatorFactory;
@@ -373,7 +369,7 @@ class GpuNUFFTOperator
        dataPointer = NULL;
      }
    }
-
+gpuNUFFT::TextureGpuNUFFTOperator::~TextureGpuNUFFTOperator
   /** \brief gpuNUFFT::OperatorType classifier. Value according to sub-class
    * implementation. */
   OperatorType operatorType;
diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp
index b4b9f461..cac4ad88 100644
--- a/CUDA/inc/gpuNUFFT_operator_factory.hpp
+++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp
@@ -176,7 +176,15 @@ class GpuNUFFTOperatorFactory
   void setBalanceWorkload(bool balanceWorkload);
 
  protected:
-  /** \brief Assign the samples on the k-space trajectory to its corresponding
+  template<typename T>
+   void freeLocalMemberArray(T* dataPointer)
+   {
+     if (dataPointer != NULL) {
+       free(dataPointer);
+       dataPointer = NULL;
+     }
+   }
+   /** \brief Assign the samples on the k-space trajectory to its corresponding
     *sector
     *
     * @return array of indices of the assigned sector
diff --git a/CUDA/inc/texture_gpuNUFFT_operator.hpp b/CUDA/inc/texture_gpuNUFFT_operator.hpp
index c9a90eac..5d1bca98 100644
--- a/CUDA/inc/texture_gpuNUFFT_operator.hpp
+++ b/CUDA/inc/texture_gpuNUFFT_operator.hpp
@@ -35,7 +35,6 @@ class TextureGpuNUFFTOperator : public GpuNUFFTOperator
 
   ~TextureGpuNUFFTOperator()
   {
-    freeLookupTable();
   }
 
   virtual OperatorType getType()
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 8f9dba42..ada90ba7 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -173,6 +173,7 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
+        delete gpuNUFFTOp;
         if(has_sense_data == true)
             free(sensArray.data);
     }
diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp
index 28bf206e..8d3a24a9 100644
--- a/CUDA/src/gpuNUFFT_operator_factory.cpp
+++ b/CUDA/src/gpuNUFFT_operator_factory.cpp
@@ -536,8 +536,7 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
     gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp));
 
   // free temporary array
-  free(assignedSectors.data);
-  assignedSectors.data = NULL;
+  freeLocalMemberArray(assignedSectors.data);
 
   gpuNUFFTOp->setDeapodizationFunction(
     this->computeDeapodizationFunction(kernelWidth, osf, imgDims));
diff --git a/python/test_file.py b/python/test_file.py
index 10f8d355..61d41688 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -4,9 +4,9 @@
 
 
 traj = np.load('/volatile/temp_traj.npy')
-D = estimate_density_compensation(traj, (384, 384, 208), 10)
-fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', density_comp=D)
-K = fourier.op(np.zeros((384, 384, 208)))
-print("Forward done")
-I = fourier.adj_op(K)
-print("Backward done")
+for i in range(1):
+    print(i)
+    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT')
+    K = fourier.op(np.zeros((384, 384, 208)))
+    im = fourier.adj_op(K)
+    del fourier

From 1bd12d980cb3ef85a9075e4986d46a7c4da8d235 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 9 Nov 2021 21:58:24 +0100
Subject: [PATCH 08/85] Compile issue

---
 CUDA/inc/gpuNUFFT_operator.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 2965667c..38988ab9 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -369,7 +369,7 @@ class GpuNUFFTOperator
        dataPointer = NULL;
      }
    }
-gpuNUFFT::TextureGpuNUFFTOperator::~TextureGpuNUFFTOperator
+
   /** \brief gpuNUFFT::OperatorType classifier. Value according to sub-class
    * implementation. */
   OperatorType operatorType;

From 5c50bf426f7fa179e5be10963b9f801d54ec5bd2 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 9 Nov 2021 22:19:12 +0100
Subject: [PATCH 09/85] clean mem on gpu

---
 CUDA/inc/gpuNUFFT_operator.hpp                       |  1 +
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp  | 12 +++++++++++-
 python/test_file.py                                  |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 38988ab9..a18c391a 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -72,6 +72,7 @@ class GpuNUFFTOperator
 
   virtual ~GpuNUFFTOperator()
   {
+    freeDeviceMemory();
     freeLocalMemberArray(this->kernel.data);
     freeLocalMemberArray(this->dens.data);
     freeLocalMemberArray(this->sens.data);
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ada90ba7..c18728e2 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -171,6 +171,15 @@ class GpuNUFFTPythonOperator
     {
        gpuNUFFTOp->clean_memory();
     }
+    void set_smaps(py::array_t<std::complex<DType>> sense_maps)
+    {
+        free(sensArray.data);
+        sensArray = copyNumpyArray(sense_maps, imgDims.count() * n_coils);
+        sensArray.dim = imgDims;
+        sensArray.dim.channels = n_coils;
+        has_sense_data = true;
+        gpuNUFFTOp->setSens(sensArray);
+    }
     ~GpuNUFFTPythonOperator()
     {
         delete gpuNUFFTOp;
@@ -183,6 +192,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
         .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool>())
         .def("op", &GpuNUFFTPythonOperator::op)
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
-        .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory);
+        .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
+        .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
 }
 #endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED
diff --git a/python/test_file.py b/python/test_file.py
index 61d41688..265c0cc4 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -4,7 +4,7 @@
 
 
 traj = np.load('/volatile/temp_traj.npy')
-for i in range(1):
+for i in range(5):
     print(i)
     fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT')
     K = fourier.op(np.zeros((384, 384, 208)))

From 7ffefeb11627f4ce8aca930f33c4724e39a65c30 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 9 Nov 2021 23:30:57 +0100
Subject: [PATCH 10/85] Remove

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 --
 python/test_file.py                                      | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index c18728e2..675cae72 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -183,8 +183,6 @@ class GpuNUFFTPythonOperator
     ~GpuNUFFTPythonOperator()
     {
         delete gpuNUFFTOp;
-        if(has_sense_data == true)
-            free(sensArray.data);
     }
 };
 PYBIND11_MODULE(gpuNUFFT, m) {
diff --git a/python/test_file.py b/python/test_file.py
index 265c0cc4..5aece6b3 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -6,7 +6,8 @@
 traj = np.load('/volatile/temp_traj.npy')
 for i in range(5):
     print(i)
-    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT')
+    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=10, smaps=np.ones((10, 384, 384, 208)))
+    fourier.impl.operator.set_smaps(np.ones((10, 384, 384, 208))+1)
     K = fourier.op(np.zeros((384, 384, 208)))
     im = fourier.adj_op(K)
     del fourier

From 432a706a065826b4c435592dd6085c3c291cb22b Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 9 Nov 2021 23:31:38 +0100
Subject: [PATCH 11/85] Version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index de582070..9108af3a 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.2.1",
+    version="0.3.0",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From 2e59fbabce4b109b92194a1e9e406078047f99b0 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 15 Nov 2021 10:12:08 +0100
Subject: [PATCH 12/85] Fix 2D issues

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 675cae72..3a05cc8e 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -141,14 +141,13 @@ class GpuNUFFTPythonOperator
         gpuNUFFT::Dimensions myDims = imgDims;
         if(grid_data)
             myDims = myDims * gpuNUFFTOp->getOsf();
-        int depth = myDims.depth;
         if(dimension==2)
             myDims.depth = 1;
         py::array_t<std::complex<DType>> out_result;
         if(has_sense_data == false)
-            out_result.resize({n_coils, depth, (int)myDims.height, (int)myDims.width});
+            out_result.resize({n_coils, (int)myDims.depth, (int)myDims.height, (int)myDims.width});
         else
-            out_result.resize({depth, (int)myDims.height, (int)myDims.width});
+            out_result.resize({(int)myDims.depth, (int)myDims.height, (int)myDims.width});
         py::buffer_info out = out_result.request();
         std::complex<DType> *t_data = (std::complex<DType> *) out.ptr;
         DType2 *new_data = reinterpret_cast<DType2(&)[0]>(*t_data);

From 471d4e85948af2450a7e1fed818a4e4e9f1cb79b Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 15 Nov 2021 13:54:16 +0100
Subject: [PATCH 13/85] Remove linking issues

---
 CUDA/src/gpu/python/CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt
index 7eeaef6c..494255c2 100644
--- a/CUDA/src/gpu/python/CMakeLists.txt
+++ b/CUDA/src/gpu/python/CMakeLists.txt
@@ -9,7 +9,7 @@ include_directories(
     ${PYTHON_INCLUDE_DIR}
     )
 cuda_include_directories(${GPUNUFFT_INC_DIR})
-cuda_add_library(gpuNUFFT  ${GPU_CU_SOURCES}  ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} gpuNUFFT_operator_python_factory.cpp SHARED)
+cuda_add_library(gpuNUFFT  ${GPU_CU_SOURCES}  ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} gpuNUFFT_operator_python_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../atomic/atomic_gpuNUFFT.cu SHARED)
 set_target_properties(gpuNUFFT PROPERTIES PREFIX "")
 
 if(WIN32)
@@ -18,7 +18,9 @@ if(WIN32)
     MESSAGE("Found ${PYTHON_LIBRARIES}")
     set_target_properties(gpuNUFFT PROPERTIES SUFFIX ".pyd")
 
-    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${GRID_LIB_NAME} ${PYTHON_LIBRARIES})
+    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES})
 elseif(UNIX)
-    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${GRID_LIB_NAME})
+    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES})
 endif(WIN32)
+CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT)
+CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT)

From 9e2bd119ccf139b8544febb613932942531e536e Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 15 Nov 2021 13:56:10 +0100
Subject: [PATCH 14/85] Version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9108af3a..adde187e 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.3.0",
+    version="0.3.2",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From a175ef8f108d843e56912d4b0685da69a339099e Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 16 Nov 2021 18:58:00 +0100
Subject: [PATCH 15/85] Update in code, added concurency and async copy

---
 CUDA/inc/cuda_utils.hpp                       | 30 +++++++++-
 CUDA/inc/gpuNUFFT_operator.hpp                |  3 +-
 .../gpuNUFFT_operator_python_factory.cpp      | 32 ++++++----
 CUDA/src/gpu/std_gpuNUFFT_kernels.cu          |  8 +--
 CUDA/src/gpuNUFFT_operator.cpp                | 60 +++++++++++--------
 python/test_file.py                           | 10 +---
 6 files changed, 89 insertions(+), 54 deletions(-)

diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp
index ab26d01f..aa0f06cc 100644
--- a/CUDA/inc/cuda_utils.hpp
+++ b/CUDA/inc/cuda_utils.hpp
@@ -59,7 +59,19 @@ inline void copyToDevice(TypeName *host_ptr, TypeName *device_ptr,
   HANDLE_ERROR(cudaMemcpy(device_ptr, host_ptr, num_elements * sizeof(TypeName),
                           cudaMemcpyHostToDevice));
 }
-
+/** \brief CUDA memcpy call to copy data from host to device
+ *
+ * @param host_ptr      host data pointer
+ * @param device_ptr    device pointer
+ * @param num_elements  amount of elements of size TypeName
+ */
+template <typename TypeName>
+inline void copyToDeviceAsync(TypeName *host_ptr, TypeName *device_ptr,
+                         IndType num_elements, cudaStream_t stream=0)
+{
+  HANDLE_ERROR(cudaMemcpyAsync(device_ptr, host_ptr, num_elements * sizeof(TypeName),
+                          cudaMemcpyHostToDevice, stream));
+}
 /** \brief CUDA memory allocation and memcpy call to copy data from host to
  *device
  *
@@ -118,7 +130,19 @@ inline void copyFromDevice(TypeName *device_ptr, TypeName *host_ptr,
   HANDLE_ERROR(cudaMemcpy(host_ptr, device_ptr, num_elements * sizeof(TypeName),
                           cudaMemcpyDeviceToHost));
 }
-
+/** \brief Copy CUDA memory from device to host
+ *
+ * @param device_ptr    device pointer
+ * @param host_ptr      host pointer
+ * @param num_elements  amount of elements of size TypeName
+ */
+template <typename TypeName>
+inline void copyFromDeviceAsync(TypeName *device_ptr, TypeName *host_ptr,
+                           IndType num_elements, cudaStream_t stream=0)
+{
+  HANDLE_ERROR(cudaMemcpyAsync(host_ptr, device_ptr, num_elements * sizeof(TypeName),
+                          cudaMemcpyDeviceToHost, stream));
+}
 /** \brief Free variable list of device pointers. Use NULL as stopping element
  *
  * e.g.: freeTotalDeviceMemory(ptr1*, ptr2*,NULL);
@@ -212,7 +236,7 @@ inline void showMemoryInfo()
  *
  * @param symbol Const symbol name
  */
-void initConstSymbol(const char *symbol, const void *src, IndType count);
+void initConstSymbol(const char *symbol, const void *src, IndType count, cudaStream_t stream=0);
 
 /** \brief Initialize texture memory on device
  *
diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index a18c391a..8e582243 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -75,7 +75,6 @@ class GpuNUFFTOperator
     freeDeviceMemory();
     freeLocalMemberArray(this->kernel.data);
     freeLocalMemberArray(this->dens.data);
-    freeLocalMemberArray(this->sens.data);
     freeLocalMemberArray(this->deapo.data);
     freeLocalMemberArray(this->kSpaceTraj.data);
     freeLocalMemberArray(this->sectorCenters.data);
@@ -574,7 +573,7 @@ class GpuNUFFTOperator
 
   /** \brief Update amount of concurrently computed coils
    */
-  void updateConcurrentCoilCount(int coil_it, int n_coils, int &n_coils_cc);
+  void updateConcurrentCoilCount(int coil_it, int n_coils, int &n_coils_cc, cudaStream_t stream=0);
 
   /** \brief Compute amount of coils which can be computed at once.
    *
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 3a05cc8e..e3c8cab8 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -19,6 +19,8 @@ Carole Lazarus <carole.m.lazarus@gmail.com>
 #include <algorithm>  // std::sort
 #include <vector>     // std::vector
 #include <string>
+#include <cuda.h>
+
 
 namespace py = pybind11;
 
@@ -45,14 +47,15 @@ readNumpyArray(py::array_t<std::complex<DType>> data)
 }
 
 gpuNUFFT::Array<DType2>
-copyNumpyArray(py::array_t<std::complex<DType>> data, unsigned long alloc_size)
+copyNumpyArray(py::array_t<std::complex<DType>> data)
 {
     gpuNUFFT::Array<DType2> dataArray;
     py::buffer_info myData = data.request();
     std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
     DType2 *my_data = reinterpret_cast<DType2(&)[0]>(*t_data);
-    DType2 *copy_data = (DType2 *) malloc(alloc_size*sizeof(DType2));
-    memcpy(copy_data, my_data, alloc_size*sizeof(DType2));
+    DType2 *copy_data;
+    cudaMallocHost((void **)&copy_data, myData.size*sizeof(DType2));
+    memcpy(copy_data, my_data, myData.size*sizeof(DType2));
     dataArray.data = copy_data;
     return dataArray;
 }
@@ -103,7 +106,7 @@ class GpuNUFFTPythonOperator
         }
         else
         {
-            sensArray = copyNumpyArray(sense_maps, imgDims.count() * n_coils);
+            sensArray = copyNumpyArray(sense_maps);
             sensArray.dim = imgDims;
             sensArray.dim.channels = n_coils;
             has_sense_data = true;
@@ -117,16 +120,13 @@ class GpuNUFFTPythonOperator
 
     py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> image, bool interpolate_data=false)
     {
-        py::array_t<std::complex<DType>> out_result({n_coils, trajectory_length});
-        py::buffer_info out = out_result.request();
-        std::complex<DType> *t_data = (std::complex<DType> *) out.ptr;
-        DType2 *new_data = reinterpret_cast<DType2(&)[0]>(*t_data);
+        DType2 *new_data;
+        cudaMallocHost((void **)&new_data, n_coils*trajectory_length*sizeof(DType2));
         gpuNUFFT::Array<CufftType> dataArray;
         dataArray.data = new_data;
         dataArray.dim.length = trajectory_length;
         dataArray.dim.channels = n_coils;
-
-        gpuNUFFT::Array<DType2> imdataArray = readNumpyArray(image);
+        gpuNUFFT::Array<DType2> imdataArray = copyNumpyArray(image);
         imdataArray.dim = imgDims;
         imdataArray.dim.channels = n_coils;
         if(interpolate_data)
@@ -134,7 +134,15 @@ class GpuNUFFTPythonOperator
         else
             gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray);
         cudaThreadSynchronize();
-        return out_result;
+        return py::array_t<std::complex<DType>>(py::buffer_info(
+            new_data,                               /* Pointer to buffer */
+            sizeof(std::complex<DType>),                          /* Size of one scalar */
+            py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
+            2,                                      /* Number of dimensions */
+            { n_coils, trajectory_length },                 /* Buffer dimensions */
+            { sizeof(float) * n_coils,             /* Strides (in bytes) for each index */
+              sizeof(float) }
+        ));
     }
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> kspace_data, bool grid_data=false)
     {
@@ -173,7 +181,7 @@ class GpuNUFFTPythonOperator
     void set_smaps(py::array_t<std::complex<DType>> sense_maps)
     {
         free(sensArray.data);
-        sensArray = copyNumpyArray(sense_maps, imgDims.count() * n_coils);
+        sensArray = copyNumpyArray(sense_maps);
         sensArray.dim = imgDims;
         sensArray.dim.channels = n_coils;
         has_sense_data = true;
diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
index 2331c2cf..7d6956d8 100644
--- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
@@ -8,17 +8,17 @@
 #include "precomp_utils.hpp"
 #include "cuda_utils.cuh"
 
-// Method to initialize CONSTANT memory symbols. Needs to reside in *.cu file 
+// Method to initialize CONSTANT memory symbols. Needs to reside in *.cu file
 // to work properly
 //
 //
-void initConstSymbol(const char* symbol, const void* src, IndType size)
+void initConstSymbol(const char* symbol, const void* src, IndType size, cudaStream_t stream)
 {
   if (std::string("GI").compare(symbol)==0)
-    HANDLE_ERROR(cudaMemcpyToSymbol(GI, src,size));
+    HANDLE_ERROR(cudaMemcpyToSymbolAsync(GI, src, size, 0, cudaMemcpyHostToDevice, stream));
 
   if (std::string("KERNEL").compare(symbol)==0)
-    HANDLE_ERROR(cudaMemcpyToSymbol(KERNEL, src,size));
+    HANDLE_ERROR(cudaMemcpyToSymbolAsync(KERNEL, src, size, 0, cudaMemcpyHostToDevice, stream));
 }
 
 void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count)
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 03cd13f2..d07d517d 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -355,7 +355,8 @@ int gpuNUFFT::GpuNUFFTOperator::computePossibleConcurrentCoilCount(
 
 void gpuNUFFT::GpuNUFFTOperator::updateConcurrentCoilCount(int coil_it,
                                                            int n_coils,
-                                                           int &n_coils_cc)
+                                                           int &n_coils_cc,
+                                                           cudaStream_t stream)
 {
   if ((coil_it + n_coils_cc) >= n_coils)
   {
@@ -363,7 +364,7 @@ void gpuNUFFT::GpuNUFFTOperator::updateConcurrentCoilCount(int coil_it,
     n_coils_cc = n_coils - coil_it;
     // Update Gridding Info struct
     gi_host->n_coils_cc = n_coils_cc;
-    initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo));
+    initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo), stream);
   }
 }
 
@@ -1135,38 +1136,39 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     printf("Memory allocation: %.2f ms\n", stopTiming());
 
   int err;
-
+  cudaStream_t new_stream, old_stream;
   // iterate over coils and compute result
   for (int coil_it = 0; coil_it < n_coils; coil_it += n_coils_cc)
   {
+    cudaStreamCreate(&new_stream);
     unsigned long int data_coil_offset = (long int) coil_it * data_count;
     unsigned long int im_coil_offset = coil_it * (long int)imdata_count;
 
-    this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc);
+    this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc, new_stream);
 
     if (this->applySensData())
       // perform automatically "repeating" of input image in case
       // of existing sensitivity data
       for (int cnt = 0; cnt < n_coils_cc; cnt++)
-        copyToDevice<DType2>(imgData.data, imdata_d + cnt * imdata_count,
-                             imdata_count);
+        copyToDeviceAsync<DType2>(imgData.data, imdata_d + cnt * imdata_count,
+                             imdata_count, new_stream);
     else
-      copyToDevice<DType2>(imgData.data + im_coil_offset, imdata_d,
-                           imdata_count * n_coils_cc);
+      copyToDeviceAsync<DType2>(imgData.data + im_coil_offset, imdata_d,
+                           imdata_count * n_coils_cc, new_stream);
 
     // reset temp arrays
-    cudaMemset(gdata_d, 0,
-               sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc);
-    cudaMemset(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc);
+    cudaMemsetAsync(gdata_d, 0,
+               sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream);
+    cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream);
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 1: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
     if (this->applySensData())
     {
-      copyToDevice(this->sens.data + im_coil_offset, sens_d,
-                   imdata_count * n_coils_cc);
+      copyToDeviceAsync(this->sens.data + im_coil_offset, sens_d,
+                        imdata_count * n_coils_cc, new_stream);
       performSensMul(imdata_d, sens_d, gi_host, false);
     }
 
@@ -1174,12 +1176,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     performForwardDeapodization(imdata_d, deapo_d, gi_host);
 	  if(gpuNUFFTOut == DENSITY_ESTIMATION)
 	  {
-	    forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d,
+	      forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d,
                        sector_centers_d, gi_host);
         writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
                     (int)this->kSpaceTraj.count(), n_coils_cc);
-        copyFromDevice(data_sorted_d, kspaceData.data + data_coil_offset,
-                   data_count * n_coils_cc);
+        copyFromDeviceAsync(data_sorted_d, kspaceData.data + data_coil_offset,
+                   data_count * n_coils_cc, new_stream);
         if ((coil_it + n_coils_cc) < (n_coils))
             continue;
         freeTotalDeviceMemory(data_d, imdata_d, NULL);
@@ -1195,13 +1197,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (debugTiming)
       startTiming();
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 3: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // shift image to get correct zero frequency position
     performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host);
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 4: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // eventually free imdata_d
@@ -1219,12 +1221,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       c++;
     }
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 5: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host);
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 6: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
@@ -1237,7 +1239,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     // convolution and resampling to non-standard trajectory
     forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d,
                        sector_centers_d, gi_host);
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 7: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
@@ -1245,16 +1247,21 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       printf("Forward Convolution: %.2f ms\n", stopTiming());
 
     performFFTScaling(data_d, gi_host->data_count, gi_host);
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error: at thread synchronization 8: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
     // write result in correct order back into output array
     writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
                     (int)this->kSpaceTraj.count(), n_coils_cc);
-    
-    copyFromDevice(data_sorted_d, kspaceData.data + data_coil_offset,
-                   data_count * n_coils_cc);
+    if(coil_it > 1)
+    {
+      cudaStreamSynchronize(old_stream);
+      cudaStreamDestroy(old_stream);
+    }
+    copyFromDeviceAsync(data_sorted_d, kspaceData.data + data_coil_offset,
+                        data_count * n_coils_cc, new_stream);
+    old_stream = new_stream;
   }  // iterate over coils
 
   freeTotalDeviceMemory(data_d, imdata_d, NULL);
@@ -1263,6 +1270,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n",
             cudaGetErrorString(cudaGetLastError()));
+  cudaStreamDestroy(old_stream);
 }
 
 gpuNUFFT::Array<CufftType>
diff --git a/python/test_file.py b/python/test_file.py
index 5aece6b3..538770aa 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -1,13 +1,9 @@
 import numpy as np
 from mri.operators import NonCartesianFFT
-from mri.operators.fourier.utils import estimate_density_compensation
-
 
 traj = np.load('/volatile/temp_traj.npy')
-for i in range(5):
+
+for i in range(1):
     print(i)
-    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=10, smaps=np.ones((10, 384, 384, 208)))
-    fourier.impl.operator.set_smaps(np.ones((10, 384, 384, 208))+1)
+    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=4, smaps=np.ones((4, 384, 384, 208)), osf=1)
     K = fourier.op(np.zeros((384, 384, 208)))
-    im = fourier.adj_op(K)
-    del fourier

From 91d8129c3924cddfd9682054a208f13001033749 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 17 Nov 2021 09:26:30 +0100
Subject: [PATCH 16/85] Update in code, added concurency and async copy

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index e3c8cab8..ed7be37c 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -133,7 +133,8 @@ class GpuNUFFTPythonOperator
             gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray, gpuNUFFT::DENSITY_ESTIMATION);
         else
             gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray);
-        cudaThreadSynchronize();
+        cudaDeviceSynchronize();
+        free(imdataArray.data);
         return py::array_t<std::complex<DType>>(py::buffer_info(
             new_data,                               /* Pointer to buffer */
             sizeof(std::complex<DType>),                          /* Size of one scalar */
@@ -171,7 +172,7 @@ class GpuNUFFTPythonOperator
             gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray, gpuNUFFT::DENSITY_ESTIMATION);
         else
             gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray);
-        cudaThreadSynchronize();
+        cudaDeviceSynchronize();
         return out_result;
     }
     void clean_memory()

From b718f007a812cefd32bccedda4bbb6a45abe7304 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 17 Nov 2021 12:57:18 +0100
Subject: [PATCH 17/85] Concurency working codes

---
 CUDA/inc/gpuNUFFT_operator.hpp                |  2 +-
 CUDA/inc/gpuNUFFT_operator_factory.hpp        |  2 +-
 .../gpuNUFFT_operator_python_factory.cpp      | 81 ++++++++++++++++---
 CUDA/src/gpuNUFFT_operator.cpp                |  6 +-
 CUDA/src/gpuNUFFT_operator_factory.cpp        | 11 +--
 5 files changed, 81 insertions(+), 21 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 8e582243..2a9c0df3 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -365,7 +365,7 @@ class GpuNUFFTOperator
    void freeLocalMemberArray(T* dataPointer)
    {
      if (dataPointer != NULL) {
-       free(dataPointer);
+       cudaFree(dataPointer);
        dataPointer = NULL;
      }
    }
diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp
index cac4ad88..9e33d8d0 100644
--- a/CUDA/inc/gpuNUFFT_operator_factory.hpp
+++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp
@@ -180,7 +180,7 @@ class GpuNUFFTOperatorFactory
    void freeLocalMemberArray(T* dataPointer)
    {
      if (dataPointer != NULL) {
-       free(dataPointer);
+       cudaFree(dataPointer);
        dataPointer = NULL;
      }
    }
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ed7be37c..58889890 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -59,6 +59,19 @@ copyNumpyArray(py::array_t<std::complex<DType>> data)
     dataArray.data = copy_data;
     return dataArray;
 }
+template <typename TType>
+gpuNUFFT::Array<TType>
+copyNumpyArray(py::array_t<std::complex<DType>> data)
+{
+    gpuNUFFT::Array<DType2> dataArray;
+    py::buffer_info myData = data.request();
+    TType *my_data = (TType *) myData.ptr;
+    DType2 *copy_data;
+    cudaMallocHost((void **)&copy_data, myData.size*sizeof(TType));
+    memcpy(copy_data, my_data, myData.size*sizeof(TType));
+    dataArray.data = copy_data;
+    return dataArray;
+}
 
 class GpuNUFFTPythonOperator
 {
@@ -115,7 +128,7 @@ class GpuNUFFTPythonOperator
         gpuNUFFTOp = factory.createGpuNUFFTOperator(
             kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width,
             osr, imgDims);
-        cudaThreadSynchronize();
+        cudaDeviceSynchronize();
     }
 
     py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> image, bool interpolate_data=false)
@@ -126,6 +139,7 @@ class GpuNUFFTPythonOperator
         dataArray.data = new_data;
         dataArray.dim.length = trajectory_length;
         dataArray.dim.channels = n_coils;
+        // Copy array to pinned memory for better memory bandwidths!
         gpuNUFFT::Array<DType2> imdataArray = copyNumpyArray(image);
         imdataArray.dim = imgDims;
         imdataArray.dim.channels = n_coils;
@@ -134,15 +148,19 @@ class GpuNUFFTPythonOperator
         else
             gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray);
         cudaDeviceSynchronize();
-        free(imdataArray.data);
+      // Free the Copied array
+        cudaFree(imdataArray.data);
+        imdataArray.data = NULL;
         return py::array_t<std::complex<DType>>(py::buffer_info(
             new_data,                               /* Pointer to buffer */
             sizeof(std::complex<DType>),                          /* Size of one scalar */
             py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
             2,                                      /* Number of dimensions */
             { n_coils, trajectory_length },                 /* Buffer dimensions */
-            { sizeof(float) * n_coils,             /* Strides (in bytes) for each index */
-              sizeof(float) }
+            {
+                sizeof(DType2) * trajectory_length,             /* Strides (in bytes) for each index */
+                sizeof(DType2)
+            }
         ));
     }
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> kspace_data, bool grid_data=false)
@@ -152,20 +170,18 @@ class GpuNUFFTPythonOperator
             myDims = myDims * gpuNUFFTOp->getOsf();
         if(dimension==2)
             myDims.depth = 1;
-        py::array_t<std::complex<DType>> out_result;
+        DType2 *t_data;
         if(has_sense_data == false)
-            out_result.resize({n_coils, (int)myDims.depth, (int)myDims.height, (int)myDims.width});
+            cudaMallocHost((void **)&t_data, n_coils*(int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2));
         else
-            out_result.resize({(int)myDims.depth, (int)myDims.height, (int)myDims.width});
-        py::buffer_info out = out_result.request();
-        std::complex<DType> *t_data = (std::complex<DType> *) out.ptr;
+            cudaMallocHost((void **)&t_data, (int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2));
         DType2 *new_data = reinterpret_cast<DType2(&)[0]>(*t_data);
         gpuNUFFT::Array<DType2> imdataArray;
         imdataArray.data = new_data;
         imdataArray.dim = myDims;
         if(has_sense_data == false)
             imdataArray.dim.channels = n_coils;
-        gpuNUFFT::Array<CufftType> dataArray = readNumpyArray(kspace_data);
+        gpuNUFFT::Array<CufftType> dataArray = copyNumpyArray(kspace_data);
         dataArray.dim.length = trajectory_length;
         dataArray.dim.channels = n_coils;
         if(grid_data)
@@ -173,7 +189,46 @@ class GpuNUFFTPythonOperator
         else
             gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray);
         cudaDeviceSynchronize();
-        return out_result;
+        // Free the Copied array
+        cudaFree(dataArray.data);
+        dataArray.data = NULL;
+        if(has_sense_data == false)
+          return py::array_t<std::complex<DType>>(py::buffer_info(
+            new_data,                               /* Pointer to buffer */
+            sizeof(std::complex<DType>),                          /* Size of one scalar */
+            py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
+            4,                                                                                    /* Number of dimensions */
+            {
+                n_coils,
+                (int)myDims.depth,
+                (int)myDims.height,
+                (int)myDims.width
+            }, /* Buffer dimensions */
+            {
+                sizeof(DType2) * (int)myDims.depth * (int)myDims.height * (int)myDims.width,
+                sizeof(DType2) * (int)myDims.height * (int)myDims.width,
+                sizeof(DType2) * (int)myDims.width,
+                sizeof(DType2),
+            }
+          ));
+        else
+          return py::array_t<std::complex<DType>>(py::buffer_info(
+            new_data,                               /* Pointer to buffer */
+            sizeof(std::complex<DType>),                          /* Size of one scalar */
+            py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
+            3,                                                                                    /* Number of dimensions */
+            {
+                (int)myDims.depth,
+                (int)myDims.height,
+                (int)myDims.width
+            }, /* Buffer dimensions */
+            {
+                sizeof(DType2) * (int)myDims.height * (int)myDims.width,
+                sizeof(DType2) * (int)myDims.width,
+                sizeof(DType2),
+            }
+          ));
+
     }
     void clean_memory()
     {
@@ -181,6 +236,10 @@ class GpuNUFFTPythonOperator
     }
     void set_smaps(py::array_t<std::complex<DType>> sense_maps)
     {
+        py::buffer_info myData = sense_maps.request();
+        std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
+        DType2 *my_data = reinterpret_cast<DType2(&)[0]>(*t_data);
+        memcpy(sensArray.data, my_data, myData.size*sizeof(DType2));
         free(sensArray.data);
         sensArray = copyNumpyArray(sense_maps);
         sensArray.dim = imgDims;
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index d07d517d..27845579 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -53,8 +53,8 @@ void gpuNUFFT::GpuNUFFTOperator::initKernel()
 gpuNUFFT::GpuNUFFTInfo *
 gpuNUFFT::GpuNUFFTOperator::initGpuNUFFTInfo(int n_coils_cc)
 {
-  gpuNUFFT::GpuNUFFTInfo *gi_host =
-      (gpuNUFFT::GpuNUFFTInfo *)malloc(sizeof(gpuNUFFT::GpuNUFFTInfo));
+  gpuNUFFT::GpuNUFFTInfo *gi_host;
+  cudaMallocHost((void **)&gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo));
 
   gi_host->data_count = (int)this->kSpaceTraj.count();
   gi_host->sector_count = (int)this->gridSectorDims.count();
@@ -300,7 +300,7 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory()
   if (!gpuMemAllocated)
     return;
 
-  free(gi_host);
+  cudaFree(gi_host);
   cufftDestroy(fft_plan);
   // Destroy the cuFFT plan.
   if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp
index 8d3a24a9..79753844 100644
--- a/CUDA/src/gpuNUFFT_operator_factory.cpp
+++ b/CUDA/src/gpuNUFFT_operator_factory.cpp
@@ -31,7 +31,7 @@ gpuNUFFT::Array<T>
 gpuNUFFT::GpuNUFFTOperatorFactory::initLinArray(IndType arrCount)
 {
   gpuNUFFT::Array<T> new_array;
-  new_array.data = (T *)malloc(arrCount * sizeof(T));
+  cudaMallocHost((void **)&new_array.data, arrCount * sizeof(T));
   new_array.dim.length = arrCount;
   return new_array;
 }
@@ -132,7 +132,7 @@ gpuNUFFT::Array<IndType> gpuNUFFT::GpuNUFFTOperatorFactory::assignSectors(
 
   // create temporary array to store assigned values
   gpuNUFFT::Array<IndType> assignedSectors;
-  assignedSectors.data = (IndType *)malloc(coordCnt * sizeof(IndType));
+  cudaMallocHost((void **) &assignedSectors.data, coordCnt * sizeof(IndType));
   assignedSectors.dim.length = coordCnt;
 
   if (useGpu)
@@ -415,7 +415,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
 
   // cleanup locally initialized arrays here
   free(dataArray.data);
-  free(assignedSectors.data);
+  cudaFree(assignedSectors.data);
 
   // Compute abs values of deapo function and compensate
   // FFT scaling sqrt(N)
@@ -438,7 +438,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
 
   // cleanup
   delete deapoGpuNUFFTOp;
-  free(deapoFunction.data);
+  cudaFree(deapoFunction.data);
   return deapoAbs;
 }
 
@@ -536,7 +536,8 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
     gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp));
 
   // free temporary array
-  freeLocalMemberArray(assignedSectors.data);
+  cudaFree(assignedSectors.data);
+  assignedSectors.data = NULL;
 
   gpuNUFFTOp->setDeapodizationFunction(
     this->computeDeapodizationFunction(kernelWidth, osf, imgDims));

From 27abe64fc6c1ebbad148fda8bb8e52f168d27e19 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 17 Nov 2021 13:10:26 +0100
Subject: [PATCH 18/85] Fix leaks

---
 CUDA/inc/gpuNUFFT_operator.hpp                         |  2 +-
 CUDA/inc/gpuNUFFT_operator_factory.hpp                 |  2 +-
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp    | 10 +++-------
 CUDA/src/gpuNUFFT_operator.cpp                         |  2 +-
 CUDA/src/gpuNUFFT_operator_factory.cpp                 |  6 +++---
 python/test_file.py                                    |  4 +++-
 6 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 2a9c0df3..cb327e6f 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -365,7 +365,7 @@ class GpuNUFFTOperator
    void freeLocalMemberArray(T* dataPointer)
    {
      if (dataPointer != NULL) {
-       cudaFree(dataPointer);
+       cudaFreeHost(dataPointer);
        dataPointer = NULL;
      }
    }
diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp
index 9e33d8d0..3e1b7a2a 100644
--- a/CUDA/inc/gpuNUFFT_operator_factory.hpp
+++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp
@@ -180,7 +180,7 @@ class GpuNUFFTOperatorFactory
    void freeLocalMemberArray(T* dataPointer)
    {
      if (dataPointer != NULL) {
-       cudaFree(dataPointer);
+       cudaFreeHost(dataPointer);
        dataPointer = NULL;
      }
    }
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 58889890..3e851ca5 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -148,8 +148,8 @@ class GpuNUFFTPythonOperator
         else
             gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray);
         cudaDeviceSynchronize();
-      // Free the Copied array
-        cudaFree(imdataArray.data);
+        // Free the Copied array
+        cudaFreeHost(imdataArray.data);
         imdataArray.data = NULL;
         return py::array_t<std::complex<DType>>(py::buffer_info(
             new_data,                               /* Pointer to buffer */
@@ -190,7 +190,7 @@ class GpuNUFFTPythonOperator
             gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray);
         cudaDeviceSynchronize();
         // Free the Copied array
-        cudaFree(dataArray.data);
+        cudaFreeHost(dataArray.data);
         dataArray.data = NULL;
         if(has_sense_data == false)
           return py::array_t<std::complex<DType>>(py::buffer_info(
@@ -240,10 +240,6 @@ class GpuNUFFTPythonOperator
         std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
         DType2 *my_data = reinterpret_cast<DType2(&)[0]>(*t_data);
         memcpy(sensArray.data, my_data, myData.size*sizeof(DType2));
-        free(sensArray.data);
-        sensArray = copyNumpyArray(sense_maps);
-        sensArray.dim = imgDims;
-        sensArray.dim.channels = n_coils;
         has_sense_data = true;
         gpuNUFFTOp->setSens(sensArray);
     }
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 27845579..770e6333 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -300,7 +300,7 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory()
   if (!gpuMemAllocated)
     return;
 
-  cudaFree(gi_host);
+  cudaFreeHost(gi_host);
   cufftDestroy(fft_plan);
   // Destroy the cuFFT plan.
   if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp
index 79753844..bb437bd9 100644
--- a/CUDA/src/gpuNUFFT_operator_factory.cpp
+++ b/CUDA/src/gpuNUFFT_operator_factory.cpp
@@ -415,7 +415,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
 
   // cleanup locally initialized arrays here
   free(dataArray.data);
-  cudaFree(assignedSectors.data);
+  cudaFreeHost(assignedSectors.data);
 
   // Compute abs values of deapo function and compensate
   // FFT scaling sqrt(N)
@@ -438,7 +438,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
 
   // cleanup
   delete deapoGpuNUFFTOp;
-  cudaFree(deapoFunction.data);
+  cudaFreeHost(deapoFunction.data);
   return deapoAbs;
 }
 
@@ -536,7 +536,7 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
     gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp));
 
   // free temporary array
-  cudaFree(assignedSectors.data);
+  cudaFreeHost(assignedSectors.data);
   assignedSectors.data = NULL;
 
   gpuNUFFTOp->setDeapodizationFunction(
diff --git a/python/test_file.py b/python/test_file.py
index 538770aa..ae547f27 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -3,7 +3,9 @@
 
 traj = np.load('/volatile/temp_traj.npy')
 
-for i in range(1):
+for i in range(3):
     print(i)
     fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=4, smaps=np.ones((4, 384, 384, 208)), osf=1)
     K = fourier.op(np.zeros((384, 384, 208)))
+    I = fourier.adj_op(K)
+    del fourier
\ No newline at end of file

From 8dd394ac43baae8947f50f40c4dc4309ac37feb9 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Thu, 18 Nov 2021 10:52:01 +0100
Subject: [PATCH 19/85] Do a single cudaMalloc

---
 .../gpuNUFFT_operator_python_factory.cpp      | 121 +++++++-----------
 python/test_file.py                           |   4 +-
 2 files changed, 48 insertions(+), 77 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 3e851ca5..0ca842cc 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -46,31 +46,19 @@ readNumpyArray(py::array_t<std::complex<DType>> data)
     return dataArray;
 }
 
-gpuNUFFT::Array<DType2>
-copyNumpyArray(py::array_t<std::complex<DType>> data)
+void allocate_pinned_memory(gpuNUFFT::Array<DType2> *lin_array, unsigned long int size)
 {
-    gpuNUFFT::Array<DType2> dataArray;
-    py::buffer_info myData = data.request();
-    std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
-    DType2 *my_data = reinterpret_cast<DType2(&)[0]>(*t_data);
-    DType2 *copy_data;
-    cudaMallocHost((void **)&copy_data, myData.size*sizeof(DType2));
-    memcpy(copy_data, my_data, myData.size*sizeof(DType2));
-    dataArray.data = copy_data;
-    return dataArray;
+  DType2 *new_data;
+  cudaMallocHost((void **)&new_data, size);
+  lin_array->data = new_data;
 }
 template <typename TType>
-gpuNUFFT::Array<TType>
-copyNumpyArray(py::array_t<std::complex<DType>> data)
+void copyNumpyArray(py::array_t<std::complex<DType>> data, TType *copy_data)
 {
-    gpuNUFFT::Array<DType2> dataArray;
     py::buffer_info myData = data.request();
-    TType *my_data = (TType *) myData.ptr;
-    DType2 *copy_data;
-    cudaMallocHost((void **)&copy_data, myData.size*sizeof(TType));
+    std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
+    TType *my_data = reinterpret_cast<TType(&)[0]>(*t_data);
     memcpy(copy_data, my_data, myData.size*sizeof(TType));
-    dataArray.data = copy_data;
-    return dataArray;
 }
 
 class GpuNUFFTPythonOperator
@@ -81,7 +69,7 @@ class GpuNUFFTPythonOperator
     bool has_sense_data;
     gpuNUFFT::Dimensions imgDims;
     // sensitivity maps
-    gpuNUFFT::Array<DType2> sensArray;
+    gpuNUFFT::Array<DType2> sensArray, kspace_data, image;
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
     py::array_t<std::complex<DType>> sense_maps,  py::array_t<float> density_comp, int kernel_width=3,
@@ -119,40 +107,44 @@ class GpuNUFFTPythonOperator
         }
         else
         {
-            sensArray = copyNumpyArray(sense_maps);
+            allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2));
             sensArray.dim = imgDims;
             sensArray.dim.channels = n_coils;
+            copyNumpyArray(sense_maps, sensArray.data);
             has_sense_data = true;
         }
         factory.setBalanceWorkload(balance_workload);
         gpuNUFFTOp = factory.createGpuNUFFTOperator(
             kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width,
             osr, imgDims);
+        allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2));
+        kspace_data.dim.length = trajectory_length;
+        kspace_data.dim.channels = n_coils;
+        image.dim = imgDims;
+        if(has_sense_data == false)
+        {
+          allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2));
+          image.dim.channels = n_coils;
+        }
+        else
+        {
+          allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2));
+          image.dim.channels = 1;
+        }
         cudaDeviceSynchronize();
     }
 
-    py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> image, bool interpolate_data=false)
+    py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> input_image, bool interpolate_data=false)
     {
-        DType2 *new_data;
-        cudaMallocHost((void **)&new_data, n_coils*trajectory_length*sizeof(DType2));
-        gpuNUFFT::Array<CufftType> dataArray;
-        dataArray.data = new_data;
-        dataArray.dim.length = trajectory_length;
-        dataArray.dim.channels = n_coils;
         // Copy array to pinned memory for better memory bandwidths!
-        gpuNUFFT::Array<DType2> imdataArray = copyNumpyArray(image);
-        imdataArray.dim = imgDims;
-        imdataArray.dim.channels = n_coils;
+        copyNumpyArray(input_image, image.data);
         if(interpolate_data)
-            gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray, gpuNUFFT::DENSITY_ESTIMATION);
+            gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
-            gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray);
+            gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data);
         cudaDeviceSynchronize();
-        // Free the Copied array
-        cudaFreeHost(imdataArray.data);
-        imdataArray.data = NULL;
         return py::array_t<std::complex<DType>>(py::buffer_info(
-            new_data,                               /* Pointer to buffer */
+            kspace_data.data,                               /* Pointer to buffer */
             sizeof(std::complex<DType>),                          /* Size of one scalar */
             py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
             2,                                      /* Number of dimensions */
@@ -163,68 +155,47 @@ class GpuNUFFTPythonOperator
             }
         ));
     }
-    py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> kspace_data, bool grid_data=false)
+    py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace_data, bool grid_data=false)
     {
-        gpuNUFFT::Dimensions myDims = imgDims;
-        if(grid_data)
-            myDims = myDims * gpuNUFFTOp->getOsf();
-        if(dimension==2)
-            myDims.depth = 1;
-        DType2 *t_data;
-        if(has_sense_data == false)
-            cudaMallocHost((void **)&t_data, n_coils*(int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2));
-        else
-            cudaMallocHost((void **)&t_data, (int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2));
-        DType2 *new_data = reinterpret_cast<DType2(&)[0]>(*t_data);
-        gpuNUFFT::Array<DType2> imdataArray;
-        imdataArray.data = new_data;
-        imdataArray.dim = myDims;
-        if(has_sense_data == false)
-            imdataArray.dim.channels = n_coils;
-        gpuNUFFT::Array<CufftType> dataArray = copyNumpyArray(kspace_data);
-        dataArray.dim.length = trajectory_length;
-        dataArray.dim.channels = n_coils;
+        copyNumpyArray(input_kspace_data, kspace_data.data);
         if(grid_data)
-            gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray, gpuNUFFT::DENSITY_ESTIMATION);
+            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
         else
-            gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray);
+            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image);
         cudaDeviceSynchronize();
-        // Free the Copied array
-        cudaFreeHost(dataArray.data);
-        dataArray.data = NULL;
         if(has_sense_data == false)
           return py::array_t<std::complex<DType>>(py::buffer_info(
-            new_data,                               /* Pointer to buffer */
+            image.data,                               /* Pointer to buffer */
             sizeof(std::complex<DType>),                          /* Size of one scalar */
             py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
             4,                                                                                    /* Number of dimensions */
             {
                 n_coils,
-                (int)myDims.depth,
-                (int)myDims.height,
-                (int)myDims.width
+                (int)image.dim.depth,
+                (int)image.dim.height,
+                (int)image.dim.width
             }, /* Buffer dimensions */
             {
-                sizeof(DType2) * (int)myDims.depth * (int)myDims.height * (int)myDims.width,
-                sizeof(DType2) * (int)myDims.height * (int)myDims.width,
-                sizeof(DType2) * (int)myDims.width,
+                sizeof(DType2) * (int)image.dim.depth * (int)image.dim.height * (int)image.dim.width,
+                sizeof(DType2) * (int)image.dim.height * (int)image.dim.width,
+                sizeof(DType2) * (int)image.dim.width,
                 sizeof(DType2),
             }
           ));
         else
           return py::array_t<std::complex<DType>>(py::buffer_info(
-            new_data,                               /* Pointer to buffer */
+            image.data,                               /* Pointer to buffer */
             sizeof(std::complex<DType>),                          /* Size of one scalar */
             py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
             3,                                                                                    /* Number of dimensions */
             {
-                (int)myDims.depth,
-                (int)myDims.height,
-                (int)myDims.width
+                (int)image.dim.depth,
+                (int)image.dim.height,
+                (int)image.dim.width
             }, /* Buffer dimensions */
             {
-                sizeof(DType2) * (int)myDims.height * (int)myDims.width,
-                sizeof(DType2) * (int)myDims.width,
+                sizeof(DType2) * (int)image.dim.height * (int)image.dim.width,
+                sizeof(DType2) * (int)image.dim.width,
                 sizeof(DType2),
             }
           ));
diff --git a/python/test_file.py b/python/test_file.py
index ae547f27..f1f26a5a 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -3,9 +3,9 @@
 
 traj = np.load('/volatile/temp_traj.npy')
 
+fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1)
+
 for i in range(3):
     print(i)
-    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=4, smaps=np.ones((4, 384, 384, 208)), osf=1)
     K = fourier.op(np.zeros((384, 384, 208)))
     I = fourier.adj_op(K)
-    del fourier
\ No newline at end of file

From 07d0cab1431c2064afe59a81ef5f019c8390fa46 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Thu, 18 Nov 2021 14:46:19 +0100
Subject: [PATCH 20/85] New test

---
 python/test_file.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/python/test_file.py b/python/test_file.py
index 5aece6b3..e6980cc9 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -1,13 +1,11 @@
 import numpy as np
 from mri.operators import NonCartesianFFT
-from mri.operators.fourier.utils import estimate_density_compensation
-
 
 traj = np.load('/volatile/temp_traj.npy')
-for i in range(5):
+
+fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=2)
+
+for i in range(10):
     print(i)
-    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=10, smaps=np.ones((10, 384, 384, 208)))
-    fourier.impl.operator.set_smaps(np.ones((10, 384, 384, 208))+1)
     K = fourier.op(np.zeros((384, 384, 208)))
-    im = fourier.adj_op(K)
-    del fourier
+    I = fourier.adj_op(K)

From 15db29dffefc8f3c6b686a2746ac84c93bb6aa27 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Thu, 18 Nov 2021 18:03:19 +0100
Subject: [PATCH 21/85] Fixed all issues, no copies

---
 .../gpuNUFFT_operator_python_factory.cpp      | 58 +++++++++----------
 python/test_file.py                           |  6 +-
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 0ca842cc..5768a9be 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -137,68 +137,66 @@ class GpuNUFFTPythonOperator
     py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> input_image, bool interpolate_data=false)
     {
         // Copy array to pinned memory for better memory bandwidths!
-        copyNumpyArray(input_image, image.data);
+        //copyNumpyArray(input_image, image.data);
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data);
         cudaDeviceSynchronize();
-        return py::array_t<std::complex<DType>>(py::buffer_info(
-            kspace_data.data,                               /* Pointer to buffer */
-            sizeof(std::complex<DType>),                          /* Size of one scalar */
-            py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
-            2,                                      /* Number of dimensions */
-            { n_coils, trajectory_length },                 /* Buffer dimensions */
+        std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*kspace_data.data);
+        auto capsule = py::capsule(ptr, [](void *ptr) { return; });
+        return py::array_t<std::complex<DType>>(
+            { n_coils, trajectory_length },
             {
-                sizeof(DType2) * trajectory_length,             /* Strides (in bytes) for each index */
+                sizeof(DType2) * trajectory_length,
                 sizeof(DType2)
-            }
-        ));
+            },
+            ptr,
+            capsule
+        );
     }
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace_data, bool grid_data=false)
     {
-        copyNumpyArray(input_kspace_data, kspace_data.data);
+        //copyNumpyArray(input_kspace_data, kspace_data.data);
         if(grid_data)
             gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
         else
             gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image);
         cudaDeviceSynchronize();
+        std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*image.data);
+        auto capsule = py::capsule(ptr, [](void *ptr) { return; });
         if(has_sense_data == false)
-          return py::array_t<std::complex<DType>>(py::buffer_info(
-            image.data,                               /* Pointer to buffer */
-            sizeof(std::complex<DType>),                          /* Size of one scalar */
-            py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
-            4,                                                                                    /* Number of dimensions */
+          return py::array_t<std::complex<DType>>(
             {
                 n_coils,
                 (int)image.dim.depth,
                 (int)image.dim.height,
                 (int)image.dim.width
-            }, /* Buffer dimensions */
+            },
             {
                 sizeof(DType2) * (int)image.dim.depth * (int)image.dim.height * (int)image.dim.width,
                 sizeof(DType2) * (int)image.dim.height * (int)image.dim.width,
                 sizeof(DType2) * (int)image.dim.width,
                 sizeof(DType2),
-            }
-          ));
+            },
+            ptr,
+            capsule
+          );
         else
-          return py::array_t<std::complex<DType>>(py::buffer_info(
-            image.data,                               /* Pointer to buffer */
-            sizeof(std::complex<DType>),                          /* Size of one scalar */
-            py::format_descriptor<std::complex<DType>>::format(), /* Python struct-style format descriptor */
-            3,                                                                                    /* Number of dimensions */
+          return py::array_t<std::complex<DType>>(
             {
                 (int)image.dim.depth,
                 (int)image.dim.height,
                 (int)image.dim.width
-            }, /* Buffer dimensions */
+            },
             {
                 sizeof(DType2) * (int)image.dim.height * (int)image.dim.width,
                 sizeof(DType2) * (int)image.dim.width,
                 sizeof(DType2),
-            }
-          ));
+            },
+            ptr,
+            capsule
+      );
 
     }
     void clean_memory()
@@ -216,14 +214,16 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
+        cudaFree(kspace_data.data);
+        cudaFree(image.data);
         delete gpuNUFFTOp;
     }
 };
 PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
         .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool>())
-        .def("op", &GpuNUFFTPythonOperator::op)
-        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
+        .def("op", &GpuNUFFTPythonOperator::op, py::return_value_policy::reference)
+        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::return_value_policy::reference)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
 }
diff --git a/python/test_file.py b/python/test_file.py
index f1f26a5a..7e45fcfd 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -3,9 +3,9 @@
 
 traj = np.load('/volatile/temp_traj.npy')
 
-fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1)
+fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=1)
 
-for i in range(3):
+for i in range(10):
     print(i)
     K = fourier.op(np.zeros((384, 384, 208)))
-    I = fourier.adj_op(K)
+    I = fourier.adj_op(K)
\ No newline at end of file

From 99d03adf1d81b7cd303d396aac79b89214729ae8 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Fri, 19 Nov 2021 13:21:55 +0100
Subject: [PATCH 22/85] FIx leaks

---
 CUDA/inc/gpuNUFFT_operator.hpp                 |  1 +
 .../gpuNUFFT_operator_python_factory.cpp       | 18 ++++++++++--------
 python/test_file.py                            |  5 +++--
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index cb327e6f..4eb94478 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -75,6 +75,7 @@ class GpuNUFFTOperator
     freeDeviceMemory();
     freeLocalMemberArray(this->kernel.data);
     freeLocalMemberArray(this->dens.data);
+    freeLocalMemberArray(this->sens.data);
     freeLocalMemberArray(this->deapo.data);
     freeLocalMemberArray(this->kSpaceTraj.data);
     freeLocalMemberArray(this->sectorCenters.data);
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 5768a9be..d017434e 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -137,14 +137,15 @@ class GpuNUFFTPythonOperator
     py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> input_image, bool interpolate_data=false)
     {
         // Copy array to pinned memory for better memory bandwidths!
-        //copyNumpyArray(input_image, image.data);
+        copyNumpyArray(input_image, image.data);
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data);
         cudaDeviceSynchronize();
         std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*kspace_data.data);
-        auto capsule = py::capsule(ptr, [](void *ptr) { return; });
+        auto capsule = py::capsule(ptr, [](void *ptr) { return;
+        });
         return py::array_t<std::complex<DType>>(
             { n_coils, trajectory_length },
             {
@@ -157,14 +158,15 @@ class GpuNUFFTPythonOperator
     }
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace_data, bool grid_data=false)
     {
-        //copyNumpyArray(input_kspace_data, kspace_data.data);
+        copyNumpyArray(input_kspace_data, kspace_data.data);
         if(grid_data)
             gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
         else
             gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image);
         cudaDeviceSynchronize();
         std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*image.data);
-        auto capsule = py::capsule(ptr, [](void *ptr) { return; });
+        auto capsule = py::capsule(ptr, [](void *ptr) { return;
+        });
         if(has_sense_data == false)
           return py::array_t<std::complex<DType>>(
             {
@@ -214,16 +216,16 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
-        cudaFree(kspace_data.data);
-        cudaFree(image.data);
+        cudaFreeHost(kspace_data.data);
+        cudaFreeHost(image.data);
         delete gpuNUFFTOp;
     }
 };
 PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
         .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool>())
-        .def("op", &GpuNUFFTPythonOperator::op, py::return_value_policy::reference)
-        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::return_value_policy::reference)
+        .def("op", &GpuNUFFTPythonOperator::op)
+        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
 }
diff --git a/python/test_file.py b/python/test_file.py
index 7e45fcfd..83dc2a98 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -3,9 +3,10 @@
 
 traj = np.load('/volatile/temp_traj.npy')
 
-fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=1)
 
 for i in range(10):
+    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1)
     print(i)
     K = fourier.op(np.zeros((384, 384, 208)))
-    I = fourier.adj_op(K)
\ No newline at end of file
+    I = fourier.adj_op(K)
+    del fourier
\ No newline at end of file

From 8d8f24885ed9921bf2478da2496270f4eb4b2b96 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Fri, 19 Nov 2021 16:03:38 +0100
Subject: [PATCH 23/85] Fix mem leaks

---
 CUDA/src/gpuNUFFT_operator_factory.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp
index bb437bd9..9fbae2e9 100644
--- a/CUDA/src/gpuNUFFT_operator_factory.cpp
+++ b/CUDA/src/gpuNUFFT_operator_factory.cpp
@@ -368,7 +368,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
   
   // Data
   gpuNUFFT::Array<DType2> dataArray;
-  dataArray.data = (DType2*)calloc(1, sizeof(DType2)); // re + im
+  cudaMallocHost((void **) &dataArray.data, sizeof(DType2));
   dataArray.dim.length = 1;
   dataArray.data[0].x = 1;
   dataArray.data[0].y = 0;
@@ -377,9 +377,9 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
   // should result in k-space center (0,0,0)
   gpuNUFFT::Array<DType> kSpaceTraj;
   if (deapoGpuNUFFTOp->is3DProcessing())
-    kSpaceTraj.data = (DType*)calloc(3, sizeof(DType)); // x,y,z
+    cudaMallocHost((void **) &kSpaceTraj.data, 3*sizeof(DType));
   else
-    kSpaceTraj.data = (DType*)calloc(2, sizeof(DType)); // x,y
+    cudaMallocHost((void **) &kSpaceTraj.data, 2*sizeof(DType));
   kSpaceTraj.dim.length = 1;
   deapoGpuNUFFTOp->setKSpaceTraj(kSpaceTraj);
   
@@ -391,7 +391,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
   
   // only one data entry, data index = 0
   Array<IndType> dataIndices;
-  dataIndices.data = (IndType*)calloc(1, sizeof(IndType));
+  cudaMallocHost((void **) &dataIndices.data, 2*sizeof(IndType));
   dataIndices.dim.length = 1;
   deapoGpuNUFFTOp->setDataIndices(dataIndices);
   
@@ -414,7 +414,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
   debug("finished deapo computation\n");
 
   // cleanup locally initialized arrays here
-  free(dataArray.data);
+  cudaFreeHost(dataArray.data);
   cudaFreeHost(assignedSectors.data);
 
   // Compute abs values of deapo function and compensate
@@ -438,7 +438,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
 
   // cleanup
   delete deapoGpuNUFFTOp;
-  cudaFreeHost(deapoFunction.data);
+  free(deapoFunction.data);
   return deapoAbs;
 }
 

From 10663bfc31f6535b7c2998da2b46f3eb6fa537d4 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Thu, 25 Nov 2021 10:16:25 +0100
Subject: [PATCH 24/85] test file

---
 python/test_file.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/test_file.py b/python/test_file.py
index 83dc2a98..564212d5 100644
--- a/python/test_file.py
+++ b/python/test_file.py
@@ -1,11 +1,12 @@
 import numpy as np
 from mri.operators import NonCartesianFFT
-
+from mri.operators.fourier.utils import estimate_density_compensation
 traj = np.load('/volatile/temp_traj.npy')
 
 
-for i in range(10):
-    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1)
+for i in range(1):
+    dens = estimate_density_compensation(traj, (384, 384, 208))
+    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=2, density_comp=dens)
     print(i)
     K = fourier.op(np.zeros((384, 384, 208)))
     I = fourier.adj_op(K)

From d32497554ce9a4507f0abd4f70b3b48b28bec8af Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Thu, 25 Nov 2021 11:50:10 +0100
Subject: [PATCH 25/85] Fix minute issues

---
 .../gpuNUFFT_operator_python_factory.cpp      | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index d017434e..2bb88f36 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -158,6 +158,9 @@ class GpuNUFFTPythonOperator
     }
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace_data, bool grid_data=false)
     {
+        gpuNUFFT::Dimensions myDims = imgDims;
+        if(dimension==2)
+            myDims.depth = 1;
         copyNumpyArray(input_kspace_data, kspace_data.data);
         if(grid_data)
             gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
@@ -171,14 +174,14 @@ class GpuNUFFTPythonOperator
           return py::array_t<std::complex<DType>>(
             {
                 n_coils,
-                (int)image.dim.depth,
-                (int)image.dim.height,
-                (int)image.dim.width
+                (int)myDims.depth,
+                (int)myDims.height,
+                (int)myDims.width
             },
             {
-                sizeof(DType2) * (int)image.dim.depth * (int)image.dim.height * (int)image.dim.width,
-                sizeof(DType2) * (int)image.dim.height * (int)image.dim.width,
-                sizeof(DType2) * (int)image.dim.width,
+                sizeof(DType2) * (int)myDims.depth * (int)myDims.height * (int)myDims.width,
+                sizeof(DType2) * (int)myDims.height * (int)myDims.width,
+                sizeof(DType2) * (int)myDims.width,
                 sizeof(DType2),
             },
             ptr,
@@ -187,13 +190,13 @@ class GpuNUFFTPythonOperator
         else
           return py::array_t<std::complex<DType>>(
             {
-                (int)image.dim.depth,
-                (int)image.dim.height,
-                (int)image.dim.width
+                (int)myDims.depth,
+                (int)myDims.height,
+                (int)myDims.width
             },
             {
-                sizeof(DType2) * (int)image.dim.height * (int)image.dim.width,
-                sizeof(DType2) * (int)image.dim.width,
+                sizeof(DType2) * (int)myDims.height * (int)myDims.width,
+                sizeof(DType2) * (int)myDims.width,
                 sizeof(DType2),
             },
             ptr,

From ee3af688389a66eac118ba39aeb2661c16e45b6b Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 11 Jan 2022 18:22:02 +0100
Subject: [PATCH 26/85] Fix for 2D

---
 CUDA/src/gpuNUFFT_operator_factory.cpp | 36 +++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp
index 9fbae2e9..b922ca63 100644
--- a/CUDA/src/gpuNUFFT_operator_factory.cpp
+++ b/CUDA/src/gpuNUFFT_operator_factory.cpp
@@ -132,7 +132,7 @@ gpuNUFFT::Array<IndType> gpuNUFFT::GpuNUFFTOperatorFactory::assignSectors(
 
   // create temporary array to store assigned values
   gpuNUFFT::Array<IndType> assignedSectors;
-  cudaMallocHost((void **) &assignedSectors.data, coordCnt * sizeof(IndType));
+  assignedSectors.data = (IndType *)malloc(coordCnt * sizeof(IndType));
   assignedSectors.dim.length = coordCnt;
 
   if (useGpu)
@@ -355,46 +355,46 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
   const IndType &kernelWidth, const DType &osf, gpuNUFFT::Dimensions &imgDims)
 {
   debug("compute deapodization function\n");
-  
+
   // Create simple gpuNUFFT Operator
   IndType sectorWidth = 8;
   gpuNUFFT::GpuNUFFTOperator *deapoGpuNUFFTOp;
-  
+
   if (useTextures)
     deapoGpuNUFFTOp = new gpuNUFFT::TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf,
     imgDims, TEXTURE2D_LOOKUP);
   else
     deapoGpuNUFFTOp = new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims);
-  
+
   // Data
   gpuNUFFT::Array<DType2> dataArray;
-  cudaMallocHost((void **) &dataArray.data, sizeof(DType2));
+  dataArray.data = (DType2*)calloc(1, sizeof(DType2)); // re + im
   dataArray.dim.length = 1;
   dataArray.data[0].x = 1;
   dataArray.data[0].y = 0;
-  
+
   // Coord triplet (x,y,z)
   // should result in k-space center (0,0,0)
   gpuNUFFT::Array<DType> kSpaceTraj;
   if (deapoGpuNUFFTOp->is3DProcessing())
-    cudaMallocHost((void **) &kSpaceTraj.data, 3*sizeof(DType));
+    kSpaceTraj.data = (DType*)calloc(3, sizeof(DType)); // x,y,z
   else
-    cudaMallocHost((void **) &kSpaceTraj.data, 2*sizeof(DType));
+    kSpaceTraj.data = (DType*)calloc(2, sizeof(DType)); // x,y
   kSpaceTraj.dim.length = 1;
   deapoGpuNUFFTOp->setKSpaceTraj(kSpaceTraj);
-  
+
   // assign according sector to k-Space position
   gpuNUFFT::Array<IndType> assignedSectors =
     assignSectors(deapoGpuNUFFTOp, kSpaceTraj);
   deapoGpuNUFFTOp->setSectorDataCount(
     computeSectorDataCount(deapoGpuNUFFTOp, assignedSectors, true));
-  
+
   // only one data entry, data index = 0
   Array<IndType> dataIndices;
-  cudaMallocHost((void **) &dataIndices.data, 2*sizeof(IndType));
+  dataIndices.data = (IndType*)calloc(1, sizeof(IndType));
   dataIndices.dim.length = 1;
   deapoGpuNUFFTOp->setDataIndices(dataIndices);
-  
+
   // sector centers
   if (deapoGpuNUFFTOp->is3DProcessing())
     deapoGpuNUFFTOp->setSectorCenters(computeSectorCenters(deapoGpuNUFFTOp, true));
@@ -405,17 +405,17 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
   debug("compute deapodization\n");
   deapoGpuNUFFTOp->setDebugFunction(std::bind(&gpuNUFFT::GpuNUFFTOperatorFactory::debug, this, std::placeholders::_1));
 
-  // Compute deapodization function by gridding of a single value positioned 
+  // Compute deapodization function by gridding of a single value positioned
   // in the center of k-space and by using the intended oversampling factor
   // and interpolation kernel width
   gpuNUFFT::Array<CufftType> deapoFunction =
     deapoGpuNUFFTOp->performGpuNUFFTAdj(dataArray,FFT);
-  
+
   debug("finished deapo computation\n");
 
   // cleanup locally initialized arrays here
-  cudaFreeHost(dataArray.data);
-  cudaFreeHost(assignedSectors.data);
+  free(dataArray.data);
+  free(assignedSectors.data);
 
   // Compute abs values of deapo function and compensate
   // FFT scaling sqrt(N)
@@ -423,7 +423,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
 
   DType maxDeapoVal = 0;
   DType minDeapoVal = std::numeric_limits<DType>::max();
-  double fft_scaling_factor = std::sqrt(deapoGpuNUFFTOp->getGridDims().count()); 
+  double fft_scaling_factor = std::sqrt(deapoGpuNUFFTOp->getGridDims().count());
 
   for (unsigned cnt = 0; cnt < deapoFunction.count(); cnt++)
   {
@@ -536,7 +536,7 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
     gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp));
 
   // free temporary array
-  cudaFreeHost(assignedSectors.data);
+  free(assignedSectors.data);
   assignedSectors.data = NULL;
 
   gpuNUFFTOp->setDeapodizationFunction(

From bc16ccfb2c496bab9db5d241bd96ccb01eae1809 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Fri, 22 Apr 2022 12:02:02 +0200
Subject: [PATCH 27/85] Fix

---
 CUDA/inc/config.hpp.cmake                       |  2 +-
 CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu | 12 ++++++------
 CUDA/src/gpu/std_gpuNUFFT_kernels.cu            |  4 ++--
 setup.py                                        |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/CUDA/inc/config.hpp.cmake b/CUDA/inc/config.hpp.cmake
index ff61b8a4..bb1965f1 100644
--- a/CUDA/inc/config.hpp.cmake
+++ b/CUDA/inc/config.hpp.cmake
@@ -30,7 +30,7 @@
 #endif
 
 typedef unsigned int SizeType;
-typedef unsigned int IndType;
+typedef unsigned long int IndType;
 
 /** \brief Combined 2-tuple (x,y) of IndType */
 typedef struct IndType2
diff --git a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
index 4734019e..5004a3f6 100644
--- a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
@@ -473,7 +473,7 @@ void performTextureConvolution(DType2 *data_d, DType *crds_d,
 //  * N              : number of threads
 
 __device__ void
-textureForwardConvolutionFunction(int *sec, int sec_max, int sec_offset,
+textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_offset,
                                   DType2 *sdata, CufftType *gdata_cache,
                                   DType2 *data, DType *crds, CufftType *gdata,
                                   IndType *sectors, IndType *sector_centers)
@@ -592,7 +592,7 @@ __global__ void textureForwardConvolutionKernel(CufftType *data, DType *crds,
   CufftType *shared_out_data = (CufftType *)&shared[0];
   CufftType *gdata_cache = (CufftType *)&shared[blockDim.x];
 
-  __shared__ int sec[THREAD_BLOCK_SIZE];
+  __shared__ long int sec[THREAD_BLOCK_SIZE];
   sec[threadIdx.x] = blockIdx.x;
 
   // init shared memory
@@ -603,7 +603,7 @@ __global__ void textureForwardConvolutionKernel(CufftType *data, DType *crds,
   // start convolution
   while (sec[threadIdx.x] < N)
   {
-    __shared__ int data_max;
+    __shared__ long int data_max;
     data_max = sectors[sec[threadIdx.x] + 1];
 
     textureForwardConvolutionFunction(sec, data_max, 0, shared_out_data,
@@ -622,8 +622,8 @@ __global__ void balancedTextureForwardConvolutionKernel(
   CufftType *shared_out_data = (CufftType *)&shared[0];
   CufftType *gdata_cache = (CufftType *)&shared[blockDim.x];
 
-  int sec_cnt = blockIdx.x;
-  __shared__ int sec[THREAD_BLOCK_SIZE];
+  long int sec_cnt = blockIdx.x;
+  __shared__ long int sec[THREAD_BLOCK_SIZE];
 
   // init shared memory
   shared_out_data[threadIdx.x].x = (DType)0.0;  // Re
@@ -634,7 +634,7 @@ __global__ void balancedTextureForwardConvolutionKernel(
   while (sec_cnt < N)
   {
     sec[threadIdx.x] = sector_processing_order[sec_cnt].x;
-    __shared__ int data_max;
+    __shared__ long int data_max;
     data_max = min(sectors[sec[threadIdx.x] + 1],
                    sectors[sec[threadIdx.x]] +
                        sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD);
diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
index 7d6956d8..5bdc706f 100644
--- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
@@ -25,11 +25,11 @@ void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count)
 {
   if (std::string("texDATA").compare(symbol)==0)
   {
-    HANDLE_ERROR (cudaBindTexture(NULL,texDATA, devicePtr,count*sizeof(float2)));
+    HANDLE_ERROR (cudaBindTexture(NULL,texDATA, devicePtr,(unsigned long)count*sizeof(float2)));
   }
   else if (std::string("texGDATA").compare(symbol)==0)
   {
-    HANDLE_ERROR (cudaBindTexture(NULL,texGDATA, devicePtr,count*sizeof(cufftComplex)));
+    cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex));
   }
 }
 
diff --git a/setup.py b/setup.py
index adde187e..30d90731 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@ def build_extension(self, ext):
                       "-DGEN_PYTHON_FILES=ON",
                       "-DGEN_MEX_FILES=OFF",
                       "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path]
-        cfg = "Debug" if self.debug else "Release"
+        cfg = "Debug" #if self.debug else "Release"
         build_args = ["--config", cfg]
 
         if platform.system() == "Windows":

From 72bf53771202c7de0642b0e8bc4891108a2fb20d Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 25 Apr 2022 09:11:26 +0200
Subject: [PATCH 28/85] fixed

---
 CUDA/CMakeLists.txt                           |  2 +-
 CUDA/inc/gpuNUFFT_kernels.hpp                 |  2 +-
 CUDA/inc/gpuNUFFT_operator_factory.hpp        |  2 +-
 CUDA/inc/gpuNUFFT_types.hpp                   | 32 +++++++++----------
 .../gpu/atomic/texture_gpuNUFFT_kernels.cu    |  8 ++---
 CUDA/src/gpu/std_gpuNUFFT_kernels.cu          |  8 ++---
 CUDA/src/gpuNUFFT_operator.cpp                |  4 +--
 setup.py                                      |  2 +-
 8 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt
index 5f80d6b0..06883c6b 100644
--- a/CUDA/CMakeLists.txt
+++ b/CUDA/CMakeLists.txt
@@ -126,7 +126,7 @@ ENDIF(FERMI_GPU)
 
 IF(CMAKE_BUILD_TYPE MATCHES Debug)
   MESSAGE("debug mode")
-  list(APPEND CUDA_NVCC_FLAGS ${MY_NVCC_FLAGS} --ptxas-options=-v)
+  list(APPEND CUDA_NVCC_FLAGS ${MY_NVCC_FLAGS} --ptxas-options=-v -G)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g -std=c++11")
 ELSE(CMAKE_BUILD_TYPE)
   list(APPEND CUDA_NVCC_FLAGS ${MY_NVCC_FLAGS})
diff --git a/CUDA/inc/gpuNUFFT_kernels.hpp b/CUDA/inc/gpuNUFFT_kernels.hpp
index ec4dad08..9966becc 100644
--- a/CUDA/inc/gpuNUFFT_kernels.hpp
+++ b/CUDA/inc/gpuNUFFT_kernels.hpp
@@ -311,7 +311,7 @@ void performTextureForwardConvolution(CufftType *data_d, DType *crds_d,
   * @param N    Problem size N
   * @param gi_host Info struct with meta information
   */
-void performFFTScaling(CufftType *data, int N, gpuNUFFT::GpuNUFFTInfo *gi_host);
+void performFFTScaling(CufftType *data, long int N, gpuNUFFT::GpuNUFFTInfo *gi_host);
 
 /** \brief Scale each element of the input data by the value of the density
   *compensation function for the corresponding sample point.
diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp
index 3e1b7a2a..5658803f 100644
--- a/CUDA/inc/gpuNUFFT_operator_factory.hpp
+++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp
@@ -55,7 +55,7 @@ class GpuNUFFTOperatorFactory
     * @param useGpu Flag to indicat&GpuNUFFTPythonOperator::adj_op);e gpu usage for precomputation
     * @param balanceWorkload Flag to indicate load balancing
     */
-  GpuNUFFTOperatorFactory(const bool useTextures = true, const bool useGpu = true,
+  GpuNUFFTOperatorFactory(const bool useTextures = false, const bool useGpu = true,
                           bool balanceWorkload = true, bool matlabSharedMem = false)
     : useTextures(useTextures), useGpu(useGpu), balanceWorkload(balanceWorkload),
     matlabSharedMem(matlabSharedMem)
diff --git a/CUDA/inc/gpuNUFFT_types.hpp b/CUDA/inc/gpuNUFFT_types.hpp
index 697cbba8..857f8c45 100644
--- a/CUDA/inc/gpuNUFFT_types.hpp
+++ b/CUDA/inc/gpuNUFFT_types.hpp
@@ -230,27 +230,27 @@ enum OperatorType
 struct GpuNUFFTInfo
 {
   /**\brief Total amount of data samples.*/
-  int data_count;
+  IndType data_count;
   /**\brief Width in grid units of gridding kernel.*/
-  int kernel_width;
+  IndType kernel_width;
   /**\brief Squared kernel_width.*/
-  int kernel_widthSquared;
+  IndType kernel_widthSquared;
   /**\brief Reciprocal value of kernel_widthSquared.*/
   DType kernel_widthInvSquared;
   /**\brief Total amount of kernel entries.*/
-  int kernel_count;
+  IndType kernel_count;
   /**\brief Radius of kernel relative to grid size.*/
   DType kernel_radius;
 
   /**\brief Width of oversampled grid.*/
-  int grid_width_dim;
+  IndType grid_width_dim;
   /**\brief .*/
-  int grid_width_offset;
+  IndType grid_width_offset;
   /**\brief Reciprocal value of grid_width_dim.*/
   DType3 grid_width_inv;
 
   /**\brief Total amount of image nodes.*/
-  int im_width_dim;
+  IndType im_width_dim;
   /**\brief Image offset (imgDims / 2).*/
   IndType3 im_width_offset;  // used in deapodization
 
@@ -258,22 +258,22 @@ struct GpuNUFFTInfo
   DType osr;
 
   /**\brief Total amount of sectors.*/
-  int sector_count;
+  IndType sector_count;
   /**\brief Amount of sectors per dimension.*/
-  int sector_width;
+  IndType sector_width;
 
   /**\brief Padded sector width (sector_width + kernel_width / 2).*/
-  int sector_pad_width;
+  IndType sector_pad_width;
   /**\brief Maximum index per dimension of padded sector (sector_pad_width -
    * 1).*/
-  int sector_pad_max;
+  IndType sector_pad_max;
   /**\brief Total amount of elements in one padded sector.*/
-  int sector_dim;
+  IndType sector_dim;
   /**\brief Offset to zero position inside padded sector (sector_pad_width / 2).
    * Used in combination with the sector center in order to get to the starting
    * index (bottom left of the front slice)
    */
-  int sector_offset;
+  IndType sector_offset;
 
   /**\brief Distance scale in x direction in case of anisotropic grids.*/
   DType aniso_x_scale;
@@ -302,12 +302,12 @@ struct GpuNUFFTInfo
   /**\brief Flag to indicate whether 2-d or 3-d data is processed.*/
   bool is2Dprocessing;
   /**\brief Type used for texture interpolation.*/
-  int interpolationType;
+  IndType interpolationType;
   /**\brief Total amount of sectors which have to be processed.
     * Depends on sector load balancing.*/
-  int sectorsToProcess;
+  IndType sectorsToProcess;
   /**\brief Number of coils processed concurrently */
-  int n_coils_cc;
+  IndType n_coils_cc;
 };
 }
 
diff --git a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
index 5004a3f6..1fa20f88 100644
--- a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
@@ -486,16 +486,16 @@ textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_
   center.y = sector_centers[sec[threadIdx.x] * 3 + 1];
   center.z = sector_centers[sec[threadIdx.x] * 3 + 2];
 
-  __shared__ int sector_ind_offset;
+  __shared__ long int sector_ind_offset;
   sector_ind_offset =
       computeXYZ2Lin(center.x - GI.sector_offset, center.y - GI.sector_offset,
                      center.z - GI.sector_offset, GI.gridDims);
 
   // init sector cache
   // preload sector grid data into cache
-  for (int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x)
+  for (long int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x)
   {
-    int grid_index;
+    long int grid_index;
     getCoordsFromIndex(ind, &i, &j, &k, GI.sector_pad_width);
 
     if (isOutlier(i, j, k, center.x, center.y, center.z, GI.gridDims,
@@ -516,7 +516,7 @@ textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_
   __syncthreads();
 
   // Grid Points over Threads
-  int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
+  long int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
 
   while (data_cnt < sec_max)
   {
diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
index 5bdc706f..b0fe0e8b 100644
--- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
@@ -29,7 +29,7 @@ void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count)
   }
   else if (std::string("texGDATA").compare(symbol)==0)
   {
-    cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex));
+    HANDLE_ERROR (cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex)));
   }
 }
 
@@ -111,9 +111,9 @@ void freeTexture(const char* symbol, cudaArray* devicePtr)
   HANDLE_ERROR(cudaFreeArray(devicePtr));  
 }
 
-__global__ void fftScaleKernel(CufftType* data, DType scaling, int N)
+__global__ void fftScaleKernel(CufftType* data, DType scaling, long int N)
 {
-  int t = threadIdx.x +  blockIdx.x *blockDim.x;
+  long int t = threadIdx.x +  blockIdx.x *blockDim.x;
 
   while (t < N) 
   {
@@ -129,7 +129,7 @@ __global__ void fftScaleKernel(CufftType* data, DType scaling, int N)
   }
 }
 
-void performFFTScaling(CufftType* data,int N, gpuNUFFT::GpuNUFFTInfo* gi_host)
+void performFFTScaling(CufftType* data,long int N, gpuNUFFT::GpuNUFFTInfo* gi_host)
 {
   dim3 block_dim(64, 1, 8);
   //dim3 block_dim(THREAD_BLOCK_SIZE);
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 770e6333..b1612d5f 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -1104,7 +1104,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
   if (debugTiming)
     startTiming();
 
-  int data_count = (int)this->kSpaceTraj.count();
+  long int data_count = (int)this->kSpaceTraj.count();
   int n_coils = (int)kspaceData.dim.channels;
   IndType imdata_count = this->imgDims.count();
 
@@ -1253,7 +1253,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
 
     // write result in correct order back into output array
     writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
-                    (int)this->kSpaceTraj.count(), n_coils_cc);
+                    (long int)this->kSpaceTraj.count(), n_coils_cc);
     if(coil_it > 1)
     {
       cudaStreamSynchronize(old_stream);
diff --git a/setup.py b/setup.py
index 30d90731..adde187e 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@ def build_extension(self, ext):
                       "-DGEN_PYTHON_FILES=ON",
                       "-DGEN_MEX_FILES=OFF",
                       "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path]
-        cfg = "Debug" #if self.debug else "Release"
+        cfg = "Debug" if self.debug else "Release"
         build_args = ["--config", cfg]
 
         if platform.system() == "Windows":

From f8b2ccb4cc86f165ff9d0e398477cf63e80a675d Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Thu, 20 Jul 2023 07:48:26 +0000
Subject: [PATCH 29/85] Fixes for missing directories, add CUDA DIR expliocitly

---
 CUDA/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt
index 06883c6b..71260d30 100644
--- a/CUDA/CMakeLists.txt
+++ b/CUDA/CMakeLists.txt
@@ -94,9 +94,10 @@ endif(GPU_DOUBLE_PREC)
 
 SET(FERMI_GPU OFF CACHE BOOL "Enable build for (old) Fermi architectures (Compute capability 2.0)")
 
-
+set(MY_NVCC_FLAGS -I${CUDA_INCLUDE_DIRS})
+set(CMAKE_CXX_FLAGS -I${CUDA_INCLUDE_DIRS})
 IF(FERMI_GPU)
-  set(MY_NVCC_FLAGS -gencode arch=compute_30,code=sm_30)
+  list(APPEND MY_NVCC_FLAGS -gencode arch=compute_30,code=sm_30)
   list(APPEND MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50)
   list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=sm_52)
   list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=compute_52)
@@ -150,6 +151,7 @@ CONFIGURE_FILE( ${CMAKE_SOURCE_DIR}/inc/cufft_config.hpp.cmake ${CMAKE_SOURCE_DI
 
 #Include dirs
 include_directories(inc)
+message(CUDA_INCLUDE_DIRS : ${CUDA_INCLUDE_DIRS})
 SET(GPUNUFFT_INC_DIR ${CMAKE_SOURCE_DIR}/inc)
 SET(GPUNUFFT_INCLUDE ${GPUNUFFT_INC_DIR}/cuda_utils.hpp 
 										 ${GPUNUFFT_INC_DIR}/cuda_utils.cuh

From 357e548874b075307b72c5f5de457541557e2b4c Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Thu, 20 Jul 2023 09:52:19 +0200
Subject: [PATCH 30/85] Update versioning to take concurrency into account

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index adde187e..dafe5ecc 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.3.2",
+    version="0.4.2",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From 5ca38c9f693c483a65f3f2a2599d259ab30d4d11 Mon Sep 17 00:00:00 2001
From: Pierre-antoine Comby <pierre-antoine.comby@crans.org>
Date: Sun, 5 Nov 2023 15:47:39 +0100
Subject: [PATCH 31/85] fix: allow for non-integer osf.

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 6 +++---
 python/test_nufftOp.py                                   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 2bb88f36..41dd48ab 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -73,7 +73,7 @@ class GpuNUFFTPythonOperator
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
     py::array_t<std::complex<DType>> sense_maps,  py::array_t<float> density_comp, int kernel_width=3,
-    int sector_width=8, int osr=2, bool balance_workload=1)
+    int sector_width=8, float osf=2, bool balance_workload=1)
     {
         // k-space coordinates
         py::buffer_info sample_loc = kspace_loc.request();
@@ -116,7 +116,7 @@ class GpuNUFFTPythonOperator
         factory.setBalanceWorkload(balance_workload);
         gpuNUFFTOp = factory.createGpuNUFFTOperator(
             kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width,
-            osr, imgDims);
+            osf, imgDims);
         allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2));
         kspace_data.dim.length = trajectory_length;
         kspace_data.dim.channels = n_coils;
@@ -226,7 +226,7 @@ class GpuNUFFTPythonOperator
 };
 PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
-        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool>())
+        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, float, bool>())
         .def("op", &GpuNUFFTPythonOperator::op)
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
diff --git a/python/test_nufftOp.py b/python/test_nufftOp.py
index 4d7a1387..68284a15 100644
--- a/python/test_nufftOp.py
+++ b/python/test_nufftOp.py
@@ -24,7 +24,7 @@ def get_nufft_op(self, sens_maps=None):
             self.weights,
             3,
             8,
-            2,
+            2.0,
             True,
         )
 

From 417e1c6d83410558aff75fd629e0d79da34020be Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 15:55:29 +0100
Subject: [PATCH 32/85] Add pinned memory stuff first code, with debug prints

---
 .../gpuNUFFT_operator_python_factory.cpp      | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 2bb88f36..1f4afe4a 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -107,11 +107,22 @@ class GpuNUFFTPythonOperator
         }
         else
         {
-            allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2));
-            sensArray.dim = imgDims;
-            sensArray.dim.channels = n_coils;
-            copyNumpyArray(sense_maps, sensArray.data);
-            has_sense_data = true;
+            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED));
+            if(cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED))
+            {
+                printf("The smaps data is pinned!, skipping copies");
+                std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
+                sensArray.data = reinterpret_cast<DType2(&)[0]>(*t_data);
+            }
+            else
+            {
+                printf("The smaps data is NOT pinned!, DOING copies");
+                allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2));
+                sensArray.dim = imgDims;
+                sensArray.dim.channels = n_coils;
+                copyNumpyArray(sense_maps, sensArray.data);
+                has_sense_data = true;
+            }
         }
         factory.setBalanceWorkload(balance_workload);
         gpuNUFFTOp = factory.createGpuNUFFTOperator(

From e3cdb905083e6f49e7792993c6b3eb89759fa916 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 16:01:21 +0100
Subject: [PATCH 33/85] All setup!

---
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp    | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 1f4afe4a..bafdb89f 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -107,11 +107,15 @@ class GpuNUFFTPythonOperator
         }
         else
         {
-            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED));
-            if(cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED))
+            bool is_pinned_memory;
+            // FIXME, check for errors
+            cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) sense_maps_buffer.ptr);
+            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", is_pinned_memory);
+            if(is_pinned_memory)
             {
                 printf("The smaps data is pinned!, skipping copies");
-                std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
+                // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned
+                std::complex<DType> *t_data = (std::complex<DType> *) sense_maps_buffer.ptr;
                 sensArray.data = reinterpret_cast<DType2(&)[0]>(*t_data);
             }
             else

From 31e44c92740fe3a31645453dd9500ac02b1c8776 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 16:13:44 +0100
Subject: [PATCH 34/85] Fix cmake link cuda

---
 CUDA/src/gpu/python/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt
index 494255c2..0931003f 100644
--- a/CUDA/src/gpu/python/CMakeLists.txt
+++ b/CUDA/src/gpu/python/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WIN32)
 
     TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES})
 elseif(UNIX)
-    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES})
+    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} cuda)
 endif(WIN32)
 CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT)
 CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT)

From f524864be77488aefcffaf76734161dda06b563e Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 16:17:57 +0100
Subject: [PATCH 35/85] \n

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index bafdb89f..fae45e08 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -110,17 +110,17 @@ class GpuNUFFTPythonOperator
             bool is_pinned_memory;
             // FIXME, check for errors
             cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) sense_maps_buffer.ptr);
-            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", is_pinned_memory);
+            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory);
             if(is_pinned_memory)
             {
-                printf("The smaps data is pinned!, skipping copies");
+                printf("The smaps data is pinned!, skipping copies\n");
                 // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned
                 std::complex<DType> *t_data = (std::complex<DType> *) sense_maps_buffer.ptr;
                 sensArray.data = reinterpret_cast<DType2(&)[0]>(*t_data);
             }
             else
             {
-                printf("The smaps data is NOT pinned!, DOING copies");
+                printf("The smaps data is NOT pinned!, DOING copies\n");
                 allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2));
                 sensArray.dim = imgDims;
                 sensArray.dim.channels = n_coils;

From 4ff0c1a44fdaf974c66edf02b6b9c66c2cf48653 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 16:25:35 +0100
Subject: [PATCH 36/85] mapped

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index fae45e08..c65a8a69 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -109,8 +109,8 @@ class GpuNUFFTPythonOperator
         {
             bool is_pinned_memory;
             // FIXME, check for errors
-            cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) sense_maps_buffer.ptr);
-            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory);
+            cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_MAPPED, (CUdeviceptr) sense_maps_buffer.ptr);
+            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %b\n", is_pinned_memory);
             if(is_pinned_memory)
             {
                 printf("The smaps data is pinned!, skipping copies\n");

From 1daf1c5be95d04d7c14c1efb83f04e88c786f63b Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 16:25:48 +0100
Subject: [PATCH 37/85] mapped

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index c65a8a69..d1587a3a 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -110,7 +110,7 @@ class GpuNUFFTPythonOperator
             bool is_pinned_memory;
             // FIXME, check for errors
             cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_MAPPED, (CUdeviceptr) sense_maps_buffer.ptr);
-            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %b\n", is_pinned_memory);
+            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory);
             if(is_pinned_memory)
             {
                 printf("The smaps data is pinned!, skipping copies\n");

From caf48ac28b88fc871c7ebb1f1f2ae75a718befbd Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 16:39:43 +0100
Subject: [PATCH 38/85] Fix for memory type

---
 CUDA/src/gpu/python/CMakeLists.txt                       | 2 +-
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt
index 0931003f..494255c2 100644
--- a/CUDA/src/gpu/python/CMakeLists.txt
+++ b/CUDA/src/gpu/python/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WIN32)
 
     TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES})
 elseif(UNIX)
-    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} cuda)
+    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES})
 endif(WIN32)
 CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT)
 CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT)
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index d1587a3a..da267082 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -107,10 +107,11 @@ class GpuNUFFTPythonOperator
         }
         else
         {
-            bool is_pinned_memory;
+            cudaPointerAttributes attr;
             // FIXME, check for errors
-            cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_MAPPED, (CUdeviceptr) sense_maps_buffer.ptr);
-            printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory);
+            cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr);
+            printf("Value of attr.cudaMemoryType = %d\n", attr.type);
+            bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
             if(is_pinned_memory)
             {
                 printf("The smaps data is pinned!, skipping copies\n");

From 59545c959d573edeb84da9316af04e5853b0a0e2 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 15 Nov 2023 16:55:06 +0100
Subject: [PATCH 39/85] Fix pointers pointers

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index da267082..fc83acfd 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -109,7 +109,7 @@ class GpuNUFFTPythonOperator
         {
             cudaPointerAttributes attr;
             // FIXME, check for errors
-            cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr);
+            cudaPointerGetAttributes(&attr, &sense_maps_buffer.ptr);
             printf("Value of attr.cudaMemoryType = %d\n", attr.type);
             bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
             if(is_pinned_memory)

From dbca20e784389e6875c9457e7d65655df4efeeb2 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Fri, 17 Nov 2023 09:44:49 +0100
Subject: [PATCH 40/85] Added pinned stuff

---
 .../python/gpuNUFFT_operator_python_factory.cpp    | 14 +++++++++-----
 setup.py                                           |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index fc83acfd..ee2d03fe 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -108,20 +108,24 @@ class GpuNUFFTPythonOperator
         else
         {
             cudaPointerAttributes attr;
-            // FIXME, check for errors
-            cudaPointerGetAttributes(&attr, &sense_maps_buffer.ptr);
-            printf("Value of attr.cudaMemoryType = %d\n", attr.type);
+            if(DEBUG)
+                printf("Value of sense_maps pointer == 0x%x or %d\n", sense_maps_buffer.ptr, sense_maps_buffer.ptr);
+            cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr);
+            if(DEBUG)
+                printf("Value of attr.cudaMemoryType2 = %d\n", attr.type);
             bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
             if(is_pinned_memory)
             {
-                printf("The smaps data is pinned!, skipping copies\n");
+                if(DEBUG)
+                    printf("The smaps data is pinned!, skipping copies\n");
                 // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned
                 std::complex<DType> *t_data = (std::complex<DType> *) sense_maps_buffer.ptr;
                 sensArray.data = reinterpret_cast<DType2(&)[0]>(*t_data);
             }
             else
             {
-                printf("The smaps data is NOT pinned!, DOING copies\n");
+                if(DEBUG)
+                    printf("The smaps data is NOT pinned!, DOING copies\n");
                 allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2));
                 sensArray.dim = imgDims;
                 sensArray.dim.channels = n_coils;
diff --git a/setup.py b/setup.py
index dafe5ecc..14cc18f1 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.4.2",
+    version="0.4.3",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From 837bcd0c185dd6814b1f7e9996f4e691f25eb298 Mon Sep 17 00:00:00 2001
From: Pierre-antoine Comby <pierre-antoine.comby@crans.org>
Date: Mon, 20 Nov 2023 10:43:20 +0100
Subject: [PATCH 41/85] feat: add fully on-gpu density compensation estimation.

---
 .../gpuNUFFT_operator_python_factory.cpp      | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ee2d03fe..c4783f9b 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -237,6 +237,73 @@ class GpuNUFFTPythonOperator
         has_sense_data = true;
         gpuNUFFTOp->setSens(sensArray);
     }
+
+    py::array_t<DType> estimate_density_comp(int num_iter = 10)
+    {
+        IndType n_samples = kspace_data.count();
+        gpuNUFFT::Array<CufftType> densArray;
+        allocate_pinned_memory(&densArray, n_samples * sizeof(CufftType));
+        densArray.dim.length = n_samples;
+
+        // TODO: Allocate directly on device and set with kernel.
+        for (int cnt = 0; cnt < n_samples; cnt++)
+        {
+          densArray.data[cnt].x = 1.0;
+          densArray.data[cnt].y = 0.0;
+        }
+
+        gpuNUFFT::GpuArray<DType2> densArray_gpu;
+        densArray_gpu.dim.length = n_samples;
+        allocateDeviceMem(&densArray_gpu.data, n_samples);
+
+        copyToDeviceAsync(densArray.data, densArray_gpu.data, n_samples);
+
+        gpuNUFFT::GpuArray<CufftType> densEstimation_gpu;
+        densEstimation_gpu.dim.length = n_samples;
+        allocateDeviceMem(&densEstimation_gpu.data, n_samples);
+
+        gpuNUFFT::GpuArray<CufftType> image_gpu;
+        image_gpu.dim = imgDims;
+        allocateDeviceMem(&image_gpu.data, imgDims.count());
+
+        if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+          printf("error at adj thread synchronization a: %s\n",
+                 cudaGetErrorString(cudaGetLastError()));
+        for (int cnt = 0; cnt < num_iter; cnt++)
+        {
+          if (DEBUG)
+                printf("### update %i\n", cnt);
+          gpuNUFFTOp->performGpuNUFFTAdj(densArray_gpu, image_gpu,
+                                         gpuNUFFT::DENSITY_ESTIMATION);
+          gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, densEstimation_gpu,
+                                             gpuNUFFT::DENSITY_ESTIMATION);
+          performUpdateDensityComp(densArray_gpu.data, densEstimation_gpu.data,
+                                   n_samples);
+          if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+                printf("error at adj thread synchronization d: %s\n",
+                       cudaGetErrorString(cudaGetLastError()));
+        }
+        freeDeviceMem(densEstimation_gpu.data);
+        freeDeviceMem(image_gpu.data);
+
+        cudaDeviceSynchronize();
+        // copy only the real part back to cpu
+        DType *tmp_d = (DType *)densArray_gpu.data;
+
+        gpuNUFFT::Array<DType> final_densArray;
+        final_densArray.dim.length = n_samples;
+        allocate_pinned_memory(&final_densArray, n_samples * sizeof(DType));
+        HANDLE_ERROR(cudaMemcpy2DAsync(final_densArray.data, sizeof(DType),
+                                       tmp_d, sizeof(DType2), sizeof(DType),
+                                       n_samples, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+        freeDeviceMem(densArray_gpu.data);
+        DType *ptr = reinterpret_cast<DType(&)[0]>(*final_densArray.data);
+        auto capsule = py::capsule(ptr, [](void *ptr) { return; });
+        return py::array_t<DType>({ trajectory_length }, { sizeof(DType) }, ptr,
+                                  capsule);
+    }
+
     ~GpuNUFFTPythonOperator()
     {
         cudaFreeHost(kspace_data.data);
@@ -250,6 +317,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
         .def("op", &GpuNUFFTPythonOperator::op)
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
+        .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
 }
 #endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED

From ae7a1e3fbb37c648853bb67bbac65dfe89ec70ee Mon Sep 17 00:00:00 2001
From: Pierre-antoine Comby <pierre-antoine.comby@crans.org>
Date: Mon, 20 Nov 2023 10:54:00 +0100
Subject: [PATCH 42/85] feat: add power method estimation of the spectral
 radius.

---
 .../gpuNUFFT_operator_python_factory.cpp      | 62 ++++++++++++++++++-
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ee2d03fe..e08cac84 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -13,13 +13,13 @@ Carole Lazarus <carole.m.lazarus@gmail.com>
 #include "cufft.h"
 #include "cuda_runtime.h"
 #include <cuda.h>
-#include <cublas.h>
+#include <cublas_v2.h>
+#include <curand.h>
 #include "config.hpp"
 #include "gpuNUFFT_operator_factory.hpp"
 #include <algorithm>  // std::sort
 #include <vector>     // std::vector
 #include <string>
-#include <cuda.h>
 
 
 namespace py = pybind11;
@@ -237,6 +237,61 @@ class GpuNUFFTPythonOperator
         has_sense_data = true;
         gpuNUFFTOp->setSens(sensArray);
     }
+
+    float get_spectral_radius(int max_iter = 20,float tolerance = 1e-6)
+    {
+        int im_size = image.count();
+
+        gpuNUFFT::GpuArray<DType2> x_gpu;
+        x_gpu.dim = image.dim;
+        allocateDeviceMem(&x_gpu.data, im_size);
+
+        gpuNUFFT::GpuArray<DType2> tmp_kspace_gpu;
+        tmp_kspace_gpu.dim = kspace_data.dim;
+        allocateDeviceMem(&tmp_kspace_gpu.data, kspace_data.count());
+
+        cudaDeviceSynchronize();
+        DType norm_old = 1.0;
+        DType norm_new = 1.0;
+        DType inv_norm = 1.0;
+        // initialisation: create a random complex image.
+        curandGenerator_t generator;
+        curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW);
+        curandSetPseudoRandomGeneratorSeed(generator, (int)time(NULL));
+
+        // complex value generator by giving twice the size.
+        curandGenerateUniform(generator, (DType *)x_gpu.data, 2 * im_size);
+        // xold = initialize random x of image size.
+        curandDestroyGenerator(generator);
+        // Create a handle
+        cublasHandle_t handle;
+        cublasCreate(&handle);
+
+        cublasScnrm2(handle, im_size, x_gpu.data, 1, &norm_old);
+        inv_norm = 1.0 / norm_old;
+        cublasCsscal(handle, im_size, &inv_norm, x_gpu.data, 1);
+
+        for (int i = 0; i < max_iter; i++)
+        {
+          // compute x_new = adj_op(op(x_old))
+          gpuNUFFTOp->performForwardGpuNUFFT(x_gpu, tmp_kspace_gpu);
+          gpuNUFFTOp->performGpuNUFFTAdj(tmp_kspace_gpu, x_gpu);
+          // compute ||x_new||
+          cublasScnrm2(handle, im_size, x_gpu.data, 1, &norm_new);
+          // x_new <- x_new/ ||x_new||
+          inv_norm = 1.0 / norm_new;
+
+          cublasCsscal(handle, im_size, &inv_norm, x_gpu.data, 1);
+          if (fabs(norm_new - norm_old) < tolerance)
+          {
+                break;
+          }
+          norm_old = norm_new;
+        }
+        freeTotalDeviceMemory(tmp_kspace_gpu.data, x_gpu.data, NULL);
+        cublasDestroy(handle);
+        return norm_new;
+    }
     ~GpuNUFFTPythonOperator()
     {
         cudaFreeHost(kspace_data.data);
@@ -250,6 +305,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
         .def("op", &GpuNUFFTPythonOperator::op)
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
-        .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
+        .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps)
+        .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius);
 }
 #endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED

From d2bcf7c806ee4925c4cd3c1a23a074299381e14d Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 11:42:11 +0100
Subject: [PATCH 43/85] Working codes for mem allocations

---
 .../gpuNUFFT_operator_python_factory.cpp      | 97 ++++++++++++++++---
 1 file changed, 83 insertions(+), 14 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ee2d03fe..b443ccba 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -52,6 +52,13 @@ void allocate_pinned_memory(gpuNUFFT::Array<DType2> *lin_array, unsigned long in
   cudaMallocHost((void **)&new_data, size);
   lin_array->data = new_data;
 }
+
+void deallocate_pinned_memory(gpuNUFFT::Array<DType2> *lin_array)
+{
+  cudaFreeHost(lin_array->data);
+  lin_array->data = NULL;
+}
+
 template <typename TType>
 void copyNumpyArray(py::array_t<std::complex<DType>> data, TType *copy_data)
 {
@@ -61,19 +68,62 @@ void copyNumpyArray(py::array_t<std::complex<DType>> data, TType *copy_data)
     memcpy(copy_data, my_data, myData.size*sizeof(TType));
 }
 
+enum MemoryAllocationType{
+        NEVER_ALLOCATE_MEMORY = 0,
+        ALLOCATE_MEMORY_IN_CONSTRUCTOR = 1,
+        ALLOCATE_MEMORY_IN_OP = 2
+    };
+
 class GpuNUFFTPythonOperator
 {
     gpuNUFFT::GpuNUFFTOperatorFactory factory;
     gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp;
     int trajectory_length, n_coils, dimension;
     bool has_sense_data;
+    MemoryAllocationType when_allocate_memory;
     gpuNUFFT::Dimensions imgDims;
     // sensitivity maps
     gpuNUFFT::Array<DType2> sensArray, kspace_data, image;
+    void allocate_memory_kspace()
+    {
+        allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2));
+        kspace_data.dim.length = trajectory_length;
+        kspace_data.dim.channels = n_coils;
+    }
+    void deallocate_memory_kspace()
+    {
+        deallocate_pinned_memory(&kspace_data);
+        kspace_data.dim.length = 0;
+        kspace_data.dim.channels = 0;
+    }           
+
+    void allocate_memory_image()
+    {
+        image.dim = imgDims;
+        if(has_sense_data == false)
+        {
+          allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2));
+          image.dim.channels = n_coils;
+        }
+        else
+        {
+          allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2));
+          image.dim.channels = 1;
+        }
+    }
+    void deallocate_memory_image()
+    {
+        deallocate_pinned_memory(&image);
+        image.dim.width = 0;
+        image.dim.depth = 0;
+        image.dim.height = 0;
+        image.dim.channels = 0;
+    }
+    
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
     py::array_t<std::complex<DType>> sense_maps,  py::array_t<float> density_comp, int kernel_width=3,
-    int sector_width=8, int osr=2, bool balance_workload=1)
+    int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory)
     {
         // k-space coordinates
         py::buffer_info sample_loc = kspace_loc.request();
@@ -137,25 +187,22 @@ class GpuNUFFTPythonOperator
         gpuNUFFTOp = factory.createGpuNUFFTOperator(
             kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width,
             osr, imgDims);
-        allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2));
-        kspace_data.dim.length = trajectory_length;
-        kspace_data.dim.channels = n_coils;
-        image.dim = imgDims;
-        if(has_sense_data == false)
-        {
-          allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2));
-          image.dim.channels = n_coils;
-        }
-        else
+        
+        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
         {
-          allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2));
-          image.dim.channels = 1;
+            allocate_memory_kspace();
+            allocate_memory_image();
         }
         cudaDeviceSynchronize();
     }
 
     py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> input_image, bool interpolate_data=false)
     {
+        if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
+        {
+            allocate_memory_kspace();
+            allocate_memory_image();
+        }
         // Copy array to pinned memory for better memory bandwidths!
         copyNumpyArray(input_image, image.data);
         if(interpolate_data)
@@ -166,6 +213,11 @@ class GpuNUFFTPythonOperator
         std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*kspace_data.data);
         auto capsule = py::capsule(ptr, [](void *ptr) { return;
         });
+        if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
+        {
+            // Deallocate the memory (only image) to prevent memory leaks!
+            deallocate_memory_image();
+        }
         return py::array_t<std::complex<DType>>(
             { n_coils, trajectory_length },
             {
@@ -178,6 +230,11 @@ class GpuNUFFTPythonOperator
     }
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace_data, bool grid_data=false)
     {
+        if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
+        {
+            allocate_memory_kspace();
+            allocate_memory_image();
+        }
         gpuNUFFT::Dimensions myDims = imgDims;
         if(dimension==2)
             myDims.depth = 1;
@@ -190,6 +247,11 @@ class GpuNUFFTPythonOperator
         std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*image.data);
         auto capsule = py::capsule(ptr, [](void *ptr) { return;
         });
+        if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
+        {
+            // Deallocate the memory (only k-space) to prevent memory leaks!
+            deallocate_memory_kspace();
+        }
         if(has_sense_data == false)
           return py::array_t<std::complex<DType>>(
             {
@@ -246,10 +308,17 @@ class GpuNUFFTPythonOperator
 };
 PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
-        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool>())
+        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool, MemoryAllocationType>()) // FIXME : Add defaul values!
         .def("op", &GpuNUFFTPythonOperator::op)
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
+    
+    py::enum_<MemoryAllocationType>(m, "MemoryAllocationType")
+        .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY)
+        .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR)
+        .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP)
+        .export_values();
+
 }
 #endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED

From 11ef27d1bbc3c7f70cd89a80a8610c81f9ebb8c2 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 13:25:39 +0100
Subject: [PATCH 44/85] Working added additional optional input

---
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp    | 10 +++++-----
 setup.py                                               |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index b443ccba..8f7e32fe 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -196,7 +196,7 @@ class GpuNUFFTPythonOperator
         cudaDeviceSynchronize();
     }
 
-    py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> input_image, bool interpolate_data=false)
+    py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> input_image, bool interpolate_data, std::optional<py::array_t<std::complex<DType>>> out_kspace)
     {
         if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
         {
@@ -228,7 +228,7 @@ class GpuNUFFTPythonOperator
             capsule
         );
     }
-    py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace_data, bool grid_data=false)
+    py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace, bool grid_data, std::optional<py::array_t<std::complex<DType>>> out_image)
     {
         if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
         {
@@ -238,7 +238,7 @@ class GpuNUFFTPythonOperator
         gpuNUFFT::Dimensions myDims = imgDims;
         if(dimension==2)
             myDims.depth = 1;
-        copyNumpyArray(input_kspace_data, kspace_data.data);
+        copyNumpyArray(input_kspace, kspace_data.data);
         if(grid_data)
             gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
         else
@@ -309,8 +309,8 @@ class GpuNUFFTPythonOperator
 PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
         .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool, MemoryAllocationType>()) // FIXME : Add defaul values!
-        .def("op", &GpuNUFFTPythonOperator::op)
-        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op)
+        .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none())
+        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none())
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
     
diff --git a/setup.py b/setup.py
index 14cc18f1..bd0f53e7 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.4.3",
+    version="0.5.0",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From b7cf9b8c313d896415fe605db9b7f4ba5c71a057 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 13:40:29 +0100
Subject: [PATCH 45/85] Completed coding the entire end to end

---
 .../gpuNUFFT_operator_python_factory.cpp      | 59 +++++++++++++------
 1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 8f7e32fe..31253bbc 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -68,6 +68,15 @@ void copyNumpyArray(py::array_t<std::complex<DType>> data, TType *copy_data)
     memcpy(copy_data, my_data, myData.size*sizeof(TType));
 }
 
+template <typename TType>
+void cast_pointer(py::array_t<std::complex<DType>> data, gpuNUFFT::Array<TType> &copy_data)
+{
+    py::buffer_info myData = data.request();
+    std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
+    TType *my_data = reinterpret_cast<TType(&)[0]>(*t_data);
+    copy_data.data = my_data;
+}
+
 enum MemoryAllocationType{
         NEVER_ALLOCATE_MEMORY = 0,
         ALLOCATE_MEMORY_IN_CONSTRUCTOR = 1,
@@ -90,12 +99,6 @@ class GpuNUFFTPythonOperator
         kspace_data.dim.length = trajectory_length;
         kspace_data.dim.channels = n_coils;
     }
-    void deallocate_memory_kspace()
-    {
-        deallocate_pinned_memory(&kspace_data);
-        kspace_data.dim.length = 0;
-        kspace_data.dim.channels = 0;
-    }           
 
     void allocate_memory_image()
     {
@@ -111,14 +114,6 @@ class GpuNUFFTPythonOperator
           image.dim.channels = 1;
         }
     }
-    void deallocate_memory_image()
-    {
-        deallocate_pinned_memory(&image);
-        image.dim.width = 0;
-        image.dim.depth = 0;
-        image.dim.height = 0;
-        image.dim.channels = 0;
-    }
     
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
@@ -202,9 +197,21 @@ class GpuNUFFTPythonOperator
         {
             allocate_memory_kspace();
             allocate_memory_image();
+            // Copy array to pinned memory for better memory bandwidths!
+            copyNumpyArray(input_image, image.data);
+        }
+        else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
+        {
+            cast_pointer(input_image, image);
+            if(out_kspace.has_value())
+                cast_pointer(out_kspace.value(), kspace_data);
+            else
+            {
+                // We dont have out_kspace allocated. Warn and then allocate
+                py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!");
+                allocate_memory_kspace();
+            }
         }
-        // Copy array to pinned memory for better memory bandwidths!
-        copyNumpyArray(input_image, image.data);
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
@@ -216,7 +223,7 @@ class GpuNUFFTPythonOperator
         if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
         {
             // Deallocate the memory (only image) to prevent memory leaks!
-            deallocate_memory_image();
+            deallocate_pinned_memory(&image);
         }
         return py::array_t<std::complex<DType>>(
             { n_coils, trajectory_length },
@@ -234,11 +241,25 @@ class GpuNUFFTPythonOperator
         {
             allocate_memory_kspace();
             allocate_memory_image();
+            // Copy array to pinned memory for better memory bandwidths!
+            copyNumpyArray(input_kspace, kspace_data.data);
+        }
+        else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
+        {
+            cast_pointer(input_kspace, kspace_data);
+            // Check if we have out image allocated
+            if (out_image.has_value())
+                cast_pointer(out_image.value(), image);
+            else
+            {
+                // We dont have out_image allocated. Warn and then allocate
+                py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!");
+                allocate_memory_image();
+            }
         }
         gpuNUFFT::Dimensions myDims = imgDims;
         if(dimension==2)
             myDims.depth = 1;
-        copyNumpyArray(input_kspace, kspace_data.data);
         if(grid_data)
             gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
         else
@@ -250,7 +271,7 @@ class GpuNUFFTPythonOperator
         if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
         {
             // Deallocate the memory (only k-space) to prevent memory leaks!
-            deallocate_memory_kspace();
+            deallocate_pinned_memory(&kspace_data);
         }
         if(has_sense_data == false)
           return py::array_t<std::complex<DType>>(

From 0d3dcf59195f3f3c1690bff9fa8de07161cfebd7 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 13:45:52 +0100
Subject: [PATCH 46/85] Completed coding the entire end to end

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 31253bbc..69fd8d41 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -329,7 +329,9 @@ class GpuNUFFTPythonOperator
 };
 PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
-        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool, MemoryAllocationType>()) // FIXME : Add defaul values!
+        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool, MemoryAllocationType>(),
+            py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR
+        ) // FIXME : Add defaul values!
         .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none())
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none())
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)

From 3f73176c0b79e2859b3969fe2fbea48da58d8064 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 15:19:25 +0100
Subject: [PATCH 47/85] Added codes

---
 .../gpuNUFFT_operator_python_factory.cpp      | 20 +++++++++++--------
 python/test_file.py                           | 13 ------------
 2 files changed, 12 insertions(+), 21 deletions(-)
 delete mode 100644 python/test_file.py

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 69fd8d41..05b13bd8 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -9,6 +9,7 @@ Carole Lazarus <carole.m.lazarus@gmail.com>
 #define GPUNUFFT_OPERATOR_PYTHON_FACTORY_H_INCLUDED
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
+#include <pybind11/stl.h>
 #include <pybind11/complex.h>
 #include "cufft.h"
 #include "cuda_runtime.h"
@@ -21,7 +22,6 @@ Carole Lazarus <carole.m.lazarus@gmail.com>
 #include <string>
 #include <cuda.h>
 
-
 namespace py = pybind11;
 
 template <typename TType>
@@ -202,7 +202,9 @@ class GpuNUFFTPythonOperator
         }
         else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
         {
+            printf("Not allocating memory in op\n");
             cast_pointer(input_image, image);
+            printf("Output kspace : %d\n", out_kspace.has_value());
             if(out_kspace.has_value())
                 cast_pointer(out_kspace.value(), kspace_data);
             else
@@ -210,6 +212,7 @@ class GpuNUFFTPythonOperator
                 // We dont have out_kspace allocated. Warn and then allocate
                 py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!");
                 allocate_memory_kspace();
+                cudaDeviceSynchronize();
             }
         }
         if(interpolate_data)
@@ -327,21 +330,22 @@ class GpuNUFFTPythonOperator
         delete gpuNUFFTOp;
     }
 };
+
 PYBIND11_MODULE(gpuNUFFT, m) {
+    py::enum_<MemoryAllocationType>(m, "MemoryAllocationType")
+        .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY)
+        .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR)
+        .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP)
+        .export_values();
+
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
         .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool, MemoryAllocationType>(),
             py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR
-        ) // FIXME : Add defaul values!
+        )
         .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none())
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none())
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
     
-    py::enum_<MemoryAllocationType>(m, "MemoryAllocationType")
-        .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY)
-        .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR)
-        .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP)
-        .export_values();
-
 }
 #endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED
diff --git a/python/test_file.py b/python/test_file.py
deleted file mode 100644
index e150184f..00000000
--- a/python/test_file.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import numpy as np
-from mri.operators import NonCartesianFFT
-from mri.operators.fourier.utils import estimate_density_compensation
-traj = np.load('/volatile/temp_traj.npy')
-
-
-for i in range(1):
-    dens = estimate_density_compensation(traj, (384, 384, 208))
-    fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=2, density_comp=dens)
-    print(i)
-    K = fourier.op(np.zeros((384, 384, 208)))
-    I = fourier.adj_op(K)
-    del fourier

From 04b420707112bd6917fb8039b1f793bdb66a3a3b Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 15:24:46 +0100
Subject: [PATCH 48/85] Fixes

---
 .../gpuNUFFT_operator_python_factory.cpp      | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 05b13bd8..236546da 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -96,23 +96,16 @@ class GpuNUFFTPythonOperator
     void allocate_memory_kspace()
     {
         allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2));
-        kspace_data.dim.length = trajectory_length;
-        kspace_data.dim.channels = n_coils;
+        
     }
 
     void allocate_memory_image()
     {
         image.dim = imgDims;
         if(has_sense_data == false)
-        {
           allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2));
-          image.dim.channels = n_coils;
-        }
         else
-        {
           allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2));
-          image.dim.channels = 1;
-        }
     }
     
     public:
@@ -142,7 +135,9 @@ class GpuNUFFTPythonOperator
             imgDims.depth = 0;
 
         n_coils = num_coils;
-
+        kspace_data.dim.length = trajectory_length;
+        kspace_data.dim.channels = n_coils;
+        
         // sensitivity maps
         py::buffer_info sense_maps_buffer = sense_maps.request();
         if (sense_maps_buffer.shape.size()==0)
@@ -182,7 +177,11 @@ class GpuNUFFTPythonOperator
         gpuNUFFTOp = factory.createGpuNUFFTOperator(
             kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width,
             osr, imgDims);
-        
+
+        if(has_sense_data == false)
+            image.dim.channels = n_coils;
+        else
+            image.dim.channels = 1;
         if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
         {
             allocate_memory_kspace();

From 7d17703e99cb13b0c3ffcbb261b1b6c2020eda47 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 15:29:29 +0100
Subject: [PATCH 49/85] fixes

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 236546da..46c4d4b5 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -96,7 +96,6 @@ class GpuNUFFTPythonOperator
     void allocate_memory_kspace()
     {
         allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2));
-        
     }
 
     void allocate_memory_image()
@@ -135,9 +134,12 @@ class GpuNUFFTPythonOperator
             imgDims.depth = 0;
 
         n_coils = num_coils;
+
+        // Setup all the sizes
         kspace_data.dim.length = trajectory_length;
-        kspace_data.dim.channels = n_coils;
-        
+        kspace_data.dim.channels = num_coils;
+        image.dim = imgDims;
+
         // sensitivity maps
         py::buffer_info sense_maps_buffer = sense_maps.request();
         if (sense_maps_buffer.shape.size()==0)

From 08059e7cd650fc0d8ca25df731726a49f3fccfed Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 15:36:02 +0100
Subject: [PATCH 50/85] major fixes

---
 .../python/gpuNUFFT_operator_python_factory.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 46c4d4b5..93fffc65 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -100,7 +100,6 @@ class GpuNUFFTPythonOperator
 
     void allocate_memory_image()
     {
-        image.dim = imgDims;
         if(has_sense_data == false)
           allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2));
         else
@@ -172,8 +171,8 @@ class GpuNUFFTPythonOperator
                 sensArray.dim = imgDims;
                 sensArray.dim.channels = n_coils;
                 copyNumpyArray(sense_maps, sensArray.data);
-                has_sense_data = true;
             }
+            has_sense_data = true;
         }
         factory.setBalanceWorkload(balance_workload);
         gpuNUFFTOp = factory.createGpuNUFFTOperator(
@@ -198,8 +197,6 @@ class GpuNUFFTPythonOperator
         {
             allocate_memory_kspace();
             allocate_memory_image();
-            // Copy array to pinned memory for better memory bandwidths!
-            copyNumpyArray(input_image, image.data);
         }
         else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
         {
@@ -216,6 +213,11 @@ class GpuNUFFTPythonOperator
                 cudaDeviceSynchronize();
             }
         }
+        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
+        {
+            // Copy array to pinned memory for better memory bandwidths!
+            copyNumpyArray(input_image, image.data);
+        }
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
@@ -245,8 +247,6 @@ class GpuNUFFTPythonOperator
         {
             allocate_memory_kspace();
             allocate_memory_image();
-            // Copy array to pinned memory for better memory bandwidths!
-            copyNumpyArray(input_kspace, kspace_data.data);
         }
         else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
         {
@@ -261,6 +261,11 @@ class GpuNUFFTPythonOperator
                 allocate_memory_image();
             }
         }
+        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
+        {
+            // Copy array to pinned memory for better memory bandwidths!
+            copyNumpyArray(input_kspace, kspace_data.data);
+        }
         gpuNUFFT::Dimensions myDims = imgDims;
         if(dimension==2)
             myDims.depth = 1;

From 10d461822962b5b6f13117464b3ccd8c8bf19174 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 15:42:55 +0100
Subject: [PATCH 51/85] Fix density comp

---
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp   | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 93fffc65..ab248a31 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -108,7 +108,7 @@ class GpuNUFFTPythonOperator
     
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
-    py::array_t<std::complex<DType>> sense_maps,  py::array_t<float> density_comp, int kernel_width=3,
+    py::array_t<std::complex<DType>> sense_maps,  std::optional<py::array_t<float>> density_comp, int kernel_width=3,
     int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory)
     {
         // k-space coordinates
@@ -119,8 +119,13 @@ class GpuNUFFTPythonOperator
         kSpaceTraj.dim.length = trajectory_length;
 
         // density compensation weights
-        gpuNUFFT::Array<DType> density_compArray = readNumpyArray(density_comp);
-        density_compArray.dim.length = trajectory_length;
+        gpuNUFFT::Array<DType> density_compArray;
+        if(density_comp.has_value())
+        {
+            density_compArray = readNumpyArray(density_comp.value());
+            density_compArray.dim.length = trajectory_length;
+            // No need else as the init is by default with 0 length and density comp is not applied
+        }
 
         // image size
         py::buffer_info img_dim = image_size.request();

From e656fc40ba95393521264859012378d24bc04a6f Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 15:57:55 +0100
Subject: [PATCH 52/85] All fixes

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ab248a31..beedd73b 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -108,7 +108,7 @@ class GpuNUFFTPythonOperator
     
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
-    py::array_t<std::complex<DType>> sense_maps,  std::optional<py::array_t<float>> density_comp, int kernel_width=3,
+    py::array_t<std::complex<DType>> sense_maps,  std::optional<py::array_t<DType>> density_comp, int kernel_width=3,
     int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory)
     {
         // k-space coordinates
@@ -350,7 +350,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
         .export_values();
 
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
-        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<float>, int, int, int, bool, MemoryAllocationType>(),
+        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, std::optional<py::array_t<DType>>, int, int, int, bool, MemoryAllocationType>(),
             py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR
         )
         .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none())

From 4d993b1752713e2f8010cd0cbbcba78dfdd7ef15 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 16:03:42 +0100
Subject: [PATCH 53/85] Fix free

---
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp    | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index beedd73b..ac0c2c84 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -205,9 +205,7 @@ class GpuNUFFTPythonOperator
         }
         else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
         {
-            printf("Not allocating memory in op\n");
             cast_pointer(input_image, image);
-            printf("Output kspace : %d\n", out_kspace.has_value());
             if(out_kspace.has_value())
                 cast_pointer(out_kspace.value(), kspace_data);
             else
@@ -336,8 +334,12 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
-        cudaFreeHost(kspace_data.data);
-        cudaFreeHost(image.data);
+        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
+        {
+            cudaFreeHost(kspace_data.data);
+            cudaFreeHost(image.data);
+        
+        }
         delete gpuNUFFTOp;
     }
 };

From 57d0160f18dae672ce07e78331b83e6ba458e8de Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 20 Nov 2023 16:04:00 +0100
Subject: [PATCH 54/85] added tests

---
 python/test_mem.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 python/test_mem.py

diff --git a/python/test_mem.py b/python/test_mem.py
new file mode 100644
index 00000000..80109902
--- /dev/null
+++ b/python/test_mem.py
@@ -0,0 +1,31 @@
+"""Script to test gpuNUFFT wrapper.
+Authors:
+Chaithya G R <chaithyagr@gmail.com>
+"""
+
+import numpy as np
+from gpuNUFFT import NUFFTOp, MemoryAllocationType
+import pytest
+
+
+def test_memory_allocation_types():
+    kspace_loc = np.random.random((5000, 3)) - 0.5
+    img_size = [256, 256, 256]
+    n_coils = 1
+    image = np.random.random(img_size) + 1j * np.random.random(img_size)
+    kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))
+    kspace_out = []
+    images_out = []
+    for mem_allocation_type in list(MemoryAllocationType.__members__.values()):
+        nufft_op = NUFFTOp(
+            kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
+            image_size=img_size,
+            num_coils=n_coils,
+            when_allocate_memory=mem_allocation_type,
+        )
+        kspace_out.append(nufft_op.op(input_image=image))
+        images_out.append(nufft_op.adj_op(kspace))
+    kspace_out
+    images_out
+    images_out
+    
\ No newline at end of file

From 212469371d9b93071cbd3227b6eaaca8cb40dbd4 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 10:09:42 +0100
Subject: [PATCH 55/85] Temp fixes

---
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 13 ++++++-------
 python/test_mem.py                                  | 11 +++++++----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ac0c2c84..85cb8573 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -213,7 +213,6 @@ class GpuNUFFTPythonOperator
                 // We dont have out_kspace allocated. Warn and then allocate
                 py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!");
                 allocate_memory_kspace();
-                cudaDeviceSynchronize();
             }
         }
         if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
@@ -334,12 +333,12 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
-        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
-        {
-            cudaFreeHost(kspace_data.data);
-            cudaFreeHost(image.data);
-        
-        }
+        py::print("Destructor called :: ", when_allocate_memory);
+        // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
+        // {
+        //     cudaFreeHost(kspace_data.data);
+        //     cudaFreeHost(image.data);
+        // }
         delete gpuNUFFTOp;
     }
 };
diff --git a/python/test_mem.py b/python/test_mem.py
index 80109902..17f9733f 100644
--- a/python/test_mem.py
+++ b/python/test_mem.py
@@ -16,15 +16,18 @@ def test_memory_allocation_types():
     kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))
     kspace_out = []
     images_out = []
+    nufft_ops = []
     for mem_allocation_type in list(MemoryAllocationType.__members__.values()):
-        nufft_op = NUFFTOp(
+        nufft_ops.append(NUFFTOp(
             kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
             image_size=img_size,
             num_coils=n_coils,
             when_allocate_memory=mem_allocation_type,
-        )
-        kspace_out.append(nufft_op.op(input_image=image))
-        images_out.append(nufft_op.adj_op(kspace))
+        ))
+        kspace_out.append(nufft_ops[-1].op(input_image=image))
+        images_out.append(nufft_ops[-1].adj_op(input_kspace=kspace))
+        if len(nufft_ops) > 1:
+            del nufft_ops[-2]
     kspace_out
     images_out
     images_out

From e9c9fef0c444ea68cbd9aca748c182a6077ec756 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 10:20:36 +0100
Subject: [PATCH 56/85] Test

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 85cb8573..89bec316 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -334,10 +334,11 @@ class GpuNUFFTPythonOperator
     ~GpuNUFFTPythonOperator()
     {
         py::print("Destructor called :: ", when_allocate_memory);
+        // We cant deallocate as we could have passed the memory to python!
         // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
         // {
-        //     cudaFreeHost(kspace_data.data);
-        //     cudaFreeHost(image.data);
+        //     deallocate_pinned_memory(&kspace_data);
+        //     deallocate_pinned_memory(&image);
         // }
         delete gpuNUFFTOp;
     }

From 38cefe94bc3359b347731d141011367d70a5ea94 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 10:44:28 +0100
Subject: [PATCH 57/85] Added memory warnings

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 89bec316..d6483950 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -190,6 +190,10 @@ class GpuNUFFTPythonOperator
             image.dim.channels = 1;
         if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
         {
+            py::print("
+                WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\n 
+                Please consider providing pinned memory yourself for speed and efficiency"
+            );
             allocate_memory_kspace();
             allocate_memory_image();
         }
@@ -335,6 +339,7 @@ class GpuNUFFTPythonOperator
     {
         py::print("Destructor called :: ", when_allocate_memory);
         // We cant deallocate as we could have passed the memory to python!
+        // FIXME, we will no longer support this!
         // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
         // {
         //     deallocate_pinned_memory(&kspace_data);

From 56298f83b631d48cfa0b4f5aa0ffb7a5aefceda1 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 11:15:46 +0100
Subject: [PATCH 58/85] Added additional tests, just before removing all
 options!

---
 .../gpuNUFFT_operator_python_factory.cpp      | 11 ++---
 python/test_mem.py                            | 48 +++++++++++++++----
 python/test_nufftOp.py                        |  1 -
 3 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index d6483950..a5317fa5 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -190,10 +190,10 @@ class GpuNUFFTPythonOperator
             image.dim.channels = 1;
         if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
         {
-            py::print("
-                WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\n 
-                Please consider providing pinned memory yourself for speed and efficiency"
-            );
+            py::print(" \
+                WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\
+                \nPlease consider providing pinned memory yourself for speed and efficiency\
+            ");
             allocate_memory_kspace();
             allocate_memory_image();
         }
@@ -337,7 +337,6 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
-        py::print("Destructor called :: ", when_allocate_memory);
         // We cant deallocate as we could have passed the memory to python!
         // FIXME, we will no longer support this!
         // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
@@ -358,7 +357,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
 
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
         .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, std::optional<py::array_t<DType>>, int, int, int, bool, MemoryAllocationType>(),
-            py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR
+            py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_OP
         )
         .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none())
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none())
diff --git a/python/test_mem.py b/python/test_mem.py
index 17f9733f..93adb852 100644
--- a/python/test_mem.py
+++ b/python/test_mem.py
@@ -8,6 +8,8 @@
 import pytest
 
 
+
+
 def test_memory_allocation_types():
     kspace_loc = np.random.random((5000, 3)) - 0.5
     img_size = [256, 256, 256]
@@ -16,19 +18,49 @@ def test_memory_allocation_types():
     kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))
     kspace_out = []
     images_out = []
-    nufft_ops = []
+    nufft_op = []
     for mem_allocation_type in list(MemoryAllocationType.__members__.values()):
-        nufft_ops.append(NUFFTOp(
+        nufft_op = NUFFTOp(
             kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
             image_size=img_size,
             num_coils=n_coils,
             when_allocate_memory=mem_allocation_type,
-        ))
-        kspace_out.append(nufft_ops[-1].op(input_image=image))
-        images_out.append(nufft_ops[-1].adj_op(input_kspace=kspace))
-        if len(nufft_ops) > 1:
-            del nufft_ops[-2]
+        )
+        kspace_out.append(nufft_op.op(input_image=image))
+        images_out.append(nufft_op.adj_op(input_kspace=kspace))
+        del nufft_op
     kspace_out
     images_out
     images_out
-    
\ No newline at end of file
+    
+    
+def test_pinned_memory_provided():
+    import cupyx as cpx
+    
+    kspace_loc = np.random.random((5000, 3)) - 0.5
+    img_size = [256, 256, 256]
+    n_coils = 1
+    image = np.random.random(img_size) + 1j * np.random.random(img_size)
+    kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))
+    
+    image_out = cpx.empty_like_pinned(image)
+    kspace_out = cpx.empty_like_pinned(kspace)
+    
+    nufft_ori = NUFFTOp(
+        kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
+        image_size=img_size,
+        num_coils=n_coils,
+        when_allocate_memory=MemoryAllocationType.ALLOCATE_MEMORY_IN_OP,
+    )
+    ori_kspace_out = nufft_ori.op(input_image=image)
+    ori_image_out = nufft_ori.adj_op(input_kspace=kspace)
+    
+    nufft_op = NUFFTOp(
+        kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
+        image_size=img_size,
+        num_coils=n_coils,
+        when_allocate_memory=MemoryAllocationType.NEVER_ALLOCATE_MEMORY,
+    )
+    out_ksp = nufft_op.op(input_image=image, out_kspace=kspace_out)
+    out_im = nufft_op.adj_op(input_kspace=kspace, out_image=image_out)
+    out_ksp
\ No newline at end of file
diff --git a/python/test_nufftOp.py b/python/test_nufftOp.py
index 4d7a1387..8e272fdb 100644
--- a/python/test_nufftOp.py
+++ b/python/test_nufftOp.py
@@ -5,7 +5,6 @@
 """
 
 import numpy as np
-import numpy.matlib
 import matplotlib.pyplot as plt
 from gpuNUFFT import NUFFTOp
 import unittest

From 436a875152ef8478d58cc30220545a15219b9e9b Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 11:21:18 +0100
Subject: [PATCH 59/85] Added additional tests, just before removing all
 options!

---
 .../gpuNUFFT_operator_python_factory.cpp      | 105 +++---------------
 1 file changed, 13 insertions(+), 92 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index a5317fa5..4cfb4e1e 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -77,11 +77,6 @@ void cast_pointer(py::array_t<std::complex<DType>> data, gpuNUFFT::Array<TType>
     copy_data.data = my_data;
 }
 
-enum MemoryAllocationType{
-        NEVER_ALLOCATE_MEMORY = 0,
-        ALLOCATE_MEMORY_IN_CONSTRUCTOR = 1,
-        ALLOCATE_MEMORY_IN_OP = 2
-    };
 
 class GpuNUFFTPythonOperator
 {
@@ -89,7 +84,6 @@ class GpuNUFFTPythonOperator
     gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp;
     int trajectory_length, n_coils, dimension;
     bool has_sense_data;
-    MemoryAllocationType when_allocate_memory;
     gpuNUFFT::Dimensions imgDims;
     // sensitivity maps
     gpuNUFFT::Array<DType2> sensArray, kspace_data, image;
@@ -109,7 +103,7 @@ class GpuNUFFTPythonOperator
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
     py::array_t<std::complex<DType>> sense_maps,  std::optional<py::array_t<DType>> density_comp, int kernel_width=3,
-    int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory)
+    int sector_width=8, int osr=2, bool balance_workload=1) 
     {
         // k-space coordinates
         py::buffer_info sample_loc = kspace_loc.request();
@@ -188,42 +182,13 @@ class GpuNUFFTPythonOperator
             image.dim.channels = n_coils;
         else
             image.dim.channels = 1;
-        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
-        {
-            py::print(" \
-                WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\
-                \nPlease consider providing pinned memory yourself for speed and efficiency\
-            ");
-            allocate_memory_kspace();
-            allocate_memory_image();
-        }
         cudaDeviceSynchronize();
     }
 
-    py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> input_image, bool interpolate_data, std::optional<py::array_t<std::complex<DType>>> out_kspace)
+    py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> in_image, py::array_t<std::complex<DType>> out_kspace, bool interpolate_data)
     {
-        if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
-        {
-            allocate_memory_kspace();
-            allocate_memory_image();
-        }
-        else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
-        {
-            cast_pointer(input_image, image);
-            if(out_kspace.has_value())
-                cast_pointer(out_kspace.value(), kspace_data);
-            else
-            {
-                // We dont have out_kspace allocated. Warn and then allocate
-                py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!");
-                allocate_memory_kspace();
-            }
-        }
-        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
-        {
-            // Copy array to pinned memory for better memory bandwidths!
-            copyNumpyArray(input_image, image.data);
-        }
+        cast_pointer(in_image, image);
+        cast_pointer(out_kspace.value(), kspace_data);
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
@@ -232,11 +197,6 @@ class GpuNUFFTPythonOperator
         std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*kspace_data.data);
         auto capsule = py::capsule(ptr, [](void *ptr) { return;
         });
-        if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
-        {
-            // Deallocate the memory (only image) to prevent memory leaks!
-            deallocate_pinned_memory(&image);
-        }
         return py::array_t<std::complex<DType>>(
             { n_coils, trajectory_length },
             {
@@ -247,31 +207,10 @@ class GpuNUFFTPythonOperator
             capsule
         );
     }
-    py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> input_kspace, bool grid_data, std::optional<py::array_t<std::complex<DType>>> out_image)
+    py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> in_kspace, py::array_t<std::complex<DType>> out_image, bool grid_data)
     {
-        if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
-        {
-            allocate_memory_kspace();
-            allocate_memory_image();
-        }
-        else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY)
-        {
-            cast_pointer(input_kspace, kspace_data);
-            // Check if we have out image allocated
-            if (out_image.has_value())
-                cast_pointer(out_image.value(), image);
-            else
-            {
-                // We dont have out_image allocated. Warn and then allocate
-                py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!");
-                allocate_memory_image();
-            }
-        }
-        if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
-        {
-            // Copy array to pinned memory for better memory bandwidths!
-            copyNumpyArray(input_kspace, kspace_data.data);
-        }
+        cast_pointer(in_kspace, kspace_data);
+        cast_pointer(out_image.value(), image);
         gpuNUFFT::Dimensions myDims = imgDims;
         if(dimension==2)
             myDims.depth = 1;
@@ -283,11 +222,6 @@ class GpuNUFFTPythonOperator
         std::complex<DType> *ptr = reinterpret_cast<std::complex<DType>(&)[0]>(*image.data);
         auto capsule = py::capsule(ptr, [](void *ptr) { return;
         });
-        if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP)
-        {
-            // Deallocate the memory (only k-space) to prevent memory leaks!
-            deallocate_pinned_memory(&kspace_data);
-        }
         if(has_sense_data == false)
           return py::array_t<std::complex<DType>>(
             {
@@ -320,12 +254,13 @@ class GpuNUFFTPythonOperator
             ptr,
             capsule
       );
-
     }
+
     void clean_memory()
     {
        gpuNUFFTOp->clean_memory();
     }
+
     void set_smaps(py::array_t<std::complex<DType>> sense_maps)
     {
         py::buffer_info myData = sense_maps.request();
@@ -337,30 +272,16 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
-        // We cant deallocate as we could have passed the memory to python!
-        // FIXME, we will no longer support this!
-        // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR)
-        // {
-        //     deallocate_pinned_memory(&kspace_data);
-        //     deallocate_pinned_memory(&image);
-        // }
         delete gpuNUFFTOp;
     }
 };
 
 PYBIND11_MODULE(gpuNUFFT, m) {
-    py::enum_<MemoryAllocationType>(m, "MemoryAllocationType")
-        .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY)
-        .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR)
-        .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP)
-        .export_values();
-
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
-        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, std::optional<py::array_t<DType>>, int, int, int, bool, MemoryAllocationType>(),
-            py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_OP
-        )
-        .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none())
-        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none())
+        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, std::optional<py::array_t<DType>>, int, int, int, bool>(),
+            py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true)
+        .def("op", &GpuNUFFTPythonOperator::op, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false)
+        .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("in_kspace"), py::arg("out_image"),  py::arg("grid_data") = false)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
     

From a5549262436ecefceb17c31d21261f6a9f186b79 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 11:34:10 +0100
Subject: [PATCH 60/85] Working with warnings

---
 .../gpuNUFFT_operator_python_factory.cpp      | 50 +++++++++----------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 4cfb4e1e..ef4fee0f 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -46,6 +46,21 @@ readNumpyArray(py::array_t<std::complex<DType>> data)
     return dataArray;
 }
 
+
+void warn_pinned_memory(py::array_t<std::complex<DType>> array)
+{
+    py::buffer_info buffer = array.request();
+    cudaPointerAttributes attr;
+    if(DEBUG)
+        printf("Value of sense_maps pointer == 0x%x\n", buffer.ptr);
+    cudaPointerGetAttributes(&attr, buffer.ptr);
+    if(DEBUG)
+        printf("Value of attr.cudaMemoryType2 = %d\n", attr.type);
+    bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
+    if(!is_pinned_memory)
+        py::print("WARNING:: The data is NOT pinned! This will be slow, consider pinning\n");
+}
+
 void allocate_pinned_memory(gpuNUFFT::Array<DType2> *lin_array, unsigned long int size)
 {
   DType2 *new_data;
@@ -69,15 +84,19 @@ void copyNumpyArray(py::array_t<std::complex<DType>> data, TType *copy_data)
 }
 
 template <typename TType>
-void cast_pointer(py::array_t<std::complex<DType>> data, gpuNUFFT::Array<TType> &copy_data)
+void cast_pointer(py::array_t<std::complex<DType>> data, gpuNUFFT::Array<TType> &copy_data, bool warn=true)
 {
     py::buffer_info myData = data.request();
     std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
     TType *my_data = reinterpret_cast<TType(&)[0]>(*t_data);
     copy_data.data = my_data;
+    if (warn)
+        warn_pinned_memory(data);
 }
 
 
+
+
 class GpuNUFFTPythonOperator
 {
     gpuNUFFT::GpuNUFFTOperatorFactory factory;
@@ -147,30 +166,7 @@ class GpuNUFFTPythonOperator
         }
         else
         {
-            cudaPointerAttributes attr;
-            if(DEBUG)
-                printf("Value of sense_maps pointer == 0x%x or %d\n", sense_maps_buffer.ptr, sense_maps_buffer.ptr);
-            cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr);
-            if(DEBUG)
-                printf("Value of attr.cudaMemoryType2 = %d\n", attr.type);
-            bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
-            if(is_pinned_memory)
-            {
-                if(DEBUG)
-                    printf("The smaps data is pinned!, skipping copies\n");
-                // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned
-                std::complex<DType> *t_data = (std::complex<DType> *) sense_maps_buffer.ptr;
-                sensArray.data = reinterpret_cast<DType2(&)[0]>(*t_data);
-            }
-            else
-            {
-                if(DEBUG)
-                    printf("The smaps data is NOT pinned!, DOING copies\n");
-                allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2));
-                sensArray.dim = imgDims;
-                sensArray.dim.channels = n_coils;
-                copyNumpyArray(sense_maps, sensArray.data);
-            }
+            cast_pointer(sense_maps, sensArray);
             has_sense_data = true;
         }
         factory.setBalanceWorkload(balance_workload);
@@ -188,7 +184,7 @@ class GpuNUFFTPythonOperator
     py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> in_image, py::array_t<std::complex<DType>> out_kspace, bool interpolate_data)
     {
         cast_pointer(in_image, image);
-        cast_pointer(out_kspace.value(), kspace_data);
+        cast_pointer(out_kspace, kspace_data);
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
@@ -210,7 +206,7 @@ class GpuNUFFTPythonOperator
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> in_kspace, py::array_t<std::complex<DType>> out_image, bool grid_data)
     {
         cast_pointer(in_kspace, kspace_data);
-        cast_pointer(out_image.value(), image);
+        cast_pointer(out_image, image);
         gpuNUFFT::Dimensions myDims = imgDims;
         if(dimension==2)
             myDims.depth = 1;

From c71792cd8e741ff2ff112b86486cd26a48779e6f Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 12:51:26 +0100
Subject: [PATCH 61/85] Commited

---
 .../gpuNUFFT_operator_python_factory.cpp      | 27 ++++++++++---------
 python/test_mem.py                            | 19 ++++---------
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index ef4fee0f..8e8f9bbd 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -21,6 +21,7 @@ Carole Lazarus <carole.m.lazarus@gmail.com>
 #include <vector>     // std::vector
 #include <string>
 #include <cuda.h>
+#define CAST_POINTER_VARNAME(x, y)   cast_pointer(x, y, #x)
 
 namespace py = pybind11;
 
@@ -47,18 +48,18 @@ readNumpyArray(py::array_t<std::complex<DType>> data)
 }
 
 
-void warn_pinned_memory(py::array_t<std::complex<DType>> array)
+void warn_pinned_memory(py::array_t<std::complex<DType>> array, const char * name)
 {
     py::buffer_info buffer = array.request();
     cudaPointerAttributes attr;
-    if(DEBUG)
-        printf("Value of sense_maps pointer == 0x%x\n", buffer.ptr);
+    // if(DEBUG)
+        printf("%s => Value of pointer == 0x%x\n", name, buffer.ptr);
     cudaPointerGetAttributes(&attr, buffer.ptr);
-    if(DEBUG)
-        printf("Value of attr.cudaMemoryType2 = %d\n", attr.type);
+    //if(DEBUG)
+        printf("%s => of attr.cudaMemoryType = %d\n", name, attr.type);
     bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
     if(!is_pinned_memory)
-        py::print("WARNING:: The data is NOT pinned! This will be slow, consider pinning\n");
+        py::print("WARNING:: The data", name , "is NOT pinned! This will be slow, consider pinning\n");
 }
 
 void allocate_pinned_memory(gpuNUFFT::Array<DType2> *lin_array, unsigned long int size)
@@ -84,14 +85,14 @@ void copyNumpyArray(py::array_t<std::complex<DType>> data, TType *copy_data)
 }
 
 template <typename TType>
-void cast_pointer(py::array_t<std::complex<DType>> data, gpuNUFFT::Array<TType> &copy_data, bool warn=true)
+void cast_pointer(py::array_t<std::complex<DType>> data, gpuNUFFT::Array<TType> &copy_data, const char * name , bool warn=true)
 {
     py::buffer_info myData = data.request();
     std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
     TType *my_data = reinterpret_cast<TType(&)[0]>(*t_data);
     copy_data.data = my_data;
     if (warn)
-        warn_pinned_memory(data);
+        warn_pinned_memory(data, name);
 }
 
 
@@ -166,7 +167,7 @@ class GpuNUFFTPythonOperator
         }
         else
         {
-            cast_pointer(sense_maps, sensArray);
+            CAST_POINTER_VARNAME(sense_maps, sensArray);
             has_sense_data = true;
         }
         factory.setBalanceWorkload(balance_workload);
@@ -183,8 +184,8 @@ class GpuNUFFTPythonOperator
 
     py::array_t<std::complex<DType>> op(py::array_t<std::complex<DType>> in_image, py::array_t<std::complex<DType>> out_kspace, bool interpolate_data)
     {
-        cast_pointer(in_image, image);
-        cast_pointer(out_kspace, kspace_data);
+        CAST_POINTER_VARNAME(in_image, image);
+        CAST_POINTER_VARNAME(out_kspace, kspace_data);
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
         else
@@ -205,8 +206,8 @@ class GpuNUFFTPythonOperator
     }
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> in_kspace, py::array_t<std::complex<DType>> out_image, bool grid_data)
     {
-        cast_pointer(in_kspace, kspace_data);
-        cast_pointer(out_image, image);
+        CAST_POINTER_VARNAME(in_kspace, kspace_data);
+        CAST_POINTER_VARNAME(out_image, image);
         gpuNUFFT::Dimensions myDims = imgDims;
         if(dimension==2)
             myDims.depth = 1;
diff --git a/python/test_mem.py b/python/test_mem.py
index 93adb852..880f521e 100644
--- a/python/test_mem.py
+++ b/python/test_mem.py
@@ -4,7 +4,7 @@
 """
 
 import numpy as np
-from gpuNUFFT import NUFFTOp, MemoryAllocationType
+from gpuNUFFT import NUFFTOp
 import pytest
 
 
@@ -43,24 +43,15 @@ def test_pinned_memory_provided():
     image = np.random.random(img_size) + 1j * np.random.random(img_size)
     kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))
     
-    image_out = cpx.empty_like_pinned(image)
-    kspace_out = cpx.empty_like_pinned(kspace)
+    image_out = cpx.zeros_like_pinned(image)
+    kspace_out = cpx.zeros_like_pinned(kspace)
     
-    nufft_ori = NUFFTOp(
-        kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
-        image_size=img_size,
-        num_coils=n_coils,
-        when_allocate_memory=MemoryAllocationType.ALLOCATE_MEMORY_IN_OP,
-    )
-    ori_kspace_out = nufft_ori.op(input_image=image)
-    ori_image_out = nufft_ori.adj_op(input_kspace=kspace)
     
     nufft_op = NUFFTOp(
         kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
         image_size=img_size,
         num_coils=n_coils,
-        when_allocate_memory=MemoryAllocationType.NEVER_ALLOCATE_MEMORY,
     )
-    out_ksp = nufft_op.op(input_image=image, out_kspace=kspace_out)
-    out_im = nufft_op.adj_op(input_kspace=kspace, out_image=image_out)
+    out_ksp = nufft_op.op(in_image=image, out_kspace=kspace_out)
+    out_im = nufft_op.adj_op(in_kspace=kspace, out_image=image_out)
     out_ksp
\ No newline at end of file

From 7cdeaff6d577175f2548d6a2137d8b4b3e248dee Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 16:00:25 +0100
Subject: [PATCH 62/85] Added codes

---
 .../gpu/python/gpuNUFFT_operator_python_factory.cpp    |  4 ++--
 python/test_mem.py                                     | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 8e8f9bbd..8a8992b6 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -52,10 +52,10 @@ void warn_pinned_memory(py::array_t<std::complex<DType>> array, const char * nam
 {
     py::buffer_info buffer = array.request();
     cudaPointerAttributes attr;
-    // if(DEBUG)
+    if(DEBUG)
         printf("%s => Value of pointer == 0x%x\n", name, buffer.ptr);
     cudaPointerGetAttributes(&attr, buffer.ptr);
-    //if(DEBUG)
+    if(DEBUG)
         printf("%s => of attr.cudaMemoryType = %d\n", name, attr.type);
     bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
     if(!is_pinned_memory)
diff --git a/python/test_mem.py b/python/test_mem.py
index 880f521e..2c373ae7 100644
--- a/python/test_mem.py
+++ b/python/test_mem.py
@@ -40,18 +40,18 @@ def test_pinned_memory_provided():
     kspace_loc = np.random.random((5000, 3)) - 0.5
     img_size = [256, 256, 256]
     n_coils = 1
-    image = np.random.random(img_size) + 1j * np.random.random(img_size)
-    kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))
+    image = (np.random.random(img_size) + 1j * np.random.random(img_size)).astype(np.complex64)
+    kspace = (np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))).astype(np.complex64)
     
     image_out = cpx.zeros_like_pinned(image)
     kspace_out = cpx.zeros_like_pinned(kspace)
-    
-    
+    print("Addresses: ", hex(kspace_out.ctypes.data), hex(image_out.ctypes.data))
+ 
     nufft_op = NUFFTOp(
         kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32),
         image_size=img_size,
         num_coils=n_coils,
     )
-    out_ksp = nufft_op.op(in_image=image, out_kspace=kspace_out)
+    out_ksp = nufft_op.op(image, kspace_out)
     out_im = nufft_op.adj_op(in_kspace=kspace, out_image=image_out)
     out_ksp
\ No newline at end of file

From 8e3bb20d2ad5e1db6e53ea470d5dc91daa6d0f15 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 21 Nov 2023 16:11:08 +0100
Subject: [PATCH 63/85] Added to stderr

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 8a8992b6..aafcb60e 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -59,7 +59,7 @@ void warn_pinned_memory(py::array_t<std::complex<DType>> array, const char * nam
         printf("%s => of attr.cudaMemoryType = %d\n", name, attr.type);
     bool is_pinned_memory = attr.type ==  cudaMemoryTypeHost;
     if(!is_pinned_memory)
-        py::print("WARNING:: The data", name , "is NOT pinned! This will be slow, consider pinning\n");
+        std::cerr<<"WARNING:: The data"<<name<<"is NOT pinned! This will be slow, consider pinning\n";
 }
 
 void allocate_pinned_memory(gpuNUFFT::Array<DType2> *lin_array, unsigned long int size)

From 6b2ef10037ab07d91adf80c3fced79780ce43096 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Tue, 28 Nov 2023 16:26:35 +0100
Subject: [PATCH 64/85] Fixes for smaps

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index aafcb60e..3c98ffd3 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -168,6 +168,8 @@ class GpuNUFFTPythonOperator
         else
         {
             CAST_POINTER_VARNAME(sense_maps, sensArray);
+            sensArray.dim = imgDims;
+            sensArray.dim.channels = n_coils;
             has_sense_data = true;
         }
         factory.setBalanceWorkload(balance_workload);

From e31085f7929703b5150e780266187e8aab994cfe Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Wed, 29 Nov 2023 16:26:41 +0100
Subject: [PATCH 65/85] Update gpuNUFFT_operator_python_factory.cpp

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 3c98ffd3..d5f1e214 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -262,10 +262,7 @@ class GpuNUFFTPythonOperator
 
     void set_smaps(py::array_t<std::complex<DType>> sense_maps)
     {
-        py::buffer_info myData = sense_maps.request();
-        std::complex<DType> *t_data = (std::complex<DType> *) myData.ptr;
-        DType2 *my_data = reinterpret_cast<DType2(&)[0]>(*t_data);
-        memcpy(sensArray.data, my_data, myData.size*sizeof(DType2));
+        CAST_POINTER_VARNAME(sense_maps, sensArray);
         has_sense_data = true;
         gpuNUFFTOp->setSens(sensArray);
     }

From 4517fcb5d1b70785656613627558a3f3baa19fc6 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 5 Jan 2024 09:14:28 +0100
Subject: [PATCH 66/85] Update setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index bd0f53e7..2f80798c 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.5.0",
+    version="0.6.0",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From c0969138b29decec2bd875e5536cba02ce94df6f Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Fri, 5 Jan 2024 17:29:52 +0100
Subject: [PATCH 67/85] version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2f80798c..757a6775 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.6.0",
+    version="0.6.1",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From a8e2b259f35f6be6c19eb77e46a9ca2b63873ace Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Wed, 10 Jan 2024 16:52:30 +0100
Subject: [PATCH 68/85] Update setup.py

---
 setup.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 757a6775..700d0635 100644
--- a/setup.py
+++ b/setup.py
@@ -6,10 +6,7 @@
 import platform
 from pprint import pprint
 import subprocess
-try:
-    from pip._internal.main import main as pip_main
-except ImportError:
-    from pip._internal import main as pip_main
+
 
 release_info = {}
 
@@ -31,8 +28,9 @@ def _preinstall(package_list, options=[]):
 
         if not isinstance(package_list, list) or not isinstance(options, list):
             raise TypeError('preinstall inputs must be of type list.')
-
-        pip_main(['install'] + options + package_list)
+        subprocess.check_call(
+            [sys.executable, '-m', 'pip', 'install', options + package_list]
+        )
 
 
     def _set_pybind_path(self):
@@ -107,7 +105,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.6.1",
+    version="0.6.2",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     package_dir={"": "CUDA/bin"},
     ext_modules=[

From 611790a6643b1cf1fdcfc05f23353e0e1ff5d39c Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Wed, 10 Jan 2024 17:21:19 +0100
Subject: [PATCH 69/85] Fix setup

---
 setup.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 700d0635..3af7ebc9 100644
--- a/setup.py
+++ b/setup.py
@@ -22,14 +22,12 @@ class CMakeBuild(build_ext):
     """
 
     @staticmethod
-    def _preinstall(package_list, options=[]):
+    def _preinstall(package):
         """ Pre-install PyPi packages before running cmake.
         """
 
-        if not isinstance(package_list, list) or not isinstance(options, list):
-            raise TypeError('preinstall inputs must be of type list.')
         subprocess.check_call(
-            [sys.executable, '-m', 'pip', 'install', options + package_list]
+            [sys.executable, '-m', 'pip', 'install', package]
         )
 
 
@@ -42,10 +40,10 @@ def run(self):
         """ Redifine the run method.
         """
         # Set preinstall requirements
-        preinstall_list = ["pybind11"]
+        preinstall = "pybind11"
 
         # Preinstall packages
-        self._preinstall(preinstall_list)
+        self._preinstall(preinstall)
 
         # Set Pybind11 path
         self._set_pybind_path()

From f38ee69cfba661a231cdf9fded167fd2a5e6c57a Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 12 Feb 2024 13:30:59 +0100
Subject: [PATCH 70/85] Working with fixed python Lib

---
 CUDA/src/gpu/python/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt
index 494255c2..200b884b 100644
--- a/CUDA/src/gpu/python/CMakeLists.txt
+++ b/CUDA/src/gpu/python/CMakeLists.txt
@@ -1,6 +1,5 @@
 cmake_minimum_required(VERSION 3.15)
-find_package(PythonInterp 3.5 REQUIRED)
-find_package(PythonLibs 3.5 REQUIRED)
+find_package(Python 3.8 REQUIRED)
 MESSAGE(STATUS "Building Python interface")
 
 include_directories(

From 06e10b46c992a41cd881043ac9d8c2de5e32446b Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Mon, 12 Feb 2024 15:42:03 +0100
Subject: [PATCH 71/85] FIXED Dens

---
 CUDA/inc/gpuNUFFT_kernels.hpp                 |  2 ++
 CUDA/src/gpu/python/CMakeLists.txt            | 14 ++++----
 .../gpuNUFFT_operator_python_factory.cpp      | 33 ++++++++++++++-----
 CUDA/src/gpu/std_gpuNUFFT_kernels.cu          | 19 +++++++++++
 MANIFEST.in                                   |  5 +++
 setup.py                                      |  3 +-
 6 files changed, 60 insertions(+), 16 deletions(-)
 create mode 100644 MANIFEST.in

diff --git a/CUDA/inc/gpuNUFFT_kernels.hpp b/CUDA/inc/gpuNUFFT_kernels.hpp
index 9966becc..cd4861dd 100644
--- a/CUDA/inc/gpuNUFFT_kernels.hpp
+++ b/CUDA/inc/gpuNUFFT_kernels.hpp
@@ -415,4 +415,6 @@ void performPadding(DType2 *imdata_d, CufftType *gdata_d,
   */
 void precomputeDeapodization(DType *deapo_d, gpuNUFFT::GpuNUFFTInfo *gi_host);
 
+void performUpdateDensityComp(DType2* density_data, DType2* estimation_data, long int n_samples);
+ 
 #endif
diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt
index 200b884b..722fab0c 100644
--- a/CUDA/src/gpu/python/CMakeLists.txt
+++ b/CUDA/src/gpu/python/CMakeLists.txt
@@ -1,23 +1,25 @@
 cmake_minimum_required(VERSION 3.15)
-find_package(Python 3.8 REQUIRED)
+find_package(Python3 3.8 REQUIRED COMPONENTS Interpreter Development)
+
 MESSAGE(STATUS "Building Python interface")
+MESSAGE("Pybind11 include dir ${PYBIND11_INCLUDE_DIR}")
+MESSAGE("Python include dir ${Python3_INCLUDE_DIRS}")
+MESSAGE("Found ${Python3_LIBRARIES}")
 
 include_directories(
     ${GPUNUFFT_INC_DIR}
     ${PYBIND11_INCLUDE_DIR}
-    ${PYTHON_INCLUDE_DIR}
+    ${Python3_INCLUDE_DIRS}
     )
 cuda_include_directories(${GPUNUFFT_INC_DIR})
 cuda_add_library(gpuNUFFT  ${GPU_CU_SOURCES}  ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} gpuNUFFT_operator_python_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../atomic/atomic_gpuNUFFT.cu SHARED)
 set_target_properties(gpuNUFFT PROPERTIES PREFIX "")
 
+
 if(WIN32)
-    MESSAGE("Pybind11 include dir ${PYBIND11_INCLUDE_DIR}")
-    MESSAGE("Python include dir ${PYTHON_INCLUDE_DIR}")
-    MESSAGE("Found ${PYTHON_LIBRARIES}")
     set_target_properties(gpuNUFFT PROPERTIES SUFFIX ".pyd")
 
-    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES})
+    TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${Python3_LIBRARIES})
 elseif(UNIX)
     TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES})
 endif(WIN32)
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 2dbf0775..330c735e 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -22,6 +22,8 @@ Carole Lazarus <carole.m.lazarus@gmail.com>
 #include <vector>     // std::vector
 #include <string>
 #include <cuda.h>
+#include <cstdint>
+
 #define CAST_POINTER_VARNAME(x, y)   cast_pointer(x, y, #x)
 
 namespace py = pybind11;
@@ -63,9 +65,10 @@ void warn_pinned_memory(py::array_t<std::complex<DType>> array, const char * nam
         std::cerr<<"WARNING:: The data"<<name<<"is NOT pinned! This will be slow, consider pinning\n";
 }
 
-void allocate_pinned_memory(gpuNUFFT::Array<DType2> *lin_array, unsigned long int size)
+template <typename TType>
+void allocate_pinned_memory(gpuNUFFT::Array<TType> *lin_array, unsigned long int size)
 {
-  DType2 *new_data;
+  TType *new_data;
   cudaMallocHost((void **)&new_data, size);
   lin_array->data = new_data;
 }
@@ -206,6 +209,18 @@ class GpuNUFFTPythonOperator
             capsule
         );
     }
+
+    void op_direct(intptr_t in_image, intptr_t out_kspace, bool interpolate_data)
+    {
+        image.data = reinterpret_cast<DType2(&)[0]>(in_image);
+        kspace_data.data = reinterpret_cast<DType2(&)[0]>(out_kspace);
+        if(interpolate_data)
+            gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
+        else
+            gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data);
+        cudaDeviceSynchronize();
+    }
+    
     py::array_t<std::complex<DType>> adj_op(py::array_t<std::complex<DType>> in_kspace, py::array_t<std::complex<DType>> out_image, bool grid_data)
     {
         CAST_POINTER_VARNAME(in_kspace, kspace_data);
@@ -275,7 +290,7 @@ class GpuNUFFTPythonOperator
         densArray.dim.length = n_samples;
 
         // TODO: Allocate directly on device and set with kernel.
-        for (int cnt = 0; cnt < n_samples; cnt++)
+        for (long int cnt = 0; cnt < n_samples; cnt++)
         {
           densArray.data[cnt].x = 1.0;
           densArray.data[cnt].y = 0.0;
@@ -306,8 +321,7 @@ class GpuNUFFTPythonOperator
                                          gpuNUFFT::DENSITY_ESTIMATION);
           gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, densEstimation_gpu,
                                              gpuNUFFT::DENSITY_ESTIMATION);
-          performUpdateDensityComp(densArray_gpu.data, densEstimation_gpu.data,
-                                   n_samples);
+          performUpdateDensityComp(densArray_gpu.data, densEstimation_gpu.data, n_samples);
           if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
                 printf("error at adj thread synchronization d: %s\n",
                        cudaGetErrorString(cudaGetLastError()));
@@ -333,6 +347,9 @@ class GpuNUFFTPythonOperator
                                   capsule);
     }
 
+    
+
+
     float get_spectral_radius(int max_iter = 20,float tolerance = 1e-6)
     {
         int im_size = image.count();
@@ -400,8 +417,8 @@ PYBIND11_MODULE(gpuNUFFT, m) {
         .def("op", &GpuNUFFTPythonOperator::op, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false)
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("in_kspace"), py::arg("out_image"),  py::arg("grid_data") = false)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
-        .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py:arg("max_iter") = 10)
-        .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps);
-        .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py:arg("max_iter") = 20, py:arg("tolerance") = 1e-6);
+        .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10)
+        .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps)
+        .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6);
 }
 #endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED
diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
index b0fe0e8b..a48d5532 100644
--- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
@@ -33,6 +33,25 @@ void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count)
   }
 }
 
+__global__ void updateDensityCompKernel(DType2* density_data, DType2* estimation_data, long int N)
+{
+  long int t = threadIdx.x + blockIdx.x * blockDim.x;
+  while (t < N)
+  {
+    DType2 data_p = density_data[t];
+    DType2 esti_p = estimation_data[t];
+    data_p.x *= rsqrtf(esti_p.x * esti_p.x + esti_p.y * esti_p.y);
+    density_data[t] = data_p;
+    t = t + blockDim.x*gridDim.x;
+  } 
+}
+
+void performUpdateDensityComp(DType2* density_data, DType2* estimation_data, long int n_samples)
+{
+  dim3 block_dim(64, 1, 8);
+  dim3 grid_dim(getOptimalGridDim(n_samples,THREAD_BLOCK_SIZE));
+  updateDensityCompKernel<<<grid_dim,block_dim>>>(density_data, estimation_data, n_samples);
+}
 
 void initTexture(const char* symbol, cudaArray** devicePtr, gpuNUFFT::Array<DType> hostTexture)
 {
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..af69ea9f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,5 @@
+recursive-include CUDA/src *
+recursive-include CUDA/doc *
+recursive-include CUDA/inc *
+include CUDA/CMakeLists.txt
+
diff --git a/setup.py b/setup.py
index 3af7ebc9..2d513628 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 import os
 import sys
-from setuptools import setup, Extension, find_packages
+from setuptools import setup, Extension, find_namespace_packages
 from setuptools.command.build_ext import build_ext
 from importlib import import_module
 import platform
@@ -105,7 +105,6 @@ def build_extension(self, ext):
     name="gpuNUFFT",
     version="0.6.2",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
-    package_dir={"": "CUDA/bin"},
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),
     ],

From 5351ac709321a44917c134d3383b385cea942e82 Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Tue, 13 Feb 2024 11:19:44 +0100
Subject: [PATCH 72/85] Working built: GPU and CPU both present

---
 .../gpuNUFFT_operator_python_factory.cpp      | 46 +++++++++++++------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 330c735e..a5f0de7f 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -110,7 +110,9 @@ class GpuNUFFTPythonOperator
     bool has_sense_data;
     gpuNUFFT::Dimensions imgDims;
     // sensitivity maps
-    gpuNUFFT::Array<DType2> sensArray, kspace_data, image;
+    gpuNUFFT::GpuArray<DType2> image_gpu;
+    gpuNUFFT::GpuArray<CufftType> kspace_data_gpu;
+    gpuNUFFT::Array<CufftType> sensArray, kspace_data, image;
     void allocate_memory_kspace()
     {
         allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2));
@@ -126,7 +128,7 @@ class GpuNUFFTPythonOperator
     
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
-    py::array_t<std::complex<DType>> sense_maps,  std::optional<py::array_t<DType>> density_comp, int kernel_width=3,
+    py::array_t<std::complex<DType>> sense_maps,  py::array_t<DType> density_comp, int kernel_width=3,
     int sector_width=8, float osr=2, bool balance_workload=1) 
     {
         // k-space coordinates
@@ -138,12 +140,12 @@ class GpuNUFFTPythonOperator
 
         // density compensation weights
         gpuNUFFT::Array<DType> density_compArray;
-        if(density_comp.has_value())
-        {
-            density_compArray = readNumpyArray(density_comp.value());
+        //if(density_comp.has_value())
+        //{
+            density_compArray = readNumpyArray(density_comp);
             density_compArray.dim.length = trajectory_length;
             // No need else as the init is by default with 0 length and density comp is not applied
-        }
+        //}
 
         // image size
         py::buffer_info img_dim = image_size.request();
@@ -161,7 +163,10 @@ class GpuNUFFTPythonOperator
         kspace_data.dim.length = trajectory_length;
         kspace_data.dim.channels = num_coils;
         image.dim = imgDims;
-
+        kspace_data_gpu.dim.length = trajectory_length;
+        kspace_data_gpu.dim.channels = num_coils;
+        image_gpu.dim = imgDims;
+        
         // sensitivity maps
         py::buffer_info sense_maps_buffer = sense_maps.request();
         if (sense_maps_buffer.shape.size()==0)
@@ -210,14 +215,25 @@ class GpuNUFFTPythonOperator
         );
     }
 
-    void op_direct(intptr_t in_image, intptr_t out_kspace, bool interpolate_data)
+    void op_direct(uintptr_t in_image, uintptr_t out_kspace, bool interpolate_data)
     {
-        image.data = reinterpret_cast<DType2(&)[0]>(in_image);
-        kspace_data.data = reinterpret_cast<DType2(&)[0]>(out_kspace);
+        image_gpu.data = (DType2*) in_image;
+        kspace_data_gpu.data = (CufftType*) out_kspace;
         if(interpolate_data)
-            gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION);
+            gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu, gpuNUFFT::DENSITY_ESTIMATION);
         else
-            gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data);
+            gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu);
+        cudaDeviceSynchronize();
+    }
+
+    void adj_op_direct(uintptr_t in_kspace, uintptr_t out_image, bool grid_data)
+    {
+        kspace_data_gpu.data = (CufftType*) in_kspace;
+        image_gpu.data = (DType2*) out_image;
+        if(grid_data)
+            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
+        else
+            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image);
         cudaDeviceSynchronize();
     }
     
@@ -412,13 +428,15 @@ class GpuNUFFTPythonOperator
 
 PYBIND11_MODULE(gpuNUFFT, m) {
     py::class_<GpuNUFFTPythonOperator>(m, "NUFFTOp")
-        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, std::optional<py::array_t<DType>>, int, int, float, bool>(),
+        .def(py::init<py::array_t<DType>, py::array_t<int>, int, py::array_t<std::complex<DType>>, py::array_t<DType>, int, int, float, bool>(),
             py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true)
         .def("op", &GpuNUFFTPythonOperator::op, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false)
+        .def("op_direct", &GpuNUFFTPythonOperator::op_direct, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false)
+        .def("adj_op_direct", &GpuNUFFTPythonOperator::adj_op_direct, py::arg("in_kspace"), py::arg("out_image"), py::arg("grid_data") = false)
         .def("adj_op",  &GpuNUFFTPythonOperator::adj_op, py::arg("in_kspace"), py::arg("out_image"),  py::arg("grid_data") = false)
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps)
         .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6);
 }
-#endif  // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED
+#endif  // GPUNUFFT_OPERATOR_PYTHONFACTORY_H_INCLUDED

From 9f31c7811c950440ca5ee9a2764e7012d653ca05 Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Thu, 15 Feb 2024 10:02:06 +0100
Subject: [PATCH 73/85] Final fixes

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index a5f0de7f..f29e1ea3 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -231,9 +231,9 @@ class GpuNUFFTPythonOperator
         kspace_data_gpu.data = (CufftType*) in_kspace;
         image_gpu.data = (DType2*) out_image;
         if(grid_data)
-            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION);
+            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data_gpu, image_gpu, gpuNUFFT::DENSITY_ESTIMATION);
         else
-            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image);
+            gpuNUFFTOp->performGpuNUFFTAdj(kspace_data_gpu, image_gpu);
         cudaDeviceSynchronize();
     }
     

From 0bb69c1441e4a35c74aee1545958000482933aa6 Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Thu, 15 Feb 2024 11:05:48 +0100
Subject: [PATCH 74/85] Add gpuNUFFT version pop[

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2d513628..a075d919 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.6.2",
+    version="0.7.0",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 06dd0b69e8d5a267f9a5fbb6a1a2633eae1a0ba3 Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Thu, 15 Feb 2024 11:39:59 +0100
Subject: [PATCH 75/85] Fix cuRAND

---
 CUDA/src/gpu/atomic/CMakeLists.txt | 5 +++--
 CUDA/src/gpu/python/CMakeLists.txt | 2 ++
 setup.py                           | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/CUDA/src/gpu/atomic/CMakeLists.txt b/CUDA/src/gpu/atomic/CMakeLists.txt
index dffcdf15..061b9c12 100644
--- a/CUDA/src/gpu/atomic/CMakeLists.txt
+++ b/CUDA/src/gpu/atomic/CMakeLists.txt
@@ -11,10 +11,11 @@ set(GPU_CU_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/atomic_gpuNUFFT.cu
 					#${CMAKE_CURRENT_SOURCE_DIR}/../std_gpuNUFFT_kernels.cu
 									 )
 if(WIN32)
-    CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES}  ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE})
+    CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES}  ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} )
 else(WIN32)
-    CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES}  ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} SHARED)
+    CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES}  ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE}  SHARED)
 endif(WIN32)
 
 CUDA_ADD_CUFFT_TO_TARGET(${GRID_LIB_ATM_NAME})
 CUDA_ADD_CUBLAS_TO_TARGET(${GRID_LIB_ATM_NAME})
+target_link_libraries(${GRID_LIB_ATM_NAME} ${CUDA_curand_LIBRARY})
\ No newline at end of file
diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt
index 722fab0c..fac39c03 100644
--- a/CUDA/src/gpu/python/CMakeLists.txt
+++ b/CUDA/src/gpu/python/CMakeLists.txt
@@ -25,3 +25,5 @@ elseif(UNIX)
 endif(WIN32)
 CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT)
 CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT)
+target_link_libraries(gpuNUFFT ${CUDA_curand_LIBRARY})
+
diff --git a/setup.py b/setup.py
index a075d919..f3bb04da 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.7.0",
+    version="0.7.1",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 611f56e09bfd68df08ee65ce41fb22e9746f32c4 Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Mon, 19 Feb 2024 17:05:17 +0100
Subject: [PATCH 76/85] Fix issues

---
 CUDA/inc/cuda_utils.hpp              | 17 ++++++-
 CUDA/src/gpu/std_gpuNUFFT_kernels.cu |  1 +
 CUDA/src/gpuNUFFT_operator.cpp       | 70 +++++++++++++++++++---------
 3 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp
index aa0f06cc..8893383c 100644
--- a/CUDA/inc/cuda_utils.hpp
+++ b/CUDA/inc/cuda_utils.hpp
@@ -110,13 +110,28 @@ inline void allocateAndSetMem(TypeName **device_ptr, IndType num_elements,
  */
 template <typename TypeName>
 inline void copyDeviceToDevice(TypeName *device_ptr_src,
-                               TypeName *device_ptr_dest, IndType num_elements)
+                               TypeName *device_ptr_dest, IndType num_elements
+                               )
 {
   HANDLE_ERROR(cudaMemcpy(device_ptr_dest, device_ptr_src,
                           num_elements * sizeof(TypeName),
                           cudaMemcpyDeviceToDevice));
 }
 
+/** \brief CUDA memcpy call to copy data from device ptr to device ptr
+ *
+ * @param device_ptr_src   source device pointer
+ * @param device_ptr_dest  destination device pointer
+ * @param num_elements     amount of elements of size TypeName
+ */
+template <typename TypeName>
+inline void copyDeviceToDeviceAsync(TypeName *device_ptr_src,
+                               TypeName *device_ptr_dest, IndType num_elements, cudaStream_t stream=0)
+{
+  HANDLE_ERROR(cudaMemcpyAsync(device_ptr_dest, device_ptr_src,
+                          num_elements * sizeof(TypeName),
+                          cudaMemcpyDeviceToDevice, stream));
+}
 /** \brief Copy CUDA memory from device to host
  *
  * @param device_ptr    device pointer
diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
index a48d5532..bff9118c 100644
--- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
@@ -41,6 +41,7 @@ __global__ void updateDensityCompKernel(DType2* density_data, DType2* estimation
     DType2 data_p = density_data[t];
     DType2 esti_p = estimation_data[t];
     data_p.x *= rsqrtf(esti_p.x * esti_p.x + esti_p.y * esti_p.y);
+    data_p.y = 0;
     density_data[t] = data_p;
     t = t + blockDim.x*gridDim.x;
   } 
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index b1612d5f..35a4f07a 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -457,7 +457,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(
     if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
       fprintf(stderr, "error at adj  thread synchronization 2: %s\n",
               cudaGetErrorString(cudaGetLastError()));
-    if (gpuNUFFTOut == CONVOLUTION)
+    if (gpuNUFFTOut == CONVOLUTION || gpuNUFFTOut == DENSITY_ESTIMATION)
     {
       if (DEBUG)
         printf("stopping output after CONVOLUTION step\n");
@@ -939,10 +939,11 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     printf("Memory allocation: %.2f ms\n", stopTiming());
 
   int err;
-
+  cudaStream_t new_stream, old_stream;
   // iterate over coils and compute result
   for (int coil_it = 0; coil_it < n_coils; coil_it += n_coils_cc)
   {
+    cudaStreamCreate(&new_stream);
     unsigned long int data_coil_offset = (long int)coil_it * data_count;
     unsigned long int im_coil_offset = coil_it * (long int)imdata_count;
 
@@ -954,32 +955,52 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       // perform automatically "repeating" of input image in case
       // of existing sensitivity data
       for (int cnt = 0; cnt < n_coils_cc; cnt++)
-        copyDeviceToDevice<DType2>(imgData_gpu.data,
-                                   imdata_d + cnt * imdata_count, imdata_count);
+        copyDeviceToDeviceAsync<DType2>(imgData_gpu.data,
+                                   imdata_d + cnt * imdata_count, imdata_count, new_stream);
     else
-      copyDeviceToDevice<DType2>(imgData_gpu.data + im_coil_offset, imdata_d,
-                                 imdata_count * n_coils_cc);
+      copyDeviceToDeviceAsync<DType2>(imgData_gpu.data + im_coil_offset, imdata_d,
+                                 imdata_count * n_coils_cc, new_stream);
 
     // reset temp arrays
-    cudaMemset(gdata_d, 0,
-               sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc);
-    cudaMemset(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc);
+    cudaMemsetAsync(gdata_d, 0,
+               sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream);
+    cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream);
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream)!= cudaSuccess))
       printf("error at thread synchronization 1: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
     if (this->applySensData())
     {
-      copyToDevice(this->sens.data + im_coil_offset, sens_d,
-                   imdata_count * n_coils_cc);
+      copyToDeviceAsync(this->sens.data + im_coil_offset, sens_d,
+                   imdata_count * n_coils_cc, new_stream);
       performSensMul(imdata_d, sens_d, gi_host, false);
     }
 
     // apodization Correction
     performForwardDeapodization(imdata_d, deapo_d, gi_host);
-	  
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+	 if(gpuNUFFTOut == DENSITY_ESTIMATION)
+      {
+        // convolution and resampling to non-standard trajectory
+        forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d,
+                       sector_centers_d, gi_host);
+        if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
+              printf("error at thread synchronization 7: %s\n",
+                 cudaGetErrorString(cudaGetLastError()));
+
+        if (debugTiming)
+          printf("Forward Convolution: %.2f ms\n", stopTiming());
+        // write result in correct order back into output array
+        writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
+                    (int)this->kSpaceTraj.count(), n_coils_cc);
+        copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream);
+        if ((coil_it + n_coils_cc) < (n_coils))
+            continue;
+        freeTotalDeviceMemory(imdata_d, NULL);
+        this->freeDeviceMemory();
+        return;
+    } 
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 2: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // resize by oversampling factor and zero pad
@@ -988,13 +1009,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (debugTiming)
       startTiming();
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 3: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // shift image to get correct zero frequency position
     performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host);
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 4: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // eventually free imdata_d
@@ -1012,12 +1033,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       c++;
     }
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 5: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host);
 
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 6: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
@@ -1030,7 +1051,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     // convolution and resampling to non-standard trajectory
     forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d,
                        sector_centers_d, gi_host);
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 7: %s\n",
              cudaGetErrorString(cudaGetLastError()));
 
@@ -1038,15 +1059,20 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       printf("Forward Convolution: %.2f ms\n", stopTiming());
 
     performFFTScaling(data_d, gi_host->data_count, gi_host);
-    if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
+    if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error: at thread synchronization 8: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     
     // write result in correct order back into output array
     writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
                     (int)this->kSpaceTraj.count(), n_coils_cc);
-
-    copyDeviceToDevice(data_sorted_d, data_d, data_count * n_coils_cc);
+    if(coil_it > 1)
+    {
+      cudaStreamSynchronize(old_stream);
+      cudaStreamDestroy(old_stream);
+    }
+    copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream);
+    old_stream = new_stream;
   }  // iterate over coils
 
   freeTotalDeviceMemory(imdata_d, NULL);

From e86dbc6381824a19e5608aab2958180cba2c4aec Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Mon, 19 Feb 2024 17:05:37 +0100
Subject: [PATCH 77/85] Version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index f3bb04da..d2de2b98 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.7.1",
+    version="0.7.2",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 7d097e3d9166d92d94c4c9a6f410ee6326aecaf3 Mon Sep 17 00:00:00 2001
From: GILIYAR RADHAKRISHNA Chaithya <cg260486@is247382.intra.cea.fr>
Date: Wed, 21 Feb 2024 10:50:04 +0100
Subject: [PATCH 78/85] Fixes added

---
 .../python/gpuNUFFT_operator_python_factory.cpp   | 15 +++++++--------
 CUDA/src/gpuNUFFT_operator.cpp                    | 11 ++++++-----
 setup.py                                          |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index f29e1ea3..1e023a80 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -107,6 +107,7 @@ class GpuNUFFTPythonOperator
     gpuNUFFT::GpuNUFFTOperatorFactory factory;
     gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp;
     int trajectory_length, n_coils, dimension;
+    float osr;
     bool has_sense_data;
     gpuNUFFT::Dimensions imgDims;
     // sensitivity maps
@@ -129,7 +130,7 @@ class GpuNUFFTPythonOperator
     public:
     GpuNUFFTPythonOperator(py::array_t<DType> kspace_loc, py::array_t<int> image_size, int num_coils,
     py::array_t<std::complex<DType>> sense_maps,  py::array_t<DType> density_comp, int kernel_width=3,
-    int sector_width=8, float osr=2, bool balance_workload=1) 
+    int sector_width=8, float osr=2, bool balance_workload=1) : osr(osr)
     {
         // k-space coordinates
         py::buffer_info sample_loc = kspace_loc.request();
@@ -140,12 +141,11 @@ class GpuNUFFTPythonOperator
 
         // density compensation weights
         gpuNUFFT::Array<DType> density_compArray;
-        //if(density_comp.has_value())
-        //{
+        if(density_comp == Py_None)
+        {
             density_compArray = readNumpyArray(density_comp);
             density_compArray.dim.length = trajectory_length;
-            // No need else as the init is by default with 0 length and density comp is not applied
-        //}
+        }
 
         // image size
         py::buffer_info img_dim = image_size.request();
@@ -165,7 +165,6 @@ class GpuNUFFTPythonOperator
         image.dim = imgDims;
         kspace_data_gpu.dim.length = trajectory_length;
         kspace_data_gpu.dim.channels = num_coils;
-        image_gpu.dim = imgDims;
         
         // sensitivity maps
         py::buffer_info sense_maps_buffer = sense_maps.request();
@@ -189,6 +188,7 @@ class GpuNUFFTPythonOperator
             image.dim.channels = n_coils;
         else
             image.dim.channels = 1;
+        image_gpu.dim = imgDims;
         cudaDeviceSynchronize();
     }
 
@@ -305,7 +305,6 @@ class GpuNUFFTPythonOperator
         allocate_pinned_memory(&densArray, n_samples * sizeof(CufftType));
         densArray.dim.length = n_samples;
 
-        // TODO: Allocate directly on device and set with kernel.
         for (long int cnt = 0; cnt < n_samples; cnt++)
         {
           densArray.data[cnt].x = 1.0;
@@ -324,7 +323,7 @@ class GpuNUFFTPythonOperator
 
         gpuNUFFT::GpuArray<CufftType> image_gpu;
         image_gpu.dim = imgDims;
-        allocateDeviceMem(&image_gpu.data, imgDims.count());
+        allocateDeviceMem(&image_gpu.data, image_gpu.dim.count());
 
         if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
           printf("error at adj thread synchronization a: %s\n",
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 35a4f07a..50753f76 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -394,10 +394,11 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(
 
   // more than 2 coil sets are not sensible to reconstruct in one
   // adjoint kernel call , since the used shared memory is limited
+  // FIXME: We now limit to 1 as 2 has errors right now
   int n_coils_cc = this->is2DProcessing()
                        ? std::min(this->computePossibleConcurrentCoilCount(
                                       n_coils, kspaceData_gpu.dim),
-                                  2)
+                                  1)
                        : 1;
   if (DEBUG)
     printf("Computing %d coils concurrently.\n", n_coils_cc);
@@ -977,8 +978,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       performSensMul(imdata_d, sens_d, gi_host, false);
     }
 
-    // apodization Correction
-    performForwardDeapodization(imdata_d, deapo_d, gi_host);
 	 if(gpuNUFFTOut == DENSITY_ESTIMATION)
       {
         // convolution and resampling to non-standard trajectory
@@ -1000,6 +999,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
         this->freeDeviceMemory();
         return;
     } 
+    // apodization Correction
+    performForwardDeapodization(imdata_d, deapo_d, gi_host);
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 2: %s\n",
              cudaGetErrorString(cudaGetLastError()));
@@ -1198,8 +1199,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       performSensMul(imdata_d, sens_d, gi_host, false);
     }
 
-    // apodization Correction
-    performForwardDeapodization(imdata_d, deapo_d, gi_host);
 	  if(gpuNUFFTOut == DENSITY_ESTIMATION)
 	  {
 	      forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d,
@@ -1214,6 +1213,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
         this->freeDeviceMemory();
         return;
     }
+    // apodization Correction
+    performForwardDeapodization(imdata_d, deapo_d, gi_host);
     if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
       printf("error at thread synchronization 2: %s\n",
              cudaGetErrorString(cudaGetLastError()));
diff --git a/setup.py b/setup.py
index d2de2b98..0709e4f3 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.7.2",
+    version="0.7.4",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 0c34013ad599fafc857dbda863a4e90141bb6a18 Mon Sep 17 00:00:00 2001
From: chaithyagr <chaithyagr@gitlab.com>
Date: Wed, 21 Feb 2024 15:47:31 +0100
Subject: [PATCH 79/85] Update with final fixes, v0.7.5

---
 CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +-
 setup.py                                                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 1e023a80..a91c3f60 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -141,7 +141,7 @@ class GpuNUFFTPythonOperator
 
         // density compensation weights
         gpuNUFFT::Array<DType> density_compArray;
-        if(density_comp == Py_None)
+        if(density_comp != Py_None)
         {
             density_compArray = readNumpyArray(density_comp);
             density_compArray.dim.length = trajectory_length;
diff --git a/setup.py b/setup.py
index 0709e4f3..29c340c2 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.7.4",
+    version="0.7.5",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 01b9cab8ca2c3a12019dea0da5acf0677cd9d72c Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 3 Jun 2024 09:22:27 +0200
Subject: [PATCH 80/85] Autograd support added

---
 CUDA/inc/gpuNUFFT_operator.hpp                   | 16 ++++++++++++++--
 .../python/gpuNUFFT_operator_python_factory.cpp  |  7 +++++++
 CUDA/src/gpuNUFFT_operator.cpp                   |  8 ++++----
 setup.py                                         |  2 +-
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 4eb94478..2bbc6cd0 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -53,13 +53,13 @@ class GpuNUFFTOperator
   GpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth, DType osf,
                    Dimensions imgDims, bool loadKernel = true,
                    OperatorType operatorType = DEFAULT,
-                   bool matlabSharedMem = false)
+                   bool matlabSharedMem = false, bool grad_mode = false)
     : operatorType(operatorType), osf(osf), kernelWidth(kernelWidth),
       sectorWidth(sectorWidth), imgDims(imgDims), gpuMemAllocated(false),
       debugTiming(DEBUG), sens_d(NULL), crds_d(NULL), density_comp_d(NULL),
       deapo_d(NULL), gdata_d(NULL), sector_centers_d(NULL), sectors_d(NULL),
       data_indices_d(NULL), data_sorted_d(NULL), allocatedCoils(0),
-      matlabSharedMem(matlabSharedMem)
+      matlabSharedMem(matlabSharedMem), grad_mode(grad_mode)
   {
     if (loadKernel)
       initKernel();
@@ -342,6 +342,14 @@ class GpuNUFFTOperator
                                           GpuNUFFTOutput gpuNUFFTOut);
 
   void clean_memory();
+
+  void setGradMode(bool grad_mode) {
+    this->grad_mode = grad_mode;
+  }
+
+  bool getGradMode() {
+    return this->grad_mode;
+  }
   /** \brief Check if density compensation data is available. */
   bool applyDensComp()
   {
@@ -452,6 +460,10 @@ class GpuNUFFTOperator
   */
   bool matlabSharedMem;
 
+  /** \brief Flag for changing the isign, mainly used for gradients 
+  */
+  bool grad_mode;
+
   /** \brief Return Grid Width (ImageWidth * osf) */
   IndType getGridWidth()
   {
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index a91c3f60..5d0fa888 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -291,6 +291,12 @@ class GpuNUFFTPythonOperator
        gpuNUFFTOp->clean_memory();
     }
 
+    void toggle_grad_mode()
+    {
+        bool current_mode = gpuNUFFTOp->getGradMode();
+        gpuNUFFTOp->setGradMode(!current_mode);
+    }
+
     void set_smaps(py::array_t<std::complex<DType>> sense_maps)
     {
         CAST_POINTER_VARNAME(sense_maps, sensArray);
@@ -436,6 +442,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
         .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory)
         .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps)
+        .def("toggle_grad_mode", &GpuNUFFTPythonOperator::toggle_grad_mode)
         .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6);
 }
 #endif  // GPUNUFFT_OPERATOR_PYTHONFACTORY_H_INCLUDED
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 50753f76..4f370e87 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -1026,7 +1026,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     {
       if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count,
                               gdata_d + c * gi_host->gridDims_count,
-                              CUFFT_FORWARD)) != CUFFT_SUCCESS)
+                              grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS)
       {
         fprintf(stderr, "cufft has failed with err %i \n", err);
         showMemoryInfo(true, stderr);
@@ -1037,7 +1037,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 5: %s\n",
              cudaGetErrorString(cudaGetLastError()));
-    performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host);
+    performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 6: %s\n",
@@ -1240,7 +1240,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     {
       if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count,
                               gdata_d + c * gi_host->gridDims_count,
-                              CUFFT_FORWARD)) != CUFFT_SUCCESS)
+                              grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS)
       {
         fprintf(stderr, "cufft has failed with err %i \n", err);
         showMemoryInfo(true, stderr);
@@ -1251,7 +1251,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 5: %s\n",
              cudaGetErrorString(cudaGetLastError()));
-    performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host);
+    performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 6: %s\n",
diff --git a/setup.py b/setup.py
index 29c340c2..b96fa86c 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.7.5",
+    version="0.8.0",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 6b66ee000ac811ab52ba258646d875043a61a410 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 21 Jun 2024 10:31:51 +0200
Subject: [PATCH 81/85] Added support for set_pts

---
 CUDA/inc/gpuNUFFT_operator_factory.hpp        | 11 ++++++
 .../gpuNUFFT_operator_python_factory.cpp      | 18 ++++++++-
 CUDA/src/gpuNUFFT_operator_factory.cpp        | 37 ++++++++++++-------
 setup.py                                      |  2 +-
 4 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp
index 5658803f..1c4bd992 100644
--- a/CUDA/inc/gpuNUFFT_operator_factory.hpp
+++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp
@@ -174,6 +174,15 @@ class GpuNUFFTOperatorFactory
   void setUseTextures(bool useTextures);
 
   void setBalanceWorkload(bool balanceWorkload);
+  
+  /**
+  * \brief Set k-space locations and corresponding density. This can also be used 
+  * to update them
+  * 
+  */
+  void set_pts(
+    gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp, gpuNUFFT::Array<DType> &kSpaceTraj,
+    gpuNUFFT::Array<DType> &densCompData);
 
  protected:
   template<typename T>
@@ -315,7 +324,9 @@ class GpuNUFFTOperatorFactory
   */
   gpuNUFFT::Array<DType> computeDeapodizationFunction(const IndType &kernelWidth,
     const DType &osf, gpuNUFFT::Dimensions &imgDims);
+  
 
+ 
  private:
   /** \brief Flag to indicate texture interpolation */
   bool useTextures;
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 5d0fa888..2b3664f2 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -304,6 +304,21 @@ class GpuNUFFTPythonOperator
         gpuNUFFTOp->setSens(sensArray);
     }
 
+    void set_pts(py::array_t<DType> kspace_loc, py::array_t<DType> density_comp)
+    {
+        gpuNUFFT::Array<DType> kSpaceTraj = readNumpyArray(kspace_loc);
+        kSpaceTraj.dim.length = trajectory_length;
+
+        // density compensation weights
+        gpuNUFFT::Array<DType> density_compArray;
+        if(density_comp != Py_None)
+        {
+            density_compArray = readNumpyArray(density_comp);
+            density_compArray.dim.length = trajectory_length;
+        }
+        factory.set_pts(gpuNUFFTOp, kSpaceTraj, density_compArray);
+
+    }
     py::array_t<DType> estimate_density_comp(int max_iter = 10)
     {
         IndType n_samples = kspace_data.count();
@@ -443,6 +458,7 @@ PYBIND11_MODULE(gpuNUFFT, m) {
         .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10)
         .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps)
         .def("toggle_grad_mode", &GpuNUFFTPythonOperator::toggle_grad_mode)
-        .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6);
+        .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6)
+        .def("set_pts", &GpuNUFFTPythonOperator::set_pts, py::arg("kspace_loc"), py::arg("density_comp") = py::none());
 }
 #endif  // GPUNUFFT_OPERATOR_PYTHONFACTORY_H_INCLUDED
diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp
index b6c4d734..647c8840 100644
--- a/CUDA/src/gpuNUFFT_operator_factory.cpp
+++ b/CUDA/src/gpuNUFFT_operator_factory.cpp
@@ -457,10 +457,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
   checkMemoryConsumption(kSpaceTraj.dim, sectorWidth, osf, imgDims,
                          densCompData.dim, sensData.dim);
 
-  if (kSpaceTraj.dim.channels > 1)
-    throw std::invalid_argument(
-        "Trajectory dimension must not contain a channel size greater than 1!");
-
   if (imgDims.channels > 1)
     throw std::invalid_argument(
         "Image dimensions must not contain a channel size greater than 1!");
@@ -470,6 +466,29 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
   gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp =
       createNewGpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims);
 
+  // Set points and density compensation
+  set_pts(gpuNUFFTOp, kSpaceTraj, densCompData);
+  
+  if (sensData.data != NULL)
+    gpuNUFFTOp->setSens(sensData);
+  
+  gpuNUFFTOp->setDeapodizationFunction(
+    this->computeDeapodizationFunction(kernelWidth, osf, imgDims));
+    
+  debug("finished creation of gpuNUFFT operator\n");
+  
+  return gpuNUFFTOp;
+}
+
+
+void gpuNUFFT::GpuNUFFTOperatorFactory::set_pts(
+    gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp, gpuNUFFT::Array<DType> &kSpaceTraj,
+    gpuNUFFT::Array<DType> &densCompData)
+{
+  if (kSpaceTraj.dim.channels > 1)
+    throw std::invalid_argument(
+        "Trajectory dimension must not contain a channel size greater than 1!");
+
   // assign according sector to k-Space position
   gpuNUFFT::Array<IndType> assignedSectors =
       assignSectors(gpuNUFFTOp, kSpaceTraj);
@@ -487,9 +506,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
   if (densCompData.data != NULL)
     densData = initDensData(gpuNUFFTOp, coordCnt);
 
-  if (sensData.data != NULL)
-    gpuNUFFTOp->setSens(sensData);
-
   if (useGpu)
   {
     sortArrays(gpuNUFFTOp, assignedSectorsAndIndicesSorted,
@@ -543,13 +559,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator(
   // free temporary array
   free(assignedSectors.data);
   assignedSectors.data = NULL;
-
-  gpuNUFFTOp->setDeapodizationFunction(
-    this->computeDeapodizationFunction(kernelWidth, osf, imgDims));
-    
-  debug("finished creation of gpuNUFFT operator\n");
-  
-  return gpuNUFFTOp;
 }
 
 gpuNUFFT::GpuNUFFTOperator *
diff --git a/setup.py b/setup.py
index b96fa86c..6f0dd605 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.8.0",
+    version="0.8.1",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 2fde394291457966228a58f1c594840538d9a93b Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 2 Aug 2024 17:18:42 +0200
Subject: [PATCH 82/85] commit

---
 CUDA/src/gpuNUFFT_operator.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 4f370e87..05387b74 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -1076,6 +1076,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     old_stream = new_stream;
   }  // iterate over coils
 
+  cudaStreamSynchronize(old_stream);
+  cudaStreamDestroy(old_stream);
   freeTotalDeviceMemory(imdata_d, NULL);
   this->freeDeviceMemory();
 
@@ -1291,13 +1293,14 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     old_stream = new_stream;
   }  // iterate over coils
 
+  cudaStreamSynchronize(old_stream);
+  cudaStreamDestroy(old_stream);
   freeTotalDeviceMemory(data_d, imdata_d, NULL);
   this->freeDeviceMemory();
 
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n",
             cudaGetErrorString(cudaGetLastError()));
-  cudaStreamDestroy(old_stream);
 }
 
 gpuNUFFT::Array<CufftType>

From 3d9809c79f96f1860c63f703fc839c9b330f5006 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 2 Aug 2024 20:56:46 +0200
Subject: [PATCH 83/85] WIP debug

---
 CUDA/CMakeLists.txt                           |  4 +-
 CUDA/inc/cuda_utils.hpp                       |  2 +-
 CUDA/src/balanced_gpuNUFFT_operator.cpp       |  8 +--
 .../gpuNUFFT_operator_python_factory.cpp      |  9 ++-
 CUDA/src/gpuNUFFT_operator.cpp                | 65 ++++++++++---------
 setup.py                                      |  2 +-
 6 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt
index 71260d30..3235a868 100644
--- a/CUDA/CMakeLists.txt
+++ b/CUDA/CMakeLists.txt
@@ -70,8 +70,8 @@ endif(GEN_MEX_FILES)
 
 #Options
 #General DEBUG output 
-SET (DEBUG false)
-OPTION(WITH_DEBUG "Enable DEBUG messages" OFF)
+SET (DEBUG true)
+OPTION(WITH_DEBUG "Enable DEBUG messages" ON)
 if (WITH_DEBUG)
  SET (DEBUG true)
 endif()
diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp
index 8893383c..ee1c76d9 100644
--- a/CUDA/inc/cuda_utils.hpp
+++ b/CUDA/inc/cuda_utils.hpp
@@ -220,7 +220,7 @@ inline void showMemoryInfo(bool force, FILE *stream)
   size_t total_mem = 0;
   cudaMemGetInfo(&free_mem, &total_mem);
   if (DEBUG || force)
-    fprintf(stream, "memory usage, free: %lu total: %lu\n", (SizeType)(free_mem),
+    printf("memory usage, free: %lu total: %lu\n", (SizeType)(free_mem),
     (SizeType)(total_mem));
 }
 
diff --git a/CUDA/src/balanced_gpuNUFFT_operator.cpp b/CUDA/src/balanced_gpuNUFFT_operator.cpp
index 3e621044..613ee1c8 100644
--- a/CUDA/src/balanced_gpuNUFFT_operator.cpp
+++ b/CUDA/src/balanced_gpuNUFFT_operator.cpp
@@ -99,12 +99,12 @@ void gpuNUFFT::BalancedGpuNUFFTOperator::performForwardGpuNUFFT(
     printf(
         "BGpuNUFFT: allocate and copy sector processing order of size %d...\n",
         this->sectorProcessingOrder.count());
-  allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
-                                       this->sectorProcessingOrder.data,
-                                       this->sectorProcessingOrder.count());
+  //allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
+    //                                   this->sectorProcessingOrder.data,
+      //                                 this->sectorProcessingOrder.count());
 
   GpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData, gpuNUFFTOut);
 
-  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
+//  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
 }
 
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 2b3664f2..628538b7 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -222,7 +222,13 @@ class GpuNUFFTPythonOperator
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu, gpuNUFFT::DENSITY_ESTIMATION);
         else
-            gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu);
+        {
+            for(long int i=0; i<100000; i++)
+            {
+                printf("i = %ld\n", i);
+                gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu);
+            }
+        }
         cudaDeviceSynchronize();
     }
 
@@ -442,6 +448,7 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
+        printf("Destructor called\n");
         delete gpuNUFFTOp;
     }
 };
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 05387b74..78b22786 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -287,11 +287,11 @@ void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils, int n_coils_cc)
     printf("creating cufft plan with %d,%d,%d dimensions\n",
            DEFAULT_VALUE(gi_host->gridDims.z), gi_host->gridDims.y,
            gi_host->gridDims.x);
-  cufftResult res = cufftPlan3d(
-      &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z),
-      (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType);
-  if (res != CUFFT_SUCCESS)
-    fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res);
+  // cufftResult res = cufftPlan3d(
+  //     &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z),
+  //     (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType);
+  // if (res != CUFFT_SUCCESS)
+  //   fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res);
   gpuMemAllocated = true;
 }
 
@@ -301,7 +301,9 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory()
     return;
 
   cudaFreeHost(gi_host);
-  cufftDestroy(fft_plan);
+  cufftResult res = cufftDestroy(fft_plan);
+  if (res != CUFFT_SUCCESS)
+    fprintf(stderr, "error on CUFFT Plan destruction!!! %d\n", res);
   // Destroy the cuFFT plan.
   if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess))
     printf("error at thread synchronization 9: %s\n",
@@ -312,13 +314,13 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory()
                         sectors_d, sector_centers_d, NULL);  // NULL as stop
 
   if (deapo_d != NULL)
-    cudaFree(deapo_d);
+    freeDeviceMem((void *)deapo_d);
 
   if (this->applySensData())
-    cudaFree(sens_d);
+    freeDeviceMem((void *)sens_d);
 
   if (this->applyDensComp())
-    cudaFree(density_comp_d);
+    freeDeviceMem((void *)density_comp_d);
 
   showMemoryInfo();
   gpuMemAllocated = false;
@@ -934,7 +936,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
   if (DEBUG)
     printf("allocate and copy imdata of size %d...\n",
            imdata_count * n_coils_cc);
-  allocateDeviceMem<DType2>(&imdata_d, imdata_count * n_coils_cc);
+  //allocateDeviceMem<DType2>(&imdata_d, imdata_count * n_coils_cc);
 
   if (debugTiming)
     printf("Memory allocation: %.2f ms\n", stopTiming());
@@ -948,9 +950,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     unsigned long int data_coil_offset = (long int)coil_it * data_count;
     unsigned long int im_coil_offset = coil_it * (long int)imdata_count;
 
-    data_d = kspaceData_gpu.data + data_coil_offset;
+    //data_d = kspaceData_gpu.data + data_coil_offset;
 
-    this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc);
+//    this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc);
 
     if (this->applySensData())
       // perform automatically "repeating" of input image in case
@@ -959,13 +961,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
         copyDeviceToDeviceAsync<DType2>(imgData_gpu.data,
                                    imdata_d + cnt * imdata_count, imdata_count, new_stream);
     else
-      copyDeviceToDeviceAsync<DType2>(imgData_gpu.data + im_coil_offset, imdata_d,
-                                 imdata_count * n_coils_cc, new_stream);
+//      copyDeviceToDeviceAsync<DType2>(imgData_gpu.data + im_coil_offset, imdata_d,
+  //                               imdata_count * n_coils_cc, new_stream);
 
     // reset temp arrays
-    cudaMemsetAsync(gdata_d, 0,
-               sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream);
-    cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream);
+//    cudaMemsetAsync(gdata_d, 0,
+  //             sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream);
+    //cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream)!= cudaSuccess))
       printf("error at thread synchronization 1: %s\n",
@@ -1000,12 +1002,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
         return;
     } 
     // apodization Correction
-    performForwardDeapodization(imdata_d, deapo_d, gi_host);
+    //performForwardDeapodization(imdata_d, deapo_d, gi_host);
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 2: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // resize by oversampling factor and zero pad
-    performPadding(imdata_d, gdata_d, gi_host);
+    //performPadding(imdata_d, gdata_d, gi_host);
 
     if (debugTiming)
       startTiming();
@@ -1014,7 +1016,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       printf("error at thread synchronization 3: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // shift image to get correct zero frequency position
-    performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host);
+    //performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 4: %s\n",
@@ -1024,9 +1026,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     int c = 0;
     while (c < n_coils_cc)
     {
-      if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count,
-                              gdata_d + c * gi_host->gridDims_count,
-                              grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS)
+     // if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count,
+       //                       gdata_d + c * gi_host->gridDims_count,
+         //                     grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS)
       {
         fprintf(stderr, "cufft has failed with err %i \n", err);
         showMemoryInfo(true, stderr);
@@ -1037,7 +1039,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 5: %s\n",
              cudaGetErrorString(cudaGetLastError()));
-    performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host);
+    //performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 6: %s\n",
@@ -1050,8 +1052,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       startTiming();
 
     // convolution and resampling to non-standard trajectory
-    forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d,
-                       sector_centers_d, gi_host);
+    //forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d,
+     //                  sector_centers_d, gi_host);
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 7: %s\n",
              cudaGetErrorString(cudaGetLastError()));
@@ -1059,28 +1061,27 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (debugTiming)
       printf("Forward Convolution: %.2f ms\n", stopTiming());
 
-    performFFTScaling(data_d, gi_host->data_count, gi_host);
+   // performFFTScaling(data_d, gi_host->data_count, gi_host);
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error: at thread synchronization 8: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     
     // write result in correct order back into output array
-    writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
-                    (int)this->kSpaceTraj.count(), n_coils_cc);
+  //  writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
+    //                (int)this->kSpaceTraj.count(), n_coils_cc);
     if(coil_it > 1)
     {
       cudaStreamSynchronize(old_stream);
       cudaStreamDestroy(old_stream);
     }
-    copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream);
+   // copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream);
     old_stream = new_stream;
   }  // iterate over coils
 
   cudaStreamSynchronize(old_stream);
   cudaStreamDestroy(old_stream);
-  freeTotalDeviceMemory(imdata_d, NULL);
+ // freeTotalDeviceMemory(imdata_d, NULL);
   this->freeDeviceMemory();
-
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n",
             cudaGetErrorString(cudaGetLastError()));
diff --git a/setup.py b/setup.py
index 6f0dd605..e12094c9 100644
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,7 @@ def build_extension(self, ext):
                       "-DGEN_PYTHON_FILES=ON",
                       "-DGEN_MEX_FILES=OFF",
                       "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path]
-        cfg = "Debug" if self.debug else "Release"
+        cfg = "Debug"# if self.debug else "Release"
         build_args = ["--config", cfg]
 
         if platform.system() == "Windows":

From 0dc96874a0cd0acde99b22dbbc3dfdd6cf9c5e4c Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 5 Aug 2024 10:05:31 +0200
Subject: [PATCH 84/85] A bunch of fixes to support CUDA12.0

---
 CUDA/CMakeLists.txt                           |    8 +-
 .../balanced_texture_gpuNUFFT_operator.hpp    |   91 --
 CUDA/inc/cuda_utils.cuh                       |   64 -
 CUDA/inc/cuda_utils.hpp                       |   33 -
 CUDA/inc/gpuNUFFT_kernels.hpp                 |  149 ---
 CUDA/inc/gpuNUFFT_operator_factory.hpp        |   16 +-
 CUDA/inc/texture_gpuNUFFT_operator.hpp        |   70 -
 CUDA/src/CMakeLists.txt                       |    4 +-
 CUDA/src/balanced_gpuNUFFT_operator.cpp       |    8 +-
 .../balanced_texture_gpuNUFFT_operator.cpp    |  126 --
 CUDA/src/gpu/atomic/CMakeLists.txt            |    1 -
 CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu        |    1 -
 .../gpu/atomic/texture_gpuNUFFT_kernels.cu    | 1153 -----------------
 .../gpuNUFFT_operator_python_factory.cpp      |    9 +-
 CUDA/src/gpu/std_gpuNUFFT_kernels.cu          |   89 --
 CUDA/src/gpuNUFFT_operator.cpp                |   73 +-
 CUDA/src/gpuNUFFT_operator_factory.cpp        |   50 +-
 CUDA/src/texture_gpuNUFFT_operator.cpp        |  103 --
 setup.py                                      |    4 +-
 19 files changed, 66 insertions(+), 1986 deletions(-)
 delete mode 100644 CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp
 delete mode 100644 CUDA/inc/texture_gpuNUFFT_operator.hpp
 delete mode 100644 CUDA/src/balanced_texture_gpuNUFFT_operator.cpp
 delete mode 100644 CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
 delete mode 100644 CUDA/src/texture_gpuNUFFT_operator.cpp

diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt
index 3235a868..10958e50 100644
--- a/CUDA/CMakeLists.txt
+++ b/CUDA/CMakeLists.txt
@@ -70,8 +70,8 @@ endif(GEN_MEX_FILES)
 
 #Options
 #General DEBUG output 
-SET (DEBUG true)
-OPTION(WITH_DEBUG "Enable DEBUG messages" ON)
+SET (DEBUG false)
+OPTION(WITH_DEBUG "Enable DEBUG messages" OFF)
 if (WITH_DEBUG)
  SET (DEBUG true)
 endif()
@@ -163,10 +163,8 @@ SET(GPUNUFFT_INCLUDE ${GPUNUFFT_INC_DIR}/cuda_utils.hpp
 										 ${GPUNUFFT_INC_DIR}/precomp_utils.hpp
                      ${GPUNUFFT_INC_DIR}/gpuNUFFT_operator.hpp
 										 ${GPUNUFFT_INC_DIR}/balanced_operator.hpp
-										 ${GPUNUFFT_INC_DIR}/texture_gpuNUFFT_operator.hpp
 										 ${GPUNUFFT_INC_DIR}/balanced_gpuNUFFT_operator.hpp
-                     ${GPUNUFFT_INC_DIR}/gpuNUFFT_operator_factory.hpp
-										 ${GPUNUFFT_INC_DIR}/balanced_texture_gpuNUFFT_operator.hpp)
+                     ${GPUNUFFT_INC_DIR}/gpuNUFFT_operator_factory.hpp)
 					 
 SET(MATLAB_HELPER_INCLUDE ${GPUNUFFT_INC_DIR}/matlab_helper.h)
 SET(CONFIG_INCLUDE ${GPUNUFFT_INC_DIR}/config.hpp ${GPUNUFFT_INC_DIR}/cufft_config.hpp)
diff --git a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp b/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp
deleted file mode 100644
index d7672f73..00000000
--- a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef BALANCED_TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED
-#define BALANCED_TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED
-
-#include "gpuNUFFT_types.hpp"
-#include "texture_gpuNUFFT_operator.hpp"
-#include "balanced_operator.hpp"
-
-namespace gpuNUFFT
-{
-/**
-  * \brief GpuNUFFTOperator with load balancing and texture memory lookup
-  *
-  * Changes the behaviour of the default GpuNUFFTOperator by balancing the
-  * work load by sector to a maximum amount of samples per sector
-  *(MAXIMUM_PAYLOAD).
-  * Thus, sectors with a high density of data points are split into multiple
-  *ones,
-  * which are processed in parallel.
-  *
-  * Furthermore, the kernel interpolation is performed by using gpu texture
-  *memory.
-  *
-  */
-class BalancedTextureGpuNUFFTOperator : public TextureGpuNUFFTOperator,
-                                        public BalancedOperator
-{
- public:
-  BalancedTextureGpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth,
-                                  DType osf, Dimensions imgDims,
-                                  InterpolationType interpolationType = TEXTURE2D_LOOKUP,
-                                  bool matlabSharedMem = false)
-    : TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims,
-                              interpolationType, matlabSharedMem)
-  {
-  }
-
-  ~BalancedTextureGpuNUFFTOperator()
-  {
-     freeLocalMemberArray(this->sectorProcessingOrder.data);
-  }
-
-  // OPERATIONS
-  void performGpuNUFFTAdj(Array<DType2> kspaceData, Array<CufftType> &imgData,
-                          GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION);
-  void performGpuNUFFTAdj(GpuArray<DType2> kspaceData_gpu,
-                          GpuArray<CufftType> &imgData_gpu,
-                          GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION);
-
-  void performForwardGpuNUFFT(Array<DType2> imgData,
-                              Array<CufftType> &kspaceData,
-                              GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION);
-  void performForwardGpuNUFFT(GpuArray<DType2> imgData_gpu,
-                              GpuArray<CufftType> &kspaceData,
-                              GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION);
-
-  // Getter and Setter for Processing Order
-  Array<IndType2> getSectorProcessingOrder()
-  {
-    return this->sectorProcessingOrder;
-  }
-  void setSectorProcessingOrder(Array<IndType2> sectorProcessingOrder)
-  {
-    this->sectorProcessingOrder = sectorProcessingOrder;
-  }
-
-  OperatorType getType()
-  {
-    return gpuNUFFT::BALANCED_TEXTURE;
-  }
-  // OPERATIONS
- private:
-  GpuNUFFTInfo *initAndCopyGpuNUFFTInfo(int n_coils_cc = 1);
-
-  // sectorProcessingOrder
-  Array<IndType2> sectorProcessingOrder;
-
-  IndType2 *sector_processing_order_d;
-
-  void adjConvolution(DType2 *data_d, DType *crds_d, CufftType *gdata_d,
-                      DType *kernel_d, IndType *sectors_d,
-                      IndType *sector_centers_d,
-                      gpuNUFFT::GpuNUFFTInfo *gi_host);
-
-  void forwardConvolution(CufftType *data_d, DType *crds_d, CufftType *gdata_d,
-                          DType *kernel_d, IndType *sectors_d,
-                          IndType *sector_centers_d,
-                          gpuNUFFT::GpuNUFFTInfo *gi_host);
-};
-}
-
-#endif  // BALANCED_TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED
diff --git a/CUDA/inc/cuda_utils.cuh b/CUDA/inc/cuda_utils.cuh
index 9a522e78..f57b44d6 100644
--- a/CUDA/inc/cuda_utils.cuh
+++ b/CUDA/inc/cuda_utils.cuh
@@ -7,70 +7,6 @@ __constant__ gpuNUFFT::GpuNUFFTInfo GI;
 
 __constant__ DType KERNEL[10000];
 
-texture<float, 1, cudaReadModeElementType> texKERNEL;
-texture<float, 2, cudaReadModeElementType> texKERNEL2D;
-texture<float, 3, cudaReadModeElementType> texKERNEL3D;
-
-texture<float2> texDATA;
-texture<cufftComplex> texGDATA;
-
-__inline__ __device__ float compute1DTextureLookup(float x, float y)
-{
-  return tex1D(texKERNEL, x) * tex1D(texKERNEL, y);
-}
-
-__inline__ __device__ float compute1DTextureLookup(float x, float y, float z)
-{
-  return tex1D(texKERNEL, x) * tex1D(texKERNEL, y) * tex1D(texKERNEL, z);
-}
-
-__inline__ __device__ float compute2DTextureLookup(float x, float y)
-{
-  return (float)tex2D(texKERNEL2D, (float)x, (float)y);
-}
-
-__inline__ __device__ float compute2DTextureLookup(float x, float y, float z)
-{
-  return (float)tex2D(texKERNEL2D, (float)x, (float)y) *
-         tex2D(texKERNEL2D, (float)z, 0);
-}
-
-__inline__ __device__ float compute3DTextureLookup(float x, float y)
-{
-  return tex3D(texKERNEL3D, x, y, 0);
-}
-
-__inline__ __device__ float compute3DTextureLookup(float x, float y, float z)
-{
-  return tex3D(texKERNEL3D, x, y, z);
-}
-
-__inline__ __device__ float computeTextureLookup(float x, float y)
-{
-  // wired to 2d
-  return compute2DTextureLookup((float)x, (float)y);
-  // switch(GI.interpolationType)
-  //{
-  //  case 1: return compute1DTextureLookup(x,y);
-  //  case 2: return compute2DTextureLookup(x,y);
-  //  case 3: return compute3DTextureLookup(x,y);
-  //  default: return (float)0.0;
-  //}
-}
-
-__inline__ __device__ float computeTextureLookup(float x, float y, float z)
-{
-  // wired to 2d
-  return compute2DTextureLookup(x, y, z);
-  // switch(GI.interpolationType)
-  //{
-  //  case 1: return compute1DTextureLookup(x,y,z);
-  //  case 2: return compute2DTextureLookup(x,y,z);
-  //  case 3: return compute3DTextureLookup(x,y,z);
-  //  default: return (float)0.0;
-  //}
-}
-
 #if __CUDA_ARCH__ < 200
 #define THREAD_BLOCK_SIZE 256
 #else
diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp
index ee1c76d9..23e76bc7 100644
--- a/CUDA/inc/cuda_utils.hpp
+++ b/CUDA/inc/cuda_utils.hpp
@@ -253,37 +253,4 @@ inline void showMemoryInfo()
  */
 void initConstSymbol(const char *symbol, const void *src, IndType count, cudaStream_t stream=0);
 
-/** \brief Initialize texture memory on device
- *
- * CUDA Kernel function prototype.
- *
- * @param symbol Texture symbol name
- */
-void initTexture(const char *symbol, cudaArray **devicePtr,
-                 gpuNUFFT::Array<DType> hostTexture);
-
-/** \brief Bind to 1-d texture on device
- *
- * CUDA Kernel function prototype.
- *
- * @param symbol Texture symbol name
- */
-void bindTo1DTexture(const char *symbol, void *devicePtr, IndType count);
-
-/** \brief Unbind from device texture
- *
- * CUDA Kernel function prototype.
- *
- * @param symbol Texture symbol name
- */
-void unbindTexture(const char *symbol);
-
-/** \brief Free texture memory on device
- *
- * CUDA Kernel function prototype.
- *
- * @param symbol Texture symbol name
- */
-void freeTexture(const char *symbol, cudaArray *devicePtr);
-
 #endif
diff --git a/CUDA/inc/gpuNUFFT_kernels.hpp b/CUDA/inc/gpuNUFFT_kernels.hpp
index cd4861dd..48439259 100644
--- a/CUDA/inc/gpuNUFFT_kernels.hpp
+++ b/CUDA/inc/gpuNUFFT_kernels.hpp
@@ -80,80 +80,6 @@ void performConvolution(DType2 *data_d, DType *crds_d, CufftType *gdata_d,
                         IndType *sector_centers_d,
                         gpuNUFFT::GpuNUFFTInfo *gi_host);
 
-/**
- * \brief Adjoint gridding convolution implementation on GPU using textures for
- *kernel lookup.
- *
- * Performs the adjoint gridding convolution step on the GPU, thus the
- *interpolation
- * from non-uniform sampled k-space data onto the uniform oversampled grid.
- *
- * The distance from each sample to its neighboring grid positions is computed
- *and the corresponding
- * data value is weighted by the kernel function according to the distance.
- *
- * The kernel lookup is performed by the use of gpu textures.
- *
- * CUDA function prototype.
- *
- *
- * @param data_d            Input k-space sample data value, complex, sorted due
- *to precomputation
- * @param crds_d            k-space sample coordinate (non-cartesian),
- *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn)
- * @param gdata_d           Outpu k-space grid (cartesian)
- * @param kernel_d          precomputed interpolation kernel
- * @param sectors_d         precomputed data-sector mapping, defines the range
- *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to
- *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an
- *so on
- * @param sector_processing_order_d precomputed sector processing order
- * @param sector_centers_d  precomputed coordinates (x,y,z) of sector centers
- * @param gi_host           info struct with meta information
- */
-void performTextureConvolution(DType2 *data_d, DType *crds_d,
-                               CufftType *gdata_d, DType *kernel_d,
-                               IndType *sectors_d, IndType *sector_centers_d,
-                               gpuNUFFT::GpuNUFFTInfo *gi_host);
-
-/**
- * \brief Adjoint gridding convolution implementation on GPU using textures and
- *sector load balancing.
- *
- * Performs the adjoint gridding convolution step on the GPU, thus the
- *interpolation
- * from non-uniform sampled k-space data onto the uniform oversampled grid.
- *
- * The distance from each sample to its neighboring grid positions is computed
- *and the corresponding
- * data value is weighted by the kernel function according to the distance.
- *
- * The kernel lookup is performed by the use of gpu textures and the workload is
- *balanced.
- *
- * CUDA function prototype.
- *
- * @param data_d            Input k-space sample data value, complex, sorted due
- *to precomputation
- * @param crds_d            k-space sample coordinate (non-cartesian),
- *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn)
- * @param gdata_d           Output k-space grid (cartesian)
- * @param kernel_d          precomputed interpolation kernel
- * @param sectors_d         precomputed data-sector mapping, defines the range
- *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to
- *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an
- *so on
- * @param sector_processing_order_d precomputed sector processing order
- * @param sector_centers_d  precomputed coordinates (x,y,z) of sector centers
- * @param gi_host           info struct with meta information
- */
-void performTextureConvolution(DType2 *data_d, DType *crds_d,
-                               CufftType *gdata_d, DType *kernel_d,
-                               IndType *sectors_d,
-                               IndType2 *sector_processing_order_d,
-                               IndType *sector_centers_d,
-                               gpuNUFFT::GpuNUFFTInfo *gi_host);
-
 // FORWARD Operations
 
 /**
@@ -228,81 +154,6 @@ void performForwardConvolution(CufftType *data_d, DType *crds_d,
                                IndType *sector_centers_d,
                                gpuNUFFT::GpuNUFFTInfo *gi_host);
 
-/**
- * \brief Forward gridding convolution implementation on GPU using textures .
- *
- * Performs the forward gridding convolution step on the GPU, thus the
- *interpolation
- * from uniform oversampled grid positions to non-uniform sampled k-space data
- *points.
- *
- * The distance from each sample to its neighboring grid positions is computed
- *and the corresponding
- * data value is weighted by the kernel function according to the distance.
- *
- * The kernel lookup is performed by the use of gpu textures.
- *
- * CUDA function prototype.
- *
- * @param data_d            Output k-space sample data value, complex, sorted
- *due to precomputation
- * @param crds_d            k-space sample coordinate (non-cartesian),
- *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn)
- * @param gdata_d           Input k-space grid (cartesian)
- * @param kernel_d          precomputed interpolation kernel
- * @param sectors_d         precomputed data-sector mapping, defines the range
- *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to
- *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an
- *so on
- * @param sector_centers_d  precomputed coordinates (x,y,z) of sector centers
- * @param gi_host           info struct with meta information
- */
-void performTextureForwardConvolution(CufftType *data_d, DType *crds_d,
-                                      CufftType *gdata_d, DType *kernel_d,
-                                      IndType *sectors_d,
-                                      IndType *sector_centers_d,
-                                      gpuNUFFT::GpuNUFFTInfo *gi_host);
-
-/**
- * \brief Forward gridding convolution implementation on GPU using sector load
- *balancing and textures.
- *
- * Performs the forward gridding convolution step on the GPU, thus the
- *interpolation
- * from uniform oversampled grid positions to non-uniform sampled k-space data
- *points.
- *
- * The distance from each sample to its neighboring grid positions is computed
- *and the corresponding
- * data value is weighted by the kernel function according to the distance.
- *
- * The kernel lookup is performed by the use of gpu textures.
- * In order to balance the work load per thread block a sector processing order
- *is precomputed.
- *
- * CUDA function prototype.
- *
- * @param data_d            Output k-space sample data value, complex, sorted
- *due to precomputation
- * @param crds_d            k-space sample coordinate (non-cartesian),
- *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn)
- * @param gdata_d           Input k-space grid (cartesian)
- * @param kernel_d          precomputed interpolation kernel
- * @param sectors_d         precomputed data-sector mapping, defines the range
- *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to
- *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an
- *so on
- * @param sector_processing_order_d precomputed sector processing order
- * @param sector_centers_d  precomputed coordinates (x,y,z) of sector centers
- * @param gi_host           info struct with meta information
- */
-void performTextureForwardConvolution(CufftType *data_d, DType *crds_d,
-                                      CufftType *gdata_d, DType *kernel_d,
-                                      IndType *sectors_d,
-                                      IndType2 *sector_processing_order_d,
-                                      IndType *sector_centers_d,
-                                      gpuNUFFT::GpuNUFFTInfo *gi_host);
-
 // UTIL Functions
 /** \brief Scale each element by the total number of elements.
   *
diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp
index 1c4bd992..24b937e7 100644
--- a/CUDA/inc/gpuNUFFT_operator_factory.hpp
+++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp
@@ -4,8 +4,6 @@
 #include "config.hpp"
 #include "gpuNUFFT_operator.hpp"
 #include "balanced_gpuNUFFT_operator.hpp"
-#include "texture_gpuNUFFT_operator.hpp"
-#include "balanced_texture_gpuNUFFT_operator.hpp"
 #include <algorithm>  // std::sort
 #include <vector>     // std::vector
 #include <string>
@@ -27,8 +25,6 @@ namespace gpuNUFFT
  *   operator like from subsequent matlab calls
  *
  * The factory defines how the operator is going to process (load balancing
- *and/or
- * texture interpolation).
  *
  * Sector mapping:
  *
@@ -51,13 +47,12 @@ class GpuNUFFTOperatorFactory
 
   /** \brief Constructor overload
     *
-    * @param useTextures Flag to indicate texture interpolation
     * @param useGpu Flag to indicat&GpuNUFFTPythonOperator::adj_op);e gpu usage for precomputation
     * @param balanceWorkload Flag to indicate load balancing
     */
-  GpuNUFFTOperatorFactory(const bool useTextures = false, const bool useGpu = true,
+  GpuNUFFTOperatorFactory(const bool useGpu = true,
                           bool balanceWorkload = true, bool matlabSharedMem = false)
-    : useTextures(useTextures), useGpu(useGpu), balanceWorkload(balanceWorkload),
+    : useGpu(useGpu), balanceWorkload(balanceWorkload),
     matlabSharedMem(matlabSharedMem)
   {
   }
@@ -171,8 +166,6 @@ class GpuNUFFTOperatorFactory
       Array<DType> &deapoData, const IndType &kernelWidth, const IndType &sectorWidth, 
       const DType &osf, Dimensions &imgDims);
 
-  void setUseTextures(bool useTextures);
-
   void setBalanceWorkload(bool balanceWorkload);
   
   /**
@@ -298,8 +291,6 @@ class GpuNUFFTOperatorFactory
     *
     * - default: GpuNUFFTOperator
     * - balanceWorkload = true: BalancedGpuNUFFTOperator
-    * - useTextures = true: TextureGpuNUFFTOperator
-    * - balanceWorkload + useTextures = true: BalancedTextureGpuNUFFTOperator
     *
     * @return New allocated GpuNUFFTOperator or sub class
     */
@@ -328,9 +319,6 @@ class GpuNUFFTOperatorFactory
 
  
  private:
-  /** \brief Flag to indicate texture interpolation */
-  bool useTextures;
-
   /** \brief Flag to indicate gpu usage for precomputation */
   bool useGpu;
 
diff --git a/CUDA/inc/texture_gpuNUFFT_operator.hpp b/CUDA/inc/texture_gpuNUFFT_operator.hpp
deleted file mode 100644
index 5d1bca98..00000000
--- a/CUDA/inc/texture_gpuNUFFT_operator.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED
-#define TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED
-
-#include <typeinfo>
-#include <stdexcept>
-#include "gpuNUFFT_types.hpp"
-#include "gpuNUFFT_operator.hpp"
-
-namespace gpuNUFFT
-{
-/**
-* \brief GpuNUFFTOperator with texture memory lookup
-*
-* Changes the behaviour of the default GpuNUFFTOperator by using gpu texture
-*memory
-* in the kernel interpolation step.
-*
-*/
-class TextureGpuNUFFTOperator : public GpuNUFFTOperator
-{
- public:
-  TextureGpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth, DType osf,
-                          Dimensions imgDims,
-                          InterpolationType interpolationType = TEXTURE2D_LOOKUP,
-                          bool matlabSharedMem = false)
-    : GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims, false, TEXTURE, matlabSharedMem),
-    interpolationType(interpolationType), kernel_d(NULL)
-  {
-    if (typeid(DType) == typeid(double))
-      throw std::runtime_error(
-          "Double precision textures are not supported yet!");
-
-    initKernel();
-  }
-
-  ~TextureGpuNUFFTOperator()
-  {
-  }
-
-  virtual OperatorType getType()
-  {
-    return gpuNUFFT::TEXTURE;
-  }
-
- protected:
-  void initKernel();
-
-  cudaArray *kernel_d;
-  InterpolationType interpolationType;
-  const char *getInterpolationTypeName();
-
-  // OPERATIONS
- private:
-  GpuNUFFTInfo *initAndCopyGpuNUFFTInfo(int n_coils_cc = 1);
-
-  virtual void adjConvolution(DType2 *data_d, DType *crds_d, CufftType *gdata_d,
-                              DType *kernel_d, IndType *sectors_d,
-                              IndType *sector_centers_d,
-                              gpuNUFFT::GpuNUFFTInfo *gi_host);
-  virtual void forwardConvolution(CufftType *data_d, DType *crds_d,
-                                  CufftType *gdata_d, DType *kernel_d,
-                                  IndType *sectors_d, IndType *sector_centers_d,
-                                  gpuNUFFT::GpuNUFFTInfo *gi_host);
-
-  void initLookupTable();
-  void freeLookupTable();
-};
-}
-
-#endif  // TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED
diff --git a/CUDA/src/CMakeLists.txt b/CUDA/src/CMakeLists.txt
index b3b9a21b..8fb4f83c 100644
--- a/CUDA/src/CMakeLists.txt
+++ b/CUDA/src/CMakeLists.txt
@@ -4,9 +4,7 @@ SET(GPUNUFFT_SRC_DIR ${CMAKE_SOURCE_DIR}/src)
 SET(GPUNUFFT_SOURCES ${GPUNUFFT_SRC_DIR}/gpuNUFFT_utils.cpp
                      ${GPUNUFFT_SRC_DIR}/gpuNUFFT_operator_factory.cpp
                      ${GPUNUFFT_SRC_DIR}/gpuNUFFT_operator.cpp
-										 ${GPUNUFFT_SRC_DIR}/texture_gpuNUFFT_operator.cpp
-										 ${GPUNUFFT_SRC_DIR}/balanced_gpuNUFFT_operator.cpp
-										 ${GPUNUFFT_SRC_DIR}/balanced_texture_gpuNUFFT_operator.cpp)
+					 ${GPUNUFFT_SRC_DIR}/balanced_gpuNUFFT_operator.cpp)
 
 ADD_SUBDIRECTORY(gpu)
 
diff --git a/CUDA/src/balanced_gpuNUFFT_operator.cpp b/CUDA/src/balanced_gpuNUFFT_operator.cpp
index 613ee1c8..3e621044 100644
--- a/CUDA/src/balanced_gpuNUFFT_operator.cpp
+++ b/CUDA/src/balanced_gpuNUFFT_operator.cpp
@@ -99,12 +99,12 @@ void gpuNUFFT::BalancedGpuNUFFTOperator::performForwardGpuNUFFT(
     printf(
         "BGpuNUFFT: allocate and copy sector processing order of size %d...\n",
         this->sectorProcessingOrder.count());
-  //allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
-    //                                   this->sectorProcessingOrder.data,
-      //                                 this->sectorProcessingOrder.count());
+  allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
+                                       this->sectorProcessingOrder.data,
+                                       this->sectorProcessingOrder.count());
 
   GpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData, gpuNUFFTOut);
 
-//  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
+  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
 }
 
diff --git a/CUDA/src/balanced_texture_gpuNUFFT_operator.cpp b/CUDA/src/balanced_texture_gpuNUFFT_operator.cpp
deleted file mode 100644
index 1ad519c6..00000000
--- a/CUDA/src/balanced_texture_gpuNUFFT_operator.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include "balanced_texture_gpuNUFFT_operator.hpp"
-
-gpuNUFFT::GpuNUFFTInfo *
-gpuNUFFT::BalancedTextureGpuNUFFTOperator::initAndCopyGpuNUFFTInfo(
-    int n_coils_cc)
-{
-  gpuNUFFT::GpuNUFFTInfo *gi_host = initGpuNUFFTInfo(n_coils_cc);
-
-  gi_host->sectorsToProcess = sectorProcessingOrder.count();
-  gi_host->interpolationType = interpolationType;
-
-  if (DEBUG)
-    printf("copy GpuNUFFT Info to symbol memory... size = %lu \n",
-      (SizeType)sizeof(gpuNUFFT::GpuNUFFTInfo));
-
-  initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo));
-
-  if (DEBUG)
-    printf("...done!\n");
-  return gi_host;
-}
-
-void gpuNUFFT::BalancedTextureGpuNUFFTOperator::adjConvolution(
-    DType2 *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d,
-    IndType *sectors_d, IndType *sector_centers_d,
-    gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  bindTo1DTexture("texDATA", data_d,
-                  this->kSpaceTraj.count() * gi_host->n_coils_cc);
-
-  // call balanced texture kernel
-  performTextureConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d,
-                            sector_processing_order_d, sector_centers_d,
-                            gi_host);
-
-  unbindTexture("texDATA");
-}
-
-void gpuNUFFT::BalancedTextureGpuNUFFTOperator::forwardConvolution(
-    CufftType *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d,
-    IndType *sectors_d, IndType *sector_centers_d,
-    gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  bindTo1DTexture("texGDATA", gdata_d,
-                  gi_host->grid_width_dim * gi_host->n_coils_cc);
-
-  // call balanced texture kernel
-  performTextureForwardConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d,
-                                   sector_processing_order_d, sector_centers_d,
-                                   gi_host);
-
-  unbindTexture("texGDATA");
-}
-
-// Adds behaviour of GpuNUFFTOperator by
-// adding a sector processing order
-void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performGpuNUFFTAdj(
-    gpuNUFFT::Array<DType2> kspaceData, gpuNUFFT::Array<CufftType> &imgData,
-    GpuNUFFTOutput gpuNUFFTOut)
-{
-  if (DEBUG)
-    printf(
-        "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n",
-        this->sectorProcessingOrder.count());
-  allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
-                                       this->sectorProcessingOrder.data,
-                                       this->sectorProcessingOrder.count());
-
-  TextureGpuNUFFTOperator::performGpuNUFFTAdj(kspaceData, imgData, gpuNUFFTOut);
-
-  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
-}
-
-void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performGpuNUFFTAdj(
-    GpuArray<DType2> kspaceData_gpu, GpuArray<CufftType> &imgData_gpu,
-    GpuNUFFTOutput gpuNUFFTOut)
-{
-  if (DEBUG)
-    printf(
-        "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n",
-        this->sectorProcessingOrder.count());
-  allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
-                                       this->sectorProcessingOrder.data,
-                                       this->sectorProcessingOrder.count());
-
-  TextureGpuNUFFTOperator::performGpuNUFFTAdj(kspaceData_gpu, imgData_gpu,
-                                              gpuNUFFTOut);
-
-  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
-}
-
-void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performForwardGpuNUFFT(
-    gpuNUFFT::Array<DType2> imgData, gpuNUFFT::Array<CufftType> &kspaceData,
-    GpuNUFFTOutput gpuNUFFTOut)
-{
-  if (DEBUG)
-    printf(
-        "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n",
-        this->sectorProcessingOrder.count());
-  allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
-                                       this->sectorProcessingOrder.data,
-                                       this->sectorProcessingOrder.count());
-
-  TextureGpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData,
-                                                  gpuNUFFTOut);
-
-  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
-}
-
-void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performForwardGpuNUFFT(
-    gpuNUFFT::GpuArray<DType2> imgData,
-    gpuNUFFT::GpuArray<CufftType> &kspaceData, GpuNUFFTOutput gpuNUFFTOut)
-{
-  if (DEBUG)
-    printf(
-        "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n",
-        this->sectorProcessingOrder.count());
-  allocateAndCopyToDeviceMem<IndType2>(&sector_processing_order_d,
-                                       this->sectorProcessingOrder.data,
-                                       this->sectorProcessingOrder.count());
-
-  TextureGpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData,
-                                                  gpuNUFFTOut);
-
-  freeTotalDeviceMemory(sector_processing_order_d, NULL);  // NULL as stop token
-}
diff --git a/CUDA/src/gpu/atomic/CMakeLists.txt b/CUDA/src/gpu/atomic/CMakeLists.txt
index 061b9c12..3dd73742 100644
--- a/CUDA/src/gpu/atomic/CMakeLists.txt
+++ b/CUDA/src/gpu/atomic/CMakeLists.txt
@@ -7,7 +7,6 @@ cuda_include_directories(${GPUNUFFT_INC_DIR})
 
 set(GPU_CU_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/atomic_gpuNUFFT.cu 	
 					#${CMAKE_CURRENT_SOURCE_DIR}/atomic_gpuNUFFT_kernels.cu 
-					#${CMAKE_CURRENT_SOURCE_DIR}/texture_gpuNUFFT_kernels.cu 
 					#${CMAKE_CURRENT_SOURCE_DIR}/../std_gpuNUFFT_kernels.cu
 									 )
 if(WIN32)
diff --git a/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu b/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu
index bf2e9622..87df6de0 100644
--- a/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu
+++ b/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu
@@ -1,2 +1 @@
 #include "atomic_gpuNUFFT_kernels.cu"
-#include "texture_gpuNUFFT_kernels.cu"
diff --git a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
deleted file mode 100644
index e143ddcb..00000000
--- a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu
+++ /dev/null
@@ -1,1153 +0,0 @@
-#ifndef TEXTURE_GPUNUFFT_KERNELS_H
-#define TEXTURE_GPUNUFFT_KERNELS_H
-#include "gpuNUFFT_kernels.hpp"
-#include "../std_gpuNUFFT_kernels.cu"
-#include "cuda_utils.cuh"
-
-// ----------------------------------------------------------------------------
-// convolutionKernel: NUFFT^H kernel
-//
-// Performs the gpuNUFFT step by convolution of sample points with
-// interpolation function and resampling onto grid. Basic concept based on Zwart
-// et al.
-//
-// parameters:
-//  * data           : complex input sample points
-//  * crds           : coordinates of data points (x,y,z)
-//  * gdata          : output grid data
-//  * sectors        : mapping of sample indices according to each sector
-//  * sector_centers : coordinates (x,y,z) of sector centers
-//  * temp_gdata     : temporary grid data
-//  * N              : number of threads
-__device__ void textureConvolutionFunction(int *sec, int sec_max,
-                                           int sec_offset, DType2 *sdata,
-                                           DType2 *data, DType *crds,
-                                           CufftType *gdata, IndType *sectors,
-                                           IndType *sector_centers)
-{
-  // start convolution
-  int ind, x, y, z;
-  int imin, imax, jmin, jmax, kmin, kmax;
-
-  DType dx_sqr, dy_sqr, dz_sqr, val, ix, jy, kz;
-
-  __shared__ IndType3 center;
-  center.x = sector_centers[sec[threadIdx.x] * 3];
-  center.y = sector_centers[sec[threadIdx.x] * 3 + 1];
-  center.z = sector_centers[sec[threadIdx.x] * 3 + 2];
-
-  // Grid Points over Threads
-  int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
-  // loop over all data points of the current sector, and check if grid position
-  // lies inside
-  // affected region, if so, add data point weighted to grid position value
-  while (data_cnt < sec_max)
-  {
-    DType3 data_point;  // datapoint per thread
-    data_point.x = crds[data_cnt];
-    data_point.y = crds[data_cnt + GI.data_count];
-    data_point.z = crds[data_cnt + 2 * GI.data_count];
-
-    // set the boundaries of final dataset for gpuNUFFT this point
-    ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x,
-                         GI.sector_offset);
-    set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius);
-    jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y,
-                         GI.sector_offset);
-    set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius);
-    kz = mapKSpaceToGrid(data_point.z, GI.gridDims.z, center.z,
-                         GI.sector_offset);
-    set_minmax(&kz, &kmin, &kmax, GI.sector_pad_max, GI.kernel_radius);
-
-    // grid this point onto its cartesian points neighbors
-    for (int k = kmin; k <= kmax; k++)
-    {
-      kz = mapGridToKSpace(k, GI.gridDims.z, center.z, GI.sector_offset);
-      dz_sqr = (kz - data_point.z) * GI.aniso_z_scale;
-      dz_sqr *= dz_sqr;
-      for (int j = jmin; j <= jmax; j++)
-      {
-        jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset);
-        dy_sqr = (jy - data_point.y) * GI.aniso_y_scale;
-        dy_sqr *= dy_sqr;
-
-        for (int i = imin; i <= imax; i++)
-        {
-          ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset);
-          dx_sqr = (ix - data_point.x) * GI.aniso_x_scale;
-          dx_sqr *= dx_sqr;
-          // get kernel value
-          val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv,
-                                     dy_sqr * GI.radiusSquared_inv,
-                                     dz_sqr * GI.radiusSquared_inv);
-
-          ind = getIndex(i, j, k, GI.sector_pad_width);
-
-          // multiply data by current kernel val
-          // grid complex or scalar
-          atomicAdd(&(sdata[ind].x),
-              val *
-              tex1Dfetch(texDATA, data_cnt).x);
-
-          atomicAdd(&(sdata[ind].y),
-              val *
-              tex1Dfetch(texDATA, data_cnt).y);
-        }  // x
-      }  // y
-    }  // z
-    data_cnt = data_cnt + blockDim.x;
-  }  // grid points per sector
-
-  // write shared data to output grid
-  __syncthreads();
-  // int sector_ind_offset = sec * GI.sector_dim;
-  __shared__ int sector_ind_offset;
-  sector_ind_offset =
-      computeXYZ2Lin(center.x - GI.sector_offset, center.y - GI.sector_offset,
-                     center.z - GI.sector_offset, GI.gridDims);
-
-  // each thread writes one position from shared mem to global mem
-  for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x)
-  {
-    getCoordsFromIndex(s_ind, &x, &y, &z, GI.sector_pad_width);
-
-    if (isOutlier(x, y, z, center.x, center.y, center.z, GI.gridDims,
-                  GI.sector_offset))
-      // calculate opposite index
-      ind = computeXYZ2Lin(
-          calculateOppositeIndex(x, center.x, GI.gridDims.x, GI.sector_offset),
-          calculateOppositeIndex(y, center.y, GI.gridDims.y, GI.sector_offset),
-          calculateOppositeIndex(z, center.z, GI.gridDims.z, GI.sector_offset),
-          GI.gridDims);
-    else
-      ind = sector_ind_offset +
-            computeXYZ2Lin(x, y, z, GI.gridDims);  // index in output grid
-
-    atomicAdd(&(gdata[ind].x), sdata[s_ind].x);  // Re
-    atomicAdd(&(gdata[ind].y), sdata[s_ind].y);  // Im
-    // reset shared mem
-    sdata[s_ind].x = (DType)0.0;
-    sdata[s_ind].y = (DType)0.0;
-  }
-  __syncthreads();
-}
-
-__global__ void textureConvolutionKernel(DType2 *data, DType *crds,
-                                         CufftType *gdata, IndType *sectors,
-                                         IndType *sector_centers, int N)
-{
-  extern __shared__ DType2 sdata[];  // externally managed shared memory
-
-  // init shared memory
-  for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x)
-  {
-    sdata[s_ind].x = (DType)0.0;  // Re
-    sdata[s_ind].y = (DType)0.0;  // Im
-  }
-  __syncthreads();
-
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-  sec[threadIdx.x] = blockIdx.x;
-  while (sec[threadIdx.x] < N)
-  {
-    __shared__ int data_max;
-    data_max = sectors[sec[threadIdx.x] + 1];
-    textureConvolutionFunction(sec, data_max, 0, sdata, data, crds, gdata,
-                               sectors, sector_centers);
-    __syncthreads();
-    sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x;
-  }  // sec < sector_count
-}
-
-__global__ void balancedTextureConvolutionKernel(
-    DType2 *data, DType *crds, CufftType *gdata, IndType *sectors,
-    IndType2 *sector_processing_order, IndType *sector_centers, int N)
-{
-  extern __shared__ DType2 sdata[];  // externally managed shared memory
-
-  // init shared memory
-  for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x)
-  {
-    sdata[s_ind].x = (DType)0.0;  // Re
-    sdata[s_ind].y = (DType)0.0;  // Im
-  }
-  __syncthreads();
-
-  int sec_cnt = blockIdx.x;
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-
-  while (sec_cnt < N)
-  {
-    sec[threadIdx.x] = sector_processing_order[sec_cnt].x;
-    __shared__ int data_max;
-    data_max = min(sectors[sec[threadIdx.x] + 1],
-                   sectors[sec[threadIdx.x]] +
-                       sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD);
-    textureConvolutionFunction(sec, data_max,
-                               sector_processing_order[sec_cnt].y, sdata, data,
-                               crds, gdata, sectors, sector_centers);
-    __syncthreads();
-    sec_cnt = sec_cnt + gridDim.x;
-  }  // sec < sector_count
-}
-
-// ----------------------------------------------------------------------------
-// convolutionKernel: NUFFT^H kernel
-//
-// Performs the gpuNUFFT step by convolution of sample points with
-// interpolation function and resampling onto grid. Basic concept based on Zwart
-// et al.
-//
-// parameters:
-//  * data           : complex input sample points
-//  * crds           : coordinates of data points (x,y,z)
-//  * gdata          : output grid data
-//  * sectors        : mapping of sample indices according to each sector
-//  * sector_centers : coordinates (x,y,z) of sector centers
-//  * temp_gdata     : temporary grid data
-//  * N              : number of threads
-__device__ void textureConvolutionFunction2D(DType2 *sdata, int *sec,
-                                             int sec_max, int sec_offset,
-                                             DType2 *data, DType *crds,
-                                             CufftType *gdata, IndType *sectors,
-                                             IndType *sector_centers)
-{
-  // start convolution
-  int ind, x, y;
-  int imin, imax, jmin, jmax;
-
-  DType dx_sqr, dy_sqr, val, ix, jy;
-
-  __shared__ IndType2 center;
-  center.x = sector_centers[sec[threadIdx.x] * 2];
-  center.y = sector_centers[sec[threadIdx.x] * 2 + 1];
-
-  // Grid Points over Threads
-  int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
-  // loop over all data points of the current sector, and check if grid position
-  // lies inside
-  // affected region, if so, add data point weighted to grid position value
-  while (data_cnt < sec_max)
-  {
-    DType2 data_point;  // datapoint per thread
-    data_point.x = crds[data_cnt];
-    data_point.y = crds[data_cnt + GI.data_count];
-
-    // set the boundaries of final dataset for gpuNUFFT this point
-    ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x,
-                         GI.sector_offset);
-    set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius);
-    jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y,
-                         GI.sector_offset);
-    set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius);
-
-    // grid this point onto its cartesian points neighbors
-    for (int j = jmin; j <= jmax; j++)
-    {
-      jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset);
-      dy_sqr = (jy - data_point.y) * GI.aniso_y_scale;
-      dy_sqr *= dy_sqr;
-
-      for (int i = imin; i <= imax; i++)
-      {
-        ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset);
-        dx_sqr = (ix - data_point.x) * GI.aniso_x_scale;
-        dx_sqr *= dx_sqr;
-        // get kernel value
-        // Calculate Separable Filters
-        val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv,
-                                   dy_sqr * GI.radiusSquared_inv);
-
-        ind = getIndex2D(i, j, GI.sector_pad_width);
-
-        // multiply data by current kernel val
-        // grid complex or scalar
-        for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z)
-        {
-          atomicAdd(&(sdata[ind + c * GI.sector_dim].x),
-                    val * tex1Dfetch(texDATA, data_cnt + c * GI.data_count).x);
-          atomicAdd(&(sdata[ind + c * GI.sector_dim].y),
-                    val * tex1Dfetch(texDATA, data_cnt + c * GI.data_count).y);
-        }
-      }  // x
-    }  // y
-    data_cnt = data_cnt + blockDim.x;
-  }  // grid points per sector
-
-  // write shared data to output grid
-  __syncthreads();
-  // int sector_ind_offset = sec * GI.sector_dim;
-  __shared__ int sector_ind_offset;
-  sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset,
-                                    center.y - GI.sector_offset, GI.gridDims);
-
-  // each thread writes one position from shared mem to global mem
-  for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x)
-  {
-    getCoordsFromIndex2D(s_ind, &x, &y, GI.sector_pad_width);
-
-    if (isOutlier2D(x, y, center.x, center.y, GI.gridDims, GI.sector_offset))
-      // calculate opposite index
-      ind = computeXY2Lin(
-          calculateOppositeIndex(x, center.x, GI.gridDims.x, GI.sector_offset),
-          calculateOppositeIndex(y, center.y, GI.gridDims.y, GI.sector_offset),
-          GI.gridDims);
-    else
-      ind = sector_ind_offset +
-            computeXY2Lin(x, y, GI.gridDims);  // index in output grid
-
-    for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z)
-    {
-      atomicAdd(&(gdata[ind + c * GI.gridDims_count].x),
-                sdata[s_ind + c * GI.sector_dim].x);  // Re
-      atomicAdd(&(gdata[ind + c * GI.gridDims_count].y),
-                sdata[s_ind + c * GI.sector_dim].y);  // Im
-
-      // reset shared mem
-      sdata[s_ind + c * GI.sector_dim].x = (DType)0.0;
-      sdata[s_ind + c * GI.sector_dim].y = (DType)0.0;
-    }
-  }
-}
-
-__global__ void textureConvolutionKernel2D(DType2 *data, DType *crds,
-                                           CufftType *gdata, IndType *sectors,
-                                           IndType *sector_centers, int N)
-{
-  extern __shared__ DType2 sdata[];  // externally managed shared memory
-
-  // init shared memory
-  for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x)
-  {
-    for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z)
-    {
-      sdata[s_ind + c * GI.sector_dim].x = 0.0f;  // Re
-      sdata[s_ind + c * GI.sector_dim].y = 0.0f;  // Im
-    }
-  }
-  __syncthreads();
-
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-  sec[threadIdx.x] = blockIdx.x;
-  while (sec[threadIdx.x] < N)
-  {
-    __shared__ int data_max;
-    data_max = sectors[sec[threadIdx.x] + 1];
-    textureConvolutionFunction2D(sdata, sec, data_max, 0, data, crds, gdata,
-                                 sectors, sector_centers);
-    __syncthreads();
-    sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x;
-  }  // sec < sector_count
-}
-
-__global__ void balancedTextureConvolutionKernel2D(
-    DType2 *data, DType *crds, CufftType *gdata, IndType *sectors,
-    IndType2 *sector_processing_order, IndType *sector_centers, int N)
-{
-  extern __shared__ DType2 sdata[];  // externally managed shared memory
-
-  // init shared memory
-  for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x)
-  {
-    for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z)
-    {
-      sdata[s_ind + c * GI.sector_dim].x = 0.0f;  // Re
-      sdata[s_ind + c * GI.sector_dim].y = 0.0f;  // Im
-    }
-  }
-  __syncthreads();
-
-  int sec_cnt = blockIdx.x;
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-
-  while (sec_cnt < N)
-  {
-    sec[threadIdx.x] = sector_processing_order[sec_cnt].x;
-    __shared__ int data_max;
-    data_max = min(sectors[sec[threadIdx.x] + 1],
-                   sectors[sec[threadIdx.x]] 
-                      + sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD);
-    textureConvolutionFunction2D(sdata, sec, data_max,
-                                 sector_processing_order[sec_cnt].y, data, crds,
-                                 gdata, sectors, sector_centers);
-    __syncthreads();
-    sec_cnt = sec_cnt + gridDim.x;
-  }  // sec < sector_count
-}
-
-void performTextureConvolution(DType2 *data_d, DType *crds_d,
-                               CufftType *gdata_d, DType *kernel_d,
-                               IndType *sectors_d, IndType *sector_centers_d,
-                               gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  long shared_mem_size =
-      (gi_host->sector_dim) * sizeof(DType2) * gi_host->n_coils_cc;
-  int thread_size = THREAD_BLOCK_SIZE;
-
-  dim3 block_dim(thread_size);
-  dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, 1));
-  if (DEBUG)
-  {
-    printf("adjoint texture convolution requires %ld bytes of shared memory!\n",
-           shared_mem_size);
-    printf("grid dim %u, block dim %u \n", grid_dim.x, block_dim.x);
-  }
-  if (gi_host->is2Dprocessing)
-  {
-    dim3 block_dim(
-        64, 1,
-        DEFAULT_VALUE(gi_host->n_coils_cc > 4 ? 4 : gi_host->n_coils_cc));
-    textureConvolutionKernel2D <<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_centers_d,
-         gi_host->sector_count);
-  }
-  else
-    textureConvolutionKernel <<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_centers_d,
-         gi_host->sector_count);
-
-  if (DEBUG)
-    printf("...finished with: %s\n", cudaGetErrorString(cudaGetLastError()));
-}
-
-void performTextureConvolution(DType2 *data_d, DType *crds_d,
-                               CufftType *gdata_d, DType *kernel_d,
-                               IndType *sectors_d,
-                               IndType2 *sector_processing_order_d,
-                               IndType *sector_centers_d,
-                               gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  long shared_mem_size =
-      (gi_host->sector_dim) * sizeof(DType2) * gi_host->n_coils_cc;
-  int thread_size = THREAD_BLOCK_SIZE;
-
-  dim3 block_dim(thread_size);
-  dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, 1));
-  if (DEBUG)
-  {
-    printf("adjoint balanced texture convolution requires %ld bytes of shared "
-           "memory!\n",
-           shared_mem_size);
-    printf("grid dim %u, block dim %u \n", grid_dim.x, block_dim.x);
-  }
-  if (gi_host->is2Dprocessing)
-  {
-    dim3 block_dim(
-        64, 1,
-        DEFAULT_VALUE(gi_host->n_coils_cc > 4 ? 4 : gi_host->n_coils_cc));
-    //printf("block dims: %u %u %u!\n", block_dim.x, block_dim.y, block_dim.z);
-    balancedTextureConvolutionKernel2D
-            <<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d,
-         sector_centers_d, gi_host->sectorsToProcess);
-  }
-  else
-    balancedTextureConvolutionKernel <<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d,
-         sector_centers_d, gi_host->sectorsToProcess);
-
-  if (DEBUG)
-    printf("...finished with: %s\n", cudaGetErrorString(cudaGetLastError()));
-}
-
-// ----------------------------------------------------------------------------
-// forwardConvolutionKernel: NUFFT kernel
-//
-// Performs the inverse gpuNUFFT step by convolution of grid points with
-// interpolation function and resampling onto trajectory.
-//
-// parameters:
-//  * data           : complex output sample points
-//  * crds           : coordinates of data points (x,y,z)
-//  * gdata          : input grid data
-//  * sectors        : mapping of sample indices according to each sector
-//  * sector_centers : coordinates (x,y,z) of sector centers
-//  * N              : number of threads
-
-__device__ void
-textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_offset,
-                                  DType2 *sdata, CufftType *gdata_cache,
-                                  DType2 *data, DType *crds, CufftType *gdata,
-                                  IndType *sectors, IndType *sector_centers)
-{
-  int ind, imin, imax, jmin, jmax, kmin, kmax, ii, jj, kk;
-  DType dx_sqr, dy_sqr, dz_sqr, val, ix, jy, kz;
-
-  __shared__ IndType3 center;
-  center.x = sector_centers[sec[threadIdx.x] * 3];
-  center.y = sector_centers[sec[threadIdx.x] * 3 + 1];
-  center.z = sector_centers[sec[threadIdx.x] * 3 + 2];
-
-  __shared__ long int sector_ind_offset;
-  sector_ind_offset =
-      computeXYZ2Lin(center.x - GI.sector_offset, center.y - GI.sector_offset,
-                     center.z - GI.sector_offset, GI.gridDims);
-
-  // init sector cache
-  // preload sector grid data into cache
-  for (long int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x)
-  {
-    long int grid_index;
-    getCoordsFromIndex(ind, &ii, &jj, &kk, GI.sector_pad_width);
-
-    if (isOutlier(ii, jj, kk, center.x, center.y, center.z, GI.gridDims,
-                  GI.sector_offset))
-      // calculate opposite index
-      grid_index = computeXYZ2Lin(
-          calculateOppositeIndex(ii, center.x, GI.gridDims.x, GI.sector_offset),
-          calculateOppositeIndex(jj, center.y, GI.gridDims.y, GI.sector_offset),
-          calculateOppositeIndex(kk, center.z, GI.gridDims.z, GI.sector_offset),
-          GI.gridDims);
-    else
-      grid_index = (sector_ind_offset + computeXYZ2Lin(ii, jj, kk, GI.gridDims));
-
-    gdata_cache[ind].x = tex1Dfetch(texGDATA, grid_index).x;
-    gdata_cache[ind].y = tex1Dfetch(texGDATA, grid_index).y;
-  }
-
-  __syncthreads();
-
-  // Grid Points over Threads
-  long int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
-
-  while (data_cnt < sec_max)
-  {
-    DType3 data_point;  // datapoint per thread
-    data_point.x = crds[data_cnt];
-    data_point.y = crds[data_cnt + GI.data_count];
-    data_point.z = crds[data_cnt + 2 * GI.data_count];
-
-    // set the boundaries of final dataset for gpuNUFFT this point
-    ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x,
-                         GI.sector_offset);
-    set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius);
-    jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y,
-                         GI.sector_offset);
-    set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius);
-    kz = mapKSpaceToGrid(data_point.z, GI.gridDims.z, center.z,
-                         GI.sector_offset);
-    set_minmax(&kz, &kmin, &kmax, GI.sector_pad_max, GI.kernel_radius);
-
-    // convolve neighboring cartesian points to this data point
-    for (int k = kmin; k <= kmax; k++)
-    {
-      kz = mapGridToKSpace(k, GI.gridDims.z, center.z, GI.sector_offset);
-      dz_sqr = (kz - data_point.z) * GI.aniso_z_scale;
-      dz_sqr *= dz_sqr;
-
-      for (int j = jmin; j <= jmax; j++)
-      {
-        jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset);
-        dy_sqr = (jy - data_point.y) * GI.aniso_y_scale;
-        dy_sqr *= dy_sqr;
-
-        for (int i = imin; i <= imax; i++)
-        {
-          ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset);
-          dx_sqr = (ix - data_point.x) * GI.aniso_x_scale;
-          dx_sqr *= dx_sqr;
-
-          // get kernel value
-          val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv,
-                                     dy_sqr * GI.radiusSquared_inv,
-                                     dz_sqr * GI.radiusSquared_inv);
-
-          ind = getIndex(i, j, k, GI.sector_pad_width);
-
-          sdata[threadIdx.x].x += gdata_cache[ind].x * val;
-          sdata[threadIdx.x].y += gdata_cache[ind].y * val;
-        }  // x loop
-      }  // y loop
-    }  // z loop
-    atomicAdd(&(data[data_cnt].x), sdata[threadIdx.x].x);
-    atomicAdd(&(data[data_cnt].y), sdata[threadIdx.x].y);
-
-    data_cnt = data_cnt + blockDim.x;
-
-    sdata[threadIdx.x].x = (DType)0.0;  // Re
-    sdata[threadIdx.x].y = (DType)0.0;  // Im
-  }  // data points per sector
-}
-
-__global__ void textureForwardConvolutionKernel(CufftType *data, DType *crds,
-                                                CufftType *gdata,
-                                                IndType *sectors,
-                                                IndType *sector_centers, int N)
-{
-  extern __shared__ CufftType shared[];  // externally managed shared memory
-  CufftType *shared_out_data = (CufftType *)&shared[0];
-  CufftType *gdata_cache = (CufftType *)&shared[blockDim.x];
-
-  __shared__ long int sec[THREAD_BLOCK_SIZE];
-  sec[threadIdx.x] = blockIdx.x;
-
-  // init shared memory
-  shared_out_data[threadIdx.x].x = (DType)0.0;  // Re
-  shared_out_data[threadIdx.x].y = (DType)0.0;  // Im
-
-  __syncthreads();
-  // start convolution
-  while (sec[threadIdx.x] < N)
-  {
-    __shared__ long int data_max;
-    data_max = sectors[sec[threadIdx.x] + 1];
-
-    textureForwardConvolutionFunction(sec, data_max, 0, shared_out_data,
-                                      gdata_cache, data, crds, gdata, sectors,
-                                      sector_centers);
-    __syncthreads();
-    sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x;
-  }  // sector check
-}
-
-__global__ void balancedTextureForwardConvolutionKernel(
-    CufftType *data, DType *crds, CufftType *gdata, IndType *sectors,
-    IndType2 *sector_processing_order, IndType *sector_centers, int N)
-{
-  extern __shared__ CufftType shared[];  // externally managed shared memory
-  CufftType *shared_out_data = (CufftType *)&shared[0];
-  CufftType *gdata_cache = (CufftType *)&shared[blockDim.x];
-
-  long int sec_cnt = blockIdx.x;
-  __shared__ long int sec[THREAD_BLOCK_SIZE];
-
-  // init shared memory
-  shared_out_data[threadIdx.x].x = (DType)0.0;  // Re
-  shared_out_data[threadIdx.x].y = (DType)0.0;  // Im
-
-  __syncthreads();
-  // start convolution
-  while (sec_cnt < N)
-  {
-    sec[threadIdx.x] = sector_processing_order[sec_cnt].x;
-    __shared__ long int data_max;
-    data_max = min(sectors[sec[threadIdx.x] + 1],
-                   sectors[sec[threadIdx.x]] +
-                       sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD);
-
-    textureForwardConvolutionFunction(
-        sec, data_max, sector_processing_order[sec_cnt].y, shared_out_data,
-        gdata_cache, data, crds, gdata, sectors, sector_centers);
-    __syncthreads();
-    sec_cnt = sec_cnt + gridDim.x;
-  }  // sector check
-}
-
-__device__ void
-textureForwardConvolutionFunction2D(int *sec, int sec_max, int sec_offset,
-                                    DType2 *sdata, CufftType *gdata_cache,
-                                    DType2 *data, DType *crds, CufftType *gdata,
-                                    IndType *sectors, IndType *sector_centers)
-{
-  int ind, imin, imax, jmin, jmax, ii, jj;
-  DType val, ix, jy;
-
-  __shared__ IndType2 center;
-  center.x = sector_centers[sec[threadIdx.x] * 2];
-  center.y = sector_centers[sec[threadIdx.x] * 2 + 1];
-
-  __shared__ int sector_ind_offset;
-  sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset,
-                                    center.y - GI.sector_offset, GI.gridDims);
-
-  // init sector cache
-  // preload sector grid data into cache
-  for (int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x)
-  {
-    int grid_index;
-    getCoordsFromIndex2D(ind, &ii, &jj, GI.sector_pad_width);
-
-    // multiply data by current kernel val
-    // grid complex or scalar
-    if (isOutlier2D(ii, jj, center.x, center.y, GI.gridDims, GI.sector_offset))
-      // calculate opposite index
-      grid_index = getIndex2D(
-          calculateOppositeIndex(ii, center.x, GI.gridDims.x, GI.sector_offset),
-          calculateOppositeIndex(jj, center.y, GI.gridDims.y, GI.sector_offset),
-          GI.gridDims.x);
-    else
-      grid_index = (sector_ind_offset + getIndex2D(ii, jj, GI.gridDims.x));
-
-    for (int c = 0; c < GI.n_coils_cc; c++)
-    {
-      gdata_cache[ind + c * GI.sector_dim].x =
-          tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).x;
-      gdata_cache[ind + c * GI.sector_dim].y =
-          tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).y;
-    }
-  }
-  __syncthreads();
-
-  // Grid Points over Threads
-  int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
-
-  while (data_cnt < sec_max)
-  {
-    DType2 data_point;  // datapoint per thread
-    data_point.x = crds[data_cnt];
-    data_point.y = crds[data_cnt + GI.data_count];
-
-    // set the boundaries of final dataset for gpuNUFFT this point
-    ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x,
-                         GI.sector_offset);
-    set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius);
-    jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y,
-                         GI.sector_offset);
-    set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius);
-
-    // convolve neighboring cartesian points to this data point
-    for (int j = jmin; j <= jmax; j++)
-    {
-      jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset);
-      DType dy_sqr = (jy - data_point.y) * GI.aniso_y_scale;
-      dy_sqr *= dy_sqr;
-
-      for (int i = imin; i <= imax; i++)
-      {
-        ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset);
-        DType dx_sqr = (ix - data_point.x) * GI.aniso_x_scale;
-        dx_sqr *= dx_sqr;
-        // get kernel value
-        // calc as separable filter
-        val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv,
-                                   dy_sqr * GI.radiusSquared_inv);
-
-        ind = getIndex2D(i, j, GI.sector_pad_width);
-
-        for (int c = 0; c < GI.n_coils_cc; c++)
-        {
-          sdata[threadIdx.x + c * blockDim.x].x +=
-              gdata_cache[ind + c * GI.sector_dim].x * val;
-          sdata[threadIdx.x + c * blockDim.x].y +=
-              gdata_cache[ind + c * GI.sector_dim].y * val;
-        }
-      }  // x loop
-    }  // y loop
-
-    for (int c = 0; c < GI.n_coils_cc; c++)
-    {
-      atomicAdd(&(data[data_cnt + c * GI.data_count].x),
-                sdata[threadIdx.x + c * blockDim.x].x);
-      atomicAdd(&(data[data_cnt + c * GI.data_count].y),
-                sdata[threadIdx.x + c * blockDim.x].y);
-      sdata[threadIdx.x + c * blockDim.x].x = (DType)0.0;  // Re
-      sdata[threadIdx.x + c * blockDim.x].y = (DType)0.0;  // Im
-    }
-
-    data_cnt = data_cnt + blockDim.x;
-  }  // data points per sector
-}
-
-__device__ void textureForwardConvolutionFunction22D(
-    int *sec, int sec_max, int sec_offset, DType2 *data,
-    DType *crds, CufftType *gdata, IndType *sectors, IndType *sector_centers)
-{
-  int imin, imax, jmin, jmax, i, j;
-  DType val, ix, jy;
-
-  IndType2 center;
-  int sector_ind_offset;
-  center.x = sector_centers[sec[threadIdx.x] * 2];
-  center.y = sector_centers[sec[threadIdx.x] * 2 + 1];
-
-  sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset,
-      center.y - GI.sector_offset, GI.gridDims);
-
-  // Grid Points over Threads
-  int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
-  __syncthreads();
-
-  while (data_cnt < sec_max)
-  {
-    DType2 data_point;  // datapoint per thread
-    data_point.x = crds[data_cnt];
-    data_point.y = crds[data_cnt + GI.data_count];
-
-    // set the boundaries of final dataset for gpuNUFFT this point
-    ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x,
-                         GI.sector_offset);
-    set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius);
-    jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y,
-                         GI.sector_offset);
-    set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius);
-
-    // convolve neighboring cartesian points to this data point
-    int rangeX = imax - imin + 1;
-    int rangeY = jmax - jmin + 1;
-    int idx = threadIdx.y;
-    int grid_index;
-
-    while (idx < (rangeX * rangeY))
-    {
-      getCoordsFromIndex2D(idx, &i, &j, rangeX, rangeY);
-      i += imin;
-      j += jmin;
-      if (j <= jmax && j >= jmin)
-      {
-        jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset);
-        DType dy_sqr = (jy - data_point.y) * GI.aniso_y_scale;
-        dy_sqr *= dy_sqr;
-        if (i <= imax && i >= imin)
-        {
-          ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset);
-          DType dx_sqr = (ix - data_point.x) * GI.aniso_x_scale;
-          dx_sqr *= dx_sqr;
-          // get kernel value
-          // calc as separable filter
-          val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv,
-                                     dy_sqr * GI.radiusSquared_inv);
-
-          if (isOutlier2D(i, j, center.x, center.y, GI.gridDims,
-                          GI.sector_offset))
-            // calculate opposite index
-            grid_index =
-                getIndex2D(calculateOppositeIndex(i, center.x, GI.gridDims.x,
-                                                  GI.sector_offset),
-                           calculateOppositeIndex(j, center.y, GI.gridDims.y,
-                                                  GI.sector_offset),
-                           GI.gridDims.x);
-          else
-            grid_index = (sector_ind_offset + getIndex2D(i, j, GI.gridDims.x));
-
-          for (int c = 0; c < GI.n_coils_cc; c++)
-          {
-            atomicAdd(&(data[data_cnt + c * GI.data_count].x), tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).x * val);
-            atomicAdd(&(data[data_cnt + c * GI.data_count].y), tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).y * val);
-          }
-        }  // x if
-      }    // y if
-      idx = idx + blockDim.y;
-    }
-    data_cnt = data_cnt + blockDim.x;
-  }  // data points per sector
-}
-
-__device__ void textureForwardConvolutionFunction32D(
-    int *sec, int sec_max, int sec_offset, DType *cache, DType2 *data,
-    DType *crds, CufftType *gdata, IndType *sectors, IndType *sector_centers)
-{
-  int imin, imax, jmin, jmax, i, j;
-  DType val, ix, jy;
-
-  __shared__ IndType2 center;
-  center.x = sector_centers[sec[threadIdx.x] * 2];
-  center.y = sector_centers[sec[threadIdx.x] * 2 + 1];
-
-  __shared__ int sector_ind_offset;
-  sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset,
-      center.y - GI.sector_offset, GI.gridDims);
-  int grid_index;
-
-  // Grid Points over Threads
-  int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset;
-
-  while (data_cnt < sec_max)
-  {
-    DType2 data_point;  // datapoint per thread
-    data_point.x = crds[data_cnt];
-    data_point.y = crds[data_cnt + GI.data_count];
-
-    // set the boundaries of final dataset for gpuNUFFT this point
-    ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x,
-        GI.sector_offset);
-    set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius);
-    jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y,
-        GI.sector_offset);
-    set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius);
-
-    // convolve neighboring cartesian points to this data point
-    int idx = threadIdx.y;
-    getCoordsFromIndex2D(idx, &i, &j, GI.kernel_width + 1, GI.kernel_width + 1);
-    i += imin;
-    j += jmin;
-    if (j <= jmax && j >= jmin)
-    {
-      jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset);
-      DType dy_sqr = (jy - data_point.y) * GI.aniso_y_scale;
-      dy_sqr *= dy_sqr;
-      if (i <= imax && i >= imin)
-      {
-        ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset);
-        DType dx_sqr = (ix - data_point.x) * GI.aniso_x_scale;
-        dx_sqr *= dx_sqr;
-        // get kernel value
-        // calc as separable filter
-        val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv,
-            dy_sqr * GI.radiusSquared_inv);
-        cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] = val;
-
-        if (isOutlier2D(i, j, center.x, center.y, GI.gridDims,
-              GI.sector_offset))
-          // calculate opposite index
-          grid_index =
-            getIndex2D(calculateOppositeIndex(i, center.x, GI.gridDims.x,
-                  GI.sector_offset),
-                calculateOppositeIndex(j, center.y, GI.gridDims.y,
-                  GI.sector_offset),
-                GI.gridDims.x);
-        else
-          grid_index = (sector_ind_offset + getIndex2D(i, j, GI.gridDims.x));
-
-        for (int c = 0; c < GI.n_coils_cc; c++)
-        {
-          atomicAdd(
-              &(data[data_cnt + c * GI.data_count].x),
-              cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] *
-              tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).x);
-          atomicAdd(
-              &(data[data_cnt + c * GI.data_count].y),
-              cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] *
-              tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).y);
-        }
-      }  // x if
-    }    // y if
-
-    cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] = 0;
-    data_cnt = data_cnt + blockDim.x;
-  }  // data points per sector
-}
-
-__global__ void textureForwardConvolutionKernel2D(CufftType *data, DType *crds,
-                                                  CufftType *gdata,
-                                                  IndType *sectors,
-                                                  IndType *sector_centers,
-                                                  int N)
-{
-  extern __shared__ CufftType shared[];  // externally managed shared memory
-  CufftType *shared_out_data = (CufftType *)&shared[0];
-  CufftType *gdata_cache = (CufftType *)&shared[blockDim.x * GI.n_coils_cc];
-
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-  sec[threadIdx.x] = blockIdx.x;
-
-  // init shared memory
-  for (int c = 0; c < GI.n_coils_cc; c++)
-  {
-    shared_out_data[threadIdx.x + c * blockDim.x].x = 0.0f;  // Re
-    shared_out_data[threadIdx.x + c * blockDim.x].y = 0.0f;  // Im
-  }
-  __syncthreads();
-  // start convolution
-  while (sec[threadIdx.x] < N)
-  {
-    __shared__ int data_max;
-    data_max = sectors[sec[threadIdx.x] + 1];
-
-    textureForwardConvolutionFunction2D(sec, data_max, 0, shared_out_data,
-                                        gdata_cache, data, crds, gdata, sectors,
-                                        sector_centers);
-
-    __syncthreads();
-    sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x;
-  }  // sector check
-}
-
-__global__ void balancedTextureForwardConvolutionKernel2D(
-    CufftType *data, DType *crds, CufftType *gdata, IndType *sectors,
-    IndType2 *sector_processing_order, IndType *sector_centers, int N)
-{
-  extern __shared__ CufftType shared[];  // externally managed shared memory
-  CufftType *shared_out_data = (CufftType *)&shared[0];
-  CufftType *gdata_cache = (CufftType *)&shared[blockDim.x * GI.n_coils_cc];
-
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-
-  // init shared memory
-  for (int c = 0; c < GI.n_coils_cc; c++)
-  {
-    shared_out_data[threadIdx.x + c * blockDim.x].x = 0.0f;  // Re
-    shared_out_data[threadIdx.x + c * blockDim.x].y = 0.0f;  // Im
-  }
-  __syncthreads();
-  // start convolution
-  for (int sec_cnt = blockIdx.x; sec_cnt < N; sec_cnt += gridDim.x)
-  {
-    sec[threadIdx.x] = sector_processing_order[sec_cnt].x;
-    __shared__ int data_max;
-    data_max = min(sectors[sec[threadIdx.x] + 1],
-        sectors[sec[threadIdx.x]] + 
-        sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD);
-
-    textureForwardConvolutionFunction2D(
-        sec, data_max, sector_processing_order[sec_cnt].y, shared_out_data,
-        gdata_cache, data, crds, gdata, sectors, sector_centers);
-
-    __syncthreads();
-  }  // sector check
-}
-
-__global__ void balancedTextureForwardConvolutionKernel22D(
-    CufftType *data, DType *crds, CufftType *gdata, IndType *sectors,
-    IndType2 *sector_processing_order, IndType *sector_centers, int N)
-{
-  int sec_cnt = blockIdx.x;
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-
-  // init shared memory
-  // start convolution
-  while (sec_cnt < N)
-  {
-    int data_max;
-    if (threadIdx.y == 0)
-    {
-      sec[threadIdx.x] = sector_processing_order[sec_cnt].x;
-    }
-    __syncthreads();
-
-    data_max = min(sectors[sec[threadIdx.x] + 1],
-          sectors[sec[threadIdx.x]]
-          + sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD);
-
-    textureForwardConvolutionFunction22D(
-        sec, data_max, sector_processing_order[sec_cnt].y, data, crds,
-        gdata, sectors, sector_centers);
-
-    sec_cnt = sec_cnt + gridDim.x;
-    __syncthreads();
-  }  // sector check
-}
-
-__global__ void balancedTextureForwardConvolutionKernel32D(
-        CufftType *data, DType *crds, CufftType *gdata, IndType *sectors,
-            IndType2 *sector_processing_order, IndType *sector_centers, int N)
-{
-  extern __shared__ DType shared_cache[];  // externally managed shared memory
-  DType *cache = (DType *)&shared_cache[0];
-
-  int sec_cnt = blockIdx.x;
-  __shared__ int sec[THREAD_BLOCK_SIZE];
-
-  // init shared memory
-  cache[threadIdx.x * blockDim.y + threadIdx.y] = (DType)0.0;
-  __syncthreads();
-  // start convolution
-  while (sec_cnt < N)
-  {
-      sec[threadIdx.x] = sector_processing_order[sec_cnt].x;
-      __shared__ int data_max;
-      data_max = min(sectors[sec[threadIdx.x] + 1],
-          sectors[sec[threadIdx.x]] + 
-          sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD);
-
-      textureForwardConvolutionFunction32D(
-                  sec, data_max, sector_processing_order[sec_cnt].y, cache, data, crds,
-                          gdata, sectors, sector_centers);
-
-      __syncthreads();
-      sec_cnt = sec_cnt + gridDim.x;
-    }  // sector check
-}
-
-void performTextureForwardConvolution(CufftType *data_d, DType *crds_d,
-                                      CufftType *gdata_d, DType *kernel_d,
-                                      IndType *sectors_d,
-                                      IndType *sector_centers_d,
-                                      gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  int thread_size = 192;
-  long shared_mem_size = (thread_size + gi_host->sector_dim) *
-                         gi_host->n_coils_cc * sizeof(CufftType);
-
-  dim3 block_dim(thread_size);
-  dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, thread_size));
-
-  if (DEBUG)
-    printf("texture forward convolution requires %ld bytes of shared memory!\n",
-           shared_mem_size);
-  if (gi_host->is2Dprocessing)
-  {
-    // dim3 block_dim(thread_size, 1, DEFAULT_VALUE(gi_host->n_coils_cc > 4 ? 1
-    // : gi_host->n_coils_cc));
-    dim3 block_dim(thread_size, 1, 1);  // DEFAULT_VALUE(gi_host->n_coils_cc > 4
-                                        // ? 1 : gi_host->n_coils_cc));
-    textureForwardConvolutionKernel2D
-            <<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_centers_d,
-         gi_host->sector_count);
-  }
-  else
-    textureForwardConvolutionKernel <<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_centers_d,
-         gi_host->sector_count);
-}
-
-void performTextureForwardConvolution(CufftType *data_d, DType *crds_d,
-                                      CufftType *gdata_d, DType *kernel_d,
-                                      IndType *sectors_d,
-                                      IndType2 *sector_processing_order_d,
-                                      IndType *sector_centers_d,
-                                      gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  int thread_size = THREAD_BLOCK_SIZE;
-  long shared_mem_size = (thread_size + gi_host->sector_dim) *
-                         gi_host->n_coils_cc * sizeof(CufftType);
-
-  dim3 block_dim(thread_size);
-  dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, thread_size));
-
-  if (DEBUG)
-    printf("balanced texture forward convolution requires %ld bytes of shared "
-           "memory!\n",
-           shared_mem_size);
-  if (gi_host->is2Dprocessing)
-  {
-    bool useV2cached = false;
-
-    if (useV2cached)
-    {
-      int thread_size = 32;
-      int threadY = (gi_host->kernel_width + 1) * (gi_host->kernel_width + 1);
-
-      long shared_mem_size =
-        (threadY * thread_size) * sizeof(DType);
-
-      grid_dim = dim3(getOptimalGridDim(gi_host->sector_count, 1));
-
-      block_dim = getOptimal2DBlockDim(thread_size, threadY);
-
-      if (DEBUG)
-      {
-        printf("balanced texture forward convolution 2 (2d) requires %ld bytes "
-            "of shared memory!\n",
-            shared_mem_size);
-        printf("block dims: %u %u %u!\n", block_dim.x, block_dim.y, block_dim.z);
-        printf("grid dims: %u %u %u!\n", grid_dim.x, grid_dim.y, grid_dim.z);
-      }
-
-      balancedTextureForwardConvolutionKernel32D<<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d, sector_centers_d, gi_host->sectorsToProcess);
-    }
-    else
-    {
-      int thread_size = 32;
-      long shared_mem_size =
-        (gi_host->kernel_widthSquared * thread_size) * sizeof(DType);
-
-      grid_dim = dim3(getOptimalGridDim(gi_host->sector_count, 1));
-
-      //TODO maybe it's better to round kwSqrd to the next multiple of 2
-      block_dim = getOptimal2DBlockDim(thread_size, gi_host->kernel_widthSquared);
-
-      if (DEBUG)
-      {
-        printf("balanced texture forward convolution 2 (2d) requires %ld bytes "
-            "of shared memory!\n",
-            shared_mem_size);
-        printf("grid dims: %u %u %u!\n", grid_dim.x, grid_dim.y, grid_dim.z);
-        printf("block dims: %u %u %u!\n", block_dim.x, block_dim.y, block_dim.z);
-      }
-
-      balancedTextureForwardConvolutionKernel22D<<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d, sector_centers_d, gi_host->sectorsToProcess);
-    }
-  }
-  else
-  {
-    balancedTextureForwardConvolutionKernel
-            <<<grid_dim, block_dim, shared_mem_size>>>
-        (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d,
-         sector_centers_d, gi_host->sectorsToProcess);
-  }
-}
-
-#endif
diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
index 628538b7..2b3664f2 100644
--- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
+++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp
@@ -222,13 +222,7 @@ class GpuNUFFTPythonOperator
         if(interpolate_data)
             gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu, gpuNUFFT::DENSITY_ESTIMATION);
         else
-        {
-            for(long int i=0; i<100000; i++)
-            {
-                printf("i = %ld\n", i);
-                gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu);
-            }
-        }
+            gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu);
         cudaDeviceSynchronize();
     }
 
@@ -448,7 +442,6 @@ class GpuNUFFTPythonOperator
     }
     ~GpuNUFFTPythonOperator()
     {
-        printf("Destructor called\n");
         delete gpuNUFFTOp;
     }
 };
diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
index bff9118c..3eb8c192 100644
--- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
+++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu
@@ -21,18 +21,6 @@ void initConstSymbol(const char* symbol, const void* src, IndType size, cudaStre
     HANDLE_ERROR(cudaMemcpyToSymbolAsync(KERNEL, src, size, 0, cudaMemcpyHostToDevice, stream));
 }
 
-void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count)
-{
-  if (std::string("texDATA").compare(symbol)==0)
-  {
-    HANDLE_ERROR (cudaBindTexture(NULL,texDATA, devicePtr,(unsigned long)count*sizeof(float2)));
-  }
-  else if (std::string("texGDATA").compare(symbol)==0)
-  {
-    HANDLE_ERROR (cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex)));
-  }
-}
-
 __global__ void updateDensityCompKernel(DType2* density_data, DType2* estimation_data, long int N)
 {
   long int t = threadIdx.x + blockIdx.x * blockDim.x;
@@ -54,83 +42,6 @@ void performUpdateDensityComp(DType2* density_data, DType2* estimation_data, lon
   updateDensityCompKernel<<<grid_dim,block_dim>>>(density_data, estimation_data, n_samples);
 }
 
-void initTexture(const char* symbol, cudaArray** devicePtr, gpuNUFFT::Array<DType> hostTexture)
-{
-  if (std::string("texKERNEL").compare(symbol)==0)
-  {
-    HANDLE_ERROR(cudaMallocArray (devicePtr, &texKERNEL.channelDesc, hostTexture.dim.width, 1));
-    HANDLE_ERROR(cudaBindTextureToArray(texKERNEL, *devicePtr));
-    HANDLE_ERROR(cudaMemcpyToArray(*devicePtr, 0, 0, hostTexture.data, sizeof(float)*hostTexture.count(), cudaMemcpyHostToDevice));
-    
-    texKERNEL.filterMode = cudaFilterModePoint;
-    texKERNEL.normalized = true;
-    texKERNEL.addressMode[0] = cudaAddressModeClamp;
-  }
-  else if (std::string("texKERNEL2D").compare(symbol)==0)
-  {
-    HANDLE_ERROR(cudaMallocArray (devicePtr, &texKERNEL2D.channelDesc, hostTexture.dim.width, hostTexture.dim.height));
-
-    HANDLE_ERROR(cudaBindTextureToArray(texKERNEL2D, *devicePtr));
-    HANDLE_ERROR(cudaMemcpyToArray(*devicePtr, 0, 0, hostTexture.data, sizeof(float)*hostTexture.count(), cudaMemcpyHostToDevice));
-    
-    texKERNEL2D.filterMode = cudaFilterModeLinear;
-    texKERNEL2D.normalized = true;
-    texKERNEL2D.addressMode[0] = cudaAddressModeClamp;
-    texKERNEL2D.addressMode[1] = cudaAddressModeClamp;
-  }
-  else if (std::string("texKERNEL3D").compare(symbol)==0)
-  {
-    cudaExtent volumesize=make_cudaExtent(hostTexture.dim.width, hostTexture.dim.height, hostTexture.dim.depth); 
-    cudaMalloc3DArray(devicePtr,&texKERNEL3D.channelDesc,volumesize); 
-
-    cudaMemcpy3DParms copyparams = {0};
-    copyparams.extent=volumesize; 
-    copyparams.dstArray=*devicePtr; 
-    copyparams.kind=cudaMemcpyHostToDevice; 
-    copyparams.srcPtr= make_cudaPitchedPtr((void*)hostTexture.data,sizeof(float)*hostTexture.dim.width,hostTexture.dim.height,hostTexture.dim.depth); 
-
-    HANDLE_ERROR(cudaMemcpy3D(&copyparams)); 
-    HANDLE_ERROR(cudaBindTextureToArray(texKERNEL3D, *devicePtr));
-  
-    texKERNEL3D.filterMode = cudaFilterModeLinear;
-    texKERNEL3D.normalized = true;
-    texKERNEL3D.addressMode[0] = cudaAddressModeClamp;
-    texKERNEL3D.addressMode[1] = cudaAddressModeClamp;
-    texKERNEL3D.addressMode[2] = cudaAddressModeClamp;
-  }
-}
-
-void unbindTexture(const char* symbol)
-{
-  if (std::string("texKERNEL").compare(symbol)==0)
-  {
-    HANDLE_ERROR(cudaUnbindTexture(texKERNEL));    
-  }
-  else if (std::string("texKERNEL2D").compare(symbol)==0)
-  {
-    HANDLE_ERROR(cudaUnbindTexture(texKERNEL2D));    
-  }
-  else if (std::string("texKERNEL3D").compare(symbol)==0)
-  {
-    HANDLE_ERROR(cudaUnbindTexture(texKERNEL3D));    
-  }
-  else if (std::string("texDATA").compare(symbol)==0)
-  {
-    HANDLE_ERROR(cudaUnbindTexture(texDATA));    
-  }
-  else if (std::string("texGDATA").compare(symbol)==0)
-  {
-    HANDLE_ERROR(cudaUnbindTexture(texGDATA));    
-  }
-}
-
-
-void freeTexture(const char* symbol, cudaArray* devicePtr)
-{
-  unbindTexture(symbol);
-  HANDLE_ERROR(cudaFreeArray(devicePtr));  
-}
-
 __global__ void fftScaleKernel(CufftType* data, DType scaling, long int N)
 {
   long int t = threadIdx.x +  blockIdx.x *blockDim.x;
diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp
index 78b22786..bda4e983 100644
--- a/CUDA/src/gpuNUFFT_operator.cpp
+++ b/CUDA/src/gpuNUFFT_operator.cpp
@@ -243,7 +243,6 @@ void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils, int n_coils_cc)
 
   initLookupTable();
 
-  // allocateAndCopyToDeviceMem<DType>(&kernel_d,kernel,kernel_count);
   if (DEBUG)
     printf("allocate and copy sectors of size %d...\n", sector_count + 1);
   allocateAndCopyToDeviceMem<IndType>(&sectors_d, this->sectorDataCount.data,
@@ -287,11 +286,11 @@ void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils, int n_coils_cc)
     printf("creating cufft plan with %d,%d,%d dimensions\n",
            DEFAULT_VALUE(gi_host->gridDims.z), gi_host->gridDims.y,
            gi_host->gridDims.x);
-  // cufftResult res = cufftPlan3d(
-  //     &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z),
-  //     (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType);
-  // if (res != CUFFT_SUCCESS)
-  //   fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res);
+  cufftResult res = cufftPlan3d(
+       &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z),
+       (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType);
+  if (res != CUFFT_SUCCESS)
+     fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res);
   gpuMemAllocated = true;
 }
 
@@ -471,6 +470,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(
         continue;
 
       freeTotalDeviceMemory(imdata_sum_d, NULL);
+      this->freeDeviceMemory();
       return;
     }
     if ((cudaDeviceSynchronize() != cudaSuccess))
@@ -936,7 +936,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
   if (DEBUG)
     printf("allocate and copy imdata of size %d...\n",
            imdata_count * n_coils_cc);
-  //allocateDeviceMem<DType2>(&imdata_d, imdata_count * n_coils_cc);
+  allocateDeviceMem<DType2>(&imdata_d, imdata_count * n_coils_cc);
 
   if (debugTiming)
     printf("Memory allocation: %.2f ms\n", stopTiming());
@@ -950,9 +950,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     unsigned long int data_coil_offset = (long int)coil_it * data_count;
     unsigned long int im_coil_offset = coil_it * (long int)imdata_count;
 
-    //data_d = kspaceData_gpu.data + data_coil_offset;
+    data_d = kspaceData_gpu.data + data_coil_offset;
 
-//    this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc);
+    this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc);
 
     if (this->applySensData())
       // perform automatically "repeating" of input image in case
@@ -961,13 +961,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
         copyDeviceToDeviceAsync<DType2>(imgData_gpu.data,
                                    imdata_d + cnt * imdata_count, imdata_count, new_stream);
     else
-//      copyDeviceToDeviceAsync<DType2>(imgData_gpu.data + im_coil_offset, imdata_d,
-  //                               imdata_count * n_coils_cc, new_stream);
+      copyDeviceToDeviceAsync<DType2>(imgData_gpu.data + im_coil_offset, imdata_d,
+                                 imdata_count * n_coils_cc, new_stream);
 
     // reset temp arrays
-//    cudaMemsetAsync(gdata_d, 0,
-  //             sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream);
-    //cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream);
+    cudaMemsetAsync(gdata_d, 0,
+               sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream);
+    cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream)!= cudaSuccess))
       printf("error at thread synchronization 1: %s\n",
@@ -995,19 +995,28 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
         writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
                     (int)this->kSpaceTraj.count(), n_coils_cc);
         copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream);
+        if(coil_it > 1)
+        {
+            cudaStreamSynchronize(old_stream);
+            cudaStreamDestroy(old_stream);
+        }
+        old_stream = new_stream;
         if ((coil_it + n_coils_cc) < (n_coils))
             continue;
+        
+        cudaStreamSynchronize(old_stream);
+        cudaStreamDestroy(old_stream);
         freeTotalDeviceMemory(imdata_d, NULL);
         this->freeDeviceMemory();
         return;
     } 
     // apodization Correction
-    //performForwardDeapodization(imdata_d, deapo_d, gi_host);
+    performForwardDeapodization(imdata_d, deapo_d, gi_host);
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 2: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // resize by oversampling factor and zero pad
-    //performPadding(imdata_d, gdata_d, gi_host);
+    performPadding(imdata_d, gdata_d, gi_host);
 
     if (debugTiming)
       startTiming();
@@ -1016,7 +1025,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       printf("error at thread synchronization 3: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     // shift image to get correct zero frequency position
-    //performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host);
+    performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 4: %s\n",
@@ -1026,9 +1035,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     int c = 0;
     while (c < n_coils_cc)
     {
-     // if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count,
-       //                       gdata_d + c * gi_host->gridDims_count,
-         //                     grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS)
+      if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count,
+                              gdata_d + c * gi_host->gridDims_count,
+                              grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS)
       {
         fprintf(stderr, "cufft has failed with err %i \n", err);
         showMemoryInfo(true, stderr);
@@ -1039,7 +1048,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 5: %s\n",
              cudaGetErrorString(cudaGetLastError()));
-    //performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host);
+    performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host);
 
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 6: %s\n",
@@ -1052,8 +1061,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
       startTiming();
 
     // convolution and resampling to non-standard trajectory
-    //forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d,
-     //                  sector_centers_d, gi_host);
+    forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d,
+                       sector_centers_d, gi_host);
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error at thread synchronization 7: %s\n",
              cudaGetErrorString(cudaGetLastError()));
@@ -1061,26 +1070,26 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
     if (debugTiming)
       printf("Forward Convolution: %.2f ms\n", stopTiming());
 
-   // performFFTScaling(data_d, gi_host->data_count, gi_host);
+    performFFTScaling(data_d, gi_host->data_count, gi_host);
     if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess))
       printf("error: at thread synchronization 8: %s\n",
              cudaGetErrorString(cudaGetLastError()));
     
     // write result in correct order back into output array
-  //  writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
-    //                (int)this->kSpaceTraj.count(), n_coils_cc);
+    writeOrderedGPU(data_sorted_d, data_indices_d, data_d,
+                    (int)this->kSpaceTraj.count(), n_coils_cc);
     if(coil_it > 1)
     {
       cudaStreamSynchronize(old_stream);
       cudaStreamDestroy(old_stream);
     }
-   // copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream);
+    copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream);
     old_stream = new_stream;
   }  // iterate over coils
 
   cudaStreamSynchronize(old_stream);
   cudaStreamDestroy(old_stream);
- // freeTotalDeviceMemory(imdata_d, NULL);
+  freeTotalDeviceMemory(imdata_d, NULL);
   this->freeDeviceMemory();
   if ((cudaDeviceSynchronize() != cudaSuccess))
     fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n",
@@ -1210,8 +1219,16 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(
                     (int)this->kSpaceTraj.count(), n_coils_cc);
         copyFromDeviceAsync(data_sorted_d, kspaceData.data + data_coil_offset,
                    data_count * n_coils_cc, new_stream);
+        if(coil_it > 1)
+        {
+            cudaStreamSynchronize(old_stream);
+            cudaStreamDestroy(old_stream);
+        }
+        old_stream = new_stream;
         if ((coil_it + n_coils_cc) < (n_coils))
             continue;
+        cudaStreamSynchronize(old_stream);
+        cudaStreamDestroy(old_stream);
         freeTotalDeviceMemory(data_d, imdata_d, NULL);
         this->freeDeviceMemory();
         return;
diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp
index 647c8840..86a74e1c 100644
--- a/CUDA/src/gpuNUFFT_operator_factory.cpp
+++ b/CUDA/src/gpuNUFFT_operator_factory.cpp
@@ -10,11 +10,6 @@
 #include <limits>
 #include <cstring>
 
-void gpuNUFFT::GpuNUFFTOperatorFactory::setUseTextures(bool useTextures)
-{
-  this->useTextures = useTextures;
-}
-
 void gpuNUFFT::GpuNUFFTOperatorFactory::setBalanceWorkload(bool balanceWorkload)
 {
   this->balanceWorkload = balanceWorkload;
@@ -120,9 +115,6 @@ void gpuNUFFT::GpuNUFFTOperatorFactory::computeProcessingOrder(
   if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED)
     static_cast<BalancedGpuNUFFTOperator *>(gpuNUFFTOp)
         ->setSectorProcessingOrder(sectorProcessingOrder);
-  else
-    static_cast<BalancedTextureGpuNUFFTOperator *>(gpuNUFFTOp)
-        ->setSectorProcessingOrder(sectorProcessingOrder);
 }
 
 gpuNUFFT::Array<IndType> gpuNUFFT::GpuNUFFTOperatorFactory::assignSectors(
@@ -327,33 +319,14 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createNewGpuNUFFTOperator(
 {
   if (balanceWorkload)
   {
-    if (useTextures)
-    {
-      debug("creating Balanced 2D TextureLookup Operator!\n");
-      return new gpuNUFFT::BalancedTextureGpuNUFFTOperator(
-          kernelWidth, sectorWidth, osf, imgDims, TEXTURE2D_LOOKUP,
-          this->matlabSharedMem);
-    }
-    else
-    {
-      debug("creating Balanced GpuNUFFT Operator!\n");
-      return new gpuNUFFT::BalancedGpuNUFFTOperator(kernelWidth, sectorWidth,
-        osf, imgDims, this->matlabSharedMem);
-    }
+    debug("creating Balanced GpuNUFFT Operator!\n");
+    return new gpuNUFFT::BalancedGpuNUFFTOperator(kernelWidth, sectorWidth,
+      osf, imgDims, this->matlabSharedMem);
   }
 
-  if (useTextures)
-  {
-    debug("creating 2D TextureLookup Operator!\n");
-    return new gpuNUFFT::TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf,
-      imgDims, TEXTURE2D_LOOKUP, this->matlabSharedMem);
-  }
-  else
-  {
-    debug("creating DEFAULT GpuNUFFT Operator!\n");
-    return new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf,
+  debug("creating DEFAULT GpuNUFFT Operator!\n");
+  return new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf,
                                           imgDims, true, DEFAULT, true);
-  }
 }
 
 gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFunction(
@@ -365,11 +338,7 @@ gpuNUFFT::Array<DType> gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu
   IndType sectorWidth = 8;
   gpuNUFFT::GpuNUFFTOperator *deapoGpuNUFFTOp;
 
-  if (useTextures)
-    deapoGpuNUFFTOp = new gpuNUFFT::TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf,
-    imgDims, TEXTURE2D_LOOKUP);
-  else
-    deapoGpuNUFFTOp = new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims);
+  deapoGpuNUFFTOp = new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims);
 
   // Data
   gpuNUFFT::Array<DType2> dataArray;
@@ -540,8 +509,8 @@ void gpuNUFFT::GpuNUFFTOperatorFactory::set_pts(
   gpuNUFFTOp->setSectorDataCount(
       computeSectorDataCount(gpuNUFFTOp, assignedSectors));
 
-  if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED ||
-    gpuNUFFTOp->getType() == gpuNUFFT::BALANCED_TEXTURE) {
+  if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED)
+  {
     computeProcessingOrder(gpuNUFFTOp);
   }
 
@@ -605,9 +574,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::loadPrecomputedGpuNUFFTOperator(
   if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED)
     static_cast<BalancedGpuNUFFTOperator *>(gpuNUFFTOp)
         ->setSectorProcessingOrder(sectorProcessingOrder);
-  else if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED_TEXTURE)
-    static_cast<BalancedTextureGpuNUFFTOperator *>(gpuNUFFTOp)
-        ->setSectorProcessingOrder(sectorProcessingOrder);
 
   gpuNUFFTOp->setSectorCenters(sectorCenters);
   gpuNUFFTOp->setSens(sensData);
diff --git a/CUDA/src/texture_gpuNUFFT_operator.cpp b/CUDA/src/texture_gpuNUFFT_operator.cpp
deleted file mode 100644
index 80bfff94..00000000
--- a/CUDA/src/texture_gpuNUFFT_operator.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-
-#include "texture_gpuNUFFT_operator.hpp"
-
-void gpuNUFFT::TextureGpuNUFFTOperator::initKernel()
-{
-  IndType kernelSize = (interpolationType > 1)
-                           ? calculateKernelSizeLinInt(osf, kernelWidth)
-                           : calculateGrid3KernelSize(osf, kernelWidth);
-  this->kernel.dim.width = kernelSize;
-  this->kernel.dim.height = interpolationType > 1 ? kernelSize : 1;
-  this->kernel.dim.depth = interpolationType > 2 ? kernelSize : 1;
-  if (this->kernel.data != NULL)
-    free(this->kernel.data);
-  this->kernel.data = (DType *)calloc(this->kernel.count(), sizeof(DType));
-
-  switch (interpolationType)
-  {
-  case TEXTURE_LOOKUP:
-    load1DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf);
-    break;
-  case TEXTURE2D_LOOKUP:
-    load2DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf);
-    break;
-  case TEXTURE3D_LOOKUP:
-    load3DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf);
-    break;
-  default:
-    load1DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf);
-  }
-}
-
-const char *gpuNUFFT::TextureGpuNUFFTOperator::getInterpolationTypeName()
-{
-  switch (interpolationType)
-  {
-  case TEXTURE_LOOKUP:
-    return "texKERNEL";
-  case TEXTURE2D_LOOKUP:
-    return "texKERNEL2D";
-  case TEXTURE3D_LOOKUP:
-    return "texKERNEL3D";
-  default:
-    return "KERNEL";
-  }
-}
-
-gpuNUFFT::GpuNUFFTInfo *
-gpuNUFFT::TextureGpuNUFFTOperator::initAndCopyGpuNUFFTInfo(int n_coils_cc)
-{
-  gpuNUFFT::GpuNUFFTInfo *gi_host = initGpuNUFFTInfo(n_coils_cc);
-
-  gi_host->interpolationType = interpolationType;
-  gi_host->sectorsToProcess = gi_host->sector_count;
-
-  if (DEBUG)
-    printf("copy GpuNUFFT Info to symbol memory... size = %lu \n",
-      (SizeType)sizeof(gpuNUFFT::GpuNUFFTInfo));
-
-  initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo));
-
-  if (DEBUG)
-    printf("...done!\n");
-  return gi_host;
-}
-
-void gpuNUFFT::TextureGpuNUFFTOperator::adjConvolution(
-    DType2 *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d,
-    IndType *sectors_d, IndType *sector_centers_d,
-    gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  bindTo1DTexture("texDATA", data_d,
-                  this->kSpaceTraj.count() * gi_host->n_coils_cc);
-
-  performTextureConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d,
-                            sector_centers_d, gi_host);
-
-  unbindTexture("texDATA");
-}
-
-void gpuNUFFT::TextureGpuNUFFTOperator::forwardConvolution(
-    CufftType *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d,
-    IndType *sectors_d, IndType *sector_centers_d,
-    gpuNUFFT::GpuNUFFTInfo *gi_host)
-{
-  bindTo1DTexture("texGDATA", gdata_d,
-                  gi_host->grid_width_dim * gi_host->n_coils_cc);
-
-  performTextureForwardConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d,
-                                   sector_centers_d, gi_host);
-
-  unbindTexture("texGDATA");
-}
-
-void gpuNUFFT::TextureGpuNUFFTOperator::initLookupTable()
-{
-  initTexture(getInterpolationTypeName(), &kernel_d, this->kernel);
-}
-
-void gpuNUFFT::TextureGpuNUFFTOperator::freeLookupTable()
-{
-  if (kernel_d != NULL)
-    freeTexture(getInterpolationTypeName(), kernel_d);
-}
diff --git a/setup.py b/setup.py
index e12094c9..e4573946 100644
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,7 @@ def build_extension(self, ext):
                       "-DGEN_PYTHON_FILES=ON",
                       "-DGEN_MEX_FILES=OFF",
                       "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path]
-        cfg = "Debug"# if self.debug else "Release"
+        cfg = "Debug" if self.debug else "Release"
         build_args = ["--config", cfg]
 
         if platform.system() == "Windows":
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.8.1",
+    version="0.9.0",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),

From 5e267bba49959ae2a59613a0860e6d75c6392c74 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Thu, 29 Aug 2024 15:42:24 +0200
Subject: [PATCH 85/85] Fix memory leak

---
 CUDA/inc/gpuNUFFT_operator.hpp | 13 +++++++++++++
 setup.py                       |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp
index 2bbc6cd0..7ba55745 100644
--- a/CUDA/inc/gpuNUFFT_operator.hpp
+++ b/CUDA/inc/gpuNUFFT_operator.hpp
@@ -93,18 +93,27 @@ class GpuNUFFTOperator
 
   void setKSpaceTraj(Array<DType> kSpaceTraj)
   {
+    if (this->kSpaceTraj.data != NULL)
+      freeLocalMemberArray(this->kSpaceTraj.data);
     this->kSpaceTraj = kSpaceTraj;
   }
   void setSectorCenters(Array<IndType> sectorCenters)
   {
+    if(this->sectorCenters.data != NULL)
+      freeLocalMemberArray(this->sectorCenters.data);
     this->sectorCenters = sectorCenters;
   }
   void setSectorDataCount(Array<IndType> sectorDataCount)
   {
+    if(this->sectorDataCount.data != NULL)
+      freeLocalMemberArray(this->sectorDataCount.data);
+
     this->sectorDataCount = sectorDataCount;
   }
   void setDataIndices(Array<IndType> dataIndices)
   {
+    if (this->dataIndices.data != NULL)
+      freeLocalMemberArray(this->dataIndices.data);
     this->dataIndices = dataIndices;
   }
   void setSens(Array<DType2> sens)
@@ -113,10 +122,14 @@ class GpuNUFFTOperator
   }
   void setDens(Array<DType> dens)
   {
+    if (this->dens.data != NULL)
+      freeLocalMemberArray(this->dens.data);
     this->dens = dens;
   }
   void setDeapodizationFunction(Array<DType> deapo)
   {
+    if (this->deapo.data != NULL)
+      freeLocalMemberArray(this->deapo.data);
     this->deapo= deapo;
   }
 
diff --git a/setup.py b/setup.py
index e4573946..8be8c15e 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def build_extension(self, ext):
 
 setup(
     name="gpuNUFFT",
-    version="0.9.0",
+    version="0.10.0",
     description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT",
     ext_modules=[
         CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),