From c3acc581cc65ae43427453884fff25dd8f2ecc80 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 28 Jun 2021 09:41:51 +0200 Subject: [PATCH 01/85] Working with CUDA11 without warnings --- CUDA/CMakeLists.txt | 2 +- setup.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt index 693714d0..5f80d6b0 100644 --- a/CUDA/CMakeLists.txt +++ b/CUDA/CMakeLists.txt @@ -100,8 +100,8 @@ IF(FERMI_GPU) list(APPEND MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50) list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=sm_52) list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=compute_52) + list(APPEND MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50) ELSE(FERMI_GPU) - set(MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50) list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=sm_52) list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=compute_52) diff --git a/setup.py b/setup.py index 6b801034..a99c31ca 100644 --- a/setup.py +++ b/setup.py @@ -107,6 +107,8 @@ def build_extension(self, ext): setup( name="gpuNUFFT", + version="0.2.0", + description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From f471098a8066e161a637aba3127d81215b2c9a04 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 29 Jun 2021 14:35:15 +0200 Subject: [PATCH 02/85] Free memory issues and update tests --- CUDA/src/gpuNUFFT_operator.cpp | 6 ++++-- python/test_file.py | 12 ++++++++++++ python/test_nufftOp.py | 12 ++++++------ 3 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 python/test_file.py diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 1fd6c352..02a6062c 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -572,7 +572,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj( // move memory management into constructor/destructor of GpuNUFFT Operator!!! // freeTotalDeviceMemory(imdata_sum_d, NULL); - // this->freeDeviceMemory(); + this->freeDeviceMemory(); if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in gpuNUFFT_gpu_adj function: %s\n", @@ -854,6 +854,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj( cudaGetErrorString(cudaGetLastError())); freeTotalDeviceMemory(data_d, imdata_d, imdata_sum_d, NULL); + this->freeDeviceMemory(); if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in gpuNUFFT_gpu_adj function: %s\n", @@ -1051,7 +1052,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( } // iterate over coils freeTotalDeviceMemory(imdata_d, NULL); - // this->freeDeviceMemory(); + this->freeDeviceMemory(); if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n", @@ -1261,6 +1262,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( } // iterate over coils freeTotalDeviceMemory(data_d, imdata_d, NULL); + this->freeDeviceMemory(); if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n", diff --git a/python/test_file.py b/python/test_file.py new file mode 100644 index 00000000..9ada03eb --- /dev/null +++ b/python/test_file.py @@ -0,0 +1,12 @@ +import numpy as np +from mri.operators import NonCartesianFFT +from mri.operators.fourier.utils import estimate_density_compensation + + +traj = np.load('/volatile/temp_traj.npy') +#D = estimate_density_compensation(traj, (384, 384, 208), 2) +fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT') +K = fourier.op(np.zeros((384, 384, 208))) +print("Forward done") +I = fourier.adj_op(K) +print("Backward done") diff --git a/python/test_nufftOp.py b/python/test_nufftOp.py index b4d5eab1..4d7a1387 100644 --- a/python/test_nufftOp.py +++ b/python/test_nufftOp.py @@ -84,13 +84,13 @@ def setUp(self): def test_multicoil_with_sense(self): print('Apply forward op') operator = self.get_nufft_op(self.coil_maps) - x = operator.op(np.reshape(self.img.T, self.img.size)) + x = operator.op(np.reshape(self.img.T, self.img.size), False) y = np.random.random(x.shape) print('Output kdata shape is', x.shape) print('-------------------------------') print('Apply adjoint op') - img_adj = operator.adj_op(x) - adj_y = operator.adj_op(y) + img_adj = operator.adj_op(x, False) + adj_y = operator.adj_op(y, False) print('Output adjoint img shape is', img_adj.shape) img_adj = np.squeeze(img_adj).T adj_y = np.squeeze(adj_y).T @@ -110,18 +110,18 @@ def test_multicoil_without_sense(self): operator = self.get_nufft_op() x = operator.op(np.asarray( [np.reshape(image_ch.T, image_ch.size) for image_ch in self.multi_img] - ).T) + ).T, False) y = np.random.random(x.shape) print('Output kdata shape is', x.shape) print('-------------------------------') print('Apply adjoint op') - img_adj = operator.adj_op(x) + img_adj = operator.adj_op(x, False) print('Output adjoint img shape is', img_adj.shape) img_adj = np.squeeze(img_adj) img_adj = np.asarray( [image_ch.T for image_ch in img_adj] ) - adj_y = np.squeeze(operator.adj_op(y)) + adj_y = np.squeeze(operator.adj_op(y), False) adj_y = np.asarray( [image_ch.T for image_ch in adj_y] ) From 63d36642f4674a1253603d38c94f0b9789aba0e9 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 30 Jun 2021 09:41:32 +0200 Subject: [PATCH 03/85] Add Clear Memory --- CUDA/inc/gpuNUFFT_operator.hpp | 1 + .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++++ CUDA/src/gpuNUFFT_operator.cpp | 10 +++++++++- python/test_file.py | 4 ++-- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 1999680a..61ad6f6c 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -344,6 +344,7 @@ class GpuNUFFTOperator Array performForwardGpuNUFFT(Array imgData, GpuNUFFTOutput gpuNUFFTOut); + void clean_memory(); /** \brief Check if density compensation data is available. */ bool applyDensComp() { diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index a77281f2..8d15631f 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -167,6 +167,10 @@ class GpuNUFFTPythonOperator cudaThreadSynchronize(); return out_result; } + void clean_memory() + { + gpuNUFFTOp->clean_memory(); + } ~GpuNUFFTPythonOperator() { delete gpuNUFFTOp; diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 02a6062c..9fecbff9 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -1178,12 +1178,15 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( performForwardDeapodization(imdata_d, deapo_d, gi_host); if(gpuNUFFTOut == DENSITY_ESTIMATION) { - forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d, + forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d, sector_centers_d, gi_host); writeOrderedGPU(data_sorted_d, data_indices_d, data_d, (int)this->kSpaceTraj.count(), n_coils_cc); copyFromDevice(data_sorted_d, kspaceData.data + data_coil_offset, data_count * n_coils_cc); + if ((coil_it + n_coils_cc) < (n_coils)) + continue; + freeTotalDeviceMemory(data_d, imdata_d, NULL); return; } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) @@ -1311,3 +1314,8 @@ float gpuNUFFT::GpuNUFFTOperator::stopTiming() HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop)); return time; } + +void gpuNUFFT::GpuNUFFTOperator::clean_memory() +{ + this->freeDeviceMemory(); +} \ No newline at end of file diff --git a/python/test_file.py b/python/test_file.py index 9ada03eb..10f8d355 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -4,8 +4,8 @@ traj = np.load('/volatile/temp_traj.npy') -#D = estimate_density_compensation(traj, (384, 384, 208), 2) -fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT') +D = estimate_density_compensation(traj, (384, 384, 208), 10) +fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', density_comp=D) K = fourier.op(np.zeros((384, 384, 208))) print("Forward done") I = fourier.adj_op(K) From ea1722e92dd277ea63b16f445563d5c468fdcb53 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 30 Jun 2021 11:13:39 +0200 Subject: [PATCH 04/85] Fix memory leaks for python operator --- CUDA/inc/gpuNUFFT_operator_factory.hpp | 2 +- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++-- CUDA/src/gpuNUFFT_operator.cpp | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp index 6df9af70..b4b9f461 100644 --- a/CUDA/inc/gpuNUFFT_operator_factory.hpp +++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp @@ -52,7 +52,7 @@ class GpuNUFFTOperatorFactory /** \brief Constructor overload * * @param useTextures Flag to indicate texture interpolation - * @param useGpu Flag to indicate gpu usage for precomputation + * @param useGpu Flag to indicat&GpuNUFFTPythonOperator::adj_op);e gpu usage for precomputation * @param balanceWorkload Flag to indicate load balancing */ GpuNUFFTOperatorFactory(const bool useTextures = true, const bool useGpu = true, diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 8d15631f..8f9dba42 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -173,7 +173,6 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { - delete gpuNUFFTOp; if(has_sense_data == true) free(sensArray.data); } @@ -182,6 +181,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool>()) .def("op", &GpuNUFFTPythonOperator::op) - .def("adj_op", &GpuNUFFTPythonOperator::adj_op); + .def("adj_op", &GpuNUFFTPythonOperator::adj_op) + .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory); } #endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 9fecbff9..8e375845 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -747,6 +747,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj( printf("test value at point zero: %f\n", (imgData.data)[0].x); freeTotalDeviceMemory(data_d, imdata_d, imdata_sum_d, NULL); + this->freeDeviceMemory(); return; } if ((cudaDeviceSynchronize() != cudaSuccess)) @@ -1187,6 +1188,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if ((coil_it + n_coils_cc) < (n_coils)) continue; freeTotalDeviceMemory(data_d, imdata_d, NULL); + this->freeDeviceMemory(); return; } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) From 283ed59841dd3c53ffa9a37eccfc9ac36890a2b9 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 5 Jul 2021 13:59:24 +0200 Subject: [PATCH 05/85] Rename and do right DC --- CUDA/src/gpu/std_gpuNUFFT_kernels.cu | 4 ++-- CUDA/src/gpuNUFFT_operator.cpp | 11 +---------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu index 3fb39be7..2331c2cf 100644 --- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu @@ -226,8 +226,8 @@ __global__ void densityCompensationKernel(DType2* data, DType* density_comp, int for (int c = threadIdx.z; c < GI.n_coils_cc; c+= blockDim.z) { DType2 data_p = data[t + c*N]; - data_p.x = data_p.x * sqrt(density_comp[t]); - data_p.y = data_p.y * sqrt(density_comp[t]); + data_p.x = data_p.x * density_comp[t]; + data_p.y = data_p.y * density_comp[t]; data[t + c*N] = data_p; } t = t+ blockDim.x*gridDim.x; diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 8e375845..03cd13f2 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -1041,10 +1041,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("error: at thread synchronization 8: %s\n", cudaGetErrorString(cudaGetLastError())); - // Also apply density compensation here - if (this->applyDensComp()) - performDensityCompensation(data_d, density_comp_d, gi_host); - // write result in correct order back into output array writeOrderedGPU(data_sorted_d, data_indices_d, data_d, (int)this->kSpaceTraj.count(), n_coils_cc); @@ -1075,7 +1071,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( // // parameters: // * data : output kspace data -// * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinates on trajectory, passed as SoA // * imdata : input image data @@ -1254,10 +1249,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("error: at thread synchronization 8: %s\n", cudaGetErrorString(cudaGetLastError())); - // Also apply density compensation here - if (this->applyDensComp()) - performDensityCompensation(data_d, density_comp_d, gi_host); - // write result in correct order back into output array writeOrderedGPU(data_sorted_d, data_indices_d, data_d, (int)this->kSpaceTraj.count(), n_coils_cc); @@ -1320,4 +1311,4 @@ float gpuNUFFT::GpuNUFFTOperator::stopTiming() void gpuNUFFT::GpuNUFFTOperator::clean_memory() { this->freeDeviceMemory(); -} \ No newline at end of file +} From 4a5596ff9797b3d70cf52d3249b9e88040cece6c Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 5 Jul 2021 13:59:56 +0200 Subject: [PATCH 06/85] Minor version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a99c31ca..de582070 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.2.0", + version="0.2.1", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From 71c24659bdb6713287b40b63a10d0f376bff4db5 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 9 Nov 2021 21:56:36 +0100 Subject: [PATCH 07/85] Fix memory --- CUDA/inc/balanced_gpuNUFFT_operator.hpp | 1 - .../balanced_texture_gpuNUFFT_operator.hpp | 3 +-- CUDA/inc/gpuNUFFT_operator.hpp | 20 ++++++++----------- CUDA/inc/gpuNUFFT_operator_factory.hpp | 10 +++++++++- CUDA/inc/texture_gpuNUFFT_operator.hpp | 1 - .../gpuNUFFT_operator_python_factory.cpp | 1 + CUDA/src/gpuNUFFT_operator_factory.cpp | 3 +-- python/test_file.py | 12 +++++------ 8 files changed, 26 insertions(+), 25 deletions(-) diff --git a/CUDA/inc/balanced_gpuNUFFT_operator.hpp b/CUDA/inc/balanced_gpuNUFFT_operator.hpp index 66246819..e4aa8248 100644 --- a/CUDA/inc/balanced_gpuNUFFT_operator.hpp +++ b/CUDA/inc/balanced_gpuNUFFT_operator.hpp @@ -29,7 +29,6 @@ class BalancedGpuNUFFTOperator : public GpuNUFFTOperator, ~BalancedGpuNUFFTOperator() { - if (!matlabSharedMem) freeLocalMemberArray(this->sectorProcessingOrder.data); } diff --git a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp b/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp index 3eae6a7c..d7672f73 100644 --- a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp +++ b/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp @@ -36,8 +36,7 @@ class BalancedTextureGpuNUFFTOperator : public TextureGpuNUFFTOperator, ~BalancedTextureGpuNUFFTOperator() { - if (!matlabSharedMem) - freeLocalMemberArray(this->sectorProcessingOrder.data); + freeLocalMemberArray(this->sectorProcessingOrder.data); } // OPERATIONS diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 61ad6f6c..2965667c 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -73,17 +73,13 @@ class GpuNUFFTOperator virtual ~GpuNUFFTOperator() { freeLocalMemberArray(this->kernel.data); - - if (!matlabSharedMem) { - freeLocalMemberArray(this->dens.data); - freeLocalMemberArray(this->deapo.data); - freeLocalMemberArray(this->kSpaceTraj.data); - freeLocalMemberArray(this->sectorCenters.data); - freeLocalMemberArray(this->dataIndices.data); - freeLocalMemberArray(this->sectorDataCount.data); - } - - freeDeviceMemory(); + freeLocalMemberArray(this->dens.data); + freeLocalMemberArray(this->sens.data); + freeLocalMemberArray(this->deapo.data); + freeLocalMemberArray(this->kSpaceTraj.data); + freeLocalMemberArray(this->sectorCenters.data); + freeLocalMemberArray(this->dataIndices.data); + freeLocalMemberArray(this->sectorDataCount.data); } friend class GpuNUFFTOperatorFactory; @@ -373,7 +369,7 @@ class GpuNUFFTOperator dataPointer = NULL; } } - +gpuNUFFT::TextureGpuNUFFTOperator::~TextureGpuNUFFTOperator /** \brief gpuNUFFT::OperatorType classifier. Value according to sub-class * implementation. */ OperatorType operatorType; diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp index b4b9f461..cac4ad88 100644 --- a/CUDA/inc/gpuNUFFT_operator_factory.hpp +++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp @@ -176,7 +176,15 @@ class GpuNUFFTOperatorFactory void setBalanceWorkload(bool balanceWorkload); protected: - /** \brief Assign the samples on the k-space trajectory to its corresponding + template + void freeLocalMemberArray(T* dataPointer) + { + if (dataPointer != NULL) { + free(dataPointer); + dataPointer = NULL; + } + } + /** \brief Assign the samples on the k-space trajectory to its corresponding *sector * * @return array of indices of the assigned sector diff --git a/CUDA/inc/texture_gpuNUFFT_operator.hpp b/CUDA/inc/texture_gpuNUFFT_operator.hpp index c9a90eac..5d1bca98 100644 --- a/CUDA/inc/texture_gpuNUFFT_operator.hpp +++ b/CUDA/inc/texture_gpuNUFFT_operator.hpp @@ -35,7 +35,6 @@ class TextureGpuNUFFTOperator : public GpuNUFFTOperator ~TextureGpuNUFFTOperator() { - freeLookupTable(); } virtual OperatorType getType() diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 8f9dba42..ada90ba7 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -173,6 +173,7 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { + delete gpuNUFFTOp; if(has_sense_data == true) free(sensArray.data); } diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp index 28bf206e..8d3a24a9 100644 --- a/CUDA/src/gpuNUFFT_operator_factory.cpp +++ b/CUDA/src/gpuNUFFT_operator_factory.cpp @@ -536,8 +536,7 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp)); // free temporary array - free(assignedSectors.data); - assignedSectors.data = NULL; + freeLocalMemberArray(assignedSectors.data); gpuNUFFTOp->setDeapodizationFunction( this->computeDeapodizationFunction(kernelWidth, osf, imgDims)); diff --git a/python/test_file.py b/python/test_file.py index 10f8d355..61d41688 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -4,9 +4,9 @@ traj = np.load('/volatile/temp_traj.npy') -D = estimate_density_compensation(traj, (384, 384, 208), 10) -fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', density_comp=D) -K = fourier.op(np.zeros((384, 384, 208))) -print("Forward done") -I = fourier.adj_op(K) -print("Backward done") +for i in range(1): + print(i) + fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT') + K = fourier.op(np.zeros((384, 384, 208))) + im = fourier.adj_op(K) + del fourier From 1bd12d980cb3ef85a9075e4986d46a7c4da8d235 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 9 Nov 2021 21:58:24 +0100 Subject: [PATCH 08/85] Compile issue --- CUDA/inc/gpuNUFFT_operator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 2965667c..38988ab9 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -369,7 +369,7 @@ class GpuNUFFTOperator dataPointer = NULL; } } -gpuNUFFT::TextureGpuNUFFTOperator::~TextureGpuNUFFTOperator + /** \brief gpuNUFFT::OperatorType classifier. Value according to sub-class * implementation. */ OperatorType operatorType; From 5c50bf426f7fa179e5be10963b9f801d54ec5bd2 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 9 Nov 2021 22:19:12 +0100 Subject: [PATCH 09/85] clean mem on gpu --- CUDA/inc/gpuNUFFT_operator.hpp | 1 + .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 12 +++++++++++- python/test_file.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 38988ab9..a18c391a 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -72,6 +72,7 @@ class GpuNUFFTOperator virtual ~GpuNUFFTOperator() { + freeDeviceMemory(); freeLocalMemberArray(this->kernel.data); freeLocalMemberArray(this->dens.data); freeLocalMemberArray(this->sens.data); diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ada90ba7..c18728e2 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -171,6 +171,15 @@ class GpuNUFFTPythonOperator { gpuNUFFTOp->clean_memory(); } + void set_smaps(py::array_t> sense_maps) + { + free(sensArray.data); + sensArray = copyNumpyArray(sense_maps, imgDims.count() * n_coils); + sensArray.dim = imgDims; + sensArray.dim.channels = n_coils; + has_sense_data = true; + gpuNUFFTOp->setSens(sensArray); + } ~GpuNUFFTPythonOperator() { delete gpuNUFFTOp; @@ -183,6 +192,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool>()) .def("op", &GpuNUFFTPythonOperator::op) .def("adj_op", &GpuNUFFTPythonOperator::adj_op) - .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory); + .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) + .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); } #endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED diff --git a/python/test_file.py b/python/test_file.py index 61d41688..265c0cc4 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -4,7 +4,7 @@ traj = np.load('/volatile/temp_traj.npy') -for i in range(1): +for i in range(5): print(i) fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT') K = fourier.op(np.zeros((384, 384, 208))) From 7ffefeb11627f4ce8aca930f33c4724e39a65c30 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 9 Nov 2021 23:30:57 +0100 Subject: [PATCH 10/85] Remove --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 -- python/test_file.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index c18728e2..675cae72 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -183,8 +183,6 @@ class GpuNUFFTPythonOperator ~GpuNUFFTPythonOperator() { delete gpuNUFFTOp; - if(has_sense_data == true) - free(sensArray.data); } }; PYBIND11_MODULE(gpuNUFFT, m) { diff --git a/python/test_file.py b/python/test_file.py index 265c0cc4..5aece6b3 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -6,7 +6,8 @@ traj = np.load('/volatile/temp_traj.npy') for i in range(5): print(i) - fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT') + fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=10, smaps=np.ones((10, 384, 384, 208))) + fourier.impl.operator.set_smaps(np.ones((10, 384, 384, 208))+1) K = fourier.op(np.zeros((384, 384, 208))) im = fourier.adj_op(K) del fourier From 432a706a065826b4c435592dd6085c3c291cb22b Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 9 Nov 2021 23:31:38 +0100 Subject: [PATCH 11/85] Version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index de582070..9108af3a 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.2.1", + version="0.3.0", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From 2e59fbabce4b109b92194a1e9e406078047f99b0 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 15 Nov 2021 10:12:08 +0100 Subject: [PATCH 12/85] Fix 2D issues --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 675cae72..3a05cc8e 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -141,14 +141,13 @@ class GpuNUFFTPythonOperator gpuNUFFT::Dimensions myDims = imgDims; if(grid_data) myDims = myDims * gpuNUFFTOp->getOsf(); - int depth = myDims.depth; if(dimension==2) myDims.depth = 1; py::array_t> out_result; if(has_sense_data == false) - out_result.resize({n_coils, depth, (int)myDims.height, (int)myDims.width}); + out_result.resize({n_coils, (int)myDims.depth, (int)myDims.height, (int)myDims.width}); else - out_result.resize({depth, (int)myDims.height, (int)myDims.width}); + out_result.resize({(int)myDims.depth, (int)myDims.height, (int)myDims.width}); py::buffer_info out = out_result.request(); std::complex *t_data = (std::complex *) out.ptr; DType2 *new_data = reinterpret_cast(*t_data); From 471d4e85948af2450a7e1fed818a4e4e9f1cb79b Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 15 Nov 2021 13:54:16 +0100 Subject: [PATCH 13/85] Remove linking issues --- CUDA/src/gpu/python/CMakeLists.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt index 7eeaef6c..494255c2 100644 --- a/CUDA/src/gpu/python/CMakeLists.txt +++ b/CUDA/src/gpu/python/CMakeLists.txt @@ -9,7 +9,7 @@ include_directories( ${PYTHON_INCLUDE_DIR} ) cuda_include_directories(${GPUNUFFT_INC_DIR}) -cuda_add_library(gpuNUFFT ${GPU_CU_SOURCES} ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} gpuNUFFT_operator_python_factory.cpp SHARED) +cuda_add_library(gpuNUFFT ${GPU_CU_SOURCES} ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} gpuNUFFT_operator_python_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../atomic/atomic_gpuNUFFT.cu SHARED) set_target_properties(gpuNUFFT PROPERTIES PREFIX "") if(WIN32) @@ -18,7 +18,9 @@ if(WIN32) MESSAGE("Found ${PYTHON_LIBRARIES}") set_target_properties(gpuNUFFT PROPERTIES SUFFIX ".pyd") - TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${GRID_LIB_NAME} ${PYTHON_LIBRARIES}) + TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES}) elseif(UNIX) - TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${GRID_LIB_NAME}) + TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES}) endif(WIN32) +CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT) +CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT) From 9e2bd119ccf139b8544febb613932942531e536e Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 15 Nov 2021 13:56:10 +0100 Subject: [PATCH 14/85] Version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9108af3a..adde187e 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.3.0", + version="0.3.2", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From a175ef8f108d843e56912d4b0685da69a339099e Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 16 Nov 2021 18:58:00 +0100 Subject: [PATCH 15/85] Update in code, added concurency and async copy --- CUDA/inc/cuda_utils.hpp | 30 +++++++++- CUDA/inc/gpuNUFFT_operator.hpp | 3 +- .../gpuNUFFT_operator_python_factory.cpp | 32 ++++++---- CUDA/src/gpu/std_gpuNUFFT_kernels.cu | 8 +-- CUDA/src/gpuNUFFT_operator.cpp | 60 +++++++++++-------- python/test_file.py | 10 +--- 6 files changed, 89 insertions(+), 54 deletions(-) diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp index ab26d01f..aa0f06cc 100644 --- a/CUDA/inc/cuda_utils.hpp +++ b/CUDA/inc/cuda_utils.hpp @@ -59,7 +59,19 @@ inline void copyToDevice(TypeName *host_ptr, TypeName *device_ptr, HANDLE_ERROR(cudaMemcpy(device_ptr, host_ptr, num_elements * sizeof(TypeName), cudaMemcpyHostToDevice)); } - +/** \brief CUDA memcpy call to copy data from host to device + * + * @param host_ptr host data pointer + * @param device_ptr device pointer + * @param num_elements amount of elements of size TypeName + */ +template +inline void copyToDeviceAsync(TypeName *host_ptr, TypeName *device_ptr, + IndType num_elements, cudaStream_t stream=0) +{ + HANDLE_ERROR(cudaMemcpyAsync(device_ptr, host_ptr, num_elements * sizeof(TypeName), + cudaMemcpyHostToDevice, stream)); +} /** \brief CUDA memory allocation and memcpy call to copy data from host to *device * @@ -118,7 +130,19 @@ inline void copyFromDevice(TypeName *device_ptr, TypeName *host_ptr, HANDLE_ERROR(cudaMemcpy(host_ptr, device_ptr, num_elements * sizeof(TypeName), cudaMemcpyDeviceToHost)); } - +/** \brief Copy CUDA memory from device to host + * + * @param device_ptr device pointer + * @param host_ptr host pointer + * @param num_elements amount of elements of size TypeName + */ +template +inline void copyFromDeviceAsync(TypeName *device_ptr, TypeName *host_ptr, + IndType num_elements, cudaStream_t stream=0) +{ + HANDLE_ERROR(cudaMemcpyAsync(host_ptr, device_ptr, num_elements * sizeof(TypeName), + cudaMemcpyDeviceToHost, stream)); +} /** \brief Free variable list of device pointers. Use NULL as stopping element * * e.g.: freeTotalDeviceMemory(ptr1*, ptr2*,NULL); @@ -212,7 +236,7 @@ inline void showMemoryInfo() * * @param symbol Const symbol name */ -void initConstSymbol(const char *symbol, const void *src, IndType count); +void initConstSymbol(const char *symbol, const void *src, IndType count, cudaStream_t stream=0); /** \brief Initialize texture memory on device * diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index a18c391a..8e582243 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -75,7 +75,6 @@ class GpuNUFFTOperator freeDeviceMemory(); freeLocalMemberArray(this->kernel.data); freeLocalMemberArray(this->dens.data); - freeLocalMemberArray(this->sens.data); freeLocalMemberArray(this->deapo.data); freeLocalMemberArray(this->kSpaceTraj.data); freeLocalMemberArray(this->sectorCenters.data); @@ -574,7 +573,7 @@ class GpuNUFFTOperator /** \brief Update amount of concurrently computed coils */ - void updateConcurrentCoilCount(int coil_it, int n_coils, int &n_coils_cc); + void updateConcurrentCoilCount(int coil_it, int n_coils, int &n_coils_cc, cudaStream_t stream=0); /** \brief Compute amount of coils which can be computed at once. * diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 3a05cc8e..e3c8cab8 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -19,6 +19,8 @@ Carole Lazarus #include // std::sort #include // std::vector #include +#include + namespace py = pybind11; @@ -45,14 +47,15 @@ readNumpyArray(py::array_t> data) } gpuNUFFT::Array -copyNumpyArray(py::array_t> data, unsigned long alloc_size) +copyNumpyArray(py::array_t> data) { gpuNUFFT::Array dataArray; py::buffer_info myData = data.request(); std::complex *t_data = (std::complex *) myData.ptr; DType2 *my_data = reinterpret_cast(*t_data); - DType2 *copy_data = (DType2 *) malloc(alloc_size*sizeof(DType2)); - memcpy(copy_data, my_data, alloc_size*sizeof(DType2)); + DType2 *copy_data; + cudaMallocHost((void **)©_data, myData.size*sizeof(DType2)); + memcpy(copy_data, my_data, myData.size*sizeof(DType2)); dataArray.data = copy_data; return dataArray; } @@ -103,7 +106,7 @@ class GpuNUFFTPythonOperator } else { - sensArray = copyNumpyArray(sense_maps, imgDims.count() * n_coils); + sensArray = copyNumpyArray(sense_maps); sensArray.dim = imgDims; sensArray.dim.channels = n_coils; has_sense_data = true; @@ -117,16 +120,13 @@ class GpuNUFFTPythonOperator py::array_t> op(py::array_t> image, bool interpolate_data=false) { - py::array_t> out_result({n_coils, trajectory_length}); - py::buffer_info out = out_result.request(); - std::complex *t_data = (std::complex *) out.ptr; - DType2 *new_data = reinterpret_cast(*t_data); + DType2 *new_data; + cudaMallocHost((void **)&new_data, n_coils*trajectory_length*sizeof(DType2)); gpuNUFFT::Array dataArray; dataArray.data = new_data; dataArray.dim.length = trajectory_length; dataArray.dim.channels = n_coils; - - gpuNUFFT::Array imdataArray = readNumpyArray(image); + gpuNUFFT::Array imdataArray = copyNumpyArray(image); imdataArray.dim = imgDims; imdataArray.dim.channels = n_coils; if(interpolate_data) @@ -134,7 +134,15 @@ class GpuNUFFTPythonOperator else gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray); cudaThreadSynchronize(); - return out_result; + return py::array_t>(py::buffer_info( + new_data, /* Pointer to buffer */ + sizeof(std::complex), /* Size of one scalar */ + py::format_descriptor>::format(), /* Python struct-style format descriptor */ + 2, /* Number of dimensions */ + { n_coils, trajectory_length }, /* Buffer dimensions */ + { sizeof(float) * n_coils, /* Strides (in bytes) for each index */ + sizeof(float) } + )); } py::array_t> adj_op(py::array_t> kspace_data, bool grid_data=false) { @@ -173,7 +181,7 @@ class GpuNUFFTPythonOperator void set_smaps(py::array_t> sense_maps) { free(sensArray.data); - sensArray = copyNumpyArray(sense_maps, imgDims.count() * n_coils); + sensArray = copyNumpyArray(sense_maps); sensArray.dim = imgDims; sensArray.dim.channels = n_coils; has_sense_data = true; diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu index 2331c2cf..7d6956d8 100644 --- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu @@ -8,17 +8,17 @@ #include "precomp_utils.hpp" #include "cuda_utils.cuh" -// Method to initialize CONSTANT memory symbols. Needs to reside in *.cu file +// Method to initialize CONSTANT memory symbols. Needs to reside in *.cu file // to work properly // // -void initConstSymbol(const char* symbol, const void* src, IndType size) +void initConstSymbol(const char* symbol, const void* src, IndType size, cudaStream_t stream) { if (std::string("GI").compare(symbol)==0) - HANDLE_ERROR(cudaMemcpyToSymbol(GI, src,size)); + HANDLE_ERROR(cudaMemcpyToSymbolAsync(GI, src, size, 0, cudaMemcpyHostToDevice, stream)); if (std::string("KERNEL").compare(symbol)==0) - HANDLE_ERROR(cudaMemcpyToSymbol(KERNEL, src,size)); + HANDLE_ERROR(cudaMemcpyToSymbolAsync(KERNEL, src, size, 0, cudaMemcpyHostToDevice, stream)); } void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count) diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 03cd13f2..d07d517d 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -355,7 +355,8 @@ int gpuNUFFT::GpuNUFFTOperator::computePossibleConcurrentCoilCount( void gpuNUFFT::GpuNUFFTOperator::updateConcurrentCoilCount(int coil_it, int n_coils, - int &n_coils_cc) + int &n_coils_cc, + cudaStream_t stream) { if ((coil_it + n_coils_cc) >= n_coils) { @@ -363,7 +364,7 @@ void gpuNUFFT::GpuNUFFTOperator::updateConcurrentCoilCount(int coil_it, n_coils_cc = n_coils - coil_it; // Update Gridding Info struct gi_host->n_coils_cc = n_coils_cc; - initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo)); + initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo), stream); } } @@ -1135,38 +1136,39 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("Memory allocation: %.2f ms\n", stopTiming()); int err; - + cudaStream_t new_stream, old_stream; // iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it += n_coils_cc) { + cudaStreamCreate(&new_stream); unsigned long int data_coil_offset = (long int) coil_it * data_count; unsigned long int im_coil_offset = coil_it * (long int)imdata_count; - this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc); + this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc, new_stream); if (this->applySensData()) // perform automatically "repeating" of input image in case // of existing sensitivity data for (int cnt = 0; cnt < n_coils_cc; cnt++) - copyToDevice(imgData.data, imdata_d + cnt * imdata_count, - imdata_count); + copyToDeviceAsync(imgData.data, imdata_d + cnt * imdata_count, + imdata_count, new_stream); else - copyToDevice(imgData.data + im_coil_offset, imdata_d, - imdata_count * n_coils_cc); + copyToDeviceAsync(imgData.data + im_coil_offset, imdata_d, + imdata_count * n_coils_cc, new_stream); // reset temp arrays - cudaMemset(gdata_d, 0, - sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc); - cudaMemset(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc); + cudaMemsetAsync(gdata_d, 0, + sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream); + cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 1: %s\n", cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { - copyToDevice(this->sens.data + im_coil_offset, sens_d, - imdata_count * n_coils_cc); + copyToDeviceAsync(this->sens.data + im_coil_offset, sens_d, + imdata_count * n_coils_cc, new_stream); performSensMul(imdata_d, sens_d, gi_host, false); } @@ -1174,12 +1176,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( performForwardDeapodization(imdata_d, deapo_d, gi_host); if(gpuNUFFTOut == DENSITY_ESTIMATION) { - forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d, + forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d, sector_centers_d, gi_host); writeOrderedGPU(data_sorted_d, data_indices_d, data_d, (int)this->kSpaceTraj.count(), n_coils_cc); - copyFromDevice(data_sorted_d, kspaceData.data + data_coil_offset, - data_count * n_coils_cc); + copyFromDeviceAsync(data_sorted_d, kspaceData.data + data_coil_offset, + data_count * n_coils_cc, new_stream); if ((coil_it + n_coils_cc) < (n_coils)) continue; freeTotalDeviceMemory(data_d, imdata_d, NULL); @@ -1195,13 +1197,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (debugTiming) startTiming(); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 3: %s\n", cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 4: %s\n", cudaGetErrorString(cudaGetLastError())); // eventually free imdata_d @@ -1219,12 +1221,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( c++; } - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 5: %s\n", cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 6: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -1237,7 +1239,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( // convolution and resampling to non-standard trajectory forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d, sector_centers_d, gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 7: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -1245,16 +1247,21 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("Forward Convolution: %.2f ms\n", stopTiming()); performFFTScaling(data_d, gi_host->data_count, gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error: at thread synchronization 8: %s\n", cudaGetErrorString(cudaGetLastError())); // write result in correct order back into output array writeOrderedGPU(data_sorted_d, data_indices_d, data_d, (int)this->kSpaceTraj.count(), n_coils_cc); - - copyFromDevice(data_sorted_d, kspaceData.data + data_coil_offset, - data_count * n_coils_cc); + if(coil_it > 1) + { + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); + } + copyFromDeviceAsync(data_sorted_d, kspaceData.data + data_coil_offset, + data_count * n_coils_cc, new_stream); + old_stream = new_stream; } // iterate over coils freeTotalDeviceMemory(data_d, imdata_d, NULL); @@ -1263,6 +1270,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n", cudaGetErrorString(cudaGetLastError())); + cudaStreamDestroy(old_stream); } gpuNUFFT::Array diff --git a/python/test_file.py b/python/test_file.py index 5aece6b3..538770aa 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -1,13 +1,9 @@ import numpy as np from mri.operators import NonCartesianFFT -from mri.operators.fourier.utils import estimate_density_compensation - traj = np.load('/volatile/temp_traj.npy') -for i in range(5): + +for i in range(1): print(i) - fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=10, smaps=np.ones((10, 384, 384, 208))) - fourier.impl.operator.set_smaps(np.ones((10, 384, 384, 208))+1) + fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=4, smaps=np.ones((4, 384, 384, 208)), osf=1) K = fourier.op(np.zeros((384, 384, 208))) - im = fourier.adj_op(K) - del fourier From 91d8129c3924cddfd9682054a208f13001033749 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 17 Nov 2021 09:26:30 +0100 Subject: [PATCH 16/85] Update in code, added concurency and async copy --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index e3c8cab8..ed7be37c 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -133,7 +133,8 @@ class GpuNUFFTPythonOperator gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray, gpuNUFFT::DENSITY_ESTIMATION); else gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); + free(imdataArray.data); return py::array_t>(py::buffer_info( new_data, /* Pointer to buffer */ sizeof(std::complex), /* Size of one scalar */ @@ -171,7 +172,7 @@ class GpuNUFFTPythonOperator gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray, gpuNUFFT::DENSITY_ESTIMATION); else gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); return out_result; } void clean_memory() From b718f007a812cefd32bccedda4bbb6a45abe7304 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 17 Nov 2021 12:57:18 +0100 Subject: [PATCH 17/85] Concurency working codes --- CUDA/inc/gpuNUFFT_operator.hpp | 2 +- CUDA/inc/gpuNUFFT_operator_factory.hpp | 2 +- .../gpuNUFFT_operator_python_factory.cpp | 81 ++++++++++++++++--- CUDA/src/gpuNUFFT_operator.cpp | 6 +- CUDA/src/gpuNUFFT_operator_factory.cpp | 11 +-- 5 files changed, 81 insertions(+), 21 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 8e582243..2a9c0df3 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -365,7 +365,7 @@ class GpuNUFFTOperator void freeLocalMemberArray(T* dataPointer) { if (dataPointer != NULL) { - free(dataPointer); + cudaFree(dataPointer); dataPointer = NULL; } } diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp index cac4ad88..9e33d8d0 100644 --- a/CUDA/inc/gpuNUFFT_operator_factory.hpp +++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp @@ -180,7 +180,7 @@ class GpuNUFFTOperatorFactory void freeLocalMemberArray(T* dataPointer) { if (dataPointer != NULL) { - free(dataPointer); + cudaFree(dataPointer); dataPointer = NULL; } } diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ed7be37c..58889890 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -59,6 +59,19 @@ copyNumpyArray(py::array_t> data) dataArray.data = copy_data; return dataArray; } +template +gpuNUFFT::Array +copyNumpyArray(py::array_t> data) +{ + gpuNUFFT::Array dataArray; + py::buffer_info myData = data.request(); + TType *my_data = (TType *) myData.ptr; + DType2 *copy_data; + cudaMallocHost((void **)©_data, myData.size*sizeof(TType)); + memcpy(copy_data, my_data, myData.size*sizeof(TType)); + dataArray.data = copy_data; + return dataArray; +} class GpuNUFFTPythonOperator { @@ -115,7 +128,7 @@ class GpuNUFFTPythonOperator gpuNUFFTOp = factory.createGpuNUFFTOperator( kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width, osr, imgDims); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } py::array_t> op(py::array_t> image, bool interpolate_data=false) @@ -126,6 +139,7 @@ class GpuNUFFTPythonOperator dataArray.data = new_data; dataArray.dim.length = trajectory_length; dataArray.dim.channels = n_coils; + // Copy array to pinned memory for better memory bandwidths! gpuNUFFT::Array imdataArray = copyNumpyArray(image); imdataArray.dim = imgDims; imdataArray.dim.channels = n_coils; @@ -134,15 +148,19 @@ class GpuNUFFTPythonOperator else gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray); cudaDeviceSynchronize(); - free(imdataArray.data); + // Free the Copied array + cudaFree(imdataArray.data); + imdataArray.data = NULL; return py::array_t>(py::buffer_info( new_data, /* Pointer to buffer */ sizeof(std::complex), /* Size of one scalar */ py::format_descriptor>::format(), /* Python struct-style format descriptor */ 2, /* Number of dimensions */ { n_coils, trajectory_length }, /* Buffer dimensions */ - { sizeof(float) * n_coils, /* Strides (in bytes) for each index */ - sizeof(float) } + { + sizeof(DType2) * trajectory_length, /* Strides (in bytes) for each index */ + sizeof(DType2) + } )); } py::array_t> adj_op(py::array_t> kspace_data, bool grid_data=false) @@ -152,20 +170,18 @@ class GpuNUFFTPythonOperator myDims = myDims * gpuNUFFTOp->getOsf(); if(dimension==2) myDims.depth = 1; - py::array_t> out_result; + DType2 *t_data; if(has_sense_data == false) - out_result.resize({n_coils, (int)myDims.depth, (int)myDims.height, (int)myDims.width}); + cudaMallocHost((void **)&t_data, n_coils*(int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2)); else - out_result.resize({(int)myDims.depth, (int)myDims.height, (int)myDims.width}); - py::buffer_info out = out_result.request(); - std::complex *t_data = (std::complex *) out.ptr; + cudaMallocHost((void **)&t_data, (int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2)); DType2 *new_data = reinterpret_cast(*t_data); gpuNUFFT::Array imdataArray; imdataArray.data = new_data; imdataArray.dim = myDims; if(has_sense_data == false) imdataArray.dim.channels = n_coils; - gpuNUFFT::Array dataArray = readNumpyArray(kspace_data); + gpuNUFFT::Array dataArray = copyNumpyArray(kspace_data); dataArray.dim.length = trajectory_length; dataArray.dim.channels = n_coils; if(grid_data) @@ -173,7 +189,46 @@ class GpuNUFFTPythonOperator else gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray); cudaDeviceSynchronize(); - return out_result; + // Free the Copied array + cudaFree(dataArray.data); + dataArray.data = NULL; + if(has_sense_data == false) + return py::array_t>(py::buffer_info( + new_data, /* Pointer to buffer */ + sizeof(std::complex), /* Size of one scalar */ + py::format_descriptor>::format(), /* Python struct-style format descriptor */ + 4, /* Number of dimensions */ + { + n_coils, + (int)myDims.depth, + (int)myDims.height, + (int)myDims.width + }, /* Buffer dimensions */ + { + sizeof(DType2) * (int)myDims.depth * (int)myDims.height * (int)myDims.width, + sizeof(DType2) * (int)myDims.height * (int)myDims.width, + sizeof(DType2) * (int)myDims.width, + sizeof(DType2), + } + )); + else + return py::array_t>(py::buffer_info( + new_data, /* Pointer to buffer */ + sizeof(std::complex), /* Size of one scalar */ + py::format_descriptor>::format(), /* Python struct-style format descriptor */ + 3, /* Number of dimensions */ + { + (int)myDims.depth, + (int)myDims.height, + (int)myDims.width + }, /* Buffer dimensions */ + { + sizeof(DType2) * (int)myDims.height * (int)myDims.width, + sizeof(DType2) * (int)myDims.width, + sizeof(DType2), + } + )); + } void clean_memory() { @@ -181,6 +236,10 @@ class GpuNUFFTPythonOperator } void set_smaps(py::array_t> sense_maps) { + py::buffer_info myData = sense_maps.request(); + std::complex *t_data = (std::complex *) myData.ptr; + DType2 *my_data = reinterpret_cast(*t_data); + memcpy(sensArray.data, my_data, myData.size*sizeof(DType2)); free(sensArray.data); sensArray = copyNumpyArray(sense_maps); sensArray.dim = imgDims; diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index d07d517d..27845579 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -53,8 +53,8 @@ void gpuNUFFT::GpuNUFFTOperator::initKernel() gpuNUFFT::GpuNUFFTInfo * gpuNUFFT::GpuNUFFTOperator::initGpuNUFFTInfo(int n_coils_cc) { - gpuNUFFT::GpuNUFFTInfo *gi_host = - (gpuNUFFT::GpuNUFFTInfo *)malloc(sizeof(gpuNUFFT::GpuNUFFTInfo)); + gpuNUFFT::GpuNUFFTInfo *gi_host; + cudaMallocHost((void **)&gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo)); gi_host->data_count = (int)this->kSpaceTraj.count(); gi_host->sector_count = (int)this->gridSectorDims.count(); @@ -300,7 +300,7 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory() if (!gpuMemAllocated) return; - free(gi_host); + cudaFree(gi_host); cufftDestroy(fft_plan); // Destroy the cuFFT plan. if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp index 8d3a24a9..79753844 100644 --- a/CUDA/src/gpuNUFFT_operator_factory.cpp +++ b/CUDA/src/gpuNUFFT_operator_factory.cpp @@ -31,7 +31,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::initLinArray(IndType arrCount) { gpuNUFFT::Array new_array; - new_array.data = (T *)malloc(arrCount * sizeof(T)); + cudaMallocHost((void **)&new_array.data, arrCount * sizeof(T)); new_array.dim.length = arrCount; return new_array; } @@ -132,7 +132,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::assignSectors( // create temporary array to store assigned values gpuNUFFT::Array assignedSectors; - assignedSectors.data = (IndType *)malloc(coordCnt * sizeof(IndType)); + cudaMallocHost((void **) &assignedSectors.data, coordCnt * sizeof(IndType)); assignedSectors.dim.length = coordCnt; if (useGpu) @@ -415,7 +415,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // cleanup locally initialized arrays here free(dataArray.data); - free(assignedSectors.data); + cudaFree(assignedSectors.data); // Compute abs values of deapo function and compensate // FFT scaling sqrt(N) @@ -438,7 +438,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // cleanup delete deapoGpuNUFFTOp; - free(deapoFunction.data); + cudaFree(deapoFunction.data); return deapoAbs; } @@ -536,7 +536,8 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp)); // free temporary array - freeLocalMemberArray(assignedSectors.data); + cudaFree(assignedSectors.data); + assignedSectors.data = NULL; gpuNUFFTOp->setDeapodizationFunction( this->computeDeapodizationFunction(kernelWidth, osf, imgDims)); From 27abe64fc6c1ebbad148fda8bb8e52f168d27e19 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 17 Nov 2021 13:10:26 +0100 Subject: [PATCH 18/85] Fix leaks --- CUDA/inc/gpuNUFFT_operator.hpp | 2 +- CUDA/inc/gpuNUFFT_operator_factory.hpp | 2 +- .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 10 +++------- CUDA/src/gpuNUFFT_operator.cpp | 2 +- CUDA/src/gpuNUFFT_operator_factory.cpp | 6 +++--- python/test_file.py | 4 +++- 6 files changed, 12 insertions(+), 14 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 2a9c0df3..cb327e6f 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -365,7 +365,7 @@ class GpuNUFFTOperator void freeLocalMemberArray(T* dataPointer) { if (dataPointer != NULL) { - cudaFree(dataPointer); + cudaFreeHost(dataPointer); dataPointer = NULL; } } diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp index 9e33d8d0..3e1b7a2a 100644 --- a/CUDA/inc/gpuNUFFT_operator_factory.hpp +++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp @@ -180,7 +180,7 @@ class GpuNUFFTOperatorFactory void freeLocalMemberArray(T* dataPointer) { if (dataPointer != NULL) { - cudaFree(dataPointer); + cudaFreeHost(dataPointer); dataPointer = NULL; } } diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 58889890..3e851ca5 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -148,8 +148,8 @@ class GpuNUFFTPythonOperator else gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray); cudaDeviceSynchronize(); - // Free the Copied array - cudaFree(imdataArray.data); + // Free the Copied array + cudaFreeHost(imdataArray.data); imdataArray.data = NULL; return py::array_t>(py::buffer_info( new_data, /* Pointer to buffer */ @@ -190,7 +190,7 @@ class GpuNUFFTPythonOperator gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray); cudaDeviceSynchronize(); // Free the Copied array - cudaFree(dataArray.data); + cudaFreeHost(dataArray.data); dataArray.data = NULL; if(has_sense_data == false) return py::array_t>(py::buffer_info( @@ -240,10 +240,6 @@ class GpuNUFFTPythonOperator std::complex *t_data = (std::complex *) myData.ptr; DType2 *my_data = reinterpret_cast(*t_data); memcpy(sensArray.data, my_data, myData.size*sizeof(DType2)); - free(sensArray.data); - sensArray = copyNumpyArray(sense_maps); - sensArray.dim = imgDims; - sensArray.dim.channels = n_coils; has_sense_data = true; gpuNUFFTOp->setSens(sensArray); } diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 27845579..770e6333 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -300,7 +300,7 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory() if (!gpuMemAllocated) return; - cudaFree(gi_host); + cudaFreeHost(gi_host); cufftDestroy(fft_plan); // Destroy the cuFFT plan. if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp index 79753844..bb437bd9 100644 --- a/CUDA/src/gpuNUFFT_operator_factory.cpp +++ b/CUDA/src/gpuNUFFT_operator_factory.cpp @@ -415,7 +415,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // cleanup locally initialized arrays here free(dataArray.data); - cudaFree(assignedSectors.data); + cudaFreeHost(assignedSectors.data); // Compute abs values of deapo function and compensate // FFT scaling sqrt(N) @@ -438,7 +438,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // cleanup delete deapoGpuNUFFTOp; - cudaFree(deapoFunction.data); + cudaFreeHost(deapoFunction.data); return deapoAbs; } @@ -536,7 +536,7 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp)); // free temporary array - cudaFree(assignedSectors.data); + cudaFreeHost(assignedSectors.data); assignedSectors.data = NULL; gpuNUFFTOp->setDeapodizationFunction( diff --git a/python/test_file.py b/python/test_file.py index 538770aa..ae547f27 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -3,7 +3,9 @@ traj = np.load('/volatile/temp_traj.npy') -for i in range(1): +for i in range(3): print(i) fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=4, smaps=np.ones((4, 384, 384, 208)), osf=1) K = fourier.op(np.zeros((384, 384, 208))) + I = fourier.adj_op(K) + del fourier \ No newline at end of file From 8dd394ac43baae8947f50f40c4dc4309ac37feb9 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Thu, 18 Nov 2021 10:52:01 +0100 Subject: [PATCH 19/85] Do a single cudaMalloc --- .../gpuNUFFT_operator_python_factory.cpp | 121 +++++++----------- python/test_file.py | 4 +- 2 files changed, 48 insertions(+), 77 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 3e851ca5..0ca842cc 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -46,31 +46,19 @@ readNumpyArray(py::array_t> data) return dataArray; } -gpuNUFFT::Array -copyNumpyArray(py::array_t> data) +void allocate_pinned_memory(gpuNUFFT::Array *lin_array, unsigned long int size) { - gpuNUFFT::Array dataArray; - py::buffer_info myData = data.request(); - std::complex *t_data = (std::complex *) myData.ptr; - DType2 *my_data = reinterpret_cast(*t_data); - DType2 *copy_data; - cudaMallocHost((void **)©_data, myData.size*sizeof(DType2)); - memcpy(copy_data, my_data, myData.size*sizeof(DType2)); - dataArray.data = copy_data; - return dataArray; + DType2 *new_data; + cudaMallocHost((void **)&new_data, size); + lin_array->data = new_data; } template -gpuNUFFT::Array -copyNumpyArray(py::array_t> data) +void copyNumpyArray(py::array_t> data, TType *copy_data) { - gpuNUFFT::Array dataArray; py::buffer_info myData = data.request(); - TType *my_data = (TType *) myData.ptr; - DType2 *copy_data; - cudaMallocHost((void **)©_data, myData.size*sizeof(TType)); + std::complex *t_data = (std::complex *) myData.ptr; + TType *my_data = reinterpret_cast(*t_data); memcpy(copy_data, my_data, myData.size*sizeof(TType)); - dataArray.data = copy_data; - return dataArray; } class GpuNUFFTPythonOperator @@ -81,7 +69,7 @@ class GpuNUFFTPythonOperator bool has_sense_data; gpuNUFFT::Dimensions imgDims; // sensitivity maps - gpuNUFFT::Array sensArray; + gpuNUFFT::Array sensArray, kspace_data, image; public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, py::array_t> sense_maps, py::array_t density_comp, int kernel_width=3, @@ -119,40 +107,44 @@ class GpuNUFFTPythonOperator } else { - sensArray = copyNumpyArray(sense_maps); + allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2)); sensArray.dim = imgDims; sensArray.dim.channels = n_coils; + copyNumpyArray(sense_maps, sensArray.data); has_sense_data = true; } factory.setBalanceWorkload(balance_workload); gpuNUFFTOp = factory.createGpuNUFFTOperator( kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width, osr, imgDims); + allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2)); + kspace_data.dim.length = trajectory_length; + kspace_data.dim.channels = n_coils; + image.dim = imgDims; + if(has_sense_data == false) + { + allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2)); + image.dim.channels = n_coils; + } + else + { + allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2)); + image.dim.channels = 1; + } cudaDeviceSynchronize(); } - py::array_t> op(py::array_t> image, bool interpolate_data=false) + py::array_t> op(py::array_t> input_image, bool interpolate_data=false) { - DType2 *new_data; - cudaMallocHost((void **)&new_data, n_coils*trajectory_length*sizeof(DType2)); - gpuNUFFT::Array dataArray; - dataArray.data = new_data; - dataArray.dim.length = trajectory_length; - dataArray.dim.channels = n_coils; // Copy array to pinned memory for better memory bandwidths! - gpuNUFFT::Array imdataArray = copyNumpyArray(image); - imdataArray.dim = imgDims; - imdataArray.dim.channels = n_coils; + copyNumpyArray(input_image, image.data); if(interpolate_data) - gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray, gpuNUFFT::DENSITY_ESTIMATION); + gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else - gpuNUFFTOp->performForwardGpuNUFFT(imdataArray, dataArray); + gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data); cudaDeviceSynchronize(); - // Free the Copied array - cudaFreeHost(imdataArray.data); - imdataArray.data = NULL; return py::array_t>(py::buffer_info( - new_data, /* Pointer to buffer */ + kspace_data.data, /* Pointer to buffer */ sizeof(std::complex), /* Size of one scalar */ py::format_descriptor>::format(), /* Python struct-style format descriptor */ 2, /* Number of dimensions */ @@ -163,68 +155,47 @@ class GpuNUFFTPythonOperator } )); } - py::array_t> adj_op(py::array_t> kspace_data, bool grid_data=false) + py::array_t> adj_op(py::array_t> input_kspace_data, bool grid_data=false) { - gpuNUFFT::Dimensions myDims = imgDims; - if(grid_data) - myDims = myDims * gpuNUFFTOp->getOsf(); - if(dimension==2) - myDims.depth = 1; - DType2 *t_data; - if(has_sense_data == false) - cudaMallocHost((void **)&t_data, n_coils*(int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2)); - else - cudaMallocHost((void **)&t_data, (int)myDims.depth*(int)myDims.height*(int)myDims.width*sizeof(DType2)); - DType2 *new_data = reinterpret_cast(*t_data); - gpuNUFFT::Array imdataArray; - imdataArray.data = new_data; - imdataArray.dim = myDims; - if(has_sense_data == false) - imdataArray.dim.channels = n_coils; - gpuNUFFT::Array dataArray = copyNumpyArray(kspace_data); - dataArray.dim.length = trajectory_length; - dataArray.dim.channels = n_coils; + copyNumpyArray(input_kspace_data, kspace_data.data); if(grid_data) - gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray, gpuNUFFT::DENSITY_ESTIMATION); + gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); else - gpuNUFFTOp->performGpuNUFFTAdj(dataArray, imdataArray); + gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image); cudaDeviceSynchronize(); - // Free the Copied array - cudaFreeHost(dataArray.data); - dataArray.data = NULL; if(has_sense_data == false) return py::array_t>(py::buffer_info( - new_data, /* Pointer to buffer */ + image.data, /* Pointer to buffer */ sizeof(std::complex), /* Size of one scalar */ py::format_descriptor>::format(), /* Python struct-style format descriptor */ 4, /* Number of dimensions */ { n_coils, - (int)myDims.depth, - (int)myDims.height, - (int)myDims.width + (int)image.dim.depth, + (int)image.dim.height, + (int)image.dim.width }, /* Buffer dimensions */ { - sizeof(DType2) * (int)myDims.depth * (int)myDims.height * (int)myDims.width, - sizeof(DType2) * (int)myDims.height * (int)myDims.width, - sizeof(DType2) * (int)myDims.width, + sizeof(DType2) * (int)image.dim.depth * (int)image.dim.height * (int)image.dim.width, + sizeof(DType2) * (int)image.dim.height * (int)image.dim.width, + sizeof(DType2) * (int)image.dim.width, sizeof(DType2), } )); else return py::array_t>(py::buffer_info( - new_data, /* Pointer to buffer */ + image.data, /* Pointer to buffer */ sizeof(std::complex), /* Size of one scalar */ py::format_descriptor>::format(), /* Python struct-style format descriptor */ 3, /* Number of dimensions */ { - (int)myDims.depth, - (int)myDims.height, - (int)myDims.width + (int)image.dim.depth, + (int)image.dim.height, + (int)image.dim.width }, /* Buffer dimensions */ { - sizeof(DType2) * (int)myDims.height * (int)myDims.width, - sizeof(DType2) * (int)myDims.width, + sizeof(DType2) * (int)image.dim.height * (int)image.dim.width, + sizeof(DType2) * (int)image.dim.width, sizeof(DType2), } )); diff --git a/python/test_file.py b/python/test_file.py index ae547f27..f1f26a5a 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -3,9 +3,9 @@ traj = np.load('/volatile/temp_traj.npy') +fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1) + for i in range(3): print(i) - fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=4, smaps=np.ones((4, 384, 384, 208)), osf=1) K = fourier.op(np.zeros((384, 384, 208))) I = fourier.adj_op(K) - del fourier \ No newline at end of file From 07d0cab1431c2064afe59a81ef5f019c8390fa46 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Thu, 18 Nov 2021 14:46:19 +0100 Subject: [PATCH 20/85] New test --- python/test_file.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/python/test_file.py b/python/test_file.py index 5aece6b3..e6980cc9 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -1,13 +1,11 @@ import numpy as np from mri.operators import NonCartesianFFT -from mri.operators.fourier.utils import estimate_density_compensation - traj = np.load('/volatile/temp_traj.npy') -for i in range(5): + +fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=2) + +for i in range(10): print(i) - fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=10, smaps=np.ones((10, 384, 384, 208))) - fourier.impl.operator.set_smaps(np.ones((10, 384, 384, 208))+1) K = fourier.op(np.zeros((384, 384, 208))) - im = fourier.adj_op(K) - del fourier + I = fourier.adj_op(K) From 15db29dffefc8f3c6b686a2746ac84c93bb6aa27 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Thu, 18 Nov 2021 18:03:19 +0100 Subject: [PATCH 21/85] Fixed all issues, no copies --- .../gpuNUFFT_operator_python_factory.cpp | 58 +++++++++---------- python/test_file.py | 6 +- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 0ca842cc..5768a9be 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -137,68 +137,66 @@ class GpuNUFFTPythonOperator py::array_t> op(py::array_t> input_image, bool interpolate_data=false) { // Copy array to pinned memory for better memory bandwidths! - copyNumpyArray(input_image, image.data); + //copyNumpyArray(input_image, image.data); if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data); cudaDeviceSynchronize(); - return py::array_t>(py::buffer_info( - kspace_data.data, /* Pointer to buffer */ - sizeof(std::complex), /* Size of one scalar */ - py::format_descriptor>::format(), /* Python struct-style format descriptor */ - 2, /* Number of dimensions */ - { n_coils, trajectory_length }, /* Buffer dimensions */ + std::complex *ptr = reinterpret_cast(&)[0]>(*kspace_data.data); + auto capsule = py::capsule(ptr, [](void *ptr) { return; }); + return py::array_t>( + { n_coils, trajectory_length }, { - sizeof(DType2) * trajectory_length, /* Strides (in bytes) for each index */ + sizeof(DType2) * trajectory_length, sizeof(DType2) - } - )); + }, + ptr, + capsule + ); } py::array_t> adj_op(py::array_t> input_kspace_data, bool grid_data=false) { - copyNumpyArray(input_kspace_data, kspace_data.data); + //copyNumpyArray(input_kspace_data, kspace_data.data); if(grid_data) gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); else gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image); cudaDeviceSynchronize(); + std::complex *ptr = reinterpret_cast(&)[0]>(*image.data); + auto capsule = py::capsule(ptr, [](void *ptr) { return; }); if(has_sense_data == false) - return py::array_t>(py::buffer_info( - image.data, /* Pointer to buffer */ - sizeof(std::complex), /* Size of one scalar */ - py::format_descriptor>::format(), /* Python struct-style format descriptor */ - 4, /* Number of dimensions */ + return py::array_t>( { n_coils, (int)image.dim.depth, (int)image.dim.height, (int)image.dim.width - }, /* Buffer dimensions */ + }, { sizeof(DType2) * (int)image.dim.depth * (int)image.dim.height * (int)image.dim.width, sizeof(DType2) * (int)image.dim.height * (int)image.dim.width, sizeof(DType2) * (int)image.dim.width, sizeof(DType2), - } - )); + }, + ptr, + capsule + ); else - return py::array_t>(py::buffer_info( - image.data, /* Pointer to buffer */ - sizeof(std::complex), /* Size of one scalar */ - py::format_descriptor>::format(), /* Python struct-style format descriptor */ - 3, /* Number of dimensions */ + return py::array_t>( { (int)image.dim.depth, (int)image.dim.height, (int)image.dim.width - }, /* Buffer dimensions */ + }, { sizeof(DType2) * (int)image.dim.height * (int)image.dim.width, sizeof(DType2) * (int)image.dim.width, sizeof(DType2), - } - )); + }, + ptr, + capsule + ); } void clean_memory() @@ -216,14 +214,16 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { + cudaFree(kspace_data.data); + cudaFree(image.data); delete gpuNUFFTOp; } }; PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool>()) - .def("op", &GpuNUFFTPythonOperator::op) - .def("adj_op", &GpuNUFFTPythonOperator::adj_op) + .def("op", &GpuNUFFTPythonOperator::op, py::return_value_policy::reference) + .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::return_value_policy::reference) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); } diff --git a/python/test_file.py b/python/test_file.py index f1f26a5a..7e45fcfd 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -3,9 +3,9 @@ traj = np.load('/volatile/temp_traj.npy') -fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1) +fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=1) -for i in range(3): +for i in range(10): print(i) K = fourier.op(np.zeros((384, 384, 208))) - I = fourier.adj_op(K) + I = fourier.adj_op(K) \ No newline at end of file From 99d03adf1d81b7cd303d396aac79b89214729ae8 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Fri, 19 Nov 2021 13:21:55 +0100 Subject: [PATCH 22/85] FIx leaks --- CUDA/inc/gpuNUFFT_operator.hpp | 1 + .../gpuNUFFT_operator_python_factory.cpp | 18 ++++++++++-------- python/test_file.py | 5 +++-- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index cb327e6f..4eb94478 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -75,6 +75,7 @@ class GpuNUFFTOperator freeDeviceMemory(); freeLocalMemberArray(this->kernel.data); freeLocalMemberArray(this->dens.data); + freeLocalMemberArray(this->sens.data); freeLocalMemberArray(this->deapo.data); freeLocalMemberArray(this->kSpaceTraj.data); freeLocalMemberArray(this->sectorCenters.data); diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 5768a9be..d017434e 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -137,14 +137,15 @@ class GpuNUFFTPythonOperator py::array_t> op(py::array_t> input_image, bool interpolate_data=false) { // Copy array to pinned memory for better memory bandwidths! - //copyNumpyArray(input_image, image.data); + copyNumpyArray(input_image, image.data); if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data); cudaDeviceSynchronize(); std::complex *ptr = reinterpret_cast(&)[0]>(*kspace_data.data); - auto capsule = py::capsule(ptr, [](void *ptr) { return; }); + auto capsule = py::capsule(ptr, [](void *ptr) { return; + }); return py::array_t>( { n_coils, trajectory_length }, { @@ -157,14 +158,15 @@ class GpuNUFFTPythonOperator } py::array_t> adj_op(py::array_t> input_kspace_data, bool grid_data=false) { - //copyNumpyArray(input_kspace_data, kspace_data.data); + copyNumpyArray(input_kspace_data, kspace_data.data); if(grid_data) gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); else gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image); cudaDeviceSynchronize(); std::complex *ptr = reinterpret_cast(&)[0]>(*image.data); - auto capsule = py::capsule(ptr, [](void *ptr) { return; }); + auto capsule = py::capsule(ptr, [](void *ptr) { return; + }); if(has_sense_data == false) return py::array_t>( { @@ -214,16 +216,16 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { - cudaFree(kspace_data.data); - cudaFree(image.data); + cudaFreeHost(kspace_data.data); + cudaFreeHost(image.data); delete gpuNUFFTOp; } }; PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool>()) - .def("op", &GpuNUFFTPythonOperator::op, py::return_value_policy::reference) - .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::return_value_policy::reference) + .def("op", &GpuNUFFTPythonOperator::op) + .def("adj_op", &GpuNUFFTPythonOperator::adj_op) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); } diff --git a/python/test_file.py b/python/test_file.py index 7e45fcfd..83dc2a98 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -3,9 +3,10 @@ traj = np.load('/volatile/temp_traj.npy') -fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=1) for i in range(10): + fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1) print(i) K = fourier.op(np.zeros((384, 384, 208))) - I = fourier.adj_op(K) \ No newline at end of file + I = fourier.adj_op(K) + del fourier \ No newline at end of file From 8d8f24885ed9921bf2478da2496270f4eb4b2b96 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Fri, 19 Nov 2021 16:03:38 +0100 Subject: [PATCH 23/85] Fix mem leaks --- CUDA/src/gpuNUFFT_operator_factory.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp index bb437bd9..9fbae2e9 100644 --- a/CUDA/src/gpuNUFFT_operator_factory.cpp +++ b/CUDA/src/gpuNUFFT_operator_factory.cpp @@ -368,7 +368,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // Data gpuNUFFT::Array dataArray; - dataArray.data = (DType2*)calloc(1, sizeof(DType2)); // re + im + cudaMallocHost((void **) &dataArray.data, sizeof(DType2)); dataArray.dim.length = 1; dataArray.data[0].x = 1; dataArray.data[0].y = 0; @@ -377,9 +377,9 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // should result in k-space center (0,0,0) gpuNUFFT::Array kSpaceTraj; if (deapoGpuNUFFTOp->is3DProcessing()) - kSpaceTraj.data = (DType*)calloc(3, sizeof(DType)); // x,y,z + cudaMallocHost((void **) &kSpaceTraj.data, 3*sizeof(DType)); else - kSpaceTraj.data = (DType*)calloc(2, sizeof(DType)); // x,y + cudaMallocHost((void **) &kSpaceTraj.data, 2*sizeof(DType)); kSpaceTraj.dim.length = 1; deapoGpuNUFFTOp->setKSpaceTraj(kSpaceTraj); @@ -391,7 +391,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // only one data entry, data index = 0 Array dataIndices; - dataIndices.data = (IndType*)calloc(1, sizeof(IndType)); + cudaMallocHost((void **) &dataIndices.data, 2*sizeof(IndType)); dataIndices.dim.length = 1; deapoGpuNUFFTOp->setDataIndices(dataIndices); @@ -414,7 +414,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu debug("finished deapo computation\n"); // cleanup locally initialized arrays here - free(dataArray.data); + cudaFreeHost(dataArray.data); cudaFreeHost(assignedSectors.data); // Compute abs values of deapo function and compensate @@ -438,7 +438,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu // cleanup delete deapoGpuNUFFTOp; - cudaFreeHost(deapoFunction.data); + free(deapoFunction.data); return deapoAbs; } From 10663bfc31f6535b7c2998da2b46f3eb6fa537d4 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Thu, 25 Nov 2021 10:16:25 +0100 Subject: [PATCH 24/85] test file --- python/test_file.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/test_file.py b/python/test_file.py index 83dc2a98..564212d5 100644 --- a/python/test_file.py +++ b/python/test_file.py @@ -1,11 +1,12 @@ import numpy as np from mri.operators import NonCartesianFFT - +from mri.operators.fourier.utils import estimate_density_compensation traj = np.load('/volatile/temp_traj.npy') -for i in range(10): - fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=2, smaps=np.ones((2, 384, 384, 208)), osf=1) +for i in range(1): + dens = estimate_density_compensation(traj, (384, 384, 208)) + fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=2, density_comp=dens) print(i) K = fourier.op(np.zeros((384, 384, 208))) I = fourier.adj_op(K) From d32497554ce9a4507f0abd4f70b3b48b28bec8af Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Thu, 25 Nov 2021 11:50:10 +0100 Subject: [PATCH 25/85] Fix minute issues --- .../gpuNUFFT_operator_python_factory.cpp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index d017434e..2bb88f36 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -158,6 +158,9 @@ class GpuNUFFTPythonOperator } py::array_t> adj_op(py::array_t> input_kspace_data, bool grid_data=false) { + gpuNUFFT::Dimensions myDims = imgDims; + if(dimension==2) + myDims.depth = 1; copyNumpyArray(input_kspace_data, kspace_data.data); if(grid_data) gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); @@ -171,14 +174,14 @@ class GpuNUFFTPythonOperator return py::array_t>( { n_coils, - (int)image.dim.depth, - (int)image.dim.height, - (int)image.dim.width + (int)myDims.depth, + (int)myDims.height, + (int)myDims.width }, { - sizeof(DType2) * (int)image.dim.depth * (int)image.dim.height * (int)image.dim.width, - sizeof(DType2) * (int)image.dim.height * (int)image.dim.width, - sizeof(DType2) * (int)image.dim.width, + sizeof(DType2) * (int)myDims.depth * (int)myDims.height * (int)myDims.width, + sizeof(DType2) * (int)myDims.height * (int)myDims.width, + sizeof(DType2) * (int)myDims.width, sizeof(DType2), }, ptr, @@ -187,13 +190,13 @@ class GpuNUFFTPythonOperator else return py::array_t>( { - (int)image.dim.depth, - (int)image.dim.height, - (int)image.dim.width + (int)myDims.depth, + (int)myDims.height, + (int)myDims.width }, { - sizeof(DType2) * (int)image.dim.height * (int)image.dim.width, - sizeof(DType2) * (int)image.dim.width, + sizeof(DType2) * (int)myDims.height * (int)myDims.width, + sizeof(DType2) * (int)myDims.width, sizeof(DType2), }, ptr, From ee3af688389a66eac118ba39aeb2661c16e45b6b Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 11 Jan 2022 18:22:02 +0100 Subject: [PATCH 26/85] Fix for 2D --- CUDA/src/gpuNUFFT_operator_factory.cpp | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp index 9fbae2e9..b922ca63 100644 --- a/CUDA/src/gpuNUFFT_operator_factory.cpp +++ b/CUDA/src/gpuNUFFT_operator_factory.cpp @@ -132,7 +132,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::assignSectors( // create temporary array to store assigned values gpuNUFFT::Array assignedSectors; - cudaMallocHost((void **) &assignedSectors.data, coordCnt * sizeof(IndType)); + assignedSectors.data = (IndType *)malloc(coordCnt * sizeof(IndType)); assignedSectors.dim.length = coordCnt; if (useGpu) @@ -355,46 +355,46 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu const IndType &kernelWidth, const DType &osf, gpuNUFFT::Dimensions &imgDims) { debug("compute deapodization function\n"); - + // Create simple gpuNUFFT Operator IndType sectorWidth = 8; gpuNUFFT::GpuNUFFTOperator *deapoGpuNUFFTOp; - + if (useTextures) deapoGpuNUFFTOp = new gpuNUFFT::TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims, TEXTURE2D_LOOKUP); else deapoGpuNUFFTOp = new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims); - + // Data gpuNUFFT::Array dataArray; - cudaMallocHost((void **) &dataArray.data, sizeof(DType2)); + dataArray.data = (DType2*)calloc(1, sizeof(DType2)); // re + im dataArray.dim.length = 1; dataArray.data[0].x = 1; dataArray.data[0].y = 0; - + // Coord triplet (x,y,z) // should result in k-space center (0,0,0) gpuNUFFT::Array kSpaceTraj; if (deapoGpuNUFFTOp->is3DProcessing()) - cudaMallocHost((void **) &kSpaceTraj.data, 3*sizeof(DType)); + kSpaceTraj.data = (DType*)calloc(3, sizeof(DType)); // x,y,z else - cudaMallocHost((void **) &kSpaceTraj.data, 2*sizeof(DType)); + kSpaceTraj.data = (DType*)calloc(2, sizeof(DType)); // x,y kSpaceTraj.dim.length = 1; deapoGpuNUFFTOp->setKSpaceTraj(kSpaceTraj); - + // assign according sector to k-Space position gpuNUFFT::Array assignedSectors = assignSectors(deapoGpuNUFFTOp, kSpaceTraj); deapoGpuNUFFTOp->setSectorDataCount( computeSectorDataCount(deapoGpuNUFFTOp, assignedSectors, true)); - + // only one data entry, data index = 0 Array dataIndices; - cudaMallocHost((void **) &dataIndices.data, 2*sizeof(IndType)); + dataIndices.data = (IndType*)calloc(1, sizeof(IndType)); dataIndices.dim.length = 1; deapoGpuNUFFTOp->setDataIndices(dataIndices); - + // sector centers if (deapoGpuNUFFTOp->is3DProcessing()) deapoGpuNUFFTOp->setSectorCenters(computeSectorCenters(deapoGpuNUFFTOp, true)); @@ -405,17 +405,17 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu debug("compute deapodization\n"); deapoGpuNUFFTOp->setDebugFunction(std::bind(&gpuNUFFT::GpuNUFFTOperatorFactory::debug, this, std::placeholders::_1)); - // Compute deapodization function by gridding of a single value positioned + // Compute deapodization function by gridding of a single value positioned // in the center of k-space and by using the intended oversampling factor // and interpolation kernel width gpuNUFFT::Array deapoFunction = deapoGpuNUFFTOp->performGpuNUFFTAdj(dataArray,FFT); - + debug("finished deapo computation\n"); // cleanup locally initialized arrays here - cudaFreeHost(dataArray.data); - cudaFreeHost(assignedSectors.data); + free(dataArray.data); + free(assignedSectors.data); // Compute abs values of deapo function and compensate // FFT scaling sqrt(N) @@ -423,7 +423,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu DType maxDeapoVal = 0; DType minDeapoVal = std::numeric_limits::max(); - double fft_scaling_factor = std::sqrt(deapoGpuNUFFTOp->getGridDims().count()); + double fft_scaling_factor = std::sqrt(deapoGpuNUFFTOp->getGridDims().count()); for (unsigned cnt = 0; cnt < deapoFunction.count(); cnt++) { @@ -536,7 +536,7 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( gpuNUFFTOp->setSectorCenters(computeSectorCenters2D(gpuNUFFTOp)); // free temporary array - cudaFreeHost(assignedSectors.data); + free(assignedSectors.data); assignedSectors.data = NULL; gpuNUFFTOp->setDeapodizationFunction( From bc16ccfb2c496bab9db5d241bd96ccb01eae1809 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Fri, 22 Apr 2022 12:02:02 +0200 Subject: [PATCH 27/85] Fix --- CUDA/inc/config.hpp.cmake | 2 +- CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu | 12 ++++++------ CUDA/src/gpu/std_gpuNUFFT_kernels.cu | 4 ++-- setup.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CUDA/inc/config.hpp.cmake b/CUDA/inc/config.hpp.cmake index ff61b8a4..bb1965f1 100644 --- a/CUDA/inc/config.hpp.cmake +++ b/CUDA/inc/config.hpp.cmake @@ -30,7 +30,7 @@ #endif typedef unsigned int SizeType; -typedef unsigned int IndType; +typedef unsigned long int IndType; /** \brief Combined 2-tuple (x,y) of IndType */ typedef struct IndType2 diff --git a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu index 4734019e..5004a3f6 100644 --- a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu @@ -473,7 +473,7 @@ void performTextureConvolution(DType2 *data_d, DType *crds_d, // * N : number of threads __device__ void -textureForwardConvolutionFunction(int *sec, int sec_max, int sec_offset, +textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_offset, DType2 *sdata, CufftType *gdata_cache, DType2 *data, DType *crds, CufftType *gdata, IndType *sectors, IndType *sector_centers) @@ -592,7 +592,7 @@ __global__ void textureForwardConvolutionKernel(CufftType *data, DType *crds, CufftType *shared_out_data = (CufftType *)&shared[0]; CufftType *gdata_cache = (CufftType *)&shared[blockDim.x]; - __shared__ int sec[THREAD_BLOCK_SIZE]; + __shared__ long int sec[THREAD_BLOCK_SIZE]; sec[threadIdx.x] = blockIdx.x; // init shared memory @@ -603,7 +603,7 @@ __global__ void textureForwardConvolutionKernel(CufftType *data, DType *crds, // start convolution while (sec[threadIdx.x] < N) { - __shared__ int data_max; + __shared__ long int data_max; data_max = sectors[sec[threadIdx.x] + 1]; textureForwardConvolutionFunction(sec, data_max, 0, shared_out_data, @@ -622,8 +622,8 @@ __global__ void balancedTextureForwardConvolutionKernel( CufftType *shared_out_data = (CufftType *)&shared[0]; CufftType *gdata_cache = (CufftType *)&shared[blockDim.x]; - int sec_cnt = blockIdx.x; - __shared__ int sec[THREAD_BLOCK_SIZE]; + long int sec_cnt = blockIdx.x; + __shared__ long int sec[THREAD_BLOCK_SIZE]; // init shared memory shared_out_data[threadIdx.x].x = (DType)0.0; // Re @@ -634,7 +634,7 @@ __global__ void balancedTextureForwardConvolutionKernel( while (sec_cnt < N) { sec[threadIdx.x] = sector_processing_order[sec_cnt].x; - __shared__ int data_max; + __shared__ long int data_max; data_max = min(sectors[sec[threadIdx.x] + 1], sectors[sec[threadIdx.x]] + sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD); diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu index 7d6956d8..5bdc706f 100644 --- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu @@ -25,11 +25,11 @@ void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count) { if (std::string("texDATA").compare(symbol)==0) { - HANDLE_ERROR (cudaBindTexture(NULL,texDATA, devicePtr,count*sizeof(float2))); + HANDLE_ERROR (cudaBindTexture(NULL,texDATA, devicePtr,(unsigned long)count*sizeof(float2))); } else if (std::string("texGDATA").compare(symbol)==0) { - HANDLE_ERROR (cudaBindTexture(NULL,texGDATA, devicePtr,count*sizeof(cufftComplex))); + cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex)); } } diff --git a/setup.py b/setup.py index adde187e..30d90731 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ def build_extension(self, ext): "-DGEN_PYTHON_FILES=ON", "-DGEN_MEX_FILES=OFF", "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path] - cfg = "Debug" if self.debug else "Release" + cfg = "Debug" #if self.debug else "Release" build_args = ["--config", cfg] if platform.system() == "Windows": From 72bf53771202c7de0642b0e8bc4891108a2fb20d Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 25 Apr 2022 09:11:26 +0200 Subject: [PATCH 28/85] fixed --- CUDA/CMakeLists.txt | 2 +- CUDA/inc/gpuNUFFT_kernels.hpp | 2 +- CUDA/inc/gpuNUFFT_operator_factory.hpp | 2 +- CUDA/inc/gpuNUFFT_types.hpp | 32 +++++++++---------- .../gpu/atomic/texture_gpuNUFFT_kernels.cu | 8 ++--- CUDA/src/gpu/std_gpuNUFFT_kernels.cu | 8 ++--- CUDA/src/gpuNUFFT_operator.cpp | 4 +-- setup.py | 2 +- 8 files changed, 30 insertions(+), 30 deletions(-) diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt index 5f80d6b0..06883c6b 100644 --- a/CUDA/CMakeLists.txt +++ b/CUDA/CMakeLists.txt @@ -126,7 +126,7 @@ ENDIF(FERMI_GPU) IF(CMAKE_BUILD_TYPE MATCHES Debug) MESSAGE("debug mode") - list(APPEND CUDA_NVCC_FLAGS ${MY_NVCC_FLAGS} --ptxas-options=-v) + list(APPEND CUDA_NVCC_FLAGS ${MY_NVCC_FLAGS} --ptxas-options=-v -G) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g -std=c++11") ELSE(CMAKE_BUILD_TYPE) list(APPEND CUDA_NVCC_FLAGS ${MY_NVCC_FLAGS}) diff --git a/CUDA/inc/gpuNUFFT_kernels.hpp b/CUDA/inc/gpuNUFFT_kernels.hpp index ec4dad08..9966becc 100644 --- a/CUDA/inc/gpuNUFFT_kernels.hpp +++ b/CUDA/inc/gpuNUFFT_kernels.hpp @@ -311,7 +311,7 @@ void performTextureForwardConvolution(CufftType *data_d, DType *crds_d, * @param N Problem size N * @param gi_host Info struct with meta information */ -void performFFTScaling(CufftType *data, int N, gpuNUFFT::GpuNUFFTInfo *gi_host); +void performFFTScaling(CufftType *data, long int N, gpuNUFFT::GpuNUFFTInfo *gi_host); /** \brief Scale each element of the input data by the value of the density *compensation function for the corresponding sample point. diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp index 3e1b7a2a..5658803f 100644 --- a/CUDA/inc/gpuNUFFT_operator_factory.hpp +++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp @@ -55,7 +55,7 @@ class GpuNUFFTOperatorFactory * @param useGpu Flag to indicat&GpuNUFFTPythonOperator::adj_op);e gpu usage for precomputation * @param balanceWorkload Flag to indicate load balancing */ - GpuNUFFTOperatorFactory(const bool useTextures = true, const bool useGpu = true, + GpuNUFFTOperatorFactory(const bool useTextures = false, const bool useGpu = true, bool balanceWorkload = true, bool matlabSharedMem = false) : useTextures(useTextures), useGpu(useGpu), balanceWorkload(balanceWorkload), matlabSharedMem(matlabSharedMem) diff --git a/CUDA/inc/gpuNUFFT_types.hpp b/CUDA/inc/gpuNUFFT_types.hpp index 697cbba8..857f8c45 100644 --- a/CUDA/inc/gpuNUFFT_types.hpp +++ b/CUDA/inc/gpuNUFFT_types.hpp @@ -230,27 +230,27 @@ enum OperatorType struct GpuNUFFTInfo { /**\brief Total amount of data samples.*/ - int data_count; + IndType data_count; /**\brief Width in grid units of gridding kernel.*/ - int kernel_width; + IndType kernel_width; /**\brief Squared kernel_width.*/ - int kernel_widthSquared; + IndType kernel_widthSquared; /**\brief Reciprocal value of kernel_widthSquared.*/ DType kernel_widthInvSquared; /**\brief Total amount of kernel entries.*/ - int kernel_count; + IndType kernel_count; /**\brief Radius of kernel relative to grid size.*/ DType kernel_radius; /**\brief Width of oversampled grid.*/ - int grid_width_dim; + IndType grid_width_dim; /**\brief .*/ - int grid_width_offset; + IndType grid_width_offset; /**\brief Reciprocal value of grid_width_dim.*/ DType3 grid_width_inv; /**\brief Total amount of image nodes.*/ - int im_width_dim; + IndType im_width_dim; /**\brief Image offset (imgDims / 2).*/ IndType3 im_width_offset; // used in deapodization @@ -258,22 +258,22 @@ struct GpuNUFFTInfo DType osr; /**\brief Total amount of sectors.*/ - int sector_count; + IndType sector_count; /**\brief Amount of sectors per dimension.*/ - int sector_width; + IndType sector_width; /**\brief Padded sector width (sector_width + kernel_width / 2).*/ - int sector_pad_width; + IndType sector_pad_width; /**\brief Maximum index per dimension of padded sector (sector_pad_width - * 1).*/ - int sector_pad_max; + IndType sector_pad_max; /**\brief Total amount of elements in one padded sector.*/ - int sector_dim; + IndType sector_dim; /**\brief Offset to zero position inside padded sector (sector_pad_width / 2). * Used in combination with the sector center in order to get to the starting * index (bottom left of the front slice) */ - int sector_offset; + IndType sector_offset; /**\brief Distance scale in x direction in case of anisotropic grids.*/ DType aniso_x_scale; @@ -302,12 +302,12 @@ struct GpuNUFFTInfo /**\brief Flag to indicate whether 2-d or 3-d data is processed.*/ bool is2Dprocessing; /**\brief Type used for texture interpolation.*/ - int interpolationType; + IndType interpolationType; /**\brief Total amount of sectors which have to be processed. * Depends on sector load balancing.*/ - int sectorsToProcess; + IndType sectorsToProcess; /**\brief Number of coils processed concurrently */ - int n_coils_cc; + IndType n_coils_cc; }; } diff --git a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu index 5004a3f6..1fa20f88 100644 --- a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu @@ -486,16 +486,16 @@ textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_ center.y = sector_centers[sec[threadIdx.x] * 3 + 1]; center.z = sector_centers[sec[threadIdx.x] * 3 + 2]; - __shared__ int sector_ind_offset; + __shared__ long int sector_ind_offset; sector_ind_offset = computeXYZ2Lin(center.x - GI.sector_offset, center.y - GI.sector_offset, center.z - GI.sector_offset, GI.gridDims); // init sector cache // preload sector grid data into cache - for (int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x) + for (long int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x) { - int grid_index; + long int grid_index; getCoordsFromIndex(ind, &i, &j, &k, GI.sector_pad_width); if (isOutlier(i, j, k, center.x, center.y, center.z, GI.gridDims, @@ -516,7 +516,7 @@ textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_ __syncthreads(); // Grid Points over Threads - int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; + long int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; while (data_cnt < sec_max) { diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu index 5bdc706f..b0fe0e8b 100644 --- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu @@ -29,7 +29,7 @@ void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count) } else if (std::string("texGDATA").compare(symbol)==0) { - cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex)); + HANDLE_ERROR (cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex))); } } @@ -111,9 +111,9 @@ void freeTexture(const char* symbol, cudaArray* devicePtr) HANDLE_ERROR(cudaFreeArray(devicePtr)); } -__global__ void fftScaleKernel(CufftType* data, DType scaling, int N) +__global__ void fftScaleKernel(CufftType* data, DType scaling, long int N) { - int t = threadIdx.x + blockIdx.x *blockDim.x; + long int t = threadIdx.x + blockIdx.x *blockDim.x; while (t < N) { @@ -129,7 +129,7 @@ __global__ void fftScaleKernel(CufftType* data, DType scaling, int N) } } -void performFFTScaling(CufftType* data,int N, gpuNUFFT::GpuNUFFTInfo* gi_host) +void performFFTScaling(CufftType* data,long int N, gpuNUFFT::GpuNUFFTInfo* gi_host) { dim3 block_dim(64, 1, 8); //dim3 block_dim(THREAD_BLOCK_SIZE); diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 770e6333..b1612d5f 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -1104,7 +1104,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (debugTiming) startTiming(); - int data_count = (int)this->kSpaceTraj.count(); + long int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); @@ -1253,7 +1253,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( // write result in correct order back into output array writeOrderedGPU(data_sorted_d, data_indices_d, data_d, - (int)this->kSpaceTraj.count(), n_coils_cc); + (long int)this->kSpaceTraj.count(), n_coils_cc); if(coil_it > 1) { cudaStreamSynchronize(old_stream); diff --git a/setup.py b/setup.py index 30d90731..adde187e 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ def build_extension(self, ext): "-DGEN_PYTHON_FILES=ON", "-DGEN_MEX_FILES=OFF", "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path] - cfg = "Debug" #if self.debug else "Release" + cfg = "Debug" if self.debug else "Release" build_args = ["--config", cfg] if platform.system() == "Windows": From f8b2ccb4cc86f165ff9d0e398477cf63e80a675d Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Thu, 20 Jul 2023 07:48:26 +0000 Subject: [PATCH 29/85] Fixes for missing directories, add CUDA DIR expliocitly --- CUDA/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt index 06883c6b..71260d30 100644 --- a/CUDA/CMakeLists.txt +++ b/CUDA/CMakeLists.txt @@ -94,9 +94,10 @@ endif(GPU_DOUBLE_PREC) SET(FERMI_GPU OFF CACHE BOOL "Enable build for (old) Fermi architectures (Compute capability 2.0)") - +set(MY_NVCC_FLAGS -I${CUDA_INCLUDE_DIRS}) +set(CMAKE_CXX_FLAGS -I${CUDA_INCLUDE_DIRS}) IF(FERMI_GPU) - set(MY_NVCC_FLAGS -gencode arch=compute_30,code=sm_30) + list(APPEND MY_NVCC_FLAGS -gencode arch=compute_30,code=sm_30) list(APPEND MY_NVCC_FLAGS -gencode arch=compute_50,code=sm_50) list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=sm_52) list(APPEND MY_NVCC_FLAGS -gencode=arch=compute_52,code=compute_52) @@ -150,6 +151,7 @@ CONFIGURE_FILE( ${CMAKE_SOURCE_DIR}/inc/cufft_config.hpp.cmake ${CMAKE_SOURCE_DI #Include dirs include_directories(inc) +message(CUDA_INCLUDE_DIRS : ${CUDA_INCLUDE_DIRS}) SET(GPUNUFFT_INC_DIR ${CMAKE_SOURCE_DIR}/inc) SET(GPUNUFFT_INCLUDE ${GPUNUFFT_INC_DIR}/cuda_utils.hpp ${GPUNUFFT_INC_DIR}/cuda_utils.cuh From 357e548874b075307b72c5f5de457541557e2b4c Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Thu, 20 Jul 2023 09:52:19 +0200 Subject: [PATCH 30/85] Update versioning to take concurrency into account --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index adde187e..dafe5ecc 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.3.2", + version="0.4.2", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From 5ca38c9f693c483a65f3f2a2599d259ab30d4d11 Mon Sep 17 00:00:00 2001 From: Pierre-antoine Comby Date: Sun, 5 Nov 2023 15:47:39 +0100 Subject: [PATCH 31/85] fix: allow for non-integer osf. --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 6 +++--- python/test_nufftOp.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 2bb88f36..41dd48ab 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -73,7 +73,7 @@ class GpuNUFFTPythonOperator public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, py::array_t> sense_maps, py::array_t density_comp, int kernel_width=3, - int sector_width=8, int osr=2, bool balance_workload=1) + int sector_width=8, float osf=2, bool balance_workload=1) { // k-space coordinates py::buffer_info sample_loc = kspace_loc.request(); @@ -116,7 +116,7 @@ class GpuNUFFTPythonOperator factory.setBalanceWorkload(balance_workload); gpuNUFFTOp = factory.createGpuNUFFTOperator( kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width, - osr, imgDims); + osf, imgDims); allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2)); kspace_data.dim.length = trajectory_length; kspace_data.dim.channels = n_coils; @@ -226,7 +226,7 @@ class GpuNUFFTPythonOperator }; PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") - .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool>()) + .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, float, bool>()) .def("op", &GpuNUFFTPythonOperator::op) .def("adj_op", &GpuNUFFTPythonOperator::adj_op) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) diff --git a/python/test_nufftOp.py b/python/test_nufftOp.py index 4d7a1387..68284a15 100644 --- a/python/test_nufftOp.py +++ b/python/test_nufftOp.py @@ -24,7 +24,7 @@ def get_nufft_op(self, sens_maps=None): self.weights, 3, 8, - 2, + 2.0, True, ) From 417e1c6d83410558aff75fd629e0d79da34020be Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 15:55:29 +0100 Subject: [PATCH 32/85] Add pinned memory stuff first code, with debug prints --- .../gpuNUFFT_operator_python_factory.cpp | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 2bb88f36..1f4afe4a 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -107,11 +107,22 @@ class GpuNUFFTPythonOperator } else { - allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2)); - sensArray.dim = imgDims; - sensArray.dim.channels = n_coils; - copyNumpyArray(sense_maps, sensArray.data); - has_sense_data = true; + printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED)); + if(cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED)) + { + printf("The smaps data is pinned!, skipping copies"); + std::complex *t_data = (std::complex *) myData.ptr; + sensArray.data = reinterpret_cast(*t_data); + } + else + { + printf("The smaps data is NOT pinned!, DOING copies"); + allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2)); + sensArray.dim = imgDims; + sensArray.dim.channels = n_coils; + copyNumpyArray(sense_maps, sensArray.data); + has_sense_data = true; + } } factory.setBalanceWorkload(balance_workload); gpuNUFFTOp = factory.createGpuNUFFTOperator( From e3cdb905083e6f49e7792993c6b3eb89759fa916 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 16:01:21 +0100 Subject: [PATCH 33/85] All setup! --- .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 1f4afe4a..bafdb89f 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -107,11 +107,15 @@ class GpuNUFFTPythonOperator } else { - printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED)); - if(cuPointerGetAttribute(sens_maps_buffer.ptr, CU_POINTER_ATTRIBUTE_IS_MANAGED)) + bool is_pinned_memory; + // FIXME, check for errors + cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) sense_maps_buffer.ptr); + printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", is_pinned_memory); + if(is_pinned_memory) { printf("The smaps data is pinned!, skipping copies"); - std::complex *t_data = (std::complex *) myData.ptr; + // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned + std::complex *t_data = (std::complex *) sense_maps_buffer.ptr; sensArray.data = reinterpret_cast(*t_data); } else From 31e44c92740fe3a31645453dd9500ac02b1c8776 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 16:13:44 +0100 Subject: [PATCH 34/85] Fix cmake link cuda --- CUDA/src/gpu/python/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt index 494255c2..0931003f 100644 --- a/CUDA/src/gpu/python/CMakeLists.txt +++ b/CUDA/src/gpu/python/CMakeLists.txt @@ -20,7 +20,7 @@ if(WIN32) TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES}) elseif(UNIX) - TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES}) + TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} cuda) endif(WIN32) CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT) CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT) From f524864be77488aefcffaf76734161dda06b563e Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 16:17:57 +0100 Subject: [PATCH 35/85] \n --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index bafdb89f..fae45e08 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -110,17 +110,17 @@ class GpuNUFFTPythonOperator bool is_pinned_memory; // FIXME, check for errors cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) sense_maps_buffer.ptr); - printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d", is_pinned_memory); + printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory); if(is_pinned_memory) { - printf("The smaps data is pinned!, skipping copies"); + printf("The smaps data is pinned!, skipping copies\n"); // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned std::complex *t_data = (std::complex *) sense_maps_buffer.ptr; sensArray.data = reinterpret_cast(*t_data); } else { - printf("The smaps data is NOT pinned!, DOING copies"); + printf("The smaps data is NOT pinned!, DOING copies\n"); allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2)); sensArray.dim = imgDims; sensArray.dim.channels = n_coils; From 4ff0c1a44fdaf974c66edf02b6b9c66c2cf48653 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 16:25:35 +0100 Subject: [PATCH 36/85] mapped --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index fae45e08..c65a8a69 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -109,8 +109,8 @@ class GpuNUFFTPythonOperator { bool is_pinned_memory; // FIXME, check for errors - cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) sense_maps_buffer.ptr); - printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory); + cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_MAPPED, (CUdeviceptr) sense_maps_buffer.ptr); + printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %b\n", is_pinned_memory); if(is_pinned_memory) { printf("The smaps data is pinned!, skipping copies\n"); From 1daf1c5be95d04d7c14c1efb83f04e88c786f63b Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 16:25:48 +0100 Subject: [PATCH 37/85] mapped --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index c65a8a69..d1587a3a 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -110,7 +110,7 @@ class GpuNUFFTPythonOperator bool is_pinned_memory; // FIXME, check for errors cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_MAPPED, (CUdeviceptr) sense_maps_buffer.ptr); - printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %b\n", is_pinned_memory); + printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory); if(is_pinned_memory) { printf("The smaps data is pinned!, skipping copies\n"); From caf48ac28b88fc871c7ebb1f1f2ae75a718befbd Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 16:39:43 +0100 Subject: [PATCH 38/85] Fix for memory type --- CUDA/src/gpu/python/CMakeLists.txt | 2 +- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt index 0931003f..494255c2 100644 --- a/CUDA/src/gpu/python/CMakeLists.txt +++ b/CUDA/src/gpu/python/CMakeLists.txt @@ -20,7 +20,7 @@ if(WIN32) TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES}) elseif(UNIX) - TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} cuda) + TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES}) endif(WIN32) CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT) CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index d1587a3a..da267082 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -107,10 +107,11 @@ class GpuNUFFTPythonOperator } else { - bool is_pinned_memory; + cudaPointerAttributes attr; // FIXME, check for errors - cuPointerGetAttribute(&is_pinned_memory, CU_POINTER_ATTRIBUTE_MAPPED, (CUdeviceptr) sense_maps_buffer.ptr); - printf("Value of CU_POINTER_ATTRIBUTE_IS_MANAGED = %d\n", is_pinned_memory); + cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr); + printf("Value of attr.cudaMemoryType = %d\n", attr.type); + bool is_pinned_memory = attr.type == cudaMemoryTypeHost; if(is_pinned_memory) { printf("The smaps data is pinned!, skipping copies\n"); From 59545c959d573edeb84da9316af04e5853b0a0e2 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 15 Nov 2023 16:55:06 +0100 Subject: [PATCH 39/85] Fix pointers pointers --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index da267082..fc83acfd 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -109,7 +109,7 @@ class GpuNUFFTPythonOperator { cudaPointerAttributes attr; // FIXME, check for errors - cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr); + cudaPointerGetAttributes(&attr, &sense_maps_buffer.ptr); printf("Value of attr.cudaMemoryType = %d\n", attr.type); bool is_pinned_memory = attr.type == cudaMemoryTypeHost; if(is_pinned_memory) From dbca20e784389e6875c9457e7d65655df4efeeb2 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Fri, 17 Nov 2023 09:44:49 +0100 Subject: [PATCH 40/85] Added pinned stuff --- .../python/gpuNUFFT_operator_python_factory.cpp | 14 +++++++++----- setup.py | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index fc83acfd..ee2d03fe 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -108,20 +108,24 @@ class GpuNUFFTPythonOperator else { cudaPointerAttributes attr; - // FIXME, check for errors - cudaPointerGetAttributes(&attr, &sense_maps_buffer.ptr); - printf("Value of attr.cudaMemoryType = %d\n", attr.type); + if(DEBUG) + printf("Value of sense_maps pointer == 0x%x or %d\n", sense_maps_buffer.ptr, sense_maps_buffer.ptr); + cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr); + if(DEBUG) + printf("Value of attr.cudaMemoryType2 = %d\n", attr.type); bool is_pinned_memory = attr.type == cudaMemoryTypeHost; if(is_pinned_memory) { - printf("The smaps data is pinned!, skipping copies\n"); + if(DEBUG) + printf("The smaps data is pinned!, skipping copies\n"); // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned std::complex *t_data = (std::complex *) sense_maps_buffer.ptr; sensArray.data = reinterpret_cast(*t_data); } else { - printf("The smaps data is NOT pinned!, DOING copies\n"); + if(DEBUG) + printf("The smaps data is NOT pinned!, DOING copies\n"); allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2)); sensArray.dim = imgDims; sensArray.dim.channels = n_coils; diff --git a/setup.py b/setup.py index dafe5ecc..14cc18f1 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.4.2", + version="0.4.3", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From 837bcd0c185dd6814b1f7e9996f4e691f25eb298 Mon Sep 17 00:00:00 2001 From: Pierre-antoine Comby Date: Mon, 20 Nov 2023 10:43:20 +0100 Subject: [PATCH 41/85] feat: add fully on-gpu density compensation estimation. --- .../gpuNUFFT_operator_python_factory.cpp | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ee2d03fe..c4783f9b 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -237,6 +237,73 @@ class GpuNUFFTPythonOperator has_sense_data = true; gpuNUFFTOp->setSens(sensArray); } + + py::array_t estimate_density_comp(int num_iter = 10) + { + IndType n_samples = kspace_data.count(); + gpuNUFFT::Array densArray; + allocate_pinned_memory(&densArray, n_samples * sizeof(CufftType)); + densArray.dim.length = n_samples; + + // TODO: Allocate directly on device and set with kernel. + for (int cnt = 0; cnt < n_samples; cnt++) + { + densArray.data[cnt].x = 1.0; + densArray.data[cnt].y = 0.0; + } + + gpuNUFFT::GpuArray densArray_gpu; + densArray_gpu.dim.length = n_samples; + allocateDeviceMem(&densArray_gpu.data, n_samples); + + copyToDeviceAsync(densArray.data, densArray_gpu.data, n_samples); + + gpuNUFFT::GpuArray densEstimation_gpu; + densEstimation_gpu.dim.length = n_samples; + allocateDeviceMem(&densEstimation_gpu.data, n_samples); + + gpuNUFFT::GpuArray image_gpu; + image_gpu.dim = imgDims; + allocateDeviceMem(&image_gpu.data, imgDims.count()); + + if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + printf("error at adj thread synchronization a: %s\n", + cudaGetErrorString(cudaGetLastError())); + for (int cnt = 0; cnt < num_iter; cnt++) + { + if (DEBUG) + printf("### update %i\n", cnt); + gpuNUFFTOp->performGpuNUFFTAdj(densArray_gpu, image_gpu, + gpuNUFFT::DENSITY_ESTIMATION); + gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, densEstimation_gpu, + gpuNUFFT::DENSITY_ESTIMATION); + performUpdateDensityComp(densArray_gpu.data, densEstimation_gpu.data, + n_samples); + if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + printf("error at adj thread synchronization d: %s\n", + cudaGetErrorString(cudaGetLastError())); + } + freeDeviceMem(densEstimation_gpu.data); + freeDeviceMem(image_gpu.data); + + cudaDeviceSynchronize(); + // copy only the real part back to cpu + DType *tmp_d = (DType *)densArray_gpu.data; + + gpuNUFFT::Array final_densArray; + final_densArray.dim.length = n_samples; + allocate_pinned_memory(&final_densArray, n_samples * sizeof(DType)); + HANDLE_ERROR(cudaMemcpy2DAsync(final_densArray.data, sizeof(DType), + tmp_d, sizeof(DType2), sizeof(DType), + n_samples, cudaMemcpyDeviceToHost)); + cudaDeviceSynchronize(); + freeDeviceMem(densArray_gpu.data); + DType *ptr = reinterpret_cast(*final_densArray.data); + auto capsule = py::capsule(ptr, [](void *ptr) { return; }); + return py::array_t({ trajectory_length }, { sizeof(DType) }, ptr, + capsule); + } + ~GpuNUFFTPythonOperator() { cudaFreeHost(kspace_data.data); @@ -250,6 +317,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { .def("op", &GpuNUFFTPythonOperator::op) .def("adj_op", &GpuNUFFTPythonOperator::adj_op) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) + .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); } #endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED From ae7a1e3fbb37c648853bb67bbac65dfe89ec70ee Mon Sep 17 00:00:00 2001 From: Pierre-antoine Comby Date: Mon, 20 Nov 2023 10:54:00 +0100 Subject: [PATCH 42/85] feat: add power method estimation of the spectral radius. --- .../gpuNUFFT_operator_python_factory.cpp | 62 ++++++++++++++++++- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ee2d03fe..e08cac84 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -13,13 +13,13 @@ Carole Lazarus #include "cufft.h" #include "cuda_runtime.h" #include -#include +#include +#include #include "config.hpp" #include "gpuNUFFT_operator_factory.hpp" #include // std::sort #include // std::vector #include -#include namespace py = pybind11; @@ -237,6 +237,61 @@ class GpuNUFFTPythonOperator has_sense_data = true; gpuNUFFTOp->setSens(sensArray); } + + float get_spectral_radius(int max_iter = 20,float tolerance = 1e-6) + { + int im_size = image.count(); + + gpuNUFFT::GpuArray x_gpu; + x_gpu.dim = image.dim; + allocateDeviceMem(&x_gpu.data, im_size); + + gpuNUFFT::GpuArray tmp_kspace_gpu; + tmp_kspace_gpu.dim = kspace_data.dim; + allocateDeviceMem(&tmp_kspace_gpu.data, kspace_data.count()); + + cudaDeviceSynchronize(); + DType norm_old = 1.0; + DType norm_new = 1.0; + DType inv_norm = 1.0; + // initialisation: create a random complex image. + curandGenerator_t generator; + curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW); + curandSetPseudoRandomGeneratorSeed(generator, (int)time(NULL)); + + // complex value generator by giving twice the size. + curandGenerateUniform(generator, (DType *)x_gpu.data, 2 * im_size); + // xold = initialize random x of image size. + curandDestroyGenerator(generator); + // Create a handle + cublasHandle_t handle; + cublasCreate(&handle); + + cublasScnrm2(handle, im_size, x_gpu.data, 1, &norm_old); + inv_norm = 1.0 / norm_old; + cublasCsscal(handle, im_size, &inv_norm, x_gpu.data, 1); + + for (int i = 0; i < max_iter; i++) + { + // compute x_new = adj_op(op(x_old)) + gpuNUFFTOp->performForwardGpuNUFFT(x_gpu, tmp_kspace_gpu); + gpuNUFFTOp->performGpuNUFFTAdj(tmp_kspace_gpu, x_gpu); + // compute ||x_new|| + cublasScnrm2(handle, im_size, x_gpu.data, 1, &norm_new); + // x_new <- x_new/ ||x_new|| + inv_norm = 1.0 / norm_new; + + cublasCsscal(handle, im_size, &inv_norm, x_gpu.data, 1); + if (fabs(norm_new - norm_old) < tolerance) + { + break; + } + norm_old = norm_new; + } + freeTotalDeviceMemory(tmp_kspace_gpu.data, x_gpu.data, NULL); + cublasDestroy(handle); + return norm_new; + } ~GpuNUFFTPythonOperator() { cudaFreeHost(kspace_data.data); @@ -250,6 +305,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { .def("op", &GpuNUFFTPythonOperator::op) .def("adj_op", &GpuNUFFTPythonOperator::adj_op) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) - .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); + .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps) + .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius); } #endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED From d2bcf7c806ee4925c4cd3c1a23a074299381e14d Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 11:42:11 +0100 Subject: [PATCH 43/85] Working codes for mem allocations --- .../gpuNUFFT_operator_python_factory.cpp | 97 ++++++++++++++++--- 1 file changed, 83 insertions(+), 14 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ee2d03fe..b443ccba 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -52,6 +52,13 @@ void allocate_pinned_memory(gpuNUFFT::Array *lin_array, unsigned long in cudaMallocHost((void **)&new_data, size); lin_array->data = new_data; } + +void deallocate_pinned_memory(gpuNUFFT::Array *lin_array) +{ + cudaFreeHost(lin_array->data); + lin_array->data = NULL; +} + template void copyNumpyArray(py::array_t> data, TType *copy_data) { @@ -61,19 +68,62 @@ void copyNumpyArray(py::array_t> data, TType *copy_data) memcpy(copy_data, my_data, myData.size*sizeof(TType)); } +enum MemoryAllocationType{ + NEVER_ALLOCATE_MEMORY = 0, + ALLOCATE_MEMORY_IN_CONSTRUCTOR = 1, + ALLOCATE_MEMORY_IN_OP = 2 + }; + class GpuNUFFTPythonOperator { gpuNUFFT::GpuNUFFTOperatorFactory factory; gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp; int trajectory_length, n_coils, dimension; bool has_sense_data; + MemoryAllocationType when_allocate_memory; gpuNUFFT::Dimensions imgDims; // sensitivity maps gpuNUFFT::Array sensArray, kspace_data, image; + void allocate_memory_kspace() + { + allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2)); + kspace_data.dim.length = trajectory_length; + kspace_data.dim.channels = n_coils; + } + void deallocate_memory_kspace() + { + deallocate_pinned_memory(&kspace_data); + kspace_data.dim.length = 0; + kspace_data.dim.channels = 0; + } + + void allocate_memory_image() + { + image.dim = imgDims; + if(has_sense_data == false) + { + allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2)); + image.dim.channels = n_coils; + } + else + { + allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2)); + image.dim.channels = 1; + } + } + void deallocate_memory_image() + { + deallocate_pinned_memory(&image); + image.dim.width = 0; + image.dim.depth = 0; + image.dim.height = 0; + image.dim.channels = 0; + } + public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, py::array_t> sense_maps, py::array_t density_comp, int kernel_width=3, - int sector_width=8, int osr=2, bool balance_workload=1) + int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory) { // k-space coordinates py::buffer_info sample_loc = kspace_loc.request(); @@ -137,25 +187,22 @@ class GpuNUFFTPythonOperator gpuNUFFTOp = factory.createGpuNUFFTOperator( kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width, osr, imgDims); - allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2)); - kspace_data.dim.length = trajectory_length; - kspace_data.dim.channels = n_coils; - image.dim = imgDims; - if(has_sense_data == false) - { - allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2)); - image.dim.channels = n_coils; - } - else + + if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) { - allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2)); - image.dim.channels = 1; + allocate_memory_kspace(); + allocate_memory_image(); } cudaDeviceSynchronize(); } py::array_t> op(py::array_t> input_image, bool interpolate_data=false) { + if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP) + { + allocate_memory_kspace(); + allocate_memory_image(); + } // Copy array to pinned memory for better memory bandwidths! copyNumpyArray(input_image, image.data); if(interpolate_data) @@ -166,6 +213,11 @@ class GpuNUFFTPythonOperator std::complex *ptr = reinterpret_cast(&)[0]>(*kspace_data.data); auto capsule = py::capsule(ptr, [](void *ptr) { return; }); + if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP) + { + // Deallocate the memory (only image) to prevent memory leaks! + deallocate_memory_image(); + } return py::array_t>( { n_coils, trajectory_length }, { @@ -178,6 +230,11 @@ class GpuNUFFTPythonOperator } py::array_t> adj_op(py::array_t> input_kspace_data, bool grid_data=false) { + if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP) + { + allocate_memory_kspace(); + allocate_memory_image(); + } gpuNUFFT::Dimensions myDims = imgDims; if(dimension==2) myDims.depth = 1; @@ -190,6 +247,11 @@ class GpuNUFFTPythonOperator std::complex *ptr = reinterpret_cast(&)[0]>(*image.data); auto capsule = py::capsule(ptr, [](void *ptr) { return; }); + if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP) + { + // Deallocate the memory (only k-space) to prevent memory leaks! + deallocate_memory_kspace(); + } if(has_sense_data == false) return py::array_t>( { @@ -246,10 +308,17 @@ class GpuNUFFTPythonOperator }; PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") - .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool>()) + .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool, MemoryAllocationType>()) // FIXME : Add defaul values! .def("op", &GpuNUFFTPythonOperator::op) .def("adj_op", &GpuNUFFTPythonOperator::adj_op) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); + + py::enum_(m, "MemoryAllocationType") + .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY) + .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR) + .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP) + .export_values(); + } #endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED From 11ef27d1bbc3c7f70cd89a80a8610c81f9ebb8c2 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 13:25:39 +0100 Subject: [PATCH 44/85] Working added additional optional input --- .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 10 +++++----- setup.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index b443ccba..8f7e32fe 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -196,7 +196,7 @@ class GpuNUFFTPythonOperator cudaDeviceSynchronize(); } - py::array_t> op(py::array_t> input_image, bool interpolate_data=false) + py::array_t> op(py::array_t> input_image, bool interpolate_data, std::optional>> out_kspace) { if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP) { @@ -228,7 +228,7 @@ class GpuNUFFTPythonOperator capsule ); } - py::array_t> adj_op(py::array_t> input_kspace_data, bool grid_data=false) + py::array_t> adj_op(py::array_t> input_kspace, bool grid_data, std::optional>> out_image) { if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP) { @@ -238,7 +238,7 @@ class GpuNUFFTPythonOperator gpuNUFFT::Dimensions myDims = imgDims; if(dimension==2) myDims.depth = 1; - copyNumpyArray(input_kspace_data, kspace_data.data); + copyNumpyArray(input_kspace, kspace_data.data); if(grid_data) gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); else @@ -309,8 +309,8 @@ class GpuNUFFTPythonOperator PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool, MemoryAllocationType>()) // FIXME : Add defaul values! - .def("op", &GpuNUFFTPythonOperator::op) - .def("adj_op", &GpuNUFFTPythonOperator::adj_op) + .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none()) + .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none()) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); diff --git a/setup.py b/setup.py index 14cc18f1..bd0f53e7 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.4.3", + version="0.5.0", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From b7cf9b8c313d896415fe605db9b7f4ba5c71a057 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 13:40:29 +0100 Subject: [PATCH 45/85] Completed coding the entire end to end --- .../gpuNUFFT_operator_python_factory.cpp | 59 +++++++++++++------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 8f7e32fe..31253bbc 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -68,6 +68,15 @@ void copyNumpyArray(py::array_t> data, TType *copy_data) memcpy(copy_data, my_data, myData.size*sizeof(TType)); } +template +void cast_pointer(py::array_t> data, gpuNUFFT::Array ©_data) +{ + py::buffer_info myData = data.request(); + std::complex *t_data = (std::complex *) myData.ptr; + TType *my_data = reinterpret_cast(*t_data); + copy_data.data = my_data; +} + enum MemoryAllocationType{ NEVER_ALLOCATE_MEMORY = 0, ALLOCATE_MEMORY_IN_CONSTRUCTOR = 1, @@ -90,12 +99,6 @@ class GpuNUFFTPythonOperator kspace_data.dim.length = trajectory_length; kspace_data.dim.channels = n_coils; } - void deallocate_memory_kspace() - { - deallocate_pinned_memory(&kspace_data); - kspace_data.dim.length = 0; - kspace_data.dim.channels = 0; - } void allocate_memory_image() { @@ -111,14 +114,6 @@ class GpuNUFFTPythonOperator image.dim.channels = 1; } } - void deallocate_memory_image() - { - deallocate_pinned_memory(&image); - image.dim.width = 0; - image.dim.depth = 0; - image.dim.height = 0; - image.dim.channels = 0; - } public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, @@ -202,9 +197,21 @@ class GpuNUFFTPythonOperator { allocate_memory_kspace(); allocate_memory_image(); + // Copy array to pinned memory for better memory bandwidths! + copyNumpyArray(input_image, image.data); + } + else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) + { + cast_pointer(input_image, image); + if(out_kspace.has_value()) + cast_pointer(out_kspace.value(), kspace_data); + else + { + // We dont have out_kspace allocated. Warn and then allocate + py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!"); + allocate_memory_kspace(); + } } - // Copy array to pinned memory for better memory bandwidths! - copyNumpyArray(input_image, image.data); if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else @@ -216,7 +223,7 @@ class GpuNUFFTPythonOperator if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP) { // Deallocate the memory (only image) to prevent memory leaks! - deallocate_memory_image(); + deallocate_pinned_memory(&image); } return py::array_t>( { n_coils, trajectory_length }, @@ -234,11 +241,25 @@ class GpuNUFFTPythonOperator { allocate_memory_kspace(); allocate_memory_image(); + // Copy array to pinned memory for better memory bandwidths! + copyNumpyArray(input_kspace, kspace_data.data); + } + else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) + { + cast_pointer(input_kspace, kspace_data); + // Check if we have out image allocated + if (out_image.has_value()) + cast_pointer(out_image.value(), image); + else + { + // We dont have out_image allocated. Warn and then allocate + py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!"); + allocate_memory_image(); + } } gpuNUFFT::Dimensions myDims = imgDims; if(dimension==2) myDims.depth = 1; - copyNumpyArray(input_kspace, kspace_data.data); if(grid_data) gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); else @@ -250,7 +271,7 @@ class GpuNUFFTPythonOperator if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP) { // Deallocate the memory (only k-space) to prevent memory leaks! - deallocate_memory_kspace(); + deallocate_pinned_memory(&kspace_data); } if(has_sense_data == false) return py::array_t>( From 0d3dcf59195f3f3c1690bff9fa8de07161cfebd7 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 13:45:52 +0100 Subject: [PATCH 46/85] Completed coding the entire end to end --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 31253bbc..69fd8d41 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -329,7 +329,9 @@ class GpuNUFFTPythonOperator }; PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") - .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool, MemoryAllocationType>()) // FIXME : Add defaul values! + .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool, MemoryAllocationType>(), + py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR + ) // FIXME : Add defaul values! .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none()) .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none()) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) From 3f73176c0b79e2859b3969fe2fbea48da58d8064 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 15:19:25 +0100 Subject: [PATCH 47/85] Added codes --- .../gpuNUFFT_operator_python_factory.cpp | 20 +++++++++++-------- python/test_file.py | 13 ------------ 2 files changed, 12 insertions(+), 21 deletions(-) delete mode 100644 python/test_file.py diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 69fd8d41..05b13bd8 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -9,6 +9,7 @@ Carole Lazarus #define GPUNUFFT_OPERATOR_PYTHON_FACTORY_H_INCLUDED #include #include +#include #include #include "cufft.h" #include "cuda_runtime.h" @@ -21,7 +22,6 @@ Carole Lazarus #include #include - namespace py = pybind11; template @@ -202,7 +202,9 @@ class GpuNUFFTPythonOperator } else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) { + printf("Not allocating memory in op\n"); cast_pointer(input_image, image); + printf("Output kspace : %d\n", out_kspace.has_value()); if(out_kspace.has_value()) cast_pointer(out_kspace.value(), kspace_data); else @@ -210,6 +212,7 @@ class GpuNUFFTPythonOperator // We dont have out_kspace allocated. Warn and then allocate py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!"); allocate_memory_kspace(); + cudaDeviceSynchronize(); } } if(interpolate_data) @@ -327,21 +330,22 @@ class GpuNUFFTPythonOperator delete gpuNUFFTOp; } }; + PYBIND11_MODULE(gpuNUFFT, m) { + py::enum_(m, "MemoryAllocationType") + .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY) + .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR) + .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP) + .export_values(); + py::class_(m, "NUFFTOp") .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool, MemoryAllocationType>(), py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR - ) // FIXME : Add defaul values! + ) .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none()) .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none()) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); - py::enum_(m, "MemoryAllocationType") - .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY) - .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR) - .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP) - .export_values(); - } #endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED diff --git a/python/test_file.py b/python/test_file.py deleted file mode 100644 index e150184f..00000000 --- a/python/test_file.py +++ /dev/null @@ -1,13 +0,0 @@ -import numpy as np -from mri.operators import NonCartesianFFT -from mri.operators.fourier.utils import estimate_density_compensation -traj = np.load('/volatile/temp_traj.npy') - - -for i in range(1): - dens = estimate_density_compensation(traj, (384, 384, 208)) - fourier = NonCartesianFFT(traj, (384, 384, 208), 'gpuNUFFT', n_coils=20, smaps=np.ones((20, 384, 384, 208)), osf=2, density_comp=dens) - print(i) - K = fourier.op(np.zeros((384, 384, 208))) - I = fourier.adj_op(K) - del fourier From 04b420707112bd6917fb8039b1f793bdb66a3a3b Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 15:24:46 +0100 Subject: [PATCH 48/85] Fixes --- .../gpuNUFFT_operator_python_factory.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 05b13bd8..236546da 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -96,23 +96,16 @@ class GpuNUFFTPythonOperator void allocate_memory_kspace() { allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2)); - kspace_data.dim.length = trajectory_length; - kspace_data.dim.channels = n_coils; + } void allocate_memory_image() { image.dim = imgDims; if(has_sense_data == false) - { allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2)); - image.dim.channels = n_coils; - } else - { allocate_pinned_memory(&image, imgDims.count() * sizeof(DType2)); - image.dim.channels = 1; - } } public: @@ -142,7 +135,9 @@ class GpuNUFFTPythonOperator imgDims.depth = 0; n_coils = num_coils; - + kspace_data.dim.length = trajectory_length; + kspace_data.dim.channels = n_coils; + // sensitivity maps py::buffer_info sense_maps_buffer = sense_maps.request(); if (sense_maps_buffer.shape.size()==0) @@ -182,7 +177,11 @@ class GpuNUFFTPythonOperator gpuNUFFTOp = factory.createGpuNUFFTOperator( kSpaceTraj, density_compArray, sensArray, kernel_width, sector_width, osr, imgDims); - + + if(has_sense_data == false) + image.dim.channels = n_coils; + else + image.dim.channels = 1; if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) { allocate_memory_kspace(); From 7d17703e99cb13b0c3ffcbb261b1b6c2020eda47 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 15:29:29 +0100 Subject: [PATCH 49/85] fixes --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 236546da..46c4d4b5 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -96,7 +96,6 @@ class GpuNUFFTPythonOperator void allocate_memory_kspace() { allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2)); - } void allocate_memory_image() @@ -135,9 +134,12 @@ class GpuNUFFTPythonOperator imgDims.depth = 0; n_coils = num_coils; + + // Setup all the sizes kspace_data.dim.length = trajectory_length; - kspace_data.dim.channels = n_coils; - + kspace_data.dim.channels = num_coils; + image.dim = imgDims; + // sensitivity maps py::buffer_info sense_maps_buffer = sense_maps.request(); if (sense_maps_buffer.shape.size()==0) From 08059e7cd650fc0d8ca25df731726a49f3fccfed Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 15:36:02 +0100 Subject: [PATCH 50/85] major fixes --- .../python/gpuNUFFT_operator_python_factory.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 46c4d4b5..93fffc65 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -100,7 +100,6 @@ class GpuNUFFTPythonOperator void allocate_memory_image() { - image.dim = imgDims; if(has_sense_data == false) allocate_pinned_memory(&image, n_coils * imgDims.count() * sizeof(DType2)); else @@ -172,8 +171,8 @@ class GpuNUFFTPythonOperator sensArray.dim = imgDims; sensArray.dim.channels = n_coils; copyNumpyArray(sense_maps, sensArray.data); - has_sense_data = true; } + has_sense_data = true; } factory.setBalanceWorkload(balance_workload); gpuNUFFTOp = factory.createGpuNUFFTOperator( @@ -198,8 +197,6 @@ class GpuNUFFTPythonOperator { allocate_memory_kspace(); allocate_memory_image(); - // Copy array to pinned memory for better memory bandwidths! - copyNumpyArray(input_image, image.data); } else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) { @@ -216,6 +213,11 @@ class GpuNUFFTPythonOperator cudaDeviceSynchronize(); } } + if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP) + { + // Copy array to pinned memory for better memory bandwidths! + copyNumpyArray(input_image, image.data); + } if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else @@ -245,8 +247,6 @@ class GpuNUFFTPythonOperator { allocate_memory_kspace(); allocate_memory_image(); - // Copy array to pinned memory for better memory bandwidths! - copyNumpyArray(input_kspace, kspace_data.data); } else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) { @@ -261,6 +261,11 @@ class GpuNUFFTPythonOperator allocate_memory_image(); } } + if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP) + { + // Copy array to pinned memory for better memory bandwidths! + copyNumpyArray(input_kspace, kspace_data.data); + } gpuNUFFT::Dimensions myDims = imgDims; if(dimension==2) myDims.depth = 1; From 10d461822962b5b6f13117464b3ccd8c8bf19174 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 15:42:55 +0100 Subject: [PATCH 51/85] Fix density comp --- .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 93fffc65..ab248a31 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -108,7 +108,7 @@ class GpuNUFFTPythonOperator public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, - py::array_t> sense_maps, py::array_t density_comp, int kernel_width=3, + py::array_t> sense_maps, std::optional> density_comp, int kernel_width=3, int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory) { // k-space coordinates @@ -119,8 +119,13 @@ class GpuNUFFTPythonOperator kSpaceTraj.dim.length = trajectory_length; // density compensation weights - gpuNUFFT::Array density_compArray = readNumpyArray(density_comp); - density_compArray.dim.length = trajectory_length; + gpuNUFFT::Array density_compArray; + if(density_comp.has_value()) + { + density_compArray = readNumpyArray(density_comp.value()); + density_compArray.dim.length = trajectory_length; + // No need else as the init is by default with 0 length and density comp is not applied + } // image size py::buffer_info img_dim = image_size.request(); From e656fc40ba95393521264859012378d24bc04a6f Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 15:57:55 +0100 Subject: [PATCH 52/85] All fixes --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ab248a31..beedd73b 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -108,7 +108,7 @@ class GpuNUFFTPythonOperator public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, - py::array_t> sense_maps, std::optional> density_comp, int kernel_width=3, + py::array_t> sense_maps, std::optional> density_comp, int kernel_width=3, int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory) { // k-space coordinates @@ -350,7 +350,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { .export_values(); py::class_(m, "NUFFTOp") - .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, int, bool, MemoryAllocationType>(), + .def(py::init, py::array_t, int, py::array_t>, std::optional>, int, int, int, bool, MemoryAllocationType>(), py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR ) .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none()) From 4d993b1752713e2f8010cd0cbbcba78dfdd7ef15 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 16:03:42 +0100 Subject: [PATCH 53/85] Fix free --- .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index beedd73b..ac0c2c84 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -205,9 +205,7 @@ class GpuNUFFTPythonOperator } else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) { - printf("Not allocating memory in op\n"); cast_pointer(input_image, image); - printf("Output kspace : %d\n", out_kspace.has_value()); if(out_kspace.has_value()) cast_pointer(out_kspace.value(), kspace_data); else @@ -336,8 +334,12 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { - cudaFreeHost(kspace_data.data); - cudaFreeHost(image.data); + if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) + { + cudaFreeHost(kspace_data.data); + cudaFreeHost(image.data); + + } delete gpuNUFFTOp; } }; From 57d0160f18dae672ce07e78331b83e6ba458e8de Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 20 Nov 2023 16:04:00 +0100 Subject: [PATCH 54/85] added tests --- python/test_mem.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 python/test_mem.py diff --git a/python/test_mem.py b/python/test_mem.py new file mode 100644 index 00000000..80109902 --- /dev/null +++ b/python/test_mem.py @@ -0,0 +1,31 @@ +"""Script to test gpuNUFFT wrapper. +Authors: +Chaithya G R +""" + +import numpy as np +from gpuNUFFT import NUFFTOp, MemoryAllocationType +import pytest + + +def test_memory_allocation_types(): + kspace_loc = np.random.random((5000, 3)) - 0.5 + img_size = [256, 256, 256] + n_coils = 1 + image = np.random.random(img_size) + 1j * np.random.random(img_size) + kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0])) + kspace_out = [] + images_out = [] + for mem_allocation_type in list(MemoryAllocationType.__members__.values()): + nufft_op = NUFFTOp( + kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), + image_size=img_size, + num_coils=n_coils, + when_allocate_memory=mem_allocation_type, + ) + kspace_out.append(nufft_op.op(input_image=image)) + images_out.append(nufft_op.adj_op(kspace)) + kspace_out + images_out + images_out + \ No newline at end of file From 212469371d9b93071cbd3227b6eaaca8cb40dbd4 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 10:09:42 +0100 Subject: [PATCH 55/85] Temp fixes --- .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 13 ++++++------- python/test_mem.py | 11 +++++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ac0c2c84..85cb8573 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -213,7 +213,6 @@ class GpuNUFFTPythonOperator // We dont have out_kspace allocated. Warn and then allocate py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!"); allocate_memory_kspace(); - cudaDeviceSynchronize(); } } if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP) @@ -334,12 +333,12 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { - if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) - { - cudaFreeHost(kspace_data.data); - cudaFreeHost(image.data); - - } + py::print("Destructor called :: ", when_allocate_memory); + // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) + // { + // cudaFreeHost(kspace_data.data); + // cudaFreeHost(image.data); + // } delete gpuNUFFTOp; } }; diff --git a/python/test_mem.py b/python/test_mem.py index 80109902..17f9733f 100644 --- a/python/test_mem.py +++ b/python/test_mem.py @@ -16,15 +16,18 @@ def test_memory_allocation_types(): kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0])) kspace_out = [] images_out = [] + nufft_ops = [] for mem_allocation_type in list(MemoryAllocationType.__members__.values()): - nufft_op = NUFFTOp( + nufft_ops.append(NUFFTOp( kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), image_size=img_size, num_coils=n_coils, when_allocate_memory=mem_allocation_type, - ) - kspace_out.append(nufft_op.op(input_image=image)) - images_out.append(nufft_op.adj_op(kspace)) + )) + kspace_out.append(nufft_ops[-1].op(input_image=image)) + images_out.append(nufft_ops[-1].adj_op(input_kspace=kspace)) + if len(nufft_ops) > 1: + del nufft_ops[-2] kspace_out images_out images_out From e9c9fef0c444ea68cbd9aca748c182a6077ec756 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 10:20:36 +0100 Subject: [PATCH 56/85] Test --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 85cb8573..89bec316 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -334,10 +334,11 @@ class GpuNUFFTPythonOperator ~GpuNUFFTPythonOperator() { py::print("Destructor called :: ", when_allocate_memory); + // We cant deallocate as we could have passed the memory to python! // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) // { - // cudaFreeHost(kspace_data.data); - // cudaFreeHost(image.data); + // deallocate_pinned_memory(&kspace_data); + // deallocate_pinned_memory(&image); // } delete gpuNUFFTOp; } From 38cefe94bc3359b347731d141011367d70a5ea94 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 10:44:28 +0100 Subject: [PATCH 57/85] Added memory warnings --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 89bec316..d6483950 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -190,6 +190,10 @@ class GpuNUFFTPythonOperator image.dim.channels = 1; if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) { + py::print(" + WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\n + Please consider providing pinned memory yourself for speed and efficiency" + ); allocate_memory_kspace(); allocate_memory_image(); } @@ -335,6 +339,7 @@ class GpuNUFFTPythonOperator { py::print("Destructor called :: ", when_allocate_memory); // We cant deallocate as we could have passed the memory to python! + // FIXME, we will no longer support this! // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) // { // deallocate_pinned_memory(&kspace_data); From 56298f83b631d48cfa0b4f5aa0ffb7a5aefceda1 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 11:15:46 +0100 Subject: [PATCH 58/85] Added additional tests, just before removing all options! --- .../gpuNUFFT_operator_python_factory.cpp | 11 ++--- python/test_mem.py | 48 +++++++++++++++---- python/test_nufftOp.py | 1 - 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index d6483950..a5317fa5 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -190,10 +190,10 @@ class GpuNUFFTPythonOperator image.dim.channels = 1; if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) { - py::print(" - WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\n - Please consider providing pinned memory yourself for speed and efficiency" - ); + py::print(" \ + WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\ + \nPlease consider providing pinned memory yourself for speed and efficiency\ + "); allocate_memory_kspace(); allocate_memory_image(); } @@ -337,7 +337,6 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { - py::print("Destructor called :: ", when_allocate_memory); // We cant deallocate as we could have passed the memory to python! // FIXME, we will no longer support this! // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) @@ -358,7 +357,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") .def(py::init, py::array_t, int, py::array_t>, std::optional>, int, int, int, bool, MemoryAllocationType>(), - py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_CONSTRUCTOR + py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_OP ) .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none()) .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none()) diff --git a/python/test_mem.py b/python/test_mem.py index 17f9733f..93adb852 100644 --- a/python/test_mem.py +++ b/python/test_mem.py @@ -8,6 +8,8 @@ import pytest + + def test_memory_allocation_types(): kspace_loc = np.random.random((5000, 3)) - 0.5 img_size = [256, 256, 256] @@ -16,19 +18,49 @@ def test_memory_allocation_types(): kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0])) kspace_out = [] images_out = [] - nufft_ops = [] + nufft_op = [] for mem_allocation_type in list(MemoryAllocationType.__members__.values()): - nufft_ops.append(NUFFTOp( + nufft_op = NUFFTOp( kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), image_size=img_size, num_coils=n_coils, when_allocate_memory=mem_allocation_type, - )) - kspace_out.append(nufft_ops[-1].op(input_image=image)) - images_out.append(nufft_ops[-1].adj_op(input_kspace=kspace)) - if len(nufft_ops) > 1: - del nufft_ops[-2] + ) + kspace_out.append(nufft_op.op(input_image=image)) + images_out.append(nufft_op.adj_op(input_kspace=kspace)) + del nufft_op kspace_out images_out images_out - \ No newline at end of file + + +def test_pinned_memory_provided(): + import cupyx as cpx + + kspace_loc = np.random.random((5000, 3)) - 0.5 + img_size = [256, 256, 256] + n_coils = 1 + image = np.random.random(img_size) + 1j * np.random.random(img_size) + kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0])) + + image_out = cpx.empty_like_pinned(image) + kspace_out = cpx.empty_like_pinned(kspace) + + nufft_ori = NUFFTOp( + kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), + image_size=img_size, + num_coils=n_coils, + when_allocate_memory=MemoryAllocationType.ALLOCATE_MEMORY_IN_OP, + ) + ori_kspace_out = nufft_ori.op(input_image=image) + ori_image_out = nufft_ori.adj_op(input_kspace=kspace) + + nufft_op = NUFFTOp( + kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), + image_size=img_size, + num_coils=n_coils, + when_allocate_memory=MemoryAllocationType.NEVER_ALLOCATE_MEMORY, + ) + out_ksp = nufft_op.op(input_image=image, out_kspace=kspace_out) + out_im = nufft_op.adj_op(input_kspace=kspace, out_image=image_out) + out_ksp \ No newline at end of file diff --git a/python/test_nufftOp.py b/python/test_nufftOp.py index 4d7a1387..8e272fdb 100644 --- a/python/test_nufftOp.py +++ b/python/test_nufftOp.py @@ -5,7 +5,6 @@ """ import numpy as np -import numpy.matlib import matplotlib.pyplot as plt from gpuNUFFT import NUFFTOp import unittest From 436a875152ef8478d58cc30220545a15219b9e9b Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 11:21:18 +0100 Subject: [PATCH 59/85] Added additional tests, just before removing all options! --- .../gpuNUFFT_operator_python_factory.cpp | 105 +++--------------- 1 file changed, 13 insertions(+), 92 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index a5317fa5..4cfb4e1e 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -77,11 +77,6 @@ void cast_pointer(py::array_t> data, gpuNUFFT::Array copy_data.data = my_data; } -enum MemoryAllocationType{ - NEVER_ALLOCATE_MEMORY = 0, - ALLOCATE_MEMORY_IN_CONSTRUCTOR = 1, - ALLOCATE_MEMORY_IN_OP = 2 - }; class GpuNUFFTPythonOperator { @@ -89,7 +84,6 @@ class GpuNUFFTPythonOperator gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp; int trajectory_length, n_coils, dimension; bool has_sense_data; - MemoryAllocationType when_allocate_memory; gpuNUFFT::Dimensions imgDims; // sensitivity maps gpuNUFFT::Array sensArray, kspace_data, image; @@ -109,7 +103,7 @@ class GpuNUFFTPythonOperator public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, py::array_t> sense_maps, std::optional> density_comp, int kernel_width=3, - int sector_width=8, int osr=2, bool balance_workload=1, MemoryAllocationType when_allocate_memory=ALLOCATE_MEMORY_IN_CONSTRUCTOR) : when_allocate_memory(when_allocate_memory) + int sector_width=8, int osr=2, bool balance_workload=1) { // k-space coordinates py::buffer_info sample_loc = kspace_loc.request(); @@ -188,42 +182,13 @@ class GpuNUFFTPythonOperator image.dim.channels = n_coils; else image.dim.channels = 1; - if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) - { - py::print(" \ - WARNING: Allocation in Memory will be deprecated in futurte due to memory handeling issues.\ - \nPlease consider providing pinned memory yourself for speed and efficiency\ - "); - allocate_memory_kspace(); - allocate_memory_image(); - } cudaDeviceSynchronize(); } - py::array_t> op(py::array_t> input_image, bool interpolate_data, std::optional>> out_kspace) + py::array_t> op(py::array_t> in_image, py::array_t> out_kspace, bool interpolate_data) { - if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP) - { - allocate_memory_kspace(); - allocate_memory_image(); - } - else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) - { - cast_pointer(input_image, image); - if(out_kspace.has_value()) - cast_pointer(out_kspace.value(), kspace_data); - else - { - // We dont have out_kspace allocated. Warn and then allocate - py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!"); - allocate_memory_kspace(); - } - } - if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP) - { - // Copy array to pinned memory for better memory bandwidths! - copyNumpyArray(input_image, image.data); - } + cast_pointer(in_image, image); + cast_pointer(out_kspace.value(), kspace_data); if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else @@ -232,11 +197,6 @@ class GpuNUFFTPythonOperator std::complex *ptr = reinterpret_cast(&)[0]>(*kspace_data.data); auto capsule = py::capsule(ptr, [](void *ptr) { return; }); - if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP) - { - // Deallocate the memory (only image) to prevent memory leaks! - deallocate_pinned_memory(&image); - } return py::array_t>( { n_coils, trajectory_length }, { @@ -247,31 +207,10 @@ class GpuNUFFTPythonOperator capsule ); } - py::array_t> adj_op(py::array_t> input_kspace, bool grid_data, std::optional>> out_image) + py::array_t> adj_op(py::array_t> in_kspace, py::array_t> out_image, bool grid_data) { - if(when_allocate_memory == ALLOCATE_MEMORY_IN_OP) - { - allocate_memory_kspace(); - allocate_memory_image(); - } - else if(when_allocate_memory == NEVER_ALLOCATE_MEMORY) - { - cast_pointer(input_kspace, kspace_data); - // Check if we have out image allocated - if (out_image.has_value()) - cast_pointer(out_image.value(), image); - else - { - // We dont have out_image allocated. Warn and then allocate - py::print("WARNING: NEVER_ALLOCATE_MEMORY is chosen but no memory is specified, allocating for now!"); - allocate_memory_image(); - } - } - if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR || when_allocate_memory == ALLOCATE_MEMORY_IN_OP) - { - // Copy array to pinned memory for better memory bandwidths! - copyNumpyArray(input_kspace, kspace_data.data); - } + cast_pointer(in_kspace, kspace_data); + cast_pointer(out_image.value(), image); gpuNUFFT::Dimensions myDims = imgDims; if(dimension==2) myDims.depth = 1; @@ -283,11 +222,6 @@ class GpuNUFFTPythonOperator std::complex *ptr = reinterpret_cast(&)[0]>(*image.data); auto capsule = py::capsule(ptr, [](void *ptr) { return; }); - if (when_allocate_memory == ALLOCATE_MEMORY_IN_OP) - { - // Deallocate the memory (only k-space) to prevent memory leaks! - deallocate_pinned_memory(&kspace_data); - } if(has_sense_data == false) return py::array_t>( { @@ -320,12 +254,13 @@ class GpuNUFFTPythonOperator ptr, capsule ); - } + void clean_memory() { gpuNUFFTOp->clean_memory(); } + void set_smaps(py::array_t> sense_maps) { py::buffer_info myData = sense_maps.request(); @@ -337,30 +272,16 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { - // We cant deallocate as we could have passed the memory to python! - // FIXME, we will no longer support this! - // if(when_allocate_memory == ALLOCATE_MEMORY_IN_CONSTRUCTOR) - // { - // deallocate_pinned_memory(&kspace_data); - // deallocate_pinned_memory(&image); - // } delete gpuNUFFTOp; } }; PYBIND11_MODULE(gpuNUFFT, m) { - py::enum_(m, "MemoryAllocationType") - .value("NEVER_ALLOCATE_MEMORY", MemoryAllocationType::NEVER_ALLOCATE_MEMORY) - .value("ALLOCATE_MEMORY_IN_CONSTRUCTOR", MemoryAllocationType::ALLOCATE_MEMORY_IN_CONSTRUCTOR) - .value("ALLOCATE_MEMORY_IN_OP", MemoryAllocationType::ALLOCATE_MEMORY_IN_OP) - .export_values(); - py::class_(m, "NUFFTOp") - .def(py::init, py::array_t, int, py::array_t>, std::optional>, int, int, int, bool, MemoryAllocationType>(), - py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true, py::arg("when_allocate_memory") = ALLOCATE_MEMORY_IN_OP - ) - .def("op", &GpuNUFFTPythonOperator::op, py::arg("input_image"), py::arg("interpolate_data") = false, py::arg("out_kspace") = py::none()) - .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("input_kspace"), py::arg("grid_data") = false, py::arg("out_image") = py::none()) + .def(py::init, py::array_t, int, py::array_t>, std::optional>, int, int, int, bool>(), + py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true) + .def("op", &GpuNUFFTPythonOperator::op, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false) + .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("in_kspace"), py::arg("out_image"), py::arg("grid_data") = false) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); From a5549262436ecefceb17c31d21261f6a9f186b79 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 11:34:10 +0100 Subject: [PATCH 60/85] Working with warnings --- .../gpuNUFFT_operator_python_factory.cpp | 50 +++++++++---------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 4cfb4e1e..ef4fee0f 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -46,6 +46,21 @@ readNumpyArray(py::array_t> data) return dataArray; } + +void warn_pinned_memory(py::array_t> array) +{ + py::buffer_info buffer = array.request(); + cudaPointerAttributes attr; + if(DEBUG) + printf("Value of sense_maps pointer == 0x%x\n", buffer.ptr); + cudaPointerGetAttributes(&attr, buffer.ptr); + if(DEBUG) + printf("Value of attr.cudaMemoryType2 = %d\n", attr.type); + bool is_pinned_memory = attr.type == cudaMemoryTypeHost; + if(!is_pinned_memory) + py::print("WARNING:: The data is NOT pinned! This will be slow, consider pinning\n"); +} + void allocate_pinned_memory(gpuNUFFT::Array *lin_array, unsigned long int size) { DType2 *new_data; @@ -69,15 +84,19 @@ void copyNumpyArray(py::array_t> data, TType *copy_data) } template -void cast_pointer(py::array_t> data, gpuNUFFT::Array ©_data) +void cast_pointer(py::array_t> data, gpuNUFFT::Array ©_data, bool warn=true) { py::buffer_info myData = data.request(); std::complex *t_data = (std::complex *) myData.ptr; TType *my_data = reinterpret_cast(*t_data); copy_data.data = my_data; + if (warn) + warn_pinned_memory(data); } + + class GpuNUFFTPythonOperator { gpuNUFFT::GpuNUFFTOperatorFactory factory; @@ -147,30 +166,7 @@ class GpuNUFFTPythonOperator } else { - cudaPointerAttributes attr; - if(DEBUG) - printf("Value of sense_maps pointer == 0x%x or %d\n", sense_maps_buffer.ptr, sense_maps_buffer.ptr); - cudaPointerGetAttributes(&attr, sense_maps_buffer.ptr); - if(DEBUG) - printf("Value of attr.cudaMemoryType2 = %d\n", attr.type); - bool is_pinned_memory = attr.type == cudaMemoryTypeHost; - if(is_pinned_memory) - { - if(DEBUG) - printf("The smaps data is pinned!, skipping copies\n"); - // Just map the memory to sensArray! We dont need to make a copy if the memory is already pinned - std::complex *t_data = (std::complex *) sense_maps_buffer.ptr; - sensArray.data = reinterpret_cast(*t_data); - } - else - { - if(DEBUG) - printf("The smaps data is NOT pinned!, DOING copies\n"); - allocate_pinned_memory(&sensArray, n_coils * imgDims.count() * sizeof(DType2)); - sensArray.dim = imgDims; - sensArray.dim.channels = n_coils; - copyNumpyArray(sense_maps, sensArray.data); - } + cast_pointer(sense_maps, sensArray); has_sense_data = true; } factory.setBalanceWorkload(balance_workload); @@ -188,7 +184,7 @@ class GpuNUFFTPythonOperator py::array_t> op(py::array_t> in_image, py::array_t> out_kspace, bool interpolate_data) { cast_pointer(in_image, image); - cast_pointer(out_kspace.value(), kspace_data); + cast_pointer(out_kspace, kspace_data); if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else @@ -210,7 +206,7 @@ class GpuNUFFTPythonOperator py::array_t> adj_op(py::array_t> in_kspace, py::array_t> out_image, bool grid_data) { cast_pointer(in_kspace, kspace_data); - cast_pointer(out_image.value(), image); + cast_pointer(out_image, image); gpuNUFFT::Dimensions myDims = imgDims; if(dimension==2) myDims.depth = 1; From c71792cd8e741ff2ff112b86486cd26a48779e6f Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 12:51:26 +0100 Subject: [PATCH 61/85] Commited --- .../gpuNUFFT_operator_python_factory.cpp | 27 ++++++++++--------- python/test_mem.py | 19 ++++--------- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index ef4fee0f..8e8f9bbd 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -21,6 +21,7 @@ Carole Lazarus #include // std::vector #include #include +#define CAST_POINTER_VARNAME(x, y) cast_pointer(x, y, #x) namespace py = pybind11; @@ -47,18 +48,18 @@ readNumpyArray(py::array_t> data) } -void warn_pinned_memory(py::array_t> array) +void warn_pinned_memory(py::array_t> array, const char * name) { py::buffer_info buffer = array.request(); cudaPointerAttributes attr; - if(DEBUG) - printf("Value of sense_maps pointer == 0x%x\n", buffer.ptr); + // if(DEBUG) + printf("%s => Value of pointer == 0x%x\n", name, buffer.ptr); cudaPointerGetAttributes(&attr, buffer.ptr); - if(DEBUG) - printf("Value of attr.cudaMemoryType2 = %d\n", attr.type); + //if(DEBUG) + printf("%s => of attr.cudaMemoryType = %d\n", name, attr.type); bool is_pinned_memory = attr.type == cudaMemoryTypeHost; if(!is_pinned_memory) - py::print("WARNING:: The data is NOT pinned! This will be slow, consider pinning\n"); + py::print("WARNING:: The data", name , "is NOT pinned! This will be slow, consider pinning\n"); } void allocate_pinned_memory(gpuNUFFT::Array *lin_array, unsigned long int size) @@ -84,14 +85,14 @@ void copyNumpyArray(py::array_t> data, TType *copy_data) } template -void cast_pointer(py::array_t> data, gpuNUFFT::Array ©_data, bool warn=true) +void cast_pointer(py::array_t> data, gpuNUFFT::Array ©_data, const char * name , bool warn=true) { py::buffer_info myData = data.request(); std::complex *t_data = (std::complex *) myData.ptr; TType *my_data = reinterpret_cast(*t_data); copy_data.data = my_data; if (warn) - warn_pinned_memory(data); + warn_pinned_memory(data, name); } @@ -166,7 +167,7 @@ class GpuNUFFTPythonOperator } else { - cast_pointer(sense_maps, sensArray); + CAST_POINTER_VARNAME(sense_maps, sensArray); has_sense_data = true; } factory.setBalanceWorkload(balance_workload); @@ -183,8 +184,8 @@ class GpuNUFFTPythonOperator py::array_t> op(py::array_t> in_image, py::array_t> out_kspace, bool interpolate_data) { - cast_pointer(in_image, image); - cast_pointer(out_kspace, kspace_data); + CAST_POINTER_VARNAME(in_image, image); + CAST_POINTER_VARNAME(out_kspace, kspace_data); if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); else @@ -205,8 +206,8 @@ class GpuNUFFTPythonOperator } py::array_t> adj_op(py::array_t> in_kspace, py::array_t> out_image, bool grid_data) { - cast_pointer(in_kspace, kspace_data); - cast_pointer(out_image, image); + CAST_POINTER_VARNAME(in_kspace, kspace_data); + CAST_POINTER_VARNAME(out_image, image); gpuNUFFT::Dimensions myDims = imgDims; if(dimension==2) myDims.depth = 1; diff --git a/python/test_mem.py b/python/test_mem.py index 93adb852..880f521e 100644 --- a/python/test_mem.py +++ b/python/test_mem.py @@ -4,7 +4,7 @@ """ import numpy as np -from gpuNUFFT import NUFFTOp, MemoryAllocationType +from gpuNUFFT import NUFFTOp import pytest @@ -43,24 +43,15 @@ def test_pinned_memory_provided(): image = np.random.random(img_size) + 1j * np.random.random(img_size) kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0])) - image_out = cpx.empty_like_pinned(image) - kspace_out = cpx.empty_like_pinned(kspace) + image_out = cpx.zeros_like_pinned(image) + kspace_out = cpx.zeros_like_pinned(kspace) - nufft_ori = NUFFTOp( - kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), - image_size=img_size, - num_coils=n_coils, - when_allocate_memory=MemoryAllocationType.ALLOCATE_MEMORY_IN_OP, - ) - ori_kspace_out = nufft_ori.op(input_image=image) - ori_image_out = nufft_ori.adj_op(input_kspace=kspace) nufft_op = NUFFTOp( kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), image_size=img_size, num_coils=n_coils, - when_allocate_memory=MemoryAllocationType.NEVER_ALLOCATE_MEMORY, ) - out_ksp = nufft_op.op(input_image=image, out_kspace=kspace_out) - out_im = nufft_op.adj_op(input_kspace=kspace, out_image=image_out) + out_ksp = nufft_op.op(in_image=image, out_kspace=kspace_out) + out_im = nufft_op.adj_op(in_kspace=kspace, out_image=image_out) out_ksp \ No newline at end of file From 7cdeaff6d577175f2548d6a2137d8b4b3e248dee Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 16:00:25 +0100 Subject: [PATCH 62/85] Added codes --- .../gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++-- python/test_mem.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 8e8f9bbd..8a8992b6 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -52,10 +52,10 @@ void warn_pinned_memory(py::array_t> array, const char * nam { py::buffer_info buffer = array.request(); cudaPointerAttributes attr; - // if(DEBUG) + if(DEBUG) printf("%s => Value of pointer == 0x%x\n", name, buffer.ptr); cudaPointerGetAttributes(&attr, buffer.ptr); - //if(DEBUG) + if(DEBUG) printf("%s => of attr.cudaMemoryType = %d\n", name, attr.type); bool is_pinned_memory = attr.type == cudaMemoryTypeHost; if(!is_pinned_memory) diff --git a/python/test_mem.py b/python/test_mem.py index 880f521e..2c373ae7 100644 --- a/python/test_mem.py +++ b/python/test_mem.py @@ -40,18 +40,18 @@ def test_pinned_memory_provided(): kspace_loc = np.random.random((5000, 3)) - 0.5 img_size = [256, 256, 256] n_coils = 1 - image = np.random.random(img_size) + 1j * np.random.random(img_size) - kspace = np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0])) + image = (np.random.random(img_size) + 1j * np.random.random(img_size)).astype(np.complex64) + kspace = (np.random.random((n_coils, kspace_loc.shape[0])) + 1j * np.random.random((n_coils, kspace_loc.shape[0]))).astype(np.complex64) image_out = cpx.zeros_like_pinned(image) kspace_out = cpx.zeros_like_pinned(kspace) - - + print("Addresses: ", hex(kspace_out.ctypes.data), hex(image_out.ctypes.data)) + nufft_op = NUFFTOp( kspace_loc=np.reshape(kspace_loc, kspace_loc.shape[::-1], order='F').astype(np.float32), image_size=img_size, num_coils=n_coils, ) - out_ksp = nufft_op.op(in_image=image, out_kspace=kspace_out) + out_ksp = nufft_op.op(image, kspace_out) out_im = nufft_op.adj_op(in_kspace=kspace, out_image=image_out) out_ksp \ No newline at end of file From 8e3bb20d2ad5e1db6e53ea470d5dc91daa6d0f15 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 21 Nov 2023 16:11:08 +0100 Subject: [PATCH 63/85] Added to stderr --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 8a8992b6..aafcb60e 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -59,7 +59,7 @@ void warn_pinned_memory(py::array_t> array, const char * nam printf("%s => of attr.cudaMemoryType = %d\n", name, attr.type); bool is_pinned_memory = attr.type == cudaMemoryTypeHost; if(!is_pinned_memory) - py::print("WARNING:: The data", name , "is NOT pinned! This will be slow, consider pinning\n"); + std::cerr<<"WARNING:: The data"< *lin_array, unsigned long int size) From 6b2ef10037ab07d91adf80c3fced79780ce43096 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Tue, 28 Nov 2023 16:26:35 +0100 Subject: [PATCH 64/85] Fixes for smaps --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index aafcb60e..3c98ffd3 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -168,6 +168,8 @@ class GpuNUFFTPythonOperator else { CAST_POINTER_VARNAME(sense_maps, sensArray); + sensArray.dim = imgDims; + sensArray.dim.channels = n_coils; has_sense_data = true; } factory.setBalanceWorkload(balance_workload); From e31085f7929703b5150e780266187e8aab994cfe Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Wed, 29 Nov 2023 16:26:41 +0100 Subject: [PATCH 65/85] Update gpuNUFFT_operator_python_factory.cpp --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 3c98ffd3..d5f1e214 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -262,10 +262,7 @@ class GpuNUFFTPythonOperator void set_smaps(py::array_t> sense_maps) { - py::buffer_info myData = sense_maps.request(); - std::complex *t_data = (std::complex *) myData.ptr; - DType2 *my_data = reinterpret_cast(*t_data); - memcpy(sensArray.data, my_data, myData.size*sizeof(DType2)); + CAST_POINTER_VARNAME(sense_maps, sensArray); has_sense_data = true; gpuNUFFTOp->setSens(sensArray); } From 4517fcb5d1b70785656613627558a3f3baa19fc6 Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Fri, 5 Jan 2024 09:14:28 +0100 Subject: [PATCH 66/85] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bd0f53e7..2f80798c 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.5.0", + version="0.6.0", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From c0969138b29decec2bd875e5536cba02ce94df6f Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Fri, 5 Jan 2024 17:29:52 +0100 Subject: [PATCH 67/85] version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2f80798c..757a6775 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.6.0", + version="0.6.1", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From a8e2b259f35f6be6c19eb77e46a9ca2b63873ace Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Wed, 10 Jan 2024 16:52:30 +0100 Subject: [PATCH 68/85] Update setup.py --- setup.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 757a6775..700d0635 100644 --- a/setup.py +++ b/setup.py @@ -6,10 +6,7 @@ import platform from pprint import pprint import subprocess -try: - from pip._internal.main import main as pip_main -except ImportError: - from pip._internal import main as pip_main + release_info = {} @@ -31,8 +28,9 @@ def _preinstall(package_list, options=[]): if not isinstance(package_list, list) or not isinstance(options, list): raise TypeError('preinstall inputs must be of type list.') - - pip_main(['install'] + options + package_list) + subprocess.check_call( + [sys.executable, '-m', 'pip', 'install', options + package_list] + ) def _set_pybind_path(self): @@ -107,7 +105,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.6.1", + version="0.6.2", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", package_dir={"": "CUDA/bin"}, ext_modules=[ From 611790a6643b1cf1fdcfc05f23353e0e1ff5d39c Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Wed, 10 Jan 2024 17:21:19 +0100 Subject: [PATCH 69/85] Fix setup --- setup.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 700d0635..3af7ebc9 100644 --- a/setup.py +++ b/setup.py @@ -22,14 +22,12 @@ class CMakeBuild(build_ext): """ @staticmethod - def _preinstall(package_list, options=[]): + def _preinstall(package): """ Pre-install PyPi packages before running cmake. """ - if not isinstance(package_list, list) or not isinstance(options, list): - raise TypeError('preinstall inputs must be of type list.') subprocess.check_call( - [sys.executable, '-m', 'pip', 'install', options + package_list] + [sys.executable, '-m', 'pip', 'install', package] ) @@ -42,10 +40,10 @@ def run(self): """ Redifine the run method. """ # Set preinstall requirements - preinstall_list = ["pybind11"] + preinstall = "pybind11" # Preinstall packages - self._preinstall(preinstall_list) + self._preinstall(preinstall) # Set Pybind11 path self._set_pybind_path() From f38ee69cfba661a231cdf9fded167fd2a5e6c57a Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 12 Feb 2024 13:30:59 +0100 Subject: [PATCH 70/85] Working with fixed python Lib --- CUDA/src/gpu/python/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt index 494255c2..200b884b 100644 --- a/CUDA/src/gpu/python/CMakeLists.txt +++ b/CUDA/src/gpu/python/CMakeLists.txt @@ -1,6 +1,5 @@ cmake_minimum_required(VERSION 3.15) -find_package(PythonInterp 3.5 REQUIRED) -find_package(PythonLibs 3.5 REQUIRED) +find_package(Python 3.8 REQUIRED) MESSAGE(STATUS "Building Python interface") include_directories( From 06e10b46c992a41cd881043ac9d8c2de5e32446b Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Mon, 12 Feb 2024 15:42:03 +0100 Subject: [PATCH 71/85] FIXED Dens --- CUDA/inc/gpuNUFFT_kernels.hpp | 2 ++ CUDA/src/gpu/python/CMakeLists.txt | 14 ++++---- .../gpuNUFFT_operator_python_factory.cpp | 33 ++++++++++++++----- CUDA/src/gpu/std_gpuNUFFT_kernels.cu | 19 +++++++++++ MANIFEST.in | 5 +++ setup.py | 3 +- 6 files changed, 60 insertions(+), 16 deletions(-) create mode 100644 MANIFEST.in diff --git a/CUDA/inc/gpuNUFFT_kernels.hpp b/CUDA/inc/gpuNUFFT_kernels.hpp index 9966becc..cd4861dd 100644 --- a/CUDA/inc/gpuNUFFT_kernels.hpp +++ b/CUDA/inc/gpuNUFFT_kernels.hpp @@ -415,4 +415,6 @@ void performPadding(DType2 *imdata_d, CufftType *gdata_d, */ void precomputeDeapodization(DType *deapo_d, gpuNUFFT::GpuNUFFTInfo *gi_host); +void performUpdateDensityComp(DType2* density_data, DType2* estimation_data, long int n_samples); + #endif diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt index 200b884b..722fab0c 100644 --- a/CUDA/src/gpu/python/CMakeLists.txt +++ b/CUDA/src/gpu/python/CMakeLists.txt @@ -1,23 +1,25 @@ cmake_minimum_required(VERSION 3.15) -find_package(Python 3.8 REQUIRED) +find_package(Python3 3.8 REQUIRED COMPONENTS Interpreter Development) + MESSAGE(STATUS "Building Python interface") +MESSAGE("Pybind11 include dir ${PYBIND11_INCLUDE_DIR}") +MESSAGE("Python include dir ${Python3_INCLUDE_DIRS}") +MESSAGE("Found ${Python3_LIBRARIES}") include_directories( ${GPUNUFFT_INC_DIR} ${PYBIND11_INCLUDE_DIR} - ${PYTHON_INCLUDE_DIR} + ${Python3_INCLUDE_DIRS} ) cuda_include_directories(${GPUNUFFT_INC_DIR}) cuda_add_library(gpuNUFFT ${GPU_CU_SOURCES} ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} gpuNUFFT_operator_python_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../atomic/atomic_gpuNUFFT.cu SHARED) set_target_properties(gpuNUFFT PROPERTIES PREFIX "") + if(WIN32) - MESSAGE("Pybind11 include dir ${PYBIND11_INCLUDE_DIR}") - MESSAGE("Python include dir ${PYTHON_INCLUDE_DIR}") - MESSAGE("Found ${PYTHON_LIBRARIES}") set_target_properties(gpuNUFFT PROPERTIES SUFFIX ".pyd") - TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${PYTHON_LIBRARIES}) + TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES} ${GRID_LIB_ATM_NAME} ${Python3_LIBRARIES}) elseif(UNIX) TARGET_LINK_LIBRARIES(gpuNUFFT ${CUDA_LIBRARIES}) endif(WIN32) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 2dbf0775..330c735e 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -22,6 +22,8 @@ Carole Lazarus #include // std::vector #include #include +#include + #define CAST_POINTER_VARNAME(x, y) cast_pointer(x, y, #x) namespace py = pybind11; @@ -63,9 +65,10 @@ void warn_pinned_memory(py::array_t> array, const char * nam std::cerr<<"WARNING:: The data"< *lin_array, unsigned long int size) +template +void allocate_pinned_memory(gpuNUFFT::Array *lin_array, unsigned long int size) { - DType2 *new_data; + TType *new_data; cudaMallocHost((void **)&new_data, size); lin_array->data = new_data; } @@ -206,6 +209,18 @@ class GpuNUFFTPythonOperator capsule ); } + + void op_direct(intptr_t in_image, intptr_t out_kspace, bool interpolate_data) + { + image.data = reinterpret_cast(in_image); + kspace_data.data = reinterpret_cast(out_kspace); + if(interpolate_data) + gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); + else + gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data); + cudaDeviceSynchronize(); + } + py::array_t> adj_op(py::array_t> in_kspace, py::array_t> out_image, bool grid_data) { CAST_POINTER_VARNAME(in_kspace, kspace_data); @@ -275,7 +290,7 @@ class GpuNUFFTPythonOperator densArray.dim.length = n_samples; // TODO: Allocate directly on device and set with kernel. - for (int cnt = 0; cnt < n_samples; cnt++) + for (long int cnt = 0; cnt < n_samples; cnt++) { densArray.data[cnt].x = 1.0; densArray.data[cnt].y = 0.0; @@ -306,8 +321,7 @@ class GpuNUFFTPythonOperator gpuNUFFT::DENSITY_ESTIMATION); gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, densEstimation_gpu, gpuNUFFT::DENSITY_ESTIMATION); - performUpdateDensityComp(densArray_gpu.data, densEstimation_gpu.data, - n_samples); + performUpdateDensityComp(densArray_gpu.data, densEstimation_gpu.data, n_samples); if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) printf("error at adj thread synchronization d: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -333,6 +347,9 @@ class GpuNUFFTPythonOperator capsule); } + + + float get_spectral_radius(int max_iter = 20,float tolerance = 1e-6) { int im_size = image.count(); @@ -400,8 +417,8 @@ PYBIND11_MODULE(gpuNUFFT, m) { .def("op", &GpuNUFFTPythonOperator::op, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false) .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("in_kspace"), py::arg("out_image"), py::arg("grid_data") = false) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) - .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py:arg("max_iter") = 10) - .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps); - .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py:arg("max_iter") = 20, py:arg("tolerance") = 1e-6); + .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10) + .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps) + .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6); } #endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu index b0fe0e8b..a48d5532 100644 --- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu @@ -33,6 +33,25 @@ void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count) } } +__global__ void updateDensityCompKernel(DType2* density_data, DType2* estimation_data, long int N) +{ + long int t = threadIdx.x + blockIdx.x * blockDim.x; + while (t < N) + { + DType2 data_p = density_data[t]; + DType2 esti_p = estimation_data[t]; + data_p.x *= rsqrtf(esti_p.x * esti_p.x + esti_p.y * esti_p.y); + density_data[t] = data_p; + t = t + blockDim.x*gridDim.x; + } +} + +void performUpdateDensityComp(DType2* density_data, DType2* estimation_data, long int n_samples) +{ + dim3 block_dim(64, 1, 8); + dim3 grid_dim(getOptimalGridDim(n_samples,THREAD_BLOCK_SIZE)); + updateDensityCompKernel<<>>(density_data, estimation_data, n_samples); +} void initTexture(const char* symbol, cudaArray** devicePtr, gpuNUFFT::Array hostTexture) { diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..af69ea9f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +recursive-include CUDA/src * +recursive-include CUDA/doc * +recursive-include CUDA/inc * +include CUDA/CMakeLists.txt + diff --git a/setup.py b/setup.py index 3af7ebc9..2d513628 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ import os import sys -from setuptools import setup, Extension, find_packages +from setuptools import setup, Extension, find_namespace_packages from setuptools.command.build_ext import build_ext from importlib import import_module import platform @@ -105,7 +105,6 @@ def build_extension(self, ext): name="gpuNUFFT", version="0.6.2", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", - package_dir={"": "CUDA/bin"}, ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), ], From 5351ac709321a44917c134d3383b385cea942e82 Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Tue, 13 Feb 2024 11:19:44 +0100 Subject: [PATCH 72/85] Working built: GPU and CPU both present --- .../gpuNUFFT_operator_python_factory.cpp | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 330c735e..a5f0de7f 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -110,7 +110,9 @@ class GpuNUFFTPythonOperator bool has_sense_data; gpuNUFFT::Dimensions imgDims; // sensitivity maps - gpuNUFFT::Array sensArray, kspace_data, image; + gpuNUFFT::GpuArray image_gpu; + gpuNUFFT::GpuArray kspace_data_gpu; + gpuNUFFT::Array sensArray, kspace_data, image; void allocate_memory_kspace() { allocate_pinned_memory(&kspace_data, n_coils*trajectory_length*sizeof(DType2)); @@ -126,7 +128,7 @@ class GpuNUFFTPythonOperator public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, - py::array_t> sense_maps, std::optional> density_comp, int kernel_width=3, + py::array_t> sense_maps, py::array_t density_comp, int kernel_width=3, int sector_width=8, float osr=2, bool balance_workload=1) { // k-space coordinates @@ -138,12 +140,12 @@ class GpuNUFFTPythonOperator // density compensation weights gpuNUFFT::Array density_compArray; - if(density_comp.has_value()) - { - density_compArray = readNumpyArray(density_comp.value()); + //if(density_comp.has_value()) + //{ + density_compArray = readNumpyArray(density_comp); density_compArray.dim.length = trajectory_length; // No need else as the init is by default with 0 length and density comp is not applied - } + //} // image size py::buffer_info img_dim = image_size.request(); @@ -161,7 +163,10 @@ class GpuNUFFTPythonOperator kspace_data.dim.length = trajectory_length; kspace_data.dim.channels = num_coils; image.dim = imgDims; - + kspace_data_gpu.dim.length = trajectory_length; + kspace_data_gpu.dim.channels = num_coils; + image_gpu.dim = imgDims; + // sensitivity maps py::buffer_info sense_maps_buffer = sense_maps.request(); if (sense_maps_buffer.shape.size()==0) @@ -210,14 +215,25 @@ class GpuNUFFTPythonOperator ); } - void op_direct(intptr_t in_image, intptr_t out_kspace, bool interpolate_data) + void op_direct(uintptr_t in_image, uintptr_t out_kspace, bool interpolate_data) { - image.data = reinterpret_cast(in_image); - kspace_data.data = reinterpret_cast(out_kspace); + image_gpu.data = (DType2*) in_image; + kspace_data_gpu.data = (CufftType*) out_kspace; if(interpolate_data) - gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data, gpuNUFFT::DENSITY_ESTIMATION); + gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu, gpuNUFFT::DENSITY_ESTIMATION); else - gpuNUFFTOp->performForwardGpuNUFFT(image, kspace_data); + gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu); + cudaDeviceSynchronize(); + } + + void adj_op_direct(uintptr_t in_kspace, uintptr_t out_image, bool grid_data) + { + kspace_data_gpu.data = (CufftType*) in_kspace; + image_gpu.data = (DType2*) out_image; + if(grid_data) + gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); + else + gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image); cudaDeviceSynchronize(); } @@ -412,13 +428,15 @@ class GpuNUFFTPythonOperator PYBIND11_MODULE(gpuNUFFT, m) { py::class_(m, "NUFFTOp") - .def(py::init, py::array_t, int, py::array_t>, std::optional>, int, int, float, bool>(), + .def(py::init, py::array_t, int, py::array_t>, py::array_t, int, int, float, bool>(), py::arg("kspace_loc"), py::arg("image_size"), py::arg("num_coils"), py::arg("sense_maps") = py::none(), py::arg("density_comp") = py::none(), py::arg("kernel_width") = 3, py::arg("sector_width") = 8, py::arg("osr") = 2, py::arg("balance_workload") = true) .def("op", &GpuNUFFTPythonOperator::op, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false) + .def("op_direct", &GpuNUFFTPythonOperator::op_direct, py::arg("in_image"), py::arg("out_kspace"), py::arg("interpolate_data") = false) + .def("adj_op_direct", &GpuNUFFTPythonOperator::adj_op_direct, py::arg("in_kspace"), py::arg("out_image"), py::arg("grid_data") = false) .def("adj_op", &GpuNUFFTPythonOperator::adj_op, py::arg("in_kspace"), py::arg("out_image"), py::arg("grid_data") = false) .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps) .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6); } -#endif // GPUNUFFT_OPERATOR_MATLABFACTORY_H_INCLUDED +#endif // GPUNUFFT_OPERATOR_PYTHONFACTORY_H_INCLUDED From 9f31c7811c950440ca5ee9a2764e7012d653ca05 Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Thu, 15 Feb 2024 10:02:06 +0100 Subject: [PATCH 73/85] Final fixes --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index a5f0de7f..f29e1ea3 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -231,9 +231,9 @@ class GpuNUFFTPythonOperator kspace_data_gpu.data = (CufftType*) in_kspace; image_gpu.data = (DType2*) out_image; if(grid_data) - gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image, gpuNUFFT::DENSITY_ESTIMATION); + gpuNUFFTOp->performGpuNUFFTAdj(kspace_data_gpu, image_gpu, gpuNUFFT::DENSITY_ESTIMATION); else - gpuNUFFTOp->performGpuNUFFTAdj(kspace_data, image); + gpuNUFFTOp->performGpuNUFFTAdj(kspace_data_gpu, image_gpu); cudaDeviceSynchronize(); } From 0bb69c1441e4a35c74aee1545958000482933aa6 Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Thu, 15 Feb 2024 11:05:48 +0100 Subject: [PATCH 74/85] Add gpuNUFFT version pop[ --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2d513628..a075d919 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.6.2", + version="0.7.0", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 06dd0b69e8d5a267f9a5fbb6a1a2633eae1a0ba3 Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Thu, 15 Feb 2024 11:39:59 +0100 Subject: [PATCH 75/85] Fix cuRAND --- CUDA/src/gpu/atomic/CMakeLists.txt | 5 +++-- CUDA/src/gpu/python/CMakeLists.txt | 2 ++ setup.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CUDA/src/gpu/atomic/CMakeLists.txt b/CUDA/src/gpu/atomic/CMakeLists.txt index dffcdf15..061b9c12 100644 --- a/CUDA/src/gpu/atomic/CMakeLists.txt +++ b/CUDA/src/gpu/atomic/CMakeLists.txt @@ -11,10 +11,11 @@ set(GPU_CU_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/atomic_gpuNUFFT.cu #${CMAKE_CURRENT_SOURCE_DIR}/../std_gpuNUFFT_kernels.cu ) if(WIN32) - CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES} ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE}) + CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES} ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} ) else(WIN32) - CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES} ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} SHARED) + CUDA_ADD_LIBRARY(${GRID_LIB_ATM_NAME} ${GPU_CU_SOURCES} ${GPU_GPUNUFFT_SOURCES} ${GPUNUFFT_SOURCES} ${GPUNUFFT_INCLUDE} SHARED) endif(WIN32) CUDA_ADD_CUFFT_TO_TARGET(${GRID_LIB_ATM_NAME}) CUDA_ADD_CUBLAS_TO_TARGET(${GRID_LIB_ATM_NAME}) +target_link_libraries(${GRID_LIB_ATM_NAME} ${CUDA_curand_LIBRARY}) \ No newline at end of file diff --git a/CUDA/src/gpu/python/CMakeLists.txt b/CUDA/src/gpu/python/CMakeLists.txt index 722fab0c..fac39c03 100644 --- a/CUDA/src/gpu/python/CMakeLists.txt +++ b/CUDA/src/gpu/python/CMakeLists.txt @@ -25,3 +25,5 @@ elseif(UNIX) endif(WIN32) CUDA_ADD_CUFFT_TO_TARGET(gpuNUFFT) CUDA_ADD_CUBLAS_TO_TARGET(gpuNUFFT) +target_link_libraries(gpuNUFFT ${CUDA_curand_LIBRARY}) + diff --git a/setup.py b/setup.py index a075d919..f3bb04da 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.7.0", + version="0.7.1", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 611f56e09bfd68df08ee65ce41fb22e9746f32c4 Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Mon, 19 Feb 2024 17:05:17 +0100 Subject: [PATCH 76/85] Fix issues --- CUDA/inc/cuda_utils.hpp | 17 ++++++- CUDA/src/gpu/std_gpuNUFFT_kernels.cu | 1 + CUDA/src/gpuNUFFT_operator.cpp | 70 +++++++++++++++++++--------- 3 files changed, 65 insertions(+), 23 deletions(-) diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp index aa0f06cc..8893383c 100644 --- a/CUDA/inc/cuda_utils.hpp +++ b/CUDA/inc/cuda_utils.hpp @@ -110,13 +110,28 @@ inline void allocateAndSetMem(TypeName **device_ptr, IndType num_elements, */ template inline void copyDeviceToDevice(TypeName *device_ptr_src, - TypeName *device_ptr_dest, IndType num_elements) + TypeName *device_ptr_dest, IndType num_elements + ) { HANDLE_ERROR(cudaMemcpy(device_ptr_dest, device_ptr_src, num_elements * sizeof(TypeName), cudaMemcpyDeviceToDevice)); } +/** \brief CUDA memcpy call to copy data from device ptr to device ptr + * + * @param device_ptr_src source device pointer + * @param device_ptr_dest destination device pointer + * @param num_elements amount of elements of size TypeName + */ +template +inline void copyDeviceToDeviceAsync(TypeName *device_ptr_src, + TypeName *device_ptr_dest, IndType num_elements, cudaStream_t stream=0) +{ + HANDLE_ERROR(cudaMemcpyAsync(device_ptr_dest, device_ptr_src, + num_elements * sizeof(TypeName), + cudaMemcpyDeviceToDevice, stream)); +} /** \brief Copy CUDA memory from device to host * * @param device_ptr device pointer diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu index a48d5532..bff9118c 100644 --- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu @@ -41,6 +41,7 @@ __global__ void updateDensityCompKernel(DType2* density_data, DType2* estimation DType2 data_p = density_data[t]; DType2 esti_p = estimation_data[t]; data_p.x *= rsqrtf(esti_p.x * esti_p.x + esti_p.y * esti_p.y); + data_p.y = 0; density_data[t] = data_p; t = t + blockDim.x*gridDim.x; } diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index b1612d5f..35a4f07a 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -457,7 +457,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj( if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error at adj thread synchronization 2: %s\n", cudaGetErrorString(cudaGetLastError())); - if (gpuNUFFTOut == CONVOLUTION) + if (gpuNUFFTOut == CONVOLUTION || gpuNUFFTOut == DENSITY_ESTIMATION) { if (DEBUG) printf("stopping output after CONVOLUTION step\n"); @@ -939,10 +939,11 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("Memory allocation: %.2f ms\n", stopTiming()); int err; - + cudaStream_t new_stream, old_stream; // iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it += n_coils_cc) { + cudaStreamCreate(&new_stream); unsigned long int data_coil_offset = (long int)coil_it * data_count; unsigned long int im_coil_offset = coil_it * (long int)imdata_count; @@ -954,32 +955,52 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( // perform automatically "repeating" of input image in case // of existing sensitivity data for (int cnt = 0; cnt < n_coils_cc; cnt++) - copyDeviceToDevice(imgData_gpu.data, - imdata_d + cnt * imdata_count, imdata_count); + copyDeviceToDeviceAsync(imgData_gpu.data, + imdata_d + cnt * imdata_count, imdata_count, new_stream); else - copyDeviceToDevice(imgData_gpu.data + im_coil_offset, imdata_d, - imdata_count * n_coils_cc); + copyDeviceToDeviceAsync(imgData_gpu.data + im_coil_offset, imdata_d, + imdata_count * n_coils_cc, new_stream); // reset temp arrays - cudaMemset(gdata_d, 0, - sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc); - cudaMemset(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc); + cudaMemsetAsync(gdata_d, 0, + sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream); + cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream)!= cudaSuccess)) printf("error at thread synchronization 1: %s\n", cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { - copyToDevice(this->sens.data + im_coil_offset, sens_d, - imdata_count * n_coils_cc); + copyToDeviceAsync(this->sens.data + im_coil_offset, sens_d, + imdata_count * n_coils_cc, new_stream); performSensMul(imdata_d, sens_d, gi_host, false); } // apodization Correction performForwardDeapodization(imdata_d, deapo_d, gi_host); - - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if(gpuNUFFTOut == DENSITY_ESTIMATION) + { + // convolution and resampling to non-standard trajectory + forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d, + sector_centers_d, gi_host); + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) + printf("error at thread synchronization 7: %s\n", + cudaGetErrorString(cudaGetLastError())); + + if (debugTiming) + printf("Forward Convolution: %.2f ms\n", stopTiming()); + // write result in correct order back into output array + writeOrderedGPU(data_sorted_d, data_indices_d, data_d, + (int)this->kSpaceTraj.count(), n_coils_cc); + copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream); + if ((coil_it + n_coils_cc) < (n_coils)) + continue; + freeTotalDeviceMemory(imdata_d, NULL); + this->freeDeviceMemory(); + return; + } + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 2: %s\n", cudaGetErrorString(cudaGetLastError())); // resize by oversampling factor and zero pad @@ -988,13 +1009,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (debugTiming) startTiming(); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 3: %s\n", cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 4: %s\n", cudaGetErrorString(cudaGetLastError())); // eventually free imdata_d @@ -1012,12 +1033,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( c++; } - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 5: %s\n", cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 6: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -1030,7 +1051,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( // convolution and resampling to non-standard trajectory forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d, sector_centers_d, gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 7: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -1038,15 +1059,20 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("Forward Convolution: %.2f ms\n", stopTiming()); performFFTScaling(data_d, gi_host->data_count, gi_host); - if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) + if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error: at thread synchronization 8: %s\n", cudaGetErrorString(cudaGetLastError())); // write result in correct order back into output array writeOrderedGPU(data_sorted_d, data_indices_d, data_d, (int)this->kSpaceTraj.count(), n_coils_cc); - - copyDeviceToDevice(data_sorted_d, data_d, data_count * n_coils_cc); + if(coil_it > 1) + { + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); + } + copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream); + old_stream = new_stream; } // iterate over coils freeTotalDeviceMemory(imdata_d, NULL); From e86dbc6381824a19e5608aab2958180cba2c4aec Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Mon, 19 Feb 2024 17:05:37 +0100 Subject: [PATCH 77/85] Version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f3bb04da..d2de2b98 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.7.1", + version="0.7.2", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 7d097e3d9166d92d94c4c9a6f410ee6326aecaf3 Mon Sep 17 00:00:00 2001 From: GILIYAR RADHAKRISHNA Chaithya Date: Wed, 21 Feb 2024 10:50:04 +0100 Subject: [PATCH 78/85] Fixes added --- .../python/gpuNUFFT_operator_python_factory.cpp | 15 +++++++-------- CUDA/src/gpuNUFFT_operator.cpp | 11 ++++++----- setup.py | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index f29e1ea3..1e023a80 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -107,6 +107,7 @@ class GpuNUFFTPythonOperator gpuNUFFT::GpuNUFFTOperatorFactory factory; gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp; int trajectory_length, n_coils, dimension; + float osr; bool has_sense_data; gpuNUFFT::Dimensions imgDims; // sensitivity maps @@ -129,7 +130,7 @@ class GpuNUFFTPythonOperator public: GpuNUFFTPythonOperator(py::array_t kspace_loc, py::array_t image_size, int num_coils, py::array_t> sense_maps, py::array_t density_comp, int kernel_width=3, - int sector_width=8, float osr=2, bool balance_workload=1) + int sector_width=8, float osr=2, bool balance_workload=1) : osr(osr) { // k-space coordinates py::buffer_info sample_loc = kspace_loc.request(); @@ -140,12 +141,11 @@ class GpuNUFFTPythonOperator // density compensation weights gpuNUFFT::Array density_compArray; - //if(density_comp.has_value()) - //{ + if(density_comp == Py_None) + { density_compArray = readNumpyArray(density_comp); density_compArray.dim.length = trajectory_length; - // No need else as the init is by default with 0 length and density comp is not applied - //} + } // image size py::buffer_info img_dim = image_size.request(); @@ -165,7 +165,6 @@ class GpuNUFFTPythonOperator image.dim = imgDims; kspace_data_gpu.dim.length = trajectory_length; kspace_data_gpu.dim.channels = num_coils; - image_gpu.dim = imgDims; // sensitivity maps py::buffer_info sense_maps_buffer = sense_maps.request(); @@ -189,6 +188,7 @@ class GpuNUFFTPythonOperator image.dim.channels = n_coils; else image.dim.channels = 1; + image_gpu.dim = imgDims; cudaDeviceSynchronize(); } @@ -305,7 +305,6 @@ class GpuNUFFTPythonOperator allocate_pinned_memory(&densArray, n_samples * sizeof(CufftType)); densArray.dim.length = n_samples; - // TODO: Allocate directly on device and set with kernel. for (long int cnt = 0; cnt < n_samples; cnt++) { densArray.data[cnt].x = 1.0; @@ -324,7 +323,7 @@ class GpuNUFFTPythonOperator gpuNUFFT::GpuArray image_gpu; image_gpu.dim = imgDims; - allocateDeviceMem(&image_gpu.data, imgDims.count()); + allocateDeviceMem(&image_gpu.data, image_gpu.dim.count()); if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) printf("error at adj thread synchronization a: %s\n", diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 35a4f07a..50753f76 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -394,10 +394,11 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj( // more than 2 coil sets are not sensible to reconstruct in one // adjoint kernel call , since the used shared memory is limited + // FIXME: We now limit to 1 as 2 has errors right now int n_coils_cc = this->is2DProcessing() ? std::min(this->computePossibleConcurrentCoilCount( n_coils, kspaceData_gpu.dim), - 2) + 1) : 1; if (DEBUG) printf("Computing %d coils concurrently.\n", n_coils_cc); @@ -977,8 +978,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( performSensMul(imdata_d, sens_d, gi_host, false); } - // apodization Correction - performForwardDeapodization(imdata_d, deapo_d, gi_host); if(gpuNUFFTOut == DENSITY_ESTIMATION) { // convolution and resampling to non-standard trajectory @@ -1000,6 +999,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( this->freeDeviceMemory(); return; } + // apodization Correction + performForwardDeapodization(imdata_d, deapo_d, gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 2: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -1198,8 +1199,6 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( performSensMul(imdata_d, sens_d, gi_host, false); } - // apodization Correction - performForwardDeapodization(imdata_d, deapo_d, gi_host); if(gpuNUFFTOut == DENSITY_ESTIMATION) { forwardConvolution(data_d, crds_d, imdata_d, NULL, sectors_d, @@ -1214,6 +1213,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( this->freeDeviceMemory(); return; } + // apodization Correction + performForwardDeapodization(imdata_d, deapo_d, gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 2: %s\n", cudaGetErrorString(cudaGetLastError())); diff --git a/setup.py b/setup.py index d2de2b98..0709e4f3 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.7.2", + version="0.7.4", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 0c34013ad599fafc857dbda863a4e90141bb6a18 Mon Sep 17 00:00:00 2001 From: chaithyagr Date: Wed, 21 Feb 2024 15:47:31 +0100 Subject: [PATCH 79/85] Update with final fixes, v0.7.5 --- CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 1e023a80..a91c3f60 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -141,7 +141,7 @@ class GpuNUFFTPythonOperator // density compensation weights gpuNUFFT::Array density_compArray; - if(density_comp == Py_None) + if(density_comp != Py_None) { density_compArray = readNumpyArray(density_comp); density_compArray.dim.length = trajectory_length; diff --git a/setup.py b/setup.py index 0709e4f3..29c340c2 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.7.4", + version="0.7.5", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 01b9cab8ca2c3a12019dea0da5acf0677cd9d72c Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Mon, 3 Jun 2024 09:22:27 +0200 Subject: [PATCH 80/85] Autograd support added --- CUDA/inc/gpuNUFFT_operator.hpp | 16 ++++++++++++++-- .../python/gpuNUFFT_operator_python_factory.cpp | 7 +++++++ CUDA/src/gpuNUFFT_operator.cpp | 8 ++++---- setup.py | 2 +- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 4eb94478..2bbc6cd0 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -53,13 +53,13 @@ class GpuNUFFTOperator GpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth, DType osf, Dimensions imgDims, bool loadKernel = true, OperatorType operatorType = DEFAULT, - bool matlabSharedMem = false) + bool matlabSharedMem = false, bool grad_mode = false) : operatorType(operatorType), osf(osf), kernelWidth(kernelWidth), sectorWidth(sectorWidth), imgDims(imgDims), gpuMemAllocated(false), debugTiming(DEBUG), sens_d(NULL), crds_d(NULL), density_comp_d(NULL), deapo_d(NULL), gdata_d(NULL), sector_centers_d(NULL), sectors_d(NULL), data_indices_d(NULL), data_sorted_d(NULL), allocatedCoils(0), - matlabSharedMem(matlabSharedMem) + matlabSharedMem(matlabSharedMem), grad_mode(grad_mode) { if (loadKernel) initKernel(); @@ -342,6 +342,14 @@ class GpuNUFFTOperator GpuNUFFTOutput gpuNUFFTOut); void clean_memory(); + + void setGradMode(bool grad_mode) { + this->grad_mode = grad_mode; + } + + bool getGradMode() { + return this->grad_mode; + } /** \brief Check if density compensation data is available. */ bool applyDensComp() { @@ -452,6 +460,10 @@ class GpuNUFFTOperator */ bool matlabSharedMem; + /** \brief Flag for changing the isign, mainly used for gradients + */ + bool grad_mode; + /** \brief Return Grid Width (ImageWidth * osf) */ IndType getGridWidth() { diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index a91c3f60..5d0fa888 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -291,6 +291,12 @@ class GpuNUFFTPythonOperator gpuNUFFTOp->clean_memory(); } + void toggle_grad_mode() + { + bool current_mode = gpuNUFFTOp->getGradMode(); + gpuNUFFTOp->setGradMode(!current_mode); + } + void set_smaps(py::array_t> sense_maps) { CAST_POINTER_VARNAME(sense_maps, sensArray); @@ -436,6 +442,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { .def("clean_memory", &GpuNUFFTPythonOperator::clean_memory) .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps) + .def("toggle_grad_mode", &GpuNUFFTPythonOperator::toggle_grad_mode) .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6); } #endif // GPUNUFFT_OPERATOR_PYTHONFACTORY_H_INCLUDED diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 50753f76..4f370e87 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -1026,7 +1026,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( { if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count, gdata_d + c * gi_host->gridDims_count, - CUFFT_FORWARD)) != CUFFT_SUCCESS) + grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS) { fprintf(stderr, "cufft has failed with err %i \n", err); showMemoryInfo(true, stderr); @@ -1037,7 +1037,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 5: %s\n", cudaGetErrorString(cudaGetLastError())); - performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host); + performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 6: %s\n", @@ -1240,7 +1240,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( { if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count, gdata_d + c * gi_host->gridDims_count, - CUFFT_FORWARD)) != CUFFT_SUCCESS) + grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS) { fprintf(stderr, "cufft has failed with err %i \n", err); showMemoryInfo(true, stderr); @@ -1251,7 +1251,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 5: %s\n", cudaGetErrorString(cudaGetLastError())); - performFFTShift(gdata_d, FORWARD, getGridDims(), gi_host); + performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 6: %s\n", diff --git a/setup.py b/setup.py index 29c340c2..b96fa86c 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.7.5", + version="0.8.0", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 6b66ee000ac811ab52ba258646d875043a61a410 Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Fri, 21 Jun 2024 10:31:51 +0200 Subject: [PATCH 81/85] Added support for set_pts --- CUDA/inc/gpuNUFFT_operator_factory.hpp | 11 ++++++ .../gpuNUFFT_operator_python_factory.cpp | 18 ++++++++- CUDA/src/gpuNUFFT_operator_factory.cpp | 37 ++++++++++++------- setup.py | 2 +- 4 files changed, 52 insertions(+), 16 deletions(-) diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp index 5658803f..1c4bd992 100644 --- a/CUDA/inc/gpuNUFFT_operator_factory.hpp +++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp @@ -174,6 +174,15 @@ class GpuNUFFTOperatorFactory void setUseTextures(bool useTextures); void setBalanceWorkload(bool balanceWorkload); + + /** + * \brief Set k-space locations and corresponding density. This can also be used + * to update them + * + */ + void set_pts( + gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp, gpuNUFFT::Array &kSpaceTraj, + gpuNUFFT::Array &densCompData); protected: template @@ -315,7 +324,9 @@ class GpuNUFFTOperatorFactory */ gpuNUFFT::Array computeDeapodizationFunction(const IndType &kernelWidth, const DType &osf, gpuNUFFT::Dimensions &imgDims); + + private: /** \brief Flag to indicate texture interpolation */ bool useTextures; diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 5d0fa888..2b3664f2 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -304,6 +304,21 @@ class GpuNUFFTPythonOperator gpuNUFFTOp->setSens(sensArray); } + void set_pts(py::array_t kspace_loc, py::array_t density_comp) + { + gpuNUFFT::Array kSpaceTraj = readNumpyArray(kspace_loc); + kSpaceTraj.dim.length = trajectory_length; + + // density compensation weights + gpuNUFFT::Array density_compArray; + if(density_comp != Py_None) + { + density_compArray = readNumpyArray(density_comp); + density_compArray.dim.length = trajectory_length; + } + factory.set_pts(gpuNUFFTOp, kSpaceTraj, density_compArray); + + } py::array_t estimate_density_comp(int max_iter = 10) { IndType n_samples = kspace_data.count(); @@ -443,6 +458,7 @@ PYBIND11_MODULE(gpuNUFFT, m) { .def("estimate_density_comp", &GpuNUFFTPythonOperator::estimate_density_comp, py::arg("max_iter") = 10) .def("set_smaps", &GpuNUFFTPythonOperator::set_smaps) .def("toggle_grad_mode", &GpuNUFFTPythonOperator::toggle_grad_mode) - .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6); + .def("get_spectral_radius", &GpuNUFFTPythonOperator::get_spectral_radius, py::arg("max_iter") = 20, py::arg("tolerance") = 1e-6) + .def("set_pts", &GpuNUFFTPythonOperator::set_pts, py::arg("kspace_loc"), py::arg("density_comp") = py::none()); } #endif // GPUNUFFT_OPERATOR_PYTHONFACTORY_H_INCLUDED diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp index b6c4d734..647c8840 100644 --- a/CUDA/src/gpuNUFFT_operator_factory.cpp +++ b/CUDA/src/gpuNUFFT_operator_factory.cpp @@ -457,10 +457,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( checkMemoryConsumption(kSpaceTraj.dim, sectorWidth, osf, imgDims, densCompData.dim, sensData.dim); - if (kSpaceTraj.dim.channels > 1) - throw std::invalid_argument( - "Trajectory dimension must not contain a channel size greater than 1!"); - if (imgDims.channels > 1) throw std::invalid_argument( "Image dimensions must not contain a channel size greater than 1!"); @@ -470,6 +466,29 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp = createNewGpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims); + // Set points and density compensation + set_pts(gpuNUFFTOp, kSpaceTraj, densCompData); + + if (sensData.data != NULL) + gpuNUFFTOp->setSens(sensData); + + gpuNUFFTOp->setDeapodizationFunction( + this->computeDeapodizationFunction(kernelWidth, osf, imgDims)); + + debug("finished creation of gpuNUFFT operator\n"); + + return gpuNUFFTOp; +} + + +void gpuNUFFT::GpuNUFFTOperatorFactory::set_pts( + gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp, gpuNUFFT::Array &kSpaceTraj, + gpuNUFFT::Array &densCompData) +{ + if (kSpaceTraj.dim.channels > 1) + throw std::invalid_argument( + "Trajectory dimension must not contain a channel size greater than 1!"); + // assign according sector to k-Space position gpuNUFFT::Array assignedSectors = assignSectors(gpuNUFFTOp, kSpaceTraj); @@ -487,9 +506,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( if (densCompData.data != NULL) densData = initDensData(gpuNUFFTOp, coordCnt); - if (sensData.data != NULL) - gpuNUFFTOp->setSens(sensData); - if (useGpu) { sortArrays(gpuNUFFTOp, assignedSectorsAndIndicesSorted, @@ -543,13 +559,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createGpuNUFFTOperator( // free temporary array free(assignedSectors.data); assignedSectors.data = NULL; - - gpuNUFFTOp->setDeapodizationFunction( - this->computeDeapodizationFunction(kernelWidth, osf, imgDims)); - - debug("finished creation of gpuNUFFT operator\n"); - - return gpuNUFFTOp; } gpuNUFFT::GpuNUFFTOperator * diff --git a/setup.py b/setup.py index b96fa86c..6f0dd605 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.8.0", + version="0.8.1", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 2fde394291457966228a58f1c594840538d9a93b Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Fri, 2 Aug 2024 17:18:42 +0200 Subject: [PATCH 82/85] commit --- CUDA/src/gpuNUFFT_operator.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 4f370e87..05387b74 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -1076,6 +1076,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( old_stream = new_stream; } // iterate over coils + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); freeTotalDeviceMemory(imdata_d, NULL); this->freeDeviceMemory(); @@ -1291,13 +1293,14 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( old_stream = new_stream; } // iterate over coils + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); freeTotalDeviceMemory(data_d, imdata_d, NULL); this->freeDeviceMemory(); if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n", cudaGetErrorString(cudaGetLastError())); - cudaStreamDestroy(old_stream); } gpuNUFFT::Array From 3d9809c79f96f1860c63f703fc839c9b330f5006 Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Fri, 2 Aug 2024 20:56:46 +0200 Subject: [PATCH 83/85] WIP debug --- CUDA/CMakeLists.txt | 4 +- CUDA/inc/cuda_utils.hpp | 2 +- CUDA/src/balanced_gpuNUFFT_operator.cpp | 8 +-- .../gpuNUFFT_operator_python_factory.cpp | 9 ++- CUDA/src/gpuNUFFT_operator.cpp | 65 ++++++++++--------- setup.py | 2 +- 6 files changed, 49 insertions(+), 41 deletions(-) diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt index 71260d30..3235a868 100644 --- a/CUDA/CMakeLists.txt +++ b/CUDA/CMakeLists.txt @@ -70,8 +70,8 @@ endif(GEN_MEX_FILES) #Options #General DEBUG output -SET (DEBUG false) -OPTION(WITH_DEBUG "Enable DEBUG messages" OFF) +SET (DEBUG true) +OPTION(WITH_DEBUG "Enable DEBUG messages" ON) if (WITH_DEBUG) SET (DEBUG true) endif() diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp index 8893383c..ee1c76d9 100644 --- a/CUDA/inc/cuda_utils.hpp +++ b/CUDA/inc/cuda_utils.hpp @@ -220,7 +220,7 @@ inline void showMemoryInfo(bool force, FILE *stream) size_t total_mem = 0; cudaMemGetInfo(&free_mem, &total_mem); if (DEBUG || force) - fprintf(stream, "memory usage, free: %lu total: %lu\n", (SizeType)(free_mem), + printf("memory usage, free: %lu total: %lu\n", (SizeType)(free_mem), (SizeType)(total_mem)); } diff --git a/CUDA/src/balanced_gpuNUFFT_operator.cpp b/CUDA/src/balanced_gpuNUFFT_operator.cpp index 3e621044..613ee1c8 100644 --- a/CUDA/src/balanced_gpuNUFFT_operator.cpp +++ b/CUDA/src/balanced_gpuNUFFT_operator.cpp @@ -99,12 +99,12 @@ void gpuNUFFT::BalancedGpuNUFFTOperator::performForwardGpuNUFFT( printf( "BGpuNUFFT: allocate and copy sector processing order of size %d...\n", this->sectorProcessingOrder.count()); - allocateAndCopyToDeviceMem(§or_processing_order_d, - this->sectorProcessingOrder.data, - this->sectorProcessingOrder.count()); + //allocateAndCopyToDeviceMem(§or_processing_order_d, + // this->sectorProcessingOrder.data, + // this->sectorProcessingOrder.count()); GpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData, gpuNUFFTOut); - freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token +// freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token } diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 2b3664f2..628538b7 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -222,7 +222,13 @@ class GpuNUFFTPythonOperator if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu, gpuNUFFT::DENSITY_ESTIMATION); else - gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu); + { + for(long int i=0; i<100000; i++) + { + printf("i = %ld\n", i); + gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu); + } + } cudaDeviceSynchronize(); } @@ -442,6 +448,7 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { + printf("Destructor called\n"); delete gpuNUFFTOp; } }; diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 05387b74..78b22786 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -287,11 +287,11 @@ void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils, int n_coils_cc) printf("creating cufft plan with %d,%d,%d dimensions\n", DEFAULT_VALUE(gi_host->gridDims.z), gi_host->gridDims.y, gi_host->gridDims.x); - cufftResult res = cufftPlan3d( - &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z), - (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType); - if (res != CUFFT_SUCCESS) - fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res); + // cufftResult res = cufftPlan3d( + // &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z), + // (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType); + // if (res != CUFFT_SUCCESS) + // fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res); gpuMemAllocated = true; } @@ -301,7 +301,9 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory() return; cudaFreeHost(gi_host); - cufftDestroy(fft_plan); + cufftResult res = cufftDestroy(fft_plan); + if (res != CUFFT_SUCCESS) + fprintf(stderr, "error on CUFFT Plan destruction!!! %d\n", res); // Destroy the cuFFT plan. if (DEBUG && (cudaDeviceSynchronize() != cudaSuccess)) printf("error at thread synchronization 9: %s\n", @@ -312,13 +314,13 @@ void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory() sectors_d, sector_centers_d, NULL); // NULL as stop if (deapo_d != NULL) - cudaFree(deapo_d); + freeDeviceMem((void *)deapo_d); if (this->applySensData()) - cudaFree(sens_d); + freeDeviceMem((void *)sens_d); if (this->applyDensComp()) - cudaFree(density_comp_d); + freeDeviceMem((void *)density_comp_d); showMemoryInfo(); gpuMemAllocated = false; @@ -934,7 +936,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (DEBUG) printf("allocate and copy imdata of size %d...\n", imdata_count * n_coils_cc); - allocateDeviceMem(&imdata_d, imdata_count * n_coils_cc); + //allocateDeviceMem(&imdata_d, imdata_count * n_coils_cc); if (debugTiming) printf("Memory allocation: %.2f ms\n", stopTiming()); @@ -948,9 +950,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( unsigned long int data_coil_offset = (long int)coil_it * data_count; unsigned long int im_coil_offset = coil_it * (long int)imdata_count; - data_d = kspaceData_gpu.data + data_coil_offset; + //data_d = kspaceData_gpu.data + data_coil_offset; - this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc); +// this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc); if (this->applySensData()) // perform automatically "repeating" of input image in case @@ -959,13 +961,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( copyDeviceToDeviceAsync(imgData_gpu.data, imdata_d + cnt * imdata_count, imdata_count, new_stream); else - copyDeviceToDeviceAsync(imgData_gpu.data + im_coil_offset, imdata_d, - imdata_count * n_coils_cc, new_stream); +// copyDeviceToDeviceAsync(imgData_gpu.data + im_coil_offset, imdata_d, + // imdata_count * n_coils_cc, new_stream); // reset temp arrays - cudaMemsetAsync(gdata_d, 0, - sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream); - cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream); +// cudaMemsetAsync(gdata_d, 0, + // sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream); + //cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream); if (DEBUG && (cudaStreamSynchronize(new_stream)!= cudaSuccess)) printf("error at thread synchronization 1: %s\n", @@ -1000,12 +1002,12 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( return; } // apodization Correction - performForwardDeapodization(imdata_d, deapo_d, gi_host); + //performForwardDeapodization(imdata_d, deapo_d, gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 2: %s\n", cudaGetErrorString(cudaGetLastError())); // resize by oversampling factor and zero pad - performPadding(imdata_d, gdata_d, gi_host); + //performPadding(imdata_d, gdata_d, gi_host); if (debugTiming) startTiming(); @@ -1014,7 +1016,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("error at thread synchronization 3: %s\n", cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position - performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host); + //performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 4: %s\n", @@ -1024,9 +1026,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( int c = 0; while (c < n_coils_cc) { - if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count, - gdata_d + c * gi_host->gridDims_count, - grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS) + // if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count, + // gdata_d + c * gi_host->gridDims_count, + // grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS) { fprintf(stderr, "cufft has failed with err %i \n", err); showMemoryInfo(true, stderr); @@ -1037,7 +1039,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 5: %s\n", cudaGetErrorString(cudaGetLastError())); - performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host); + //performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 6: %s\n", @@ -1050,8 +1052,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( startTiming(); // convolution and resampling to non-standard trajectory - forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d, - sector_centers_d, gi_host); + //forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d, + // sector_centers_d, gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 7: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -1059,28 +1061,27 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (debugTiming) printf("Forward Convolution: %.2f ms\n", stopTiming()); - performFFTScaling(data_d, gi_host->data_count, gi_host); + // performFFTScaling(data_d, gi_host->data_count, gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error: at thread synchronization 8: %s\n", cudaGetErrorString(cudaGetLastError())); // write result in correct order back into output array - writeOrderedGPU(data_sorted_d, data_indices_d, data_d, - (int)this->kSpaceTraj.count(), n_coils_cc); + // writeOrderedGPU(data_sorted_d, data_indices_d, data_d, + // (int)this->kSpaceTraj.count(), n_coils_cc); if(coil_it > 1) { cudaStreamSynchronize(old_stream); cudaStreamDestroy(old_stream); } - copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream); + // copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream); old_stream = new_stream; } // iterate over coils cudaStreamSynchronize(old_stream); cudaStreamDestroy(old_stream); - freeTotalDeviceMemory(imdata_d, NULL); + // freeTotalDeviceMemory(imdata_d, NULL); this->freeDeviceMemory(); - if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n", cudaGetErrorString(cudaGetLastError())); diff --git a/setup.py b/setup.py index 6f0dd605..e12094c9 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def build_extension(self, ext): "-DGEN_PYTHON_FILES=ON", "-DGEN_MEX_FILES=OFF", "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path] - cfg = "Debug" if self.debug else "Release" + cfg = "Debug"# if self.debug else "Release" build_args = ["--config", cfg] if platform.system() == "Windows": From 0dc96874a0cd0acde99b22dbbc3dfdd6cf9c5e4c Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Mon, 5 Aug 2024 10:05:31 +0200 Subject: [PATCH 84/85] A bunch of fixes to support CUDA12.0 --- CUDA/CMakeLists.txt | 8 +- .../balanced_texture_gpuNUFFT_operator.hpp | 91 -- CUDA/inc/cuda_utils.cuh | 64 - CUDA/inc/cuda_utils.hpp | 33 - CUDA/inc/gpuNUFFT_kernels.hpp | 149 --- CUDA/inc/gpuNUFFT_operator_factory.hpp | 16 +- CUDA/inc/texture_gpuNUFFT_operator.hpp | 70 - CUDA/src/CMakeLists.txt | 4 +- CUDA/src/balanced_gpuNUFFT_operator.cpp | 8 +- .../balanced_texture_gpuNUFFT_operator.cpp | 126 -- CUDA/src/gpu/atomic/CMakeLists.txt | 1 - CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu | 1 - .../gpu/atomic/texture_gpuNUFFT_kernels.cu | 1153 ----------------- .../gpuNUFFT_operator_python_factory.cpp | 9 +- CUDA/src/gpu/std_gpuNUFFT_kernels.cu | 89 -- CUDA/src/gpuNUFFT_operator.cpp | 73 +- CUDA/src/gpuNUFFT_operator_factory.cpp | 50 +- CUDA/src/texture_gpuNUFFT_operator.cpp | 103 -- setup.py | 4 +- 19 files changed, 66 insertions(+), 1986 deletions(-) delete mode 100644 CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp delete mode 100644 CUDA/inc/texture_gpuNUFFT_operator.hpp delete mode 100644 CUDA/src/balanced_texture_gpuNUFFT_operator.cpp delete mode 100644 CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu delete mode 100644 CUDA/src/texture_gpuNUFFT_operator.cpp diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt index 3235a868..10958e50 100644 --- a/CUDA/CMakeLists.txt +++ b/CUDA/CMakeLists.txt @@ -70,8 +70,8 @@ endif(GEN_MEX_FILES) #Options #General DEBUG output -SET (DEBUG true) -OPTION(WITH_DEBUG "Enable DEBUG messages" ON) +SET (DEBUG false) +OPTION(WITH_DEBUG "Enable DEBUG messages" OFF) if (WITH_DEBUG) SET (DEBUG true) endif() @@ -163,10 +163,8 @@ SET(GPUNUFFT_INCLUDE ${GPUNUFFT_INC_DIR}/cuda_utils.hpp ${GPUNUFFT_INC_DIR}/precomp_utils.hpp ${GPUNUFFT_INC_DIR}/gpuNUFFT_operator.hpp ${GPUNUFFT_INC_DIR}/balanced_operator.hpp - ${GPUNUFFT_INC_DIR}/texture_gpuNUFFT_operator.hpp ${GPUNUFFT_INC_DIR}/balanced_gpuNUFFT_operator.hpp - ${GPUNUFFT_INC_DIR}/gpuNUFFT_operator_factory.hpp - ${GPUNUFFT_INC_DIR}/balanced_texture_gpuNUFFT_operator.hpp) + ${GPUNUFFT_INC_DIR}/gpuNUFFT_operator_factory.hpp) SET(MATLAB_HELPER_INCLUDE ${GPUNUFFT_INC_DIR}/matlab_helper.h) SET(CONFIG_INCLUDE ${GPUNUFFT_INC_DIR}/config.hpp ${GPUNUFFT_INC_DIR}/cufft_config.hpp) diff --git a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp b/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp deleted file mode 100644 index d7672f73..00000000 --- a/CUDA/inc/balanced_texture_gpuNUFFT_operator.hpp +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef BALANCED_TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED -#define BALANCED_TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED - -#include "gpuNUFFT_types.hpp" -#include "texture_gpuNUFFT_operator.hpp" -#include "balanced_operator.hpp" - -namespace gpuNUFFT -{ -/** - * \brief GpuNUFFTOperator with load balancing and texture memory lookup - * - * Changes the behaviour of the default GpuNUFFTOperator by balancing the - * work load by sector to a maximum amount of samples per sector - *(MAXIMUM_PAYLOAD). - * Thus, sectors with a high density of data points are split into multiple - *ones, - * which are processed in parallel. - * - * Furthermore, the kernel interpolation is performed by using gpu texture - *memory. - * - */ -class BalancedTextureGpuNUFFTOperator : public TextureGpuNUFFTOperator, - public BalancedOperator -{ - public: - BalancedTextureGpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth, - DType osf, Dimensions imgDims, - InterpolationType interpolationType = TEXTURE2D_LOOKUP, - bool matlabSharedMem = false) - : TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims, - interpolationType, matlabSharedMem) - { - } - - ~BalancedTextureGpuNUFFTOperator() - { - freeLocalMemberArray(this->sectorProcessingOrder.data); - } - - // OPERATIONS - void performGpuNUFFTAdj(Array kspaceData, Array &imgData, - GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION); - void performGpuNUFFTAdj(GpuArray kspaceData_gpu, - GpuArray &imgData_gpu, - GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION); - - void performForwardGpuNUFFT(Array imgData, - Array &kspaceData, - GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION); - void performForwardGpuNUFFT(GpuArray imgData_gpu, - GpuArray &kspaceData, - GpuNUFFTOutput gpuNUFFTOut = DEAPODIZATION); - - // Getter and Setter for Processing Order - Array getSectorProcessingOrder() - { - return this->sectorProcessingOrder; - } - void setSectorProcessingOrder(Array sectorProcessingOrder) - { - this->sectorProcessingOrder = sectorProcessingOrder; - } - - OperatorType getType() - { - return gpuNUFFT::BALANCED_TEXTURE; - } - // OPERATIONS - private: - GpuNUFFTInfo *initAndCopyGpuNUFFTInfo(int n_coils_cc = 1); - - // sectorProcessingOrder - Array sectorProcessingOrder; - - IndType2 *sector_processing_order_d; - - void adjConvolution(DType2 *data_d, DType *crds_d, CufftType *gdata_d, - DType *kernel_d, IndType *sectors_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); - - void forwardConvolution(CufftType *data_d, DType *crds_d, CufftType *gdata_d, - DType *kernel_d, IndType *sectors_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); -}; -} - -#endif // BALANCED_TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED diff --git a/CUDA/inc/cuda_utils.cuh b/CUDA/inc/cuda_utils.cuh index 9a522e78..f57b44d6 100644 --- a/CUDA/inc/cuda_utils.cuh +++ b/CUDA/inc/cuda_utils.cuh @@ -7,70 +7,6 @@ __constant__ gpuNUFFT::GpuNUFFTInfo GI; __constant__ DType KERNEL[10000]; -texture texKERNEL; -texture texKERNEL2D; -texture texKERNEL3D; - -texture texDATA; -texture texGDATA; - -__inline__ __device__ float compute1DTextureLookup(float x, float y) -{ - return tex1D(texKERNEL, x) * tex1D(texKERNEL, y); -} - -__inline__ __device__ float compute1DTextureLookup(float x, float y, float z) -{ - return tex1D(texKERNEL, x) * tex1D(texKERNEL, y) * tex1D(texKERNEL, z); -} - -__inline__ __device__ float compute2DTextureLookup(float x, float y) -{ - return (float)tex2D(texKERNEL2D, (float)x, (float)y); -} - -__inline__ __device__ float compute2DTextureLookup(float x, float y, float z) -{ - return (float)tex2D(texKERNEL2D, (float)x, (float)y) * - tex2D(texKERNEL2D, (float)z, 0); -} - -__inline__ __device__ float compute3DTextureLookup(float x, float y) -{ - return tex3D(texKERNEL3D, x, y, 0); -} - -__inline__ __device__ float compute3DTextureLookup(float x, float y, float z) -{ - return tex3D(texKERNEL3D, x, y, z); -} - -__inline__ __device__ float computeTextureLookup(float x, float y) -{ - // wired to 2d - return compute2DTextureLookup((float)x, (float)y); - // switch(GI.interpolationType) - //{ - // case 1: return compute1DTextureLookup(x,y); - // case 2: return compute2DTextureLookup(x,y); - // case 3: return compute3DTextureLookup(x,y); - // default: return (float)0.0; - //} -} - -__inline__ __device__ float computeTextureLookup(float x, float y, float z) -{ - // wired to 2d - return compute2DTextureLookup(x, y, z); - // switch(GI.interpolationType) - //{ - // case 1: return compute1DTextureLookup(x,y,z); - // case 2: return compute2DTextureLookup(x,y,z); - // case 3: return compute3DTextureLookup(x,y,z); - // default: return (float)0.0; - //} -} - #if __CUDA_ARCH__ < 200 #define THREAD_BLOCK_SIZE 256 #else diff --git a/CUDA/inc/cuda_utils.hpp b/CUDA/inc/cuda_utils.hpp index ee1c76d9..23e76bc7 100644 --- a/CUDA/inc/cuda_utils.hpp +++ b/CUDA/inc/cuda_utils.hpp @@ -253,37 +253,4 @@ inline void showMemoryInfo() */ void initConstSymbol(const char *symbol, const void *src, IndType count, cudaStream_t stream=0); -/** \brief Initialize texture memory on device - * - * CUDA Kernel function prototype. - * - * @param symbol Texture symbol name - */ -void initTexture(const char *symbol, cudaArray **devicePtr, - gpuNUFFT::Array hostTexture); - -/** \brief Bind to 1-d texture on device - * - * CUDA Kernel function prototype. - * - * @param symbol Texture symbol name - */ -void bindTo1DTexture(const char *symbol, void *devicePtr, IndType count); - -/** \brief Unbind from device texture - * - * CUDA Kernel function prototype. - * - * @param symbol Texture symbol name - */ -void unbindTexture(const char *symbol); - -/** \brief Free texture memory on device - * - * CUDA Kernel function prototype. - * - * @param symbol Texture symbol name - */ -void freeTexture(const char *symbol, cudaArray *devicePtr); - #endif diff --git a/CUDA/inc/gpuNUFFT_kernels.hpp b/CUDA/inc/gpuNUFFT_kernels.hpp index cd4861dd..48439259 100644 --- a/CUDA/inc/gpuNUFFT_kernels.hpp +++ b/CUDA/inc/gpuNUFFT_kernels.hpp @@ -80,80 +80,6 @@ void performConvolution(DType2 *data_d, DType *crds_d, CufftType *gdata_d, IndType *sector_centers_d, gpuNUFFT::GpuNUFFTInfo *gi_host); -/** - * \brief Adjoint gridding convolution implementation on GPU using textures for - *kernel lookup. - * - * Performs the adjoint gridding convolution step on the GPU, thus the - *interpolation - * from non-uniform sampled k-space data onto the uniform oversampled grid. - * - * The distance from each sample to its neighboring grid positions is computed - *and the corresponding - * data value is weighted by the kernel function according to the distance. - * - * The kernel lookup is performed by the use of gpu textures. - * - * CUDA function prototype. - * - * - * @param data_d Input k-space sample data value, complex, sorted due - *to precomputation - * @param crds_d k-space sample coordinate (non-cartesian), - *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn) - * @param gdata_d Outpu k-space grid (cartesian) - * @param kernel_d precomputed interpolation kernel - * @param sectors_d precomputed data-sector mapping, defines the range - *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to - *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an - *so on - * @param sector_processing_order_d precomputed sector processing order - * @param sector_centers_d precomputed coordinates (x,y,z) of sector centers - * @param gi_host info struct with meta information - */ -void performTextureConvolution(DType2 *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); - -/** - * \brief Adjoint gridding convolution implementation on GPU using textures and - *sector load balancing. - * - * Performs the adjoint gridding convolution step on the GPU, thus the - *interpolation - * from non-uniform sampled k-space data onto the uniform oversampled grid. - * - * The distance from each sample to its neighboring grid positions is computed - *and the corresponding - * data value is weighted by the kernel function according to the distance. - * - * The kernel lookup is performed by the use of gpu textures and the workload is - *balanced. - * - * CUDA function prototype. - * - * @param data_d Input k-space sample data value, complex, sorted due - *to precomputation - * @param crds_d k-space sample coordinate (non-cartesian), - *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn) - * @param gdata_d Output k-space grid (cartesian) - * @param kernel_d precomputed interpolation kernel - * @param sectors_d precomputed data-sector mapping, defines the range - *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to - *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an - *so on - * @param sector_processing_order_d precomputed sector processing order - * @param sector_centers_d precomputed coordinates (x,y,z) of sector centers - * @param gi_host info struct with meta information - */ -void performTextureConvolution(DType2 *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, - IndType2 *sector_processing_order_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); - // FORWARD Operations /** @@ -228,81 +154,6 @@ void performForwardConvolution(CufftType *data_d, DType *crds_d, IndType *sector_centers_d, gpuNUFFT::GpuNUFFTInfo *gi_host); -/** - * \brief Forward gridding convolution implementation on GPU using textures . - * - * Performs the forward gridding convolution step on the GPU, thus the - *interpolation - * from uniform oversampled grid positions to non-uniform sampled k-space data - *points. - * - * The distance from each sample to its neighboring grid positions is computed - *and the corresponding - * data value is weighted by the kernel function according to the distance. - * - * The kernel lookup is performed by the use of gpu textures. - * - * CUDA function prototype. - * - * @param data_d Output k-space sample data value, complex, sorted - *due to precomputation - * @param crds_d k-space sample coordinate (non-cartesian), - *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn) - * @param gdata_d Input k-space grid (cartesian) - * @param kernel_d precomputed interpolation kernel - * @param sectors_d precomputed data-sector mapping, defines the range - *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to - *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an - *so on - * @param sector_centers_d precomputed coordinates (x,y,z) of sector centers - * @param gi_host info struct with meta information - */ -void performTextureForwardConvolution(CufftType *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); - -/** - * \brief Forward gridding convolution implementation on GPU using sector load - *balancing and textures. - * - * Performs the forward gridding convolution step on the GPU, thus the - *interpolation - * from uniform oversampled grid positions to non-uniform sampled k-space data - *points. - * - * The distance from each sample to its neighboring grid positions is computed - *and the corresponding - * data value is weighted by the kernel function according to the distance. - * - * The kernel lookup is performed by the use of gpu textures. - * In order to balance the work load per thread block a sector processing order - *is precomputed. - * - * CUDA function prototype. - * - * @param data_d Output k-space sample data value, complex, sorted - *due to precomputation - * @param crds_d k-space sample coordinate (non-cartesian), - *linearized array (x1,x2,x3,...,xn,y1,y2,y3,...,yn,z1,z2,z3,...zn) - * @param gdata_d Input k-space grid (cartesian) - * @param kernel_d precomputed interpolation kernel - * @param sectors_d precomputed data-sector mapping, defines the range - *of data elements per sector, e.g. 0,3,4,4,10 -> maps data points 0..3 to - *sector id 0, 3..4 to sector 1, no data point to sector 2, 4..10 to sector 3 an - *so on - * @param sector_processing_order_d precomputed sector processing order - * @param sector_centers_d precomputed coordinates (x,y,z) of sector centers - * @param gi_host info struct with meta information - */ -void performTextureForwardConvolution(CufftType *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, - IndType2 *sector_processing_order_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); - // UTIL Functions /** \brief Scale each element by the total number of elements. * diff --git a/CUDA/inc/gpuNUFFT_operator_factory.hpp b/CUDA/inc/gpuNUFFT_operator_factory.hpp index 1c4bd992..24b937e7 100644 --- a/CUDA/inc/gpuNUFFT_operator_factory.hpp +++ b/CUDA/inc/gpuNUFFT_operator_factory.hpp @@ -4,8 +4,6 @@ #include "config.hpp" #include "gpuNUFFT_operator.hpp" #include "balanced_gpuNUFFT_operator.hpp" -#include "texture_gpuNUFFT_operator.hpp" -#include "balanced_texture_gpuNUFFT_operator.hpp" #include // std::sort #include // std::vector #include @@ -27,8 +25,6 @@ namespace gpuNUFFT * operator like from subsequent matlab calls * * The factory defines how the operator is going to process (load balancing - *and/or - * texture interpolation). * * Sector mapping: * @@ -51,13 +47,12 @@ class GpuNUFFTOperatorFactory /** \brief Constructor overload * - * @param useTextures Flag to indicate texture interpolation * @param useGpu Flag to indicat&GpuNUFFTPythonOperator::adj_op);e gpu usage for precomputation * @param balanceWorkload Flag to indicate load balancing */ - GpuNUFFTOperatorFactory(const bool useTextures = false, const bool useGpu = true, + GpuNUFFTOperatorFactory(const bool useGpu = true, bool balanceWorkload = true, bool matlabSharedMem = false) - : useTextures(useTextures), useGpu(useGpu), balanceWorkload(balanceWorkload), + : useGpu(useGpu), balanceWorkload(balanceWorkload), matlabSharedMem(matlabSharedMem) { } @@ -171,8 +166,6 @@ class GpuNUFFTOperatorFactory Array &deapoData, const IndType &kernelWidth, const IndType §orWidth, const DType &osf, Dimensions &imgDims); - void setUseTextures(bool useTextures); - void setBalanceWorkload(bool balanceWorkload); /** @@ -298,8 +291,6 @@ class GpuNUFFTOperatorFactory * * - default: GpuNUFFTOperator * - balanceWorkload = true: BalancedGpuNUFFTOperator - * - useTextures = true: TextureGpuNUFFTOperator - * - balanceWorkload + useTextures = true: BalancedTextureGpuNUFFTOperator * * @return New allocated GpuNUFFTOperator or sub class */ @@ -328,9 +319,6 @@ class GpuNUFFTOperatorFactory private: - /** \brief Flag to indicate texture interpolation */ - bool useTextures; - /** \brief Flag to indicate gpu usage for precomputation */ bool useGpu; diff --git a/CUDA/inc/texture_gpuNUFFT_operator.hpp b/CUDA/inc/texture_gpuNUFFT_operator.hpp deleted file mode 100644 index 5d1bca98..00000000 --- a/CUDA/inc/texture_gpuNUFFT_operator.hpp +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED -#define TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED - -#include -#include -#include "gpuNUFFT_types.hpp" -#include "gpuNUFFT_operator.hpp" - -namespace gpuNUFFT -{ -/** -* \brief GpuNUFFTOperator with texture memory lookup -* -* Changes the behaviour of the default GpuNUFFTOperator by using gpu texture -*memory -* in the kernel interpolation step. -* -*/ -class TextureGpuNUFFTOperator : public GpuNUFFTOperator -{ - public: - TextureGpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth, DType osf, - Dimensions imgDims, - InterpolationType interpolationType = TEXTURE2D_LOOKUP, - bool matlabSharedMem = false) - : GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims, false, TEXTURE, matlabSharedMem), - interpolationType(interpolationType), kernel_d(NULL) - { - if (typeid(DType) == typeid(double)) - throw std::runtime_error( - "Double precision textures are not supported yet!"); - - initKernel(); - } - - ~TextureGpuNUFFTOperator() - { - } - - virtual OperatorType getType() - { - return gpuNUFFT::TEXTURE; - } - - protected: - void initKernel(); - - cudaArray *kernel_d; - InterpolationType interpolationType; - const char *getInterpolationTypeName(); - - // OPERATIONS - private: - GpuNUFFTInfo *initAndCopyGpuNUFFTInfo(int n_coils_cc = 1); - - virtual void adjConvolution(DType2 *data_d, DType *crds_d, CufftType *gdata_d, - DType *kernel_d, IndType *sectors_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); - virtual void forwardConvolution(CufftType *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host); - - void initLookupTable(); - void freeLookupTable(); -}; -} - -#endif // TEXTURE_GPUNUFFT_OPERATOR_H_INCLUDED diff --git a/CUDA/src/CMakeLists.txt b/CUDA/src/CMakeLists.txt index b3b9a21b..8fb4f83c 100644 --- a/CUDA/src/CMakeLists.txt +++ b/CUDA/src/CMakeLists.txt @@ -4,9 +4,7 @@ SET(GPUNUFFT_SRC_DIR ${CMAKE_SOURCE_DIR}/src) SET(GPUNUFFT_SOURCES ${GPUNUFFT_SRC_DIR}/gpuNUFFT_utils.cpp ${GPUNUFFT_SRC_DIR}/gpuNUFFT_operator_factory.cpp ${GPUNUFFT_SRC_DIR}/gpuNUFFT_operator.cpp - ${GPUNUFFT_SRC_DIR}/texture_gpuNUFFT_operator.cpp - ${GPUNUFFT_SRC_DIR}/balanced_gpuNUFFT_operator.cpp - ${GPUNUFFT_SRC_DIR}/balanced_texture_gpuNUFFT_operator.cpp) + ${GPUNUFFT_SRC_DIR}/balanced_gpuNUFFT_operator.cpp) ADD_SUBDIRECTORY(gpu) diff --git a/CUDA/src/balanced_gpuNUFFT_operator.cpp b/CUDA/src/balanced_gpuNUFFT_operator.cpp index 613ee1c8..3e621044 100644 --- a/CUDA/src/balanced_gpuNUFFT_operator.cpp +++ b/CUDA/src/balanced_gpuNUFFT_operator.cpp @@ -99,12 +99,12 @@ void gpuNUFFT::BalancedGpuNUFFTOperator::performForwardGpuNUFFT( printf( "BGpuNUFFT: allocate and copy sector processing order of size %d...\n", this->sectorProcessingOrder.count()); - //allocateAndCopyToDeviceMem(§or_processing_order_d, - // this->sectorProcessingOrder.data, - // this->sectorProcessingOrder.count()); + allocateAndCopyToDeviceMem(§or_processing_order_d, + this->sectorProcessingOrder.data, + this->sectorProcessingOrder.count()); GpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData, gpuNUFFTOut); -// freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token + freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token } diff --git a/CUDA/src/balanced_texture_gpuNUFFT_operator.cpp b/CUDA/src/balanced_texture_gpuNUFFT_operator.cpp deleted file mode 100644 index 1ad519c6..00000000 --- a/CUDA/src/balanced_texture_gpuNUFFT_operator.cpp +++ /dev/null @@ -1,126 +0,0 @@ -#include "balanced_texture_gpuNUFFT_operator.hpp" - -gpuNUFFT::GpuNUFFTInfo * -gpuNUFFT::BalancedTextureGpuNUFFTOperator::initAndCopyGpuNUFFTInfo( - int n_coils_cc) -{ - gpuNUFFT::GpuNUFFTInfo *gi_host = initGpuNUFFTInfo(n_coils_cc); - - gi_host->sectorsToProcess = sectorProcessingOrder.count(); - gi_host->interpolationType = interpolationType; - - if (DEBUG) - printf("copy GpuNUFFT Info to symbol memory... size = %lu \n", - (SizeType)sizeof(gpuNUFFT::GpuNUFFTInfo)); - - initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo)); - - if (DEBUG) - printf("...done!\n"); - return gi_host; -} - -void gpuNUFFT::BalancedTextureGpuNUFFTOperator::adjConvolution( - DType2 *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - bindTo1DTexture("texDATA", data_d, - this->kSpaceTraj.count() * gi_host->n_coils_cc); - - // call balanced texture kernel - performTextureConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d, - sector_processing_order_d, sector_centers_d, - gi_host); - - unbindTexture("texDATA"); -} - -void gpuNUFFT::BalancedTextureGpuNUFFTOperator::forwardConvolution( - CufftType *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - bindTo1DTexture("texGDATA", gdata_d, - gi_host->grid_width_dim * gi_host->n_coils_cc); - - // call balanced texture kernel - performTextureForwardConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d, - sector_processing_order_d, sector_centers_d, - gi_host); - - unbindTexture("texGDATA"); -} - -// Adds behaviour of GpuNUFFTOperator by -// adding a sector processing order -void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performGpuNUFFTAdj( - gpuNUFFT::Array kspaceData, gpuNUFFT::Array &imgData, - GpuNUFFTOutput gpuNUFFTOut) -{ - if (DEBUG) - printf( - "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n", - this->sectorProcessingOrder.count()); - allocateAndCopyToDeviceMem(§or_processing_order_d, - this->sectorProcessingOrder.data, - this->sectorProcessingOrder.count()); - - TextureGpuNUFFTOperator::performGpuNUFFTAdj(kspaceData, imgData, gpuNUFFTOut); - - freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token -} - -void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performGpuNUFFTAdj( - GpuArray kspaceData_gpu, GpuArray &imgData_gpu, - GpuNUFFTOutput gpuNUFFTOut) -{ - if (DEBUG) - printf( - "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n", - this->sectorProcessingOrder.count()); - allocateAndCopyToDeviceMem(§or_processing_order_d, - this->sectorProcessingOrder.data, - this->sectorProcessingOrder.count()); - - TextureGpuNUFFTOperator::performGpuNUFFTAdj(kspaceData_gpu, imgData_gpu, - gpuNUFFTOut); - - freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token -} - -void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performForwardGpuNUFFT( - gpuNUFFT::Array imgData, gpuNUFFT::Array &kspaceData, - GpuNUFFTOutput gpuNUFFTOut) -{ - if (DEBUG) - printf( - "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n", - this->sectorProcessingOrder.count()); - allocateAndCopyToDeviceMem(§or_processing_order_d, - this->sectorProcessingOrder.data, - this->sectorProcessingOrder.count()); - - TextureGpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData, - gpuNUFFTOut); - - freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token -} - -void gpuNUFFT::BalancedTextureGpuNUFFTOperator::performForwardGpuNUFFT( - gpuNUFFT::GpuArray imgData, - gpuNUFFT::GpuArray &kspaceData, GpuNUFFTOutput gpuNUFFTOut) -{ - if (DEBUG) - printf( - "BTGpuNUFFT: allocate and copy sector processing order of size %d...\n", - this->sectorProcessingOrder.count()); - allocateAndCopyToDeviceMem(§or_processing_order_d, - this->sectorProcessingOrder.data, - this->sectorProcessingOrder.count()); - - TextureGpuNUFFTOperator::performForwardGpuNUFFT(imgData, kspaceData, - gpuNUFFTOut); - - freeTotalDeviceMemory(sector_processing_order_d, NULL); // NULL as stop token -} diff --git a/CUDA/src/gpu/atomic/CMakeLists.txt b/CUDA/src/gpu/atomic/CMakeLists.txt index 061b9c12..3dd73742 100644 --- a/CUDA/src/gpu/atomic/CMakeLists.txt +++ b/CUDA/src/gpu/atomic/CMakeLists.txt @@ -7,7 +7,6 @@ cuda_include_directories(${GPUNUFFT_INC_DIR}) set(GPU_CU_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/atomic_gpuNUFFT.cu #${CMAKE_CURRENT_SOURCE_DIR}/atomic_gpuNUFFT_kernels.cu - #${CMAKE_CURRENT_SOURCE_DIR}/texture_gpuNUFFT_kernels.cu #${CMAKE_CURRENT_SOURCE_DIR}/../std_gpuNUFFT_kernels.cu ) if(WIN32) diff --git a/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu b/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu index bf2e9622..87df6de0 100644 --- a/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu +++ b/CUDA/src/gpu/atomic/atomic_gpuNUFFT.cu @@ -1,2 +1 @@ #include "atomic_gpuNUFFT_kernels.cu" -#include "texture_gpuNUFFT_kernels.cu" diff --git a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu b/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu deleted file mode 100644 index e143ddcb..00000000 --- a/CUDA/src/gpu/atomic/texture_gpuNUFFT_kernels.cu +++ /dev/null @@ -1,1153 +0,0 @@ -#ifndef TEXTURE_GPUNUFFT_KERNELS_H -#define TEXTURE_GPUNUFFT_KERNELS_H -#include "gpuNUFFT_kernels.hpp" -#include "../std_gpuNUFFT_kernels.cu" -#include "cuda_utils.cuh" - -// ---------------------------------------------------------------------------- -// convolutionKernel: NUFFT^H kernel -// -// Performs the gpuNUFFT step by convolution of sample points with -// interpolation function and resampling onto grid. Basic concept based on Zwart -// et al. -// -// parameters: -// * data : complex input sample points -// * crds : coordinates of data points (x,y,z) -// * gdata : output grid data -// * sectors : mapping of sample indices according to each sector -// * sector_centers : coordinates (x,y,z) of sector centers -// * temp_gdata : temporary grid data -// * N : number of threads -__device__ void textureConvolutionFunction(int *sec, int sec_max, - int sec_offset, DType2 *sdata, - DType2 *data, DType *crds, - CufftType *gdata, IndType *sectors, - IndType *sector_centers) -{ - // start convolution - int ind, x, y, z; - int imin, imax, jmin, jmax, kmin, kmax; - - DType dx_sqr, dy_sqr, dz_sqr, val, ix, jy, kz; - - __shared__ IndType3 center; - center.x = sector_centers[sec[threadIdx.x] * 3]; - center.y = sector_centers[sec[threadIdx.x] * 3 + 1]; - center.z = sector_centers[sec[threadIdx.x] * 3 + 2]; - - // Grid Points over Threads - int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; - // loop over all data points of the current sector, and check if grid position - // lies inside - // affected region, if so, add data point weighted to grid position value - while (data_cnt < sec_max) - { - DType3 data_point; // datapoint per thread - data_point.x = crds[data_cnt]; - data_point.y = crds[data_cnt + GI.data_count]; - data_point.z = crds[data_cnt + 2 * GI.data_count]; - - // set the boundaries of final dataset for gpuNUFFT this point - ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x, - GI.sector_offset); - set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius); - jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y, - GI.sector_offset); - set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius); - kz = mapKSpaceToGrid(data_point.z, GI.gridDims.z, center.z, - GI.sector_offset); - set_minmax(&kz, &kmin, &kmax, GI.sector_pad_max, GI.kernel_radius); - - // grid this point onto its cartesian points neighbors - for (int k = kmin; k <= kmax; k++) - { - kz = mapGridToKSpace(k, GI.gridDims.z, center.z, GI.sector_offset); - dz_sqr = (kz - data_point.z) * GI.aniso_z_scale; - dz_sqr *= dz_sqr; - for (int j = jmin; j <= jmax; j++) - { - jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset); - dy_sqr = (jy - data_point.y) * GI.aniso_y_scale; - dy_sqr *= dy_sqr; - - for (int i = imin; i <= imax; i++) - { - ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset); - dx_sqr = (ix - data_point.x) * GI.aniso_x_scale; - dx_sqr *= dx_sqr; - // get kernel value - val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv, - dy_sqr * GI.radiusSquared_inv, - dz_sqr * GI.radiusSquared_inv); - - ind = getIndex(i, j, k, GI.sector_pad_width); - - // multiply data by current kernel val - // grid complex or scalar - atomicAdd(&(sdata[ind].x), - val * - tex1Dfetch(texDATA, data_cnt).x); - - atomicAdd(&(sdata[ind].y), - val * - tex1Dfetch(texDATA, data_cnt).y); - } // x - } // y - } // z - data_cnt = data_cnt + blockDim.x; - } // grid points per sector - - // write shared data to output grid - __syncthreads(); - // int sector_ind_offset = sec * GI.sector_dim; - __shared__ int sector_ind_offset; - sector_ind_offset = - computeXYZ2Lin(center.x - GI.sector_offset, center.y - GI.sector_offset, - center.z - GI.sector_offset, GI.gridDims); - - // each thread writes one position from shared mem to global mem - for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x) - { - getCoordsFromIndex(s_ind, &x, &y, &z, GI.sector_pad_width); - - if (isOutlier(x, y, z, center.x, center.y, center.z, GI.gridDims, - GI.sector_offset)) - // calculate opposite index - ind = computeXYZ2Lin( - calculateOppositeIndex(x, center.x, GI.gridDims.x, GI.sector_offset), - calculateOppositeIndex(y, center.y, GI.gridDims.y, GI.sector_offset), - calculateOppositeIndex(z, center.z, GI.gridDims.z, GI.sector_offset), - GI.gridDims); - else - ind = sector_ind_offset + - computeXYZ2Lin(x, y, z, GI.gridDims); // index in output grid - - atomicAdd(&(gdata[ind].x), sdata[s_ind].x); // Re - atomicAdd(&(gdata[ind].y), sdata[s_ind].y); // Im - // reset shared mem - sdata[s_ind].x = (DType)0.0; - sdata[s_ind].y = (DType)0.0; - } - __syncthreads(); -} - -__global__ void textureConvolutionKernel(DType2 *data, DType *crds, - CufftType *gdata, IndType *sectors, - IndType *sector_centers, int N) -{ - extern __shared__ DType2 sdata[]; // externally managed shared memory - - // init shared memory - for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x) - { - sdata[s_ind].x = (DType)0.0; // Re - sdata[s_ind].y = (DType)0.0; // Im - } - __syncthreads(); - - __shared__ int sec[THREAD_BLOCK_SIZE]; - sec[threadIdx.x] = blockIdx.x; - while (sec[threadIdx.x] < N) - { - __shared__ int data_max; - data_max = sectors[sec[threadIdx.x] + 1]; - textureConvolutionFunction(sec, data_max, 0, sdata, data, crds, gdata, - sectors, sector_centers); - __syncthreads(); - sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x; - } // sec < sector_count -} - -__global__ void balancedTextureConvolutionKernel( - DType2 *data, DType *crds, CufftType *gdata, IndType *sectors, - IndType2 *sector_processing_order, IndType *sector_centers, int N) -{ - extern __shared__ DType2 sdata[]; // externally managed shared memory - - // init shared memory - for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x) - { - sdata[s_ind].x = (DType)0.0; // Re - sdata[s_ind].y = (DType)0.0; // Im - } - __syncthreads(); - - int sec_cnt = blockIdx.x; - __shared__ int sec[THREAD_BLOCK_SIZE]; - - while (sec_cnt < N) - { - sec[threadIdx.x] = sector_processing_order[sec_cnt].x; - __shared__ int data_max; - data_max = min(sectors[sec[threadIdx.x] + 1], - sectors[sec[threadIdx.x]] + - sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD); - textureConvolutionFunction(sec, data_max, - sector_processing_order[sec_cnt].y, sdata, data, - crds, gdata, sectors, sector_centers); - __syncthreads(); - sec_cnt = sec_cnt + gridDim.x; - } // sec < sector_count -} - -// ---------------------------------------------------------------------------- -// convolutionKernel: NUFFT^H kernel -// -// Performs the gpuNUFFT step by convolution of sample points with -// interpolation function and resampling onto grid. Basic concept based on Zwart -// et al. -// -// parameters: -// * data : complex input sample points -// * crds : coordinates of data points (x,y,z) -// * gdata : output grid data -// * sectors : mapping of sample indices according to each sector -// * sector_centers : coordinates (x,y,z) of sector centers -// * temp_gdata : temporary grid data -// * N : number of threads -__device__ void textureConvolutionFunction2D(DType2 *sdata, int *sec, - int sec_max, int sec_offset, - DType2 *data, DType *crds, - CufftType *gdata, IndType *sectors, - IndType *sector_centers) -{ - // start convolution - int ind, x, y; - int imin, imax, jmin, jmax; - - DType dx_sqr, dy_sqr, val, ix, jy; - - __shared__ IndType2 center; - center.x = sector_centers[sec[threadIdx.x] * 2]; - center.y = sector_centers[sec[threadIdx.x] * 2 + 1]; - - // Grid Points over Threads - int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; - // loop over all data points of the current sector, and check if grid position - // lies inside - // affected region, if so, add data point weighted to grid position value - while (data_cnt < sec_max) - { - DType2 data_point; // datapoint per thread - data_point.x = crds[data_cnt]; - data_point.y = crds[data_cnt + GI.data_count]; - - // set the boundaries of final dataset for gpuNUFFT this point - ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x, - GI.sector_offset); - set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius); - jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y, - GI.sector_offset); - set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius); - - // grid this point onto its cartesian points neighbors - for (int j = jmin; j <= jmax; j++) - { - jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset); - dy_sqr = (jy - data_point.y) * GI.aniso_y_scale; - dy_sqr *= dy_sqr; - - for (int i = imin; i <= imax; i++) - { - ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset); - dx_sqr = (ix - data_point.x) * GI.aniso_x_scale; - dx_sqr *= dx_sqr; - // get kernel value - // Calculate Separable Filters - val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv, - dy_sqr * GI.radiusSquared_inv); - - ind = getIndex2D(i, j, GI.sector_pad_width); - - // multiply data by current kernel val - // grid complex or scalar - for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z) - { - atomicAdd(&(sdata[ind + c * GI.sector_dim].x), - val * tex1Dfetch(texDATA, data_cnt + c * GI.data_count).x); - atomicAdd(&(sdata[ind + c * GI.sector_dim].y), - val * tex1Dfetch(texDATA, data_cnt + c * GI.data_count).y); - } - } // x - } // y - data_cnt = data_cnt + blockDim.x; - } // grid points per sector - - // write shared data to output grid - __syncthreads(); - // int sector_ind_offset = sec * GI.sector_dim; - __shared__ int sector_ind_offset; - sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset, - center.y - GI.sector_offset, GI.gridDims); - - // each thread writes one position from shared mem to global mem - for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x) - { - getCoordsFromIndex2D(s_ind, &x, &y, GI.sector_pad_width); - - if (isOutlier2D(x, y, center.x, center.y, GI.gridDims, GI.sector_offset)) - // calculate opposite index - ind = computeXY2Lin( - calculateOppositeIndex(x, center.x, GI.gridDims.x, GI.sector_offset), - calculateOppositeIndex(y, center.y, GI.gridDims.y, GI.sector_offset), - GI.gridDims); - else - ind = sector_ind_offset + - computeXY2Lin(x, y, GI.gridDims); // index in output grid - - for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z) - { - atomicAdd(&(gdata[ind + c * GI.gridDims_count].x), - sdata[s_ind + c * GI.sector_dim].x); // Re - atomicAdd(&(gdata[ind + c * GI.gridDims_count].y), - sdata[s_ind + c * GI.sector_dim].y); // Im - - // reset shared mem - sdata[s_ind + c * GI.sector_dim].x = (DType)0.0; - sdata[s_ind + c * GI.sector_dim].y = (DType)0.0; - } - } -} - -__global__ void textureConvolutionKernel2D(DType2 *data, DType *crds, - CufftType *gdata, IndType *sectors, - IndType *sector_centers, int N) -{ - extern __shared__ DType2 sdata[]; // externally managed shared memory - - // init shared memory - for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x) - { - for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z) - { - sdata[s_ind + c * GI.sector_dim].x = 0.0f; // Re - sdata[s_ind + c * GI.sector_dim].y = 0.0f; // Im - } - } - __syncthreads(); - - __shared__ int sec[THREAD_BLOCK_SIZE]; - sec[threadIdx.x] = blockIdx.x; - while (sec[threadIdx.x] < N) - { - __shared__ int data_max; - data_max = sectors[sec[threadIdx.x] + 1]; - textureConvolutionFunction2D(sdata, sec, data_max, 0, data, crds, gdata, - sectors, sector_centers); - __syncthreads(); - sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x; - } // sec < sector_count -} - -__global__ void balancedTextureConvolutionKernel2D( - DType2 *data, DType *crds, CufftType *gdata, IndType *sectors, - IndType2 *sector_processing_order, IndType *sector_centers, int N) -{ - extern __shared__ DType2 sdata[]; // externally managed shared memory - - // init shared memory - for (int s_ind = threadIdx.x; s_ind < GI.sector_dim; s_ind += blockDim.x) - { - for (int c = threadIdx.z; c < GI.n_coils_cc; c += blockDim.z) - { - sdata[s_ind + c * GI.sector_dim].x = 0.0f; // Re - sdata[s_ind + c * GI.sector_dim].y = 0.0f; // Im - } - } - __syncthreads(); - - int sec_cnt = blockIdx.x; - __shared__ int sec[THREAD_BLOCK_SIZE]; - - while (sec_cnt < N) - { - sec[threadIdx.x] = sector_processing_order[sec_cnt].x; - __shared__ int data_max; - data_max = min(sectors[sec[threadIdx.x] + 1], - sectors[sec[threadIdx.x]] - + sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD); - textureConvolutionFunction2D(sdata, sec, data_max, - sector_processing_order[sec_cnt].y, data, crds, - gdata, sectors, sector_centers); - __syncthreads(); - sec_cnt = sec_cnt + gridDim.x; - } // sec < sector_count -} - -void performTextureConvolution(DType2 *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - long shared_mem_size = - (gi_host->sector_dim) * sizeof(DType2) * gi_host->n_coils_cc; - int thread_size = THREAD_BLOCK_SIZE; - - dim3 block_dim(thread_size); - dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, 1)); - if (DEBUG) - { - printf("adjoint texture convolution requires %ld bytes of shared memory!\n", - shared_mem_size); - printf("grid dim %u, block dim %u \n", grid_dim.x, block_dim.x); - } - if (gi_host->is2Dprocessing) - { - dim3 block_dim( - 64, 1, - DEFAULT_VALUE(gi_host->n_coils_cc > 4 ? 4 : gi_host->n_coils_cc)); - textureConvolutionKernel2D <<>> - (data_d, crds_d, gdata_d, sectors_d, sector_centers_d, - gi_host->sector_count); - } - else - textureConvolutionKernel <<>> - (data_d, crds_d, gdata_d, sectors_d, sector_centers_d, - gi_host->sector_count); - - if (DEBUG) - printf("...finished with: %s\n", cudaGetErrorString(cudaGetLastError())); -} - -void performTextureConvolution(DType2 *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, - IndType2 *sector_processing_order_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - long shared_mem_size = - (gi_host->sector_dim) * sizeof(DType2) * gi_host->n_coils_cc; - int thread_size = THREAD_BLOCK_SIZE; - - dim3 block_dim(thread_size); - dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, 1)); - if (DEBUG) - { - printf("adjoint balanced texture convolution requires %ld bytes of shared " - "memory!\n", - shared_mem_size); - printf("grid dim %u, block dim %u \n", grid_dim.x, block_dim.x); - } - if (gi_host->is2Dprocessing) - { - dim3 block_dim( - 64, 1, - DEFAULT_VALUE(gi_host->n_coils_cc > 4 ? 4 : gi_host->n_coils_cc)); - //printf("block dims: %u %u %u!\n", block_dim.x, block_dim.y, block_dim.z); - balancedTextureConvolutionKernel2D - <<>> - (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d, - sector_centers_d, gi_host->sectorsToProcess); - } - else - balancedTextureConvolutionKernel <<>> - (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d, - sector_centers_d, gi_host->sectorsToProcess); - - if (DEBUG) - printf("...finished with: %s\n", cudaGetErrorString(cudaGetLastError())); -} - -// ---------------------------------------------------------------------------- -// forwardConvolutionKernel: NUFFT kernel -// -// Performs the inverse gpuNUFFT step by convolution of grid points with -// interpolation function and resampling onto trajectory. -// -// parameters: -// * data : complex output sample points -// * crds : coordinates of data points (x,y,z) -// * gdata : input grid data -// * sectors : mapping of sample indices according to each sector -// * sector_centers : coordinates (x,y,z) of sector centers -// * N : number of threads - -__device__ void -textureForwardConvolutionFunction(long int *sec, long int sec_max, long int sec_offset, - DType2 *sdata, CufftType *gdata_cache, - DType2 *data, DType *crds, CufftType *gdata, - IndType *sectors, IndType *sector_centers) -{ - int ind, imin, imax, jmin, jmax, kmin, kmax, ii, jj, kk; - DType dx_sqr, dy_sqr, dz_sqr, val, ix, jy, kz; - - __shared__ IndType3 center; - center.x = sector_centers[sec[threadIdx.x] * 3]; - center.y = sector_centers[sec[threadIdx.x] * 3 + 1]; - center.z = sector_centers[sec[threadIdx.x] * 3 + 2]; - - __shared__ long int sector_ind_offset; - sector_ind_offset = - computeXYZ2Lin(center.x - GI.sector_offset, center.y - GI.sector_offset, - center.z - GI.sector_offset, GI.gridDims); - - // init sector cache - // preload sector grid data into cache - for (long int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x) - { - long int grid_index; - getCoordsFromIndex(ind, &ii, &jj, &kk, GI.sector_pad_width); - - if (isOutlier(ii, jj, kk, center.x, center.y, center.z, GI.gridDims, - GI.sector_offset)) - // calculate opposite index - grid_index = computeXYZ2Lin( - calculateOppositeIndex(ii, center.x, GI.gridDims.x, GI.sector_offset), - calculateOppositeIndex(jj, center.y, GI.gridDims.y, GI.sector_offset), - calculateOppositeIndex(kk, center.z, GI.gridDims.z, GI.sector_offset), - GI.gridDims); - else - grid_index = (sector_ind_offset + computeXYZ2Lin(ii, jj, kk, GI.gridDims)); - - gdata_cache[ind].x = tex1Dfetch(texGDATA, grid_index).x; - gdata_cache[ind].y = tex1Dfetch(texGDATA, grid_index).y; - } - - __syncthreads(); - - // Grid Points over Threads - long int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; - - while (data_cnt < sec_max) - { - DType3 data_point; // datapoint per thread - data_point.x = crds[data_cnt]; - data_point.y = crds[data_cnt + GI.data_count]; - data_point.z = crds[data_cnt + 2 * GI.data_count]; - - // set the boundaries of final dataset for gpuNUFFT this point - ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x, - GI.sector_offset); - set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius); - jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y, - GI.sector_offset); - set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius); - kz = mapKSpaceToGrid(data_point.z, GI.gridDims.z, center.z, - GI.sector_offset); - set_minmax(&kz, &kmin, &kmax, GI.sector_pad_max, GI.kernel_radius); - - // convolve neighboring cartesian points to this data point - for (int k = kmin; k <= kmax; k++) - { - kz = mapGridToKSpace(k, GI.gridDims.z, center.z, GI.sector_offset); - dz_sqr = (kz - data_point.z) * GI.aniso_z_scale; - dz_sqr *= dz_sqr; - - for (int j = jmin; j <= jmax; j++) - { - jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset); - dy_sqr = (jy - data_point.y) * GI.aniso_y_scale; - dy_sqr *= dy_sqr; - - for (int i = imin; i <= imax; i++) - { - ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset); - dx_sqr = (ix - data_point.x) * GI.aniso_x_scale; - dx_sqr *= dx_sqr; - - // get kernel value - val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv, - dy_sqr * GI.radiusSquared_inv, - dz_sqr * GI.radiusSquared_inv); - - ind = getIndex(i, j, k, GI.sector_pad_width); - - sdata[threadIdx.x].x += gdata_cache[ind].x * val; - sdata[threadIdx.x].y += gdata_cache[ind].y * val; - } // x loop - } // y loop - } // z loop - atomicAdd(&(data[data_cnt].x), sdata[threadIdx.x].x); - atomicAdd(&(data[data_cnt].y), sdata[threadIdx.x].y); - - data_cnt = data_cnt + blockDim.x; - - sdata[threadIdx.x].x = (DType)0.0; // Re - sdata[threadIdx.x].y = (DType)0.0; // Im - } // data points per sector -} - -__global__ void textureForwardConvolutionKernel(CufftType *data, DType *crds, - CufftType *gdata, - IndType *sectors, - IndType *sector_centers, int N) -{ - extern __shared__ CufftType shared[]; // externally managed shared memory - CufftType *shared_out_data = (CufftType *)&shared[0]; - CufftType *gdata_cache = (CufftType *)&shared[blockDim.x]; - - __shared__ long int sec[THREAD_BLOCK_SIZE]; - sec[threadIdx.x] = blockIdx.x; - - // init shared memory - shared_out_data[threadIdx.x].x = (DType)0.0; // Re - shared_out_data[threadIdx.x].y = (DType)0.0; // Im - - __syncthreads(); - // start convolution - while (sec[threadIdx.x] < N) - { - __shared__ long int data_max; - data_max = sectors[sec[threadIdx.x] + 1]; - - textureForwardConvolutionFunction(sec, data_max, 0, shared_out_data, - gdata_cache, data, crds, gdata, sectors, - sector_centers); - __syncthreads(); - sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x; - } // sector check -} - -__global__ void balancedTextureForwardConvolutionKernel( - CufftType *data, DType *crds, CufftType *gdata, IndType *sectors, - IndType2 *sector_processing_order, IndType *sector_centers, int N) -{ - extern __shared__ CufftType shared[]; // externally managed shared memory - CufftType *shared_out_data = (CufftType *)&shared[0]; - CufftType *gdata_cache = (CufftType *)&shared[blockDim.x]; - - long int sec_cnt = blockIdx.x; - __shared__ long int sec[THREAD_BLOCK_SIZE]; - - // init shared memory - shared_out_data[threadIdx.x].x = (DType)0.0; // Re - shared_out_data[threadIdx.x].y = (DType)0.0; // Im - - __syncthreads(); - // start convolution - while (sec_cnt < N) - { - sec[threadIdx.x] = sector_processing_order[sec_cnt].x; - __shared__ long int data_max; - data_max = min(sectors[sec[threadIdx.x] + 1], - sectors[sec[threadIdx.x]] + - sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD); - - textureForwardConvolutionFunction( - sec, data_max, sector_processing_order[sec_cnt].y, shared_out_data, - gdata_cache, data, crds, gdata, sectors, sector_centers); - __syncthreads(); - sec_cnt = sec_cnt + gridDim.x; - } // sector check -} - -__device__ void -textureForwardConvolutionFunction2D(int *sec, int sec_max, int sec_offset, - DType2 *sdata, CufftType *gdata_cache, - DType2 *data, DType *crds, CufftType *gdata, - IndType *sectors, IndType *sector_centers) -{ - int ind, imin, imax, jmin, jmax, ii, jj; - DType val, ix, jy; - - __shared__ IndType2 center; - center.x = sector_centers[sec[threadIdx.x] * 2]; - center.y = sector_centers[sec[threadIdx.x] * 2 + 1]; - - __shared__ int sector_ind_offset; - sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset, - center.y - GI.sector_offset, GI.gridDims); - - // init sector cache - // preload sector grid data into cache - for (int ind = threadIdx.x; ind < GI.sector_dim; ind += blockDim.x) - { - int grid_index; - getCoordsFromIndex2D(ind, &ii, &jj, GI.sector_pad_width); - - // multiply data by current kernel val - // grid complex or scalar - if (isOutlier2D(ii, jj, center.x, center.y, GI.gridDims, GI.sector_offset)) - // calculate opposite index - grid_index = getIndex2D( - calculateOppositeIndex(ii, center.x, GI.gridDims.x, GI.sector_offset), - calculateOppositeIndex(jj, center.y, GI.gridDims.y, GI.sector_offset), - GI.gridDims.x); - else - grid_index = (sector_ind_offset + getIndex2D(ii, jj, GI.gridDims.x)); - - for (int c = 0; c < GI.n_coils_cc; c++) - { - gdata_cache[ind + c * GI.sector_dim].x = - tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).x; - gdata_cache[ind + c * GI.sector_dim].y = - tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).y; - } - } - __syncthreads(); - - // Grid Points over Threads - int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; - - while (data_cnt < sec_max) - { - DType2 data_point; // datapoint per thread - data_point.x = crds[data_cnt]; - data_point.y = crds[data_cnt + GI.data_count]; - - // set the boundaries of final dataset for gpuNUFFT this point - ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x, - GI.sector_offset); - set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius); - jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y, - GI.sector_offset); - set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius); - - // convolve neighboring cartesian points to this data point - for (int j = jmin; j <= jmax; j++) - { - jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset); - DType dy_sqr = (jy - data_point.y) * GI.aniso_y_scale; - dy_sqr *= dy_sqr; - - for (int i = imin; i <= imax; i++) - { - ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset); - DType dx_sqr = (ix - data_point.x) * GI.aniso_x_scale; - dx_sqr *= dx_sqr; - // get kernel value - // calc as separable filter - val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv, - dy_sqr * GI.radiusSquared_inv); - - ind = getIndex2D(i, j, GI.sector_pad_width); - - for (int c = 0; c < GI.n_coils_cc; c++) - { - sdata[threadIdx.x + c * blockDim.x].x += - gdata_cache[ind + c * GI.sector_dim].x * val; - sdata[threadIdx.x + c * blockDim.x].y += - gdata_cache[ind + c * GI.sector_dim].y * val; - } - } // x loop - } // y loop - - for (int c = 0; c < GI.n_coils_cc; c++) - { - atomicAdd(&(data[data_cnt + c * GI.data_count].x), - sdata[threadIdx.x + c * blockDim.x].x); - atomicAdd(&(data[data_cnt + c * GI.data_count].y), - sdata[threadIdx.x + c * blockDim.x].y); - sdata[threadIdx.x + c * blockDim.x].x = (DType)0.0; // Re - sdata[threadIdx.x + c * blockDim.x].y = (DType)0.0; // Im - } - - data_cnt = data_cnt + blockDim.x; - } // data points per sector -} - -__device__ void textureForwardConvolutionFunction22D( - int *sec, int sec_max, int sec_offset, DType2 *data, - DType *crds, CufftType *gdata, IndType *sectors, IndType *sector_centers) -{ - int imin, imax, jmin, jmax, i, j; - DType val, ix, jy; - - IndType2 center; - int sector_ind_offset; - center.x = sector_centers[sec[threadIdx.x] * 2]; - center.y = sector_centers[sec[threadIdx.x] * 2 + 1]; - - sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset, - center.y - GI.sector_offset, GI.gridDims); - - // Grid Points over Threads - int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; - __syncthreads(); - - while (data_cnt < sec_max) - { - DType2 data_point; // datapoint per thread - data_point.x = crds[data_cnt]; - data_point.y = crds[data_cnt + GI.data_count]; - - // set the boundaries of final dataset for gpuNUFFT this point - ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x, - GI.sector_offset); - set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius); - jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y, - GI.sector_offset); - set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius); - - // convolve neighboring cartesian points to this data point - int rangeX = imax - imin + 1; - int rangeY = jmax - jmin + 1; - int idx = threadIdx.y; - int grid_index; - - while (idx < (rangeX * rangeY)) - { - getCoordsFromIndex2D(idx, &i, &j, rangeX, rangeY); - i += imin; - j += jmin; - if (j <= jmax && j >= jmin) - { - jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset); - DType dy_sqr = (jy - data_point.y) * GI.aniso_y_scale; - dy_sqr *= dy_sqr; - if (i <= imax && i >= imin) - { - ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset); - DType dx_sqr = (ix - data_point.x) * GI.aniso_x_scale; - dx_sqr *= dx_sqr; - // get kernel value - // calc as separable filter - val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv, - dy_sqr * GI.radiusSquared_inv); - - if (isOutlier2D(i, j, center.x, center.y, GI.gridDims, - GI.sector_offset)) - // calculate opposite index - grid_index = - getIndex2D(calculateOppositeIndex(i, center.x, GI.gridDims.x, - GI.sector_offset), - calculateOppositeIndex(j, center.y, GI.gridDims.y, - GI.sector_offset), - GI.gridDims.x); - else - grid_index = (sector_ind_offset + getIndex2D(i, j, GI.gridDims.x)); - - for (int c = 0; c < GI.n_coils_cc; c++) - { - atomicAdd(&(data[data_cnt + c * GI.data_count].x), tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).x * val); - atomicAdd(&(data[data_cnt + c * GI.data_count].y), tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).y * val); - } - } // x if - } // y if - idx = idx + blockDim.y; - } - data_cnt = data_cnt + blockDim.x; - } // data points per sector -} - -__device__ void textureForwardConvolutionFunction32D( - int *sec, int sec_max, int sec_offset, DType *cache, DType2 *data, - DType *crds, CufftType *gdata, IndType *sectors, IndType *sector_centers) -{ - int imin, imax, jmin, jmax, i, j; - DType val, ix, jy; - - __shared__ IndType2 center; - center.x = sector_centers[sec[threadIdx.x] * 2]; - center.y = sector_centers[sec[threadIdx.x] * 2 + 1]; - - __shared__ int sector_ind_offset; - sector_ind_offset = computeXY2Lin(center.x - GI.sector_offset, - center.y - GI.sector_offset, GI.gridDims); - int grid_index; - - // Grid Points over Threads - int data_cnt = sectors[sec[threadIdx.x]] + threadIdx.x + sec_offset; - - while (data_cnt < sec_max) - { - DType2 data_point; // datapoint per thread - data_point.x = crds[data_cnt]; - data_point.y = crds[data_cnt + GI.data_count]; - - // set the boundaries of final dataset for gpuNUFFT this point - ix = mapKSpaceToGrid(data_point.x, GI.gridDims.x, center.x, - GI.sector_offset); - set_minmax(&ix, &imin, &imax, GI.sector_pad_max, GI.kernel_radius); - jy = mapKSpaceToGrid(data_point.y, GI.gridDims.y, center.y, - GI.sector_offset); - set_minmax(&jy, &jmin, &jmax, GI.sector_pad_max, GI.kernel_radius); - - // convolve neighboring cartesian points to this data point - int idx = threadIdx.y; - getCoordsFromIndex2D(idx, &i, &j, GI.kernel_width + 1, GI.kernel_width + 1); - i += imin; - j += jmin; - if (j <= jmax && j >= jmin) - { - jy = mapGridToKSpace(j, GI.gridDims.y, center.y, GI.sector_offset); - DType dy_sqr = (jy - data_point.y) * GI.aniso_y_scale; - dy_sqr *= dy_sqr; - if (i <= imax && i >= imin) - { - ix = mapGridToKSpace(i, GI.gridDims.x, center.x, GI.sector_offset); - DType dx_sqr = (ix - data_point.x) * GI.aniso_x_scale; - dx_sqr *= dx_sqr; - // get kernel value - // calc as separable filter - val = computeTextureLookup(dx_sqr * GI.radiusSquared_inv, - dy_sqr * GI.radiusSquared_inv); - cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] = val; - - if (isOutlier2D(i, j, center.x, center.y, GI.gridDims, - GI.sector_offset)) - // calculate opposite index - grid_index = - getIndex2D(calculateOppositeIndex(i, center.x, GI.gridDims.x, - GI.sector_offset), - calculateOppositeIndex(j, center.y, GI.gridDims.y, - GI.sector_offset), - GI.gridDims.x); - else - grid_index = (sector_ind_offset + getIndex2D(i, j, GI.gridDims.x)); - - for (int c = 0; c < GI.n_coils_cc; c++) - { - atomicAdd( - &(data[data_cnt + c * GI.data_count].x), - cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] * - tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).x); - atomicAdd( - &(data[data_cnt + c * GI.data_count].y), - cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] * - tex1Dfetch(texGDATA, grid_index + c * GI.gridDims_count).y); - } - } // x if - } // y if - - cache[GI.kernel_widthSquared * threadIdx.x + threadIdx.y] = 0; - data_cnt = data_cnt + blockDim.x; - } // data points per sector -} - -__global__ void textureForwardConvolutionKernel2D(CufftType *data, DType *crds, - CufftType *gdata, - IndType *sectors, - IndType *sector_centers, - int N) -{ - extern __shared__ CufftType shared[]; // externally managed shared memory - CufftType *shared_out_data = (CufftType *)&shared[0]; - CufftType *gdata_cache = (CufftType *)&shared[blockDim.x * GI.n_coils_cc]; - - __shared__ int sec[THREAD_BLOCK_SIZE]; - sec[threadIdx.x] = blockIdx.x; - - // init shared memory - for (int c = 0; c < GI.n_coils_cc; c++) - { - shared_out_data[threadIdx.x + c * blockDim.x].x = 0.0f; // Re - shared_out_data[threadIdx.x + c * blockDim.x].y = 0.0f; // Im - } - __syncthreads(); - // start convolution - while (sec[threadIdx.x] < N) - { - __shared__ int data_max; - data_max = sectors[sec[threadIdx.x] + 1]; - - textureForwardConvolutionFunction2D(sec, data_max, 0, shared_out_data, - gdata_cache, data, crds, gdata, sectors, - sector_centers); - - __syncthreads(); - sec[threadIdx.x] = sec[threadIdx.x] + gridDim.x; - } // sector check -} - -__global__ void balancedTextureForwardConvolutionKernel2D( - CufftType *data, DType *crds, CufftType *gdata, IndType *sectors, - IndType2 *sector_processing_order, IndType *sector_centers, int N) -{ - extern __shared__ CufftType shared[]; // externally managed shared memory - CufftType *shared_out_data = (CufftType *)&shared[0]; - CufftType *gdata_cache = (CufftType *)&shared[blockDim.x * GI.n_coils_cc]; - - __shared__ int sec[THREAD_BLOCK_SIZE]; - - // init shared memory - for (int c = 0; c < GI.n_coils_cc; c++) - { - shared_out_data[threadIdx.x + c * blockDim.x].x = 0.0f; // Re - shared_out_data[threadIdx.x + c * blockDim.x].y = 0.0f; // Im - } - __syncthreads(); - // start convolution - for (int sec_cnt = blockIdx.x; sec_cnt < N; sec_cnt += gridDim.x) - { - sec[threadIdx.x] = sector_processing_order[sec_cnt].x; - __shared__ int data_max; - data_max = min(sectors[sec[threadIdx.x] + 1], - sectors[sec[threadIdx.x]] + - sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD); - - textureForwardConvolutionFunction2D( - sec, data_max, sector_processing_order[sec_cnt].y, shared_out_data, - gdata_cache, data, crds, gdata, sectors, sector_centers); - - __syncthreads(); - } // sector check -} - -__global__ void balancedTextureForwardConvolutionKernel22D( - CufftType *data, DType *crds, CufftType *gdata, IndType *sectors, - IndType2 *sector_processing_order, IndType *sector_centers, int N) -{ - int sec_cnt = blockIdx.x; - __shared__ int sec[THREAD_BLOCK_SIZE]; - - // init shared memory - // start convolution - while (sec_cnt < N) - { - int data_max; - if (threadIdx.y == 0) - { - sec[threadIdx.x] = sector_processing_order[sec_cnt].x; - } - __syncthreads(); - - data_max = min(sectors[sec[threadIdx.x] + 1], - sectors[sec[threadIdx.x]] - + sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD); - - textureForwardConvolutionFunction22D( - sec, data_max, sector_processing_order[sec_cnt].y, data, crds, - gdata, sectors, sector_centers); - - sec_cnt = sec_cnt + gridDim.x; - __syncthreads(); - } // sector check -} - -__global__ void balancedTextureForwardConvolutionKernel32D( - CufftType *data, DType *crds, CufftType *gdata, IndType *sectors, - IndType2 *sector_processing_order, IndType *sector_centers, int N) -{ - extern __shared__ DType shared_cache[]; // externally managed shared memory - DType *cache = (DType *)&shared_cache[0]; - - int sec_cnt = blockIdx.x; - __shared__ int sec[THREAD_BLOCK_SIZE]; - - // init shared memory - cache[threadIdx.x * blockDim.y + threadIdx.y] = (DType)0.0; - __syncthreads(); - // start convolution - while (sec_cnt < N) - { - sec[threadIdx.x] = sector_processing_order[sec_cnt].x; - __shared__ int data_max; - data_max = min(sectors[sec[threadIdx.x] + 1], - sectors[sec[threadIdx.x]] + - sector_processing_order[sec_cnt].y + MAXIMUM_PAYLOAD); - - textureForwardConvolutionFunction32D( - sec, data_max, sector_processing_order[sec_cnt].y, cache, data, crds, - gdata, sectors, sector_centers); - - __syncthreads(); - sec_cnt = sec_cnt + gridDim.x; - } // sector check -} - -void performTextureForwardConvolution(CufftType *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - int thread_size = 192; - long shared_mem_size = (thread_size + gi_host->sector_dim) * - gi_host->n_coils_cc * sizeof(CufftType); - - dim3 block_dim(thread_size); - dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, thread_size)); - - if (DEBUG) - printf("texture forward convolution requires %ld bytes of shared memory!\n", - shared_mem_size); - if (gi_host->is2Dprocessing) - { - // dim3 block_dim(thread_size, 1, DEFAULT_VALUE(gi_host->n_coils_cc > 4 ? 1 - // : gi_host->n_coils_cc)); - dim3 block_dim(thread_size, 1, 1); // DEFAULT_VALUE(gi_host->n_coils_cc > 4 - // ? 1 : gi_host->n_coils_cc)); - textureForwardConvolutionKernel2D - <<>> - (data_d, crds_d, gdata_d, sectors_d, sector_centers_d, - gi_host->sector_count); - } - else - textureForwardConvolutionKernel <<>> - (data_d, crds_d, gdata_d, sectors_d, sector_centers_d, - gi_host->sector_count); -} - -void performTextureForwardConvolution(CufftType *data_d, DType *crds_d, - CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, - IndType2 *sector_processing_order_d, - IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - int thread_size = THREAD_BLOCK_SIZE; - long shared_mem_size = (thread_size + gi_host->sector_dim) * - gi_host->n_coils_cc * sizeof(CufftType); - - dim3 block_dim(thread_size); - dim3 grid_dim(getOptimalGridDim(gi_host->sector_count, thread_size)); - - if (DEBUG) - printf("balanced texture forward convolution requires %ld bytes of shared " - "memory!\n", - shared_mem_size); - if (gi_host->is2Dprocessing) - { - bool useV2cached = false; - - if (useV2cached) - { - int thread_size = 32; - int threadY = (gi_host->kernel_width + 1) * (gi_host->kernel_width + 1); - - long shared_mem_size = - (threadY * thread_size) * sizeof(DType); - - grid_dim = dim3(getOptimalGridDim(gi_host->sector_count, 1)); - - block_dim = getOptimal2DBlockDim(thread_size, threadY); - - if (DEBUG) - { - printf("balanced texture forward convolution 2 (2d) requires %ld bytes " - "of shared memory!\n", - shared_mem_size); - printf("block dims: %u %u %u!\n", block_dim.x, block_dim.y, block_dim.z); - printf("grid dims: %u %u %u!\n", grid_dim.x, grid_dim.y, grid_dim.z); - } - - balancedTextureForwardConvolutionKernel32D<<>> - (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d, sector_centers_d, gi_host->sectorsToProcess); - } - else - { - int thread_size = 32; - long shared_mem_size = - (gi_host->kernel_widthSquared * thread_size) * sizeof(DType); - - grid_dim = dim3(getOptimalGridDim(gi_host->sector_count, 1)); - - //TODO maybe it's better to round kwSqrd to the next multiple of 2 - block_dim = getOptimal2DBlockDim(thread_size, gi_host->kernel_widthSquared); - - if (DEBUG) - { - printf("balanced texture forward convolution 2 (2d) requires %ld bytes " - "of shared memory!\n", - shared_mem_size); - printf("grid dims: %u %u %u!\n", grid_dim.x, grid_dim.y, grid_dim.z); - printf("block dims: %u %u %u!\n", block_dim.x, block_dim.y, block_dim.z); - } - - balancedTextureForwardConvolutionKernel22D<<>> - (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d, sector_centers_d, gi_host->sectorsToProcess); - } - } - else - { - balancedTextureForwardConvolutionKernel - <<>> - (data_d, crds_d, gdata_d, sectors_d, sector_processing_order_d, - sector_centers_d, gi_host->sectorsToProcess); - } -} - -#endif diff --git a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp index 628538b7..2b3664f2 100644 --- a/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp +++ b/CUDA/src/gpu/python/gpuNUFFT_operator_python_factory.cpp @@ -222,13 +222,7 @@ class GpuNUFFTPythonOperator if(interpolate_data) gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu, gpuNUFFT::DENSITY_ESTIMATION); else - { - for(long int i=0; i<100000; i++) - { - printf("i = %ld\n", i); - gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu); - } - } + gpuNUFFTOp->performForwardGpuNUFFT(image_gpu, kspace_data_gpu); cudaDeviceSynchronize(); } @@ -448,7 +442,6 @@ class GpuNUFFTPythonOperator } ~GpuNUFFTPythonOperator() { - printf("Destructor called\n"); delete gpuNUFFTOp; } }; diff --git a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu index bff9118c..3eb8c192 100644 --- a/CUDA/src/gpu/std_gpuNUFFT_kernels.cu +++ b/CUDA/src/gpu/std_gpuNUFFT_kernels.cu @@ -21,18 +21,6 @@ void initConstSymbol(const char* symbol, const void* src, IndType size, cudaStre HANDLE_ERROR(cudaMemcpyToSymbolAsync(KERNEL, src, size, 0, cudaMemcpyHostToDevice, stream)); } -void bindTo1DTexture(const char* symbol, void* devicePtr, IndType count) -{ - if (std::string("texDATA").compare(symbol)==0) - { - HANDLE_ERROR (cudaBindTexture(NULL,texDATA, devicePtr,(unsigned long)count*sizeof(float2))); - } - else if (std::string("texGDATA").compare(symbol)==0) - { - HANDLE_ERROR (cudaBindTexture(NULL,texGDATA, devicePtr,(unsigned long)count*sizeof(cufftComplex))); - } -} - __global__ void updateDensityCompKernel(DType2* density_data, DType2* estimation_data, long int N) { long int t = threadIdx.x + blockIdx.x * blockDim.x; @@ -54,83 +42,6 @@ void performUpdateDensityComp(DType2* density_data, DType2* estimation_data, lon updateDensityCompKernel<<>>(density_data, estimation_data, n_samples); } -void initTexture(const char* symbol, cudaArray** devicePtr, gpuNUFFT::Array hostTexture) -{ - if (std::string("texKERNEL").compare(symbol)==0) - { - HANDLE_ERROR(cudaMallocArray (devicePtr, &texKERNEL.channelDesc, hostTexture.dim.width, 1)); - HANDLE_ERROR(cudaBindTextureToArray(texKERNEL, *devicePtr)); - HANDLE_ERROR(cudaMemcpyToArray(*devicePtr, 0, 0, hostTexture.data, sizeof(float)*hostTexture.count(), cudaMemcpyHostToDevice)); - - texKERNEL.filterMode = cudaFilterModePoint; - texKERNEL.normalized = true; - texKERNEL.addressMode[0] = cudaAddressModeClamp; - } - else if (std::string("texKERNEL2D").compare(symbol)==0) - { - HANDLE_ERROR(cudaMallocArray (devicePtr, &texKERNEL2D.channelDesc, hostTexture.dim.width, hostTexture.dim.height)); - - HANDLE_ERROR(cudaBindTextureToArray(texKERNEL2D, *devicePtr)); - HANDLE_ERROR(cudaMemcpyToArray(*devicePtr, 0, 0, hostTexture.data, sizeof(float)*hostTexture.count(), cudaMemcpyHostToDevice)); - - texKERNEL2D.filterMode = cudaFilterModeLinear; - texKERNEL2D.normalized = true; - texKERNEL2D.addressMode[0] = cudaAddressModeClamp; - texKERNEL2D.addressMode[1] = cudaAddressModeClamp; - } - else if (std::string("texKERNEL3D").compare(symbol)==0) - { - cudaExtent volumesize=make_cudaExtent(hostTexture.dim.width, hostTexture.dim.height, hostTexture.dim.depth); - cudaMalloc3DArray(devicePtr,&texKERNEL3D.channelDesc,volumesize); - - cudaMemcpy3DParms copyparams = {0}; - copyparams.extent=volumesize; - copyparams.dstArray=*devicePtr; - copyparams.kind=cudaMemcpyHostToDevice; - copyparams.srcPtr= make_cudaPitchedPtr((void*)hostTexture.data,sizeof(float)*hostTexture.dim.width,hostTexture.dim.height,hostTexture.dim.depth); - - HANDLE_ERROR(cudaMemcpy3D(©params)); - HANDLE_ERROR(cudaBindTextureToArray(texKERNEL3D, *devicePtr)); - - texKERNEL3D.filterMode = cudaFilterModeLinear; - texKERNEL3D.normalized = true; - texKERNEL3D.addressMode[0] = cudaAddressModeClamp; - texKERNEL3D.addressMode[1] = cudaAddressModeClamp; - texKERNEL3D.addressMode[2] = cudaAddressModeClamp; - } -} - -void unbindTexture(const char* symbol) -{ - if (std::string("texKERNEL").compare(symbol)==0) - { - HANDLE_ERROR(cudaUnbindTexture(texKERNEL)); - } - else if (std::string("texKERNEL2D").compare(symbol)==0) - { - HANDLE_ERROR(cudaUnbindTexture(texKERNEL2D)); - } - else if (std::string("texKERNEL3D").compare(symbol)==0) - { - HANDLE_ERROR(cudaUnbindTexture(texKERNEL3D)); - } - else if (std::string("texDATA").compare(symbol)==0) - { - HANDLE_ERROR(cudaUnbindTexture(texDATA)); - } - else if (std::string("texGDATA").compare(symbol)==0) - { - HANDLE_ERROR(cudaUnbindTexture(texGDATA)); - } -} - - -void freeTexture(const char* symbol, cudaArray* devicePtr) -{ - unbindTexture(symbol); - HANDLE_ERROR(cudaFreeArray(devicePtr)); -} - __global__ void fftScaleKernel(CufftType* data, DType scaling, long int N) { long int t = threadIdx.x + blockIdx.x *blockDim.x; diff --git a/CUDA/src/gpuNUFFT_operator.cpp b/CUDA/src/gpuNUFFT_operator.cpp index 78b22786..bda4e983 100644 --- a/CUDA/src/gpuNUFFT_operator.cpp +++ b/CUDA/src/gpuNUFFT_operator.cpp @@ -243,7 +243,6 @@ void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils, int n_coils_cc) initLookupTable(); - // allocateAndCopyToDeviceMem(&kernel_d,kernel,kernel_count); if (DEBUG) printf("allocate and copy sectors of size %d...\n", sector_count + 1); allocateAndCopyToDeviceMem(§ors_d, this->sectorDataCount.data, @@ -287,11 +286,11 @@ void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils, int n_coils_cc) printf("creating cufft plan with %d,%d,%d dimensions\n", DEFAULT_VALUE(gi_host->gridDims.z), gi_host->gridDims.y, gi_host->gridDims.x); - // cufftResult res = cufftPlan3d( - // &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z), - // (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType); - // if (res != CUFFT_SUCCESS) - // fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res); + cufftResult res = cufftPlan3d( + &fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z), + (int)gi_host->gridDims.y, (int)gi_host->gridDims.x, CufftTransformType); + if (res != CUFFT_SUCCESS) + fprintf(stderr, "error on CUFFT Plan creation!!! %d\n", res); gpuMemAllocated = true; } @@ -471,6 +470,7 @@ void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj( continue; freeTotalDeviceMemory(imdata_sum_d, NULL); + this->freeDeviceMemory(); return; } if ((cudaDeviceSynchronize() != cudaSuccess)) @@ -936,7 +936,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (DEBUG) printf("allocate and copy imdata of size %d...\n", imdata_count * n_coils_cc); - //allocateDeviceMem(&imdata_d, imdata_count * n_coils_cc); + allocateDeviceMem(&imdata_d, imdata_count * n_coils_cc); if (debugTiming) printf("Memory allocation: %.2f ms\n", stopTiming()); @@ -950,9 +950,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( unsigned long int data_coil_offset = (long int)coil_it * data_count; unsigned long int im_coil_offset = coil_it * (long int)imdata_count; - //data_d = kspaceData_gpu.data + data_coil_offset; + data_d = kspaceData_gpu.data + data_coil_offset; -// this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc); + this->updateConcurrentCoilCount(coil_it, n_coils, n_coils_cc); if (this->applySensData()) // perform automatically "repeating" of input image in case @@ -961,13 +961,13 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( copyDeviceToDeviceAsync(imgData_gpu.data, imdata_d + cnt * imdata_count, imdata_count, new_stream); else -// copyDeviceToDeviceAsync(imgData_gpu.data + im_coil_offset, imdata_d, - // imdata_count * n_coils_cc, new_stream); + copyDeviceToDeviceAsync(imgData_gpu.data + im_coil_offset, imdata_d, + imdata_count * n_coils_cc, new_stream); // reset temp arrays -// cudaMemsetAsync(gdata_d, 0, - // sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream); - //cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream); + cudaMemsetAsync(gdata_d, 0, + sizeof(CufftType) * gi_host->grid_width_dim * n_coils_cc, new_stream); + cudaMemsetAsync(data_d, 0, sizeof(CufftType) * data_count * n_coils_cc, new_stream); if (DEBUG && (cudaStreamSynchronize(new_stream)!= cudaSuccess)) printf("error at thread synchronization 1: %s\n", @@ -995,19 +995,28 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( writeOrderedGPU(data_sorted_d, data_indices_d, data_d, (int)this->kSpaceTraj.count(), n_coils_cc); copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream); + if(coil_it > 1) + { + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); + } + old_stream = new_stream; if ((coil_it + n_coils_cc) < (n_coils)) continue; + + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); freeTotalDeviceMemory(imdata_d, NULL); this->freeDeviceMemory(); return; } // apodization Correction - //performForwardDeapodization(imdata_d, deapo_d, gi_host); + performForwardDeapodization(imdata_d, deapo_d, gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 2: %s\n", cudaGetErrorString(cudaGetLastError())); // resize by oversampling factor and zero pad - //performPadding(imdata_d, gdata_d, gi_host); + performPadding(imdata_d, gdata_d, gi_host); if (debugTiming) startTiming(); @@ -1016,7 +1025,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( printf("error at thread synchronization 3: %s\n", cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position - //performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host); + performFFTShift(gdata_d, INVERSE, getGridDims(), gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 4: %s\n", @@ -1026,9 +1035,9 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( int c = 0; while (c < n_coils_cc) { - // if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count, - // gdata_d + c * gi_host->gridDims_count, - // grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS) + if ((err = pt2CufftExec(fft_plan, gdata_d + c * gi_host->gridDims_count, + gdata_d + c * gi_host->gridDims_count, + grad_mode?CUFFT_INVERSE:CUFFT_FORWARD)) != CUFFT_SUCCESS) { fprintf(stderr, "cufft has failed with err %i \n", err); showMemoryInfo(true, stderr); @@ -1039,7 +1048,7 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 5: %s\n", cudaGetErrorString(cudaGetLastError())); - //performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host); + performFFTShift(gdata_d, grad_mode?INVERSE:FORWARD, getGridDims(), gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 6: %s\n", @@ -1052,8 +1061,8 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( startTiming(); // convolution and resampling to non-standard trajectory - //forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d, - // sector_centers_d, gi_host); + forwardConvolution(data_d, crds_d, gdata_d, NULL, sectors_d, + sector_centers_d, gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error at thread synchronization 7: %s\n", cudaGetErrorString(cudaGetLastError())); @@ -1061,26 +1070,26 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( if (debugTiming) printf("Forward Convolution: %.2f ms\n", stopTiming()); - // performFFTScaling(data_d, gi_host->data_count, gi_host); + performFFTScaling(data_d, gi_host->data_count, gi_host); if (DEBUG && (cudaStreamSynchronize(new_stream) != cudaSuccess)) printf("error: at thread synchronization 8: %s\n", cudaGetErrorString(cudaGetLastError())); // write result in correct order back into output array - // writeOrderedGPU(data_sorted_d, data_indices_d, data_d, - // (int)this->kSpaceTraj.count(), n_coils_cc); + writeOrderedGPU(data_sorted_d, data_indices_d, data_d, + (int)this->kSpaceTraj.count(), n_coils_cc); if(coil_it > 1) { cudaStreamSynchronize(old_stream); cudaStreamDestroy(old_stream); } - // copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream); + copyDeviceToDeviceAsync(data_sorted_d, data_d, data_count * n_coils_cc, new_stream); old_stream = new_stream; } // iterate over coils cudaStreamSynchronize(old_stream); cudaStreamDestroy(old_stream); - // freeTotalDeviceMemory(imdata_d, NULL); + freeTotalDeviceMemory(imdata_d, NULL); this->freeDeviceMemory(); if ((cudaDeviceSynchronize() != cudaSuccess)) fprintf(stderr, "error in performForwardGpuNUFFT function: %s\n", @@ -1210,8 +1219,16 @@ void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT( (int)this->kSpaceTraj.count(), n_coils_cc); copyFromDeviceAsync(data_sorted_d, kspaceData.data + data_coil_offset, data_count * n_coils_cc, new_stream); + if(coil_it > 1) + { + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); + } + old_stream = new_stream; if ((coil_it + n_coils_cc) < (n_coils)) continue; + cudaStreamSynchronize(old_stream); + cudaStreamDestroy(old_stream); freeTotalDeviceMemory(data_d, imdata_d, NULL); this->freeDeviceMemory(); return; diff --git a/CUDA/src/gpuNUFFT_operator_factory.cpp b/CUDA/src/gpuNUFFT_operator_factory.cpp index 647c8840..86a74e1c 100644 --- a/CUDA/src/gpuNUFFT_operator_factory.cpp +++ b/CUDA/src/gpuNUFFT_operator_factory.cpp @@ -10,11 +10,6 @@ #include #include -void gpuNUFFT::GpuNUFFTOperatorFactory::setUseTextures(bool useTextures) -{ - this->useTextures = useTextures; -} - void gpuNUFFT::GpuNUFFTOperatorFactory::setBalanceWorkload(bool balanceWorkload) { this->balanceWorkload = balanceWorkload; @@ -120,9 +115,6 @@ void gpuNUFFT::GpuNUFFTOperatorFactory::computeProcessingOrder( if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED) static_cast(gpuNUFFTOp) ->setSectorProcessingOrder(sectorProcessingOrder); - else - static_cast(gpuNUFFTOp) - ->setSectorProcessingOrder(sectorProcessingOrder); } gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::assignSectors( @@ -327,33 +319,14 @@ gpuNUFFT::GpuNUFFTOperatorFactory::createNewGpuNUFFTOperator( { if (balanceWorkload) { - if (useTextures) - { - debug("creating Balanced 2D TextureLookup Operator!\n"); - return new gpuNUFFT::BalancedTextureGpuNUFFTOperator( - kernelWidth, sectorWidth, osf, imgDims, TEXTURE2D_LOOKUP, - this->matlabSharedMem); - } - else - { - debug("creating Balanced GpuNUFFT Operator!\n"); - return new gpuNUFFT::BalancedGpuNUFFTOperator(kernelWidth, sectorWidth, - osf, imgDims, this->matlabSharedMem); - } + debug("creating Balanced GpuNUFFT Operator!\n"); + return new gpuNUFFT::BalancedGpuNUFFTOperator(kernelWidth, sectorWidth, + osf, imgDims, this->matlabSharedMem); } - if (useTextures) - { - debug("creating 2D TextureLookup Operator!\n"); - return new gpuNUFFT::TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf, - imgDims, TEXTURE2D_LOOKUP, this->matlabSharedMem); - } - else - { - debug("creating DEFAULT GpuNUFFT Operator!\n"); - return new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, + debug("creating DEFAULT GpuNUFFT Operator!\n"); + return new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims, true, DEFAULT, true); - } } gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFunction( @@ -365,11 +338,7 @@ gpuNUFFT::Array gpuNUFFT::GpuNUFFTOperatorFactory::computeDeapodizationFu IndType sectorWidth = 8; gpuNUFFT::GpuNUFFTOperator *deapoGpuNUFFTOp; - if (useTextures) - deapoGpuNUFFTOp = new gpuNUFFT::TextureGpuNUFFTOperator(kernelWidth, sectorWidth, osf, - imgDims, TEXTURE2D_LOOKUP); - else - deapoGpuNUFFTOp = new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims); + deapoGpuNUFFTOp = new gpuNUFFT::GpuNUFFTOperator(kernelWidth, sectorWidth, osf, imgDims); // Data gpuNUFFT::Array dataArray; @@ -540,8 +509,8 @@ void gpuNUFFT::GpuNUFFTOperatorFactory::set_pts( gpuNUFFTOp->setSectorDataCount( computeSectorDataCount(gpuNUFFTOp, assignedSectors)); - if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED || - gpuNUFFTOp->getType() == gpuNUFFT::BALANCED_TEXTURE) { + if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED) + { computeProcessingOrder(gpuNUFFTOp); } @@ -605,9 +574,6 @@ gpuNUFFT::GpuNUFFTOperatorFactory::loadPrecomputedGpuNUFFTOperator( if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED) static_cast(gpuNUFFTOp) ->setSectorProcessingOrder(sectorProcessingOrder); - else if (gpuNUFFTOp->getType() == gpuNUFFT::BALANCED_TEXTURE) - static_cast(gpuNUFFTOp) - ->setSectorProcessingOrder(sectorProcessingOrder); gpuNUFFTOp->setSectorCenters(sectorCenters); gpuNUFFTOp->setSens(sensData); diff --git a/CUDA/src/texture_gpuNUFFT_operator.cpp b/CUDA/src/texture_gpuNUFFT_operator.cpp deleted file mode 100644 index 80bfff94..00000000 --- a/CUDA/src/texture_gpuNUFFT_operator.cpp +++ /dev/null @@ -1,103 +0,0 @@ - -#include "texture_gpuNUFFT_operator.hpp" - -void gpuNUFFT::TextureGpuNUFFTOperator::initKernel() -{ - IndType kernelSize = (interpolationType > 1) - ? calculateKernelSizeLinInt(osf, kernelWidth) - : calculateGrid3KernelSize(osf, kernelWidth); - this->kernel.dim.width = kernelSize; - this->kernel.dim.height = interpolationType > 1 ? kernelSize : 1; - this->kernel.dim.depth = interpolationType > 2 ? kernelSize : 1; - if (this->kernel.data != NULL) - free(this->kernel.data); - this->kernel.data = (DType *)calloc(this->kernel.count(), sizeof(DType)); - - switch (interpolationType) - { - case TEXTURE_LOOKUP: - load1DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf); - break; - case TEXTURE2D_LOOKUP: - load2DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf); - break; - case TEXTURE3D_LOOKUP: - load3DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf); - break; - default: - load1DKernel(this->kernel.data, (int)kernelSize, (int)kernelWidth, osf); - } -} - -const char *gpuNUFFT::TextureGpuNUFFTOperator::getInterpolationTypeName() -{ - switch (interpolationType) - { - case TEXTURE_LOOKUP: - return "texKERNEL"; - case TEXTURE2D_LOOKUP: - return "texKERNEL2D"; - case TEXTURE3D_LOOKUP: - return "texKERNEL3D"; - default: - return "KERNEL"; - } -} - -gpuNUFFT::GpuNUFFTInfo * -gpuNUFFT::TextureGpuNUFFTOperator::initAndCopyGpuNUFFTInfo(int n_coils_cc) -{ - gpuNUFFT::GpuNUFFTInfo *gi_host = initGpuNUFFTInfo(n_coils_cc); - - gi_host->interpolationType = interpolationType; - gi_host->sectorsToProcess = gi_host->sector_count; - - if (DEBUG) - printf("copy GpuNUFFT Info to symbol memory... size = %lu \n", - (SizeType)sizeof(gpuNUFFT::GpuNUFFTInfo)); - - initConstSymbol("GI", gi_host, sizeof(gpuNUFFT::GpuNUFFTInfo)); - - if (DEBUG) - printf("...done!\n"); - return gi_host; -} - -void gpuNUFFT::TextureGpuNUFFTOperator::adjConvolution( - DType2 *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - bindTo1DTexture("texDATA", data_d, - this->kSpaceTraj.count() * gi_host->n_coils_cc); - - performTextureConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d, - sector_centers_d, gi_host); - - unbindTexture("texDATA"); -} - -void gpuNUFFT::TextureGpuNUFFTOperator::forwardConvolution( - CufftType *data_d, DType *crds_d, CufftType *gdata_d, DType *kernel_d, - IndType *sectors_d, IndType *sector_centers_d, - gpuNUFFT::GpuNUFFTInfo *gi_host) -{ - bindTo1DTexture("texGDATA", gdata_d, - gi_host->grid_width_dim * gi_host->n_coils_cc); - - performTextureForwardConvolution(data_d, crds_d, gdata_d, kernel_d, sectors_d, - sector_centers_d, gi_host); - - unbindTexture("texGDATA"); -} - -void gpuNUFFT::TextureGpuNUFFTOperator::initLookupTable() -{ - initTexture(getInterpolationTypeName(), &kernel_d, this->kernel); -} - -void gpuNUFFT::TextureGpuNUFFTOperator::freeLookupTable() -{ - if (kernel_d != NULL) - freeTexture(getInterpolationTypeName(), kernel_d); -} diff --git a/setup.py b/setup.py index e12094c9..e4573946 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def build_extension(self, ext): "-DGEN_PYTHON_FILES=ON", "-DGEN_MEX_FILES=OFF", "-DPYBIND11_INCLUDE_DIR=" + self.pybind_path] - cfg = "Debug"# if self.debug else "Release" + cfg = "Debug" if self.debug else "Release" build_args = ["--config", cfg] if platform.system() == "Windows": @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.8.1", + version="0.9.0", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")), From 5e267bba49959ae2a59613a0860e6d75c6392c74 Mon Sep 17 00:00:00 2001 From: Chaithya G R Date: Thu, 29 Aug 2024 15:42:24 +0200 Subject: [PATCH 85/85] Fix memory leak --- CUDA/inc/gpuNUFFT_operator.hpp | 13 +++++++++++++ setup.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CUDA/inc/gpuNUFFT_operator.hpp b/CUDA/inc/gpuNUFFT_operator.hpp index 2bbc6cd0..7ba55745 100644 --- a/CUDA/inc/gpuNUFFT_operator.hpp +++ b/CUDA/inc/gpuNUFFT_operator.hpp @@ -93,18 +93,27 @@ class GpuNUFFTOperator void setKSpaceTraj(Array kSpaceTraj) { + if (this->kSpaceTraj.data != NULL) + freeLocalMemberArray(this->kSpaceTraj.data); this->kSpaceTraj = kSpaceTraj; } void setSectorCenters(Array sectorCenters) { + if(this->sectorCenters.data != NULL) + freeLocalMemberArray(this->sectorCenters.data); this->sectorCenters = sectorCenters; } void setSectorDataCount(Array sectorDataCount) { + if(this->sectorDataCount.data != NULL) + freeLocalMemberArray(this->sectorDataCount.data); + this->sectorDataCount = sectorDataCount; } void setDataIndices(Array dataIndices) { + if (this->dataIndices.data != NULL) + freeLocalMemberArray(this->dataIndices.data); this->dataIndices = dataIndices; } void setSens(Array sens) @@ -113,10 +122,14 @@ class GpuNUFFTOperator } void setDens(Array dens) { + if (this->dens.data != NULL) + freeLocalMemberArray(this->dens.data); this->dens = dens; } void setDeapodizationFunction(Array deapo) { + if (this->deapo.data != NULL) + freeLocalMemberArray(this->deapo.data); this->deapo= deapo; } diff --git a/setup.py b/setup.py index e4573946..8be8c15e 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extension(self, ext): setup( name="gpuNUFFT", - version="0.9.0", + version="0.10.0", description="gpuNUFFT - An open source GPU Library for 3D Gridding and NUFFT", ext_modules=[ CMakeExtension("gpuNUFFT", sourcedir=os.path.join("CUDA")),