diff --git a/CMakeLists.txt b/CMakeLists.txt index cfeea6ee9c..9664237ab3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,6 +97,27 @@ set(DEBUG_MEMORY "Off" CACHE STRING "Sanitizers") # emscripten option(EMSCRIPTEN_HTML "Emscripten HTML output" OFF) +# option(CUPDLP_GPU "Build pdlp with CPU" ON) +# message(STATUS "Build pdlp with CPU: ${CUPDLP_CPU}") + +option(CUPDLP_GPU "Build pdlp with GPU" OFF) +message(STATUS "Build pdlp with GPU: ${CUPDLP_GPU}") + +if (NOT LINUX) + set (CUPDLP_GPU OFF) + message(STATUS "CUPLDP with Nvidia is only supported on Linux at the moment. Using CPU version.") +endif() + +if (CUPDLP_GPU) + set (CUPDLP_CPU OFF) + list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}) + message(NOTICE "Set build cuPDLP with CUDA") + include(FindCUDAConf.cmake) +else() + set (CUPDLP_CPU ON) + set(CUDA_LIBRARY-NOTFOUND true) +endif() + if (BUILD_CXX) # Default Build Type to be Release get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) @@ -248,7 +269,11 @@ if (BUILD_CXX) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) message(STATUS "IPO / LTO: enabled") endif() -endif() + if (CUPDLP_GPU AND CMAKE_INTERPROCEDURAL_OPTIMIZATION) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE) + message(STATUS "IPO / LTO is not supported at the moment when PDLP is using GPU: LTO disabled.") + endif() + include(CheckCXXSourceCompiles) check_cxx_source_compiles( diff --git a/FindCUDAConf.cmake b/FindCUDAConf.cmake new file mode 100644 index 0000000000..b6697918da --- /dev/null +++ b/FindCUDAConf.cmake @@ -0,0 +1,33 @@ + +set(CUDA_LIBRARY-NOTFOUND, OFF) +message(NOTICE "Finding CUDA environment") +message(NOTICE " - CUDA Home detected at $ENV{CUDA_HOME}") +set(CMAKE_CUDA_ARCHITECTURES "all") +set(CMAKE_CUDA_PATH "$ENV{CUDA_HOME}") +set(CMAKE_CUDA_COMPILER "${CMAKE_CUDA_PATH}/bin/nvcc") + +enable_language(CUDA) + +find_library(CUDA_LIBRARY_ART + NAMES cudart + HINTS "${CMAKE_CUDA_PATH}/lib64/" + REQUIRED +) +find_library(CUDA_LIBRARY_SPS + NAMES cusparse + HINTS "${CMAKE_CUDA_PATH}/lib64/" + REQUIRED +) +find_library(CUDA_LIBRARY_BLS + NAMES cublas + HINTS "${CMAKE_CUDA_PATH}/lib64/" + REQUIRED +) +if (${CUDA_LIBRARY-NOTFOUND}) + message(WARNING " - CUDA Libraries not detected at $ENV{CUDA_HOME}") +else () + message(NOTICE " - CUDA Libraries detected at $ENV{CUDA_HOME}") + set(CUDA_LIBRARY ${CUDA_LIBRARY_ART} ${CUDA_LIBRARY_SPS} ${CUDA_LIBRARY_BLS}) + message(NOTICE " - :${CUDA_LIBRARY}") +endif () + diff --git a/check/CMakeLists.txt b/check/CMakeLists.txt index 0fac3894aa..cd228a86c6 100644 --- a/check/CMakeLists.txt +++ b/check/CMakeLists.txt @@ -13,6 +13,19 @@ if (FORTRAN) ${HIGHS_SOURCE_DIR}/check) endif() +if (CUPLDP_GPU) + # add a test + add_executable(testcudalin ${HIGHS_SOURCE_DIR}/src/pdlp/cupdlp/cuda/test_cuda_linalg.c) + add_executable(testcublas ${HIGHS_SOURCE_DIR}/src/pdlp/cupdlp/cuda/test_cublas.c) + + set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + #target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda) + target_link_libraries(testcudalin PRIVATE highs ${CUDA_LIBRARY}) + + set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(testcublas PRIVATE highs ${CUDA_LIBRARY}) +endif() + if (NOT FAST_BUILD OR ALL_TESTS) # prepare Catch library set(CATCH_INCLUDE_DIR ${HIGHS_SOURCE_DIR}/extern) diff --git a/cmake/cpp-highs.cmake b/cmake/cpp-highs.cmake index 88a529a6f3..56e6a03d3a 100644 --- a/cmake/cpp-highs.cmake +++ b/cmake/cpp-highs.cmake @@ -48,18 +48,35 @@ install(FILES ${PROJECT_BINARY_DIR}/highs_export.h string (TOLOWER ${PROJECT_NAME} lower) -install(TARGETS highs - EXPORT ${lower}-targets - INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs) - -# Add library targets to the build-tree export set -export(TARGETS highs - NAMESPACE ${PROJECT_NAMESPACE}::highs - FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake") +# if (NOT CUPDLP_GPU) + install(TARGETS highs + EXPORT ${lower}-targets + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs) + + # Add library targets to the build-tree export set + export(TARGETS highs + NAMESPACE ${PROJECT_NAMESPACE}::highs + FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake") +# else() + +# install(TARGETS highs cudalin +# EXPORT ${lower}-targets +# INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs +# ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} +# LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} +# RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +# PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs) + +# # Add library targets to the build-tree export set +# export(TARGETS highs cudalin +# NAMESPACE ${PROJECT_NAMESPACE}::highs +# FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake") +# endif() + install(EXPORT ${lower}-targets NAMESPACE ${PROJECT_NAMESPACE}:: diff --git a/cmake/python-highs.cmake b/cmake/python-highs.cmake index 4a429cb110..4217a4e15a 100644 --- a/cmake/python-highs.cmake +++ b/cmake/python-highs.cmake @@ -45,6 +45,15 @@ target_link_libraries(_core PRIVATE pybind11::headers) # sources for python target_sources(_core PUBLIC ${sources_python} ${headers_python}) +if (CUPDLP_GPU) + enable_language(CXX CUDA) + target_sources(_core PRIVATE ${cuda_sources_python}) + + target_include_directories(_core PUBLIC "/usr/local/cuda/include") + set_target_properties(_core PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +endif() + + # include directories for python target_include_directories(_core PUBLIC ${include_dirs_python}) diff --git a/cmake/sources-python.cmake b/cmake/sources-python.cmake index df2ff5a223..a6c65608f1 100644 --- a/cmake/sources-python.cmake +++ b/cmake/sources-python.cmake @@ -43,6 +43,12 @@ set(cupdlp_headers_python src/pdlp/cupdlp/cupdlp_step.h src/pdlp/cupdlp/cupdlp_utils.c) +set(cuda_sources_python + pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu + pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh + pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh + pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu) + set(basiclu_sources_python src/ipm/basiclu/basiclu_factorize.c src/ipm/basiclu/basiclu_get_factors.c diff --git a/cmake/sources.cmake b/cmake/sources.cmake index 96e84b7f5b..e1e8e248f6 100644 --- a/cmake/sources.cmake +++ b/cmake/sources.cmake @@ -43,6 +43,13 @@ set(cupdlp_headers pdlp/cupdlp/cupdlp_step.h pdlp/cupdlp/cupdlp_utils.c) +set(cuda_sources + pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu + pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh + pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh + pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu) + + set(basiclu_sources ipm/basiclu/basiclu_factorize.c ipm/basiclu/basiclu_get_factors.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 69a78869db..915917818d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -177,6 +177,17 @@ else() # $) target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file}) + + # Optional Cuda + if (CUPDLP_GPU) + enable_language(CXX CUDA) + target_sources(highs PRIVATE ${cuda_sources}) + + set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + target_link_libraries(highs ${CUDA_LIBRARY} m) + endif() + target_include_directories(highs PRIVATE ${include_dirs}) if(MSVC) diff --git a/src/HConfig.h.in b/src/HConfig.h.in index 198e07b0d3..b4e6d6cb5f 100644 --- a/src/HConfig.h.in +++ b/src/HConfig.h.in @@ -4,6 +4,7 @@ #cmakedefine FAST_BUILD #cmakedefine ZLIB_FOUND #cmakedefine CUPDLP_CPU +#cmakedefine CUPDLP_GPU #cmakedefine CMAKE_BUILD_TYPE "@CMAKE_BUILD_TYPE@" #cmakedefine CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@" #cmakedefine HIGHSINT64 diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu new file mode 100644 index 0000000000..519408b7c0 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu @@ -0,0 +1,121 @@ +#include "cupdlp_cuda_kernels.cuh" + +dim3 cuda_gridsize(cupdlp_int n) { + cupdlp_int k = (n - 1) / CUPDLP_BLOCK_SIZE + 1; + cupdlp_int x = k; + cupdlp_int y = 1; + if (x > 65535) { + x = ceil(sqrt(k)); + y = (n - 1) / (x * CUPDLP_BLOCK_SIZE) + 1; + } + dim3 d = {x, y, 1}; + return d; +} + +__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] *= y[i]; +} + +__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] /= y[i]; +} + +__global__ void element_wise_projlb_kernel(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] < lb[i] ? lb[i] : x[i]; +} + +__global__ void element_wise_projub_kernel(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] > ub[i] ? ub[i] : x[i]; +} + +__global__ void element_wise_projSamelb_kernel(cupdlp_float *x, + const cupdlp_float lb, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] <= lb ? lb : x[i]; +} + +__global__ void element_wise_projSameub_kernel(cupdlp_float *x, + const cupdlp_float ub, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] >= ub ? ub : x[i]; +} + +__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) haslb[i] = lb[i] > bound ? 1.0 : 0.0; +} + +__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) hasub[i] = ub[i] < bound ? 1.0 : 0.0; +} + +__global__ void element_wise_filterlb_kernal(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = lb[i] > bound ? lb[i] : 0.0; +} + +__global__ void element_wise_filterub_kernal(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = ub[i] < bound ? ub[i] : 0.0; +} + +__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = val; +} + +//xUpdate = x - dPrimalStep * (cost - ATy) +__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate, + const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) xUpdate[i] = x[i] - dPrimalStep * (cost[i] - ATy[i]); +} + +//yUpdate = y + dDualStep * (b -2AxUpdate + Ax) +__global__ void dual_grad_step_kernal(cupdlp_float *yUpdate, + const cupdlp_float *y, + const cupdlp_float *b, + const cupdlp_float *Ax, + const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) yUpdate[i] = y[i] + dDualStep * (b[i] - 2 * AxUpdate[i] + Ax[i]); +} + +// z = x - y +__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) z[i] = x[i] - y[i]; +} \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh new file mode 100644 index 0000000000..c4240abd98 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh @@ -0,0 +1,181 @@ +#ifndef CUPDLP_CUDA_KERNALS_H +#define CUPDLP_CUDA_KERNALS_H + +#include "cuda_runtime.h" +#define CUPDLP_BLOCK_SIZE 512 + +#ifndef SFLOAT +#ifdef DLONG +typedef long long cupdlp_int; +#else +typedef int cupdlp_int; +#endif +typedef double cupdlp_float; +#define CudaComputeType CUDA_R_64F +#else +#define CudaComputeType CUDA_R_32F +#endif + +#define CHECK_CUDA(func) \ + { \ + cudaError_t status = (func); \ + if (status != cudaSuccess) { \ + printf("CUDA API failed at line %d of %s with error: %s (%d)\n", \ + __LINE__, __FILE__, cudaGetErrorString(status), status); \ + return EXIT_FAILURE; \ + } \ + } + +#define CHECK_CUSPARSE(func) \ + { \ + cusparseStatus_t status = (func); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d of %s with error: %s (%d)\n", \ + __LINE__, __FILE__, cusparseGetErrorString(status), status); \ + return EXIT_FAILURE; \ + } \ + } + +#define CHECK_CUBLAS(func) \ + { \ + cublasStatus_t status = (func); \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + printf("CUBLAS API failed at line %d of %s with error: %s (%d)\n", \ + __LINE__, __FILE__, cublasGetStatusString(status), status); \ + return EXIT_FAILURE; \ + } \ + } + +#define CUPDLP_FREE_VEC(x) \ + { \ + cudaFree(x); \ + x = cupdlp_NULL; \ + } + +#define CUPDLP_COPY_VEC(dst, src, type, size) \ + cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault) + +#define CUPDLP_INIT_VEC(var, size) \ + { \ + cusparseStatus_t status = \ + cudaMalloc((void **)&var, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + } +#define CUPDLP_INIT_ZERO_VEC(var, size) \ + { \ + cusparseStatus_t status = \ + cudaMalloc((void **)&var, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + status = cudaMemset(var, 0, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + } +#define CUPDLP_INIT_ZERO_INT_VEC(var, size) \ + { \ + cusparseStatus_t status = \ + cudaMalloc((void **)&var, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + status = cudaMemset(var, 0, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + } + +#define CUPDLP_INIT_ZERO_DOUBLE_VEC(var, size) \ + { \ + cusparseStatus_t status = \ + cudaMalloc((void **)&var, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + status = cudaMemset(var, 0, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + } +#define CUPDLP_ZERO_VEC(var, type, size) \ + cudaMemset(var, 0, sizeof(type) * (size)) + +dim3 cuda_gridsize(cupdlp_int n); + +__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); + +__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); + +__global__ void element_wise_projlb_kernel(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_int len); + +__global__ void element_wise_projub_kernel(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_int len); + +__global__ void element_wise_projSamelb_kernel(cupdlp_float *x, + const cupdlp_float lb, + const cupdlp_int len); + +__global__ void element_wise_projSameub_kernel(cupdlp_float *x, + const cupdlp_float ub, + const cupdlp_int len); + +__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void element_wise_filterlb_kernal(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void element_wise_filterub_kernal(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len); + +__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate, + const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len); + +__global__ void dual_grad_step_kernal( + cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b, + const cupdlp_float *Ax, const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, const cupdlp_int len); + +__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len); +#endif \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu new file mode 100644 index 0000000000..332b417899 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu @@ -0,0 +1,209 @@ +#include "cupdlp_cudalinalg.cuh" + +extern "C" cupdlp_int cuda_alloc_MVbuffer( + cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx, + cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void **dBuffer) { + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + size_t AxBufferSize = 0; + size_t ATyBufferSize = 0; + cupdlp_float alpha = 1.0; + cupdlp_float beta = 0.0; + // cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; + cusparseSpMVAlg_t alg = CUSPARSE_SPMV_CSR_ALG2; //deterministic + + // get the buffer size needed by csr Ax + CHECK_CUSPARSE(cusparseSpMV_bufferSize( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, cuda_csr, vecX, &beta, + vecAx, computeType, alg, &AxBufferSize)) + + // get the buffer size needed by csc ATy + CHECK_CUSPARSE(cusparseSpMV_bufferSize( + handle, CUSPARSE_OPERATION_TRANSPOSE, &alpha, cuda_csc, vecY, &beta, + vecATy, computeType, alg, &ATyBufferSize)) + + size_t bufferSize = + (AxBufferSize > ATyBufferSize) ? AxBufferSize : ATyBufferSize; + + // allocate an external buffer if needed + CHECK_CUDA(cudaMalloc(dBuffer, bufferSize)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hAx = alpha * Acsc * hX + beta * hAx + + cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecX, &beta, vecAx, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hAx = alpha * Acsc * hX + beta * hAx + + cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecX, &beta, vecAx, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hATy = alpha * Acsr^T * hY + beta * hATy + cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecY, &beta, vecATy, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hATy = alpha * Acsr^T * hY + beta * hATy + cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecY, &beta, vecATy, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub, + const cupdlp_int len) { + element_wise_projSameub_kernel<<>>( + x, ub, len); +} + +extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb, + const cupdlp_int len) { + element_wise_projSamelb_kernel<<>>( + x, lb, len); +} + +extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_int len) { + element_wise_projub_kernel<<>>(x, ub, + len); +} + +extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_int len) { + element_wise_projlb_kernel<<>>(x, lb, + len); +} + +extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + element_wise_div_kernel<<>>(x, y, len); +} + +extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + element_wise_dot_kernel<<>>(x, y, len); +} + +extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_initHaslb_kernal<<>>( + haslb, lb, bound, len); +} + +extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_initHasub_kernal<<>>( + hasub, ub, bound, len); +} + +extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_filterlb_kernal<<>>( + x, lb, bound, len); +} + +extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_filterub_kernal<<>>( + x, ub, bound, len); +} + +extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len) { + init_cuda_vec_kernal<<>>(x, val, len); +} + +extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate, + const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len) { + primal_grad_step_kernal<<>>( + xUpdate, x, cost, ATy, dPrimalStep, len); +} + +extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b, + const cupdlp_float *Ax, const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, const cupdlp_int len) { + dual_grad_step_kernal<<>>( + yUpdate, y, b, Ax, AxUpdate, dDualStep, len); +} + +extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len) +{ + naive_sub_kernal<<>>(z, x, y, len); +} \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh new file mode 100644 index 0000000000..e0103b9228 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh @@ -0,0 +1,98 @@ +#ifndef CUPDLP_CUDA_LINALG_H +#define CUPDLP_CUDA_LINALG_H + +#include // cublas +#include // cudaMalloc, cudaMemcpy, etc. +#include // cusparseSpMV + +#include "cupdlp_cuda_kernels.cuh" + +#ifdef __cplusplus +extern "C" { +#endif + +#include // printf +#include // EXIT_FAILURE + +// #include "../cupdlp_defs.h" +// #include "../glbopts.h" +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus + +extern "C" cupdlp_int cuda_alloc_MVbuffer( + cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx, + cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void **dBuffer); + +extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); +extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); +extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); +extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); + +extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub, + const cupdlp_int len); +extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb, + const cupdlp_int len); +extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_int len); +extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_int len); +extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); +extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); +extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len); + +extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate, const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len); + +extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y, + const cupdlp_float *b, const cupdlp_float *Ax, + const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, + const cupdlp_int len); + +extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len); +#endif +#endif \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/test_cublas.c b/src/pdlp/cupdlp/cuda/test_cublas.c new file mode 100644 index 0000000000..4e1a52b85e --- /dev/null +++ b/src/pdlp/cupdlp/cuda/test_cublas.c @@ -0,0 +1,152 @@ +#include "cupdlp_cuda_kernels.cuh" +#include "cupdlp_cudalinalg.cuh" + +void use_cublas(cublasHandle_t cublashandle) { + cupdlp_int len = 10; + // cupdlp_int len = 1<<10; + + // int N = 1<<20; + + // alloc and init host vec memory + cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + for (cupdlp_int i = 0; i < len; i++) { + h_vec1[i] = 1.0; + h_vec2[i] = i; + // h_vec1[i] = 1.0; + // h_vec2[i] = 2.0; + } + + // alloc and init device vec memory + cupdlp_float *d_vec1; + cupdlp_float *d_vec2; + cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float)); + // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float), + // cudaMemcpyHostToDevice); + + cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float)); + cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float), + cudaMemcpyHostToDevice); + + // init cublas handle + // cublasHandle_t cublashandle; + // CHECK_CUBLAS(cublasCreate(&cublashandle)); + + cupdlp_float result; + // call nrm2 + CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result)); + + // print result + printf("2-norm is :%f\n", result); + + // copy result back to host + // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaError_t errSync = cudaGetLastError(); + // cudaError_t errAsync = cudaDeviceSynchronize(); + // if (errSync != cudaSuccess) + // printf("Sync kernel error: %s\n", cudaGetErrorString(errSync)); + // if (errAsync != cudaSuccess) + // printf("Async kernel error: %s\n", cudaGetErrorString(errAsync)); + + // // print result + // for (cupdlp_int i = 0; i < len; i++) { + // printf("%f\n", h_vec1[i]); + // // printf("%f\n", h_vec2[i]); + // } + + // destroy cublas handle + // CHECK_CUBLAS(cublasDestroy(cublashandle)); + + // free memory + free(h_vec1); + free(h_vec2); + cudaFree(d_vec1); + cudaFree(d_vec2); +} +int main() { + // try cupdlp_edot_cuda + + // int nDevices; + + // cudaGetDeviceCount(&nDevices); + // for (int i = 0; i < nDevices; i++) { + // cudaDeviceProp prop; + // cudaGetDeviceProperties(&prop, i); + // printf("Device Number: %d\n", i); + // printf(" Device name: %s\n", prop.name); + // printf(" Memory Clock Rate (KHz): %d\n", + // prop.memoryClockRate); + // printf(" Memory Bus Width (bits): %d\n", + // prop.memoryBusWidth); + // printf(" Peak Memory Bandwidth (GB/s): %f\n\n", + // 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) + // / 1.0e6); + // } + + // cupdlp_int len = 10; + // // cupdlp_int len = 1<<10; + + // // int N = 1<<20; + + // // alloc and init host vec memory + // cupdlp_float *h_vec1 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float)); + // cupdlp_float *h_vec2 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float)); + // for (cupdlp_int i = 0; i < len; i++) { + // h_vec1[i] = 1.0; + // h_vec2[i] = i; + // // h_vec1[i] = 1.0; + // // h_vec2[i] = 2.0; + // } + + // // alloc and init device vec memory + // cupdlp_float *d_vec1; + // cupdlp_float *d_vec2; + // cudaMalloc((void **) &d_vec1, len * sizeof(cupdlp_float)); + // // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float), + // // cudaMemcpyHostToDevice); + + // cudaMalloc((void **) &d_vec2, len * sizeof(cupdlp_float)); + // cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float), + // cudaMemcpyHostToDevice); + + // init cublas handle + cublasHandle_t cublashandle; + CHECK_CUBLAS(cublasCreate(&cublashandle)); + use_cublas(cublashandle); + // cupdlp_float result; + // call nrm2 + // CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result)); + + // print result + // printf("2-norm is :%f\n", result); + + // copy result back to host + // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaError_t errSync = cudaGetLastError(); + // cudaError_t errAsync = cudaDeviceSynchronize(); + // if (errSync != cudaSuccess) + // printf("Sync kernel error: %s\n", cudaGetErrorString(errSync)); + // if (errAsync != cudaSuccess) + // printf("Async kernel error: %s\n", cudaGetErrorString(errAsync)); + + // // print result + // for (cupdlp_int i = 0; i < len; i++) { + // printf("%f\n", h_vec1[i]); + // // printf("%f\n", h_vec2[i]); + // } + + // destroy cublas handle + CHECK_CUBLAS(cublasDestroy(cublashandle)); + + // // free memory + // free(h_vec1); + // free(h_vec2); + // cudaFree(d_vec1); + // cudaFree(d_vec2); +} \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/test_cuda_linalg.c b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c new file mode 100644 index 0000000000..ad19bbe701 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c @@ -0,0 +1,79 @@ +#include "cupdlp_cuda_kernels.cuh" +#include "cupdlp_cudalinalg.cuh" + +int main() { + // try cupdlp_edot_cuda + + int nDevices; + + cudaGetDeviceCount(&nDevices); + // for (int i = 0; i < nDevices; i++) { + // cudaDeviceProp prop; + // cudaGetDeviceProperties(&prop, i); + // printf("Device Number: %d\n", i); + // printf(" Device name: %s\n", prop.name); + // printf(" Memory Clock Rate (KHz): %d\n", + // prop.memoryClockRate); + // printf(" Memory Bus Width (bits): %d\n", + // prop.memoryBusWidth); + // printf(" Peak Memory Bandwidth (GB/s): %f\n\n", + // 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) + // / 1.0e6); + // } + + cupdlp_int len = 10; + // cupdlp_int len = 1<<10; + + // int N = 1<<20; + + // alloc and init host vec memory + cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + for (cupdlp_int i = 0; i < len; i++) { + h_vec1[i] = i; + h_vec2[i] = i; + // h_vec1[i] = 1.0; + // h_vec2[i] = 2.0; + } + + // alloc and init device vec memory + cupdlp_float *d_vec1; + cupdlp_float *d_vec2; + cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float)); + cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float)); + cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float), + cudaMemcpyHostToDevice); + cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float), + cudaMemcpyHostToDevice); + + // call cupdlp_edot_cuda + cupdlp_edot_cuda(d_vec1, d_vec2, len); + // element_wise_dot_kernel<<<(len+255)/256, 256>>>(d_vec1, d_vec2, len); + // saxpy<<<(len+255)/256, 256>>>(len, 2.0f, d_vec1, d_vec2); + // element_wise_dot<<>>(d_vec1, d_vec2, + // len); + + // copy result back to host + cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float), + cudaMemcpyDeviceToHost); + cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float), + cudaMemcpyDeviceToHost); + cudaError_t errSync = cudaGetLastError(); + cudaError_t errAsync = cudaDeviceSynchronize(); + if (errSync != cudaSuccess) + printf("Sync kernel error: %s\n", cudaGetErrorString(errSync)); + if (errAsync != cudaSuccess) + printf("Async kernel error: %s\n", cudaGetErrorString(errAsync)); + + // print result + for (cupdlp_int i = 0; i < len; i++) { + printf("%f\n", h_vec1[i]); + // printf("%f\n", h_vec2[i]); + } + + // free memory + free(h_vec1); + free(h_vec2); + cudaFree(d_vec1); + cudaFree(d_vec2); +} \ No newline at end of file diff --git a/src/pdlp/cupdlp/cupdlp_defs.h b/src/pdlp/cupdlp/cupdlp_defs.h index 068c150935..f1d7ecfd27 100644 --- a/src/pdlp/cupdlp/cupdlp_defs.h +++ b/src/pdlp/cupdlp/cupdlp_defs.h @@ -1,7 +1,7 @@ #ifndef CUPDLP_H_GUARD #define CUPDLP_H_GUARD -#define CUPDLP_CPU +// #define CUPDLP_CPU #define CUPDLP_DEBUG (0) #define CUPDLP_TIMER @@ -9,6 +9,7 @@ #include "cuda/cupdlp_cuda_kernels.cuh" #include "cuda/cupdlp_cudalinalg.cuh" #endif + #ifdef __cplusplus extern "C" { #endif diff --git a/src/pdlp/cupdlp/cupdlp_utils.c b/src/pdlp/cupdlp/cupdlp_utils.c index 58b14b2ac3..1d9823a276 100644 --- a/src/pdlp/cupdlp/cupdlp_utils.c +++ b/src/pdlp/cupdlp/cupdlp_utils.c @@ -384,6 +384,15 @@ cupdlp_int PDHG_Clear(CUPDLPwork *w) { } if (scaling) { // scaling_clear(scaling); + if (scaling->colScale) { + // cupdlp_free(scaling->colScale); + CUPDLP_FREE_VEC(scaling->colScale); // now on gpu + } + if (scaling->rowScale) { + // cupdlp_free(scaling->rowScale); + CUPDLP_FREE_VEC(scaling->rowScale); // now on gpu + } + // cupdlp_free(scaling); scaling = cupdlp_NULL; } cupdlp_free(w); @@ -1318,7 +1327,14 @@ cupdlp_retcode csr_create(CUPDLPcsr **csr) { cupdlp_retcode csc_create(CUPDLPcsc **csc) { cupdlp_retcode retcode = RETCODE_OK; +#ifdef CUPDLP_CPU CUPDLP_INIT_CSC_MATRIX(*csc, 1); +#else + // CUPDLP_INIT(*csc, 1); + // (*csc) = typeof(CUPDLPcsc) (malloc((1) * sizeof(CUPDLPcsc)); + (*csc) = (typeof(*csc))malloc((1) * sizeof(typeof(**csc))); if ((*csc) == 0) { retcode = (1); goto exit_cleanup; } + // if ((*csc) == 0) { retcode = (1); goto exit_cleanup; } +#endif exit_cleanup: return retcode; @@ -1405,9 +1421,22 @@ cupdlp_retcode csc_alloc_matrix(CUPDLPcsc *csc, cupdlp_int nRows, default: break; } + +#ifndef CUPDLP_CPU + cusparseStatus_t status = cudaMalloc((void **)&(csc->colMatBeg), (nCols + 1) * sizeof(int)); + cusparseStatus_t status2 = cudaMalloc((void **)&(csc->colMatIdx), (nnz) * sizeof(int)); + cusparseStatus_t status3 = cudaMalloc((void **)&(csc->colMatElem), (nnz) * sizeof(double)); + if (status || status2 || status3) return 2; + + status = cudaMemset(csc->colMatBeg, 0, (nCols + 1) * sizeof(int)); + status2 = cudaMemset(csc->colMatIdx, 0, (nnz) * sizeof(int)); + status3 = cudaMemset(csc->colMatElem, 0, (nnz) * sizeof(double)); + if (status || status2 || status3) return 2; +#else CUPDLP_INIT_ZERO_INT_VEC(csc->colMatBeg, nCols + 1); CUPDLP_INIT_ZERO_INT_VEC(csc->colMatIdx, nnz); CUPDLP_INIT_ZERO_DOUBLE_VEC(csc->colMatElem, nnz); +#endif switch (src_matrix_format) { case DENSE: diff --git a/src/pdlp/cupdlp/glbopts.h b/src/pdlp/cupdlp/glbopts.h index ff6599c7ca..ddf3f8e2c0 100644 --- a/src/pdlp/cupdlp/glbopts.h +++ b/src/pdlp/cupdlp/glbopts.h @@ -7,6 +7,8 @@ // #include // cusparseSpMV // #endif +#include "HConfig.h" + #ifdef __cplusplus extern "C" { @@ -70,8 +72,8 @@ extern "C" { // x = cupdlp_NULL; \ // } -// #define CUPDLP_COPY_VEC(dst, src, type, size) \ -// cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault) +#define CUPDLP_COPY_VEC(dst, src, type, size) \ + cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault) // #define CUPDLP_INIT_VEC(var, size) \ // { \