some updates added GPU

ERGO-Code · Nov 8, 2024 · ac36f78 · ac36f78
1 parent 7aede2a
commit ac36f78
Show file tree

Hide file tree

Showing 18 changed files with 1,010 additions and 16 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -97,6 +97,27 @@ set(DEBUG_MEMORY "Off" CACHE STRING "Sanitizers")
 # emscripten
 option(EMSCRIPTEN_HTML "Emscripten HTML output" OFF)
 
+# option(CUPDLP_GPU "Build pdlp with CPU" ON)
+# message(STATUS "Build pdlp with CPU: ${CUPDLP_CPU}")
+
+option(CUPDLP_GPU "Build pdlp with GPU" OFF)
+message(STATUS "Build pdlp with GPU: ${CUPDLP_GPU}")
+
+if (NOT LINUX)
+  set (CUPDLP_GPU OFF)
+  message(STATUS "CUPLDP with Nvidia is only supported on Linux at the moment. Using CPU version.")
+endif()
+
+if (CUPDLP_GPU)
+  set (CUPDLP_CPU OFF)
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
+    message(NOTICE "Set build cuPDLP with CUDA")
+    include(FindCUDAConf.cmake)
+else()
+  set (CUPDLP_CPU ON)
+    set(CUDA_LIBRARY-NOTFOUND true)
+endif()
+
 if (BUILD_CXX)
   # Default Build Type to be Release
   get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
@@ -248,7 +269,11 @@ if (BUILD_CXX)
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
     message(STATUS "IPO / LTO: enabled")
   endif()
-endif()
+  if (CUPDLP_GPU AND CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+      set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
+      message(STATUS "IPO / LTO is not supported at the moment when PDLP is using GPU: LTO disabled.")
+  endif()
+
 
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles(

diff --git a/FindCUDAConf.cmake b/FindCUDAConf.cmake
@@ -0,0 +1,33 @@
+
+set(CUDA_LIBRARY-NOTFOUND, OFF)
+message(NOTICE "Finding CUDA environment")
+message(NOTICE "    - CUDA Home detected at $ENV{CUDA_HOME}")
+set(CMAKE_CUDA_ARCHITECTURES "all")
+set(CMAKE_CUDA_PATH "$ENV{CUDA_HOME}")
+set(CMAKE_CUDA_COMPILER "${CMAKE_CUDA_PATH}/bin/nvcc")
+
+enable_language(CUDA)
+
+find_library(CUDA_LIBRARY_ART
+        NAMES cudart
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_SPS
+        NAMES cusparse
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_BLS
+        NAMES cublas
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+if (${CUDA_LIBRARY-NOTFOUND})
+    message(WARNING "    - CUDA Libraries not detected at $ENV{CUDA_HOME}")
+else ()
+    message(NOTICE "    - CUDA Libraries detected at $ENV{CUDA_HOME}")
+    set(CUDA_LIBRARY ${CUDA_LIBRARY_ART} ${CUDA_LIBRARY_SPS} ${CUDA_LIBRARY_BLS})
+    message(NOTICE "    -   :${CUDA_LIBRARY}")
+endif ()
+
diff --git a/check/CMakeLists.txt b/check/CMakeLists.txt
@@ -13,6 +13,19 @@ if (FORTRAN)
         ${HIGHS_SOURCE_DIR}/check)
 endif()
 
+if (CUPLDP_GPU)
+    # add a test
+    add_executable(testcudalin ${HIGHS_SOURCE_DIR}/src/pdlp/cupdlp/cuda/test_cuda_linalg.c)
+    add_executable(testcublas ${HIGHS_SOURCE_DIR}/src/pdlp/cupdlp/cuda/test_cublas.c)
+
+    set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    #target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda)
+    target_link_libraries(testcudalin PRIVATE highs ${CUDA_LIBRARY})
+
+    set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(testcublas PRIVATE highs ${CUDA_LIBRARY})
+endif()
+
 if (NOT FAST_BUILD OR ALL_TESTS)
   # prepare Catch library
   set(CATCH_INCLUDE_DIR ${HIGHS_SOURCE_DIR}/extern)

diff --git a/cmake/cpp-highs.cmake b/cmake/cpp-highs.cmake
@@ -48,18 +48,35 @@ install(FILES ${PROJECT_BINARY_DIR}/highs_export.h
 
 string (TOLOWER ${PROJECT_NAME} lower)
 
-install(TARGETS highs
-   EXPORT ${lower}-targets
-   INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs
-   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-   PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs)
-
-# Add library targets to the build-tree export set
-export(TARGETS highs
-  NAMESPACE ${PROJECT_NAMESPACE}::highs
-  FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake")
+# if (NOT CUPDLP_GPU)
+    install(TARGETS highs
+      EXPORT ${lower}-targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+      PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs)
+
+    # Add library targets to the build-tree export set
+    export(TARGETS highs
+      NAMESPACE ${PROJECT_NAMESPACE}::highs
+      FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake")
+# else()
+
+#     install(TARGETS highs cudalin
+#       EXPORT ${lower}-targets
+#       INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs
+#       ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+#       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+#       RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+#       PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs)
+
+#     # Add library targets to the build-tree export set
+#     export(TARGETS highs cudalin
+#       NAMESPACE ${PROJECT_NAMESPACE}::highs
+#       FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake")
+# endif()
+
 
 install(EXPORT ${lower}-targets
   NAMESPACE ${PROJECT_NAMESPACE}::

diff --git a/cmake/python-highs.cmake b/cmake/python-highs.cmake
@@ -45,6 +45,15 @@ target_link_libraries(_core PRIVATE pybind11::headers)
 # sources for python 
 target_sources(_core PUBLIC ${sources_python} ${headers_python})
 
+if (CUPDLP_GPU)
+    enable_language(CXX CUDA)
+    target_sources(_core PRIVATE ${cuda_sources_python})
+
+    target_include_directories(_core PUBLIC "/usr/local/cuda/include")
+    set_target_properties(_core PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif()
+
+
 # include directories for python 
 target_include_directories(_core PUBLIC ${include_dirs_python})
 

diff --git a/cmake/sources-python.cmake b/cmake/sources-python.cmake
@@ -43,6 +43,12 @@ set(cupdlp_headers_python
   src/pdlp/cupdlp/cupdlp_step.h
   src/pdlp/cupdlp/cupdlp_utils.c)
 
+set(cuda_sources_python
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu)
+
 set(basiclu_sources_python
   src/ipm/basiclu/basiclu_factorize.c
   src/ipm/basiclu/basiclu_get_factors.c

diff --git a/cmake/sources.cmake b/cmake/sources.cmake
@@ -43,6 +43,13 @@ set(cupdlp_headers
   pdlp/cupdlp/cupdlp_step.h
   pdlp/cupdlp/cupdlp_utils.c)
 
+set(cuda_sources
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu)
+
+
 set(basiclu_sources
   ipm/basiclu/basiclu_factorize.c
   ipm/basiclu/basiclu_get_factors.c

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -177,6 +177,17 @@ else()
   #   $<BUILD_INTERFACE:${HIGHS_SOURCE_DIR}/extern/pdqsort>)
 
   target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file})
+
+  # Optional Cuda 
+  if (CUPDLP_GPU)
+    enable_language(CXX CUDA)
+    target_sources(highs PRIVATE ${cuda_sources})
+
+    set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+    target_link_libraries(highs ${CUDA_LIBRARY} m)
+  endif()
+
   target_include_directories(highs PRIVATE ${include_dirs})
 
   if(MSVC)

diff --git a/src/HConfig.h.in b/src/HConfig.h.in
@@ -4,6 +4,7 @@
 #cmakedefine FAST_BUILD
 #cmakedefine ZLIB_FOUND
 #cmakedefine CUPDLP_CPU
+#cmakedefine CUPDLP_GPU
 #cmakedefine CMAKE_BUILD_TYPE "@CMAKE_BUILD_TYPE@"
 #cmakedefine CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@"
 #cmakedefine HIGHSINT64

diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
@@ -0,0 +1,121 @@
+#include "cupdlp_cuda_kernels.cuh"
+
+dim3 cuda_gridsize(cupdlp_int n) {
+  cupdlp_int k = (n - 1) / CUPDLP_BLOCK_SIZE + 1;
+  cupdlp_int x = k;
+  cupdlp_int y = 1;
+  if (x > 65535) {
+    x = ceil(sqrt(k));
+    y = (n - 1) / (x * CUPDLP_BLOCK_SIZE) + 1;
+  }
+  dim3 d = {x, y, 1};
+  return d;
+}
+
+__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] *= y[i];
+}
+
+__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] /= y[i];
+}
+
+__global__ void element_wise_projlb_kernel(cupdlp_float *x,
+                                           const cupdlp_float *lb,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] < lb[i] ? lb[i] : x[i];
+}
+
+__global__ void element_wise_projub_kernel(cupdlp_float *x,
+                                           const cupdlp_float *ub,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] > ub[i] ? ub[i] : x[i];
+}
+
+__global__ void element_wise_projSamelb_kernel(cupdlp_float *x,
+                                               const cupdlp_float lb,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] <= lb ? lb : x[i];
+}
+
+__global__ void element_wise_projSameub_kernel(cupdlp_float *x,
+                                               const cupdlp_float ub,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] >= ub ? ub : x[i];
+}
+
+__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb,
+                                              const cupdlp_float *lb,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) haslb[i] = lb[i] > bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub,
+                                              const cupdlp_float *ub,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) hasub[i] = ub[i] < bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_filterlb_kernal(cupdlp_float *x,
+                                             const cupdlp_float *lb,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = lb[i] > bound ? lb[i] : 0.0;
+}
+
+__global__ void element_wise_filterub_kernal(cupdlp_float *x,
+                                             const cupdlp_float *ub,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = ub[i] < bound ? ub[i] : 0.0;
+}
+
+__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val,
+                                     const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = val;
+}
+
+//xUpdate = x - dPrimalStep * (cost - ATy)
+__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) xUpdate[i] = x[i] - dPrimalStep * (cost[i] - ATy[i]);
+}
+
+//yUpdate = y + dDualStep * (b -2AxUpdate + Ax)
+__global__ void dual_grad_step_kernal(cupdlp_float *yUpdate,
+                                      const cupdlp_float *y,
+                                      const cupdlp_float *b,
+                                      const cupdlp_float *Ax,
+                                      const cupdlp_float *AxUpdate,
+                                      const cupdlp_float dDualStep,
+                                      const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; 
+  if (i < len) yUpdate[i] = y[i] + dDualStep * (b[i] - 2 * AxUpdate[i] + Ax[i]);
+}
+
+// z = x - y
+__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x,
+                                  const cupdlp_float *y, const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) z[i] = x[i] - y[i];
+}