diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfeea6ee9c..9664237ab3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,27 @@ set(DEBUG_MEMORY "Off" CACHE STRING "Sanitizers")
 # emscripten
 option(EMSCRIPTEN_HTML "Emscripten HTML output" OFF)
 
+# option(CUPDLP_GPU "Build pdlp with CPU" ON)
+# message(STATUS "Build pdlp with CPU: ${CUPDLP_CPU}")
+
+option(CUPDLP_GPU "Build pdlp with GPU" OFF)
+message(STATUS "Build pdlp with GPU: ${CUPDLP_GPU}")
+
+if (NOT LINUX)
+  set (CUPDLP_GPU OFF)
+  message(STATUS "CUPLDP with Nvidia is only supported on Linux at the moment. Using CPU version.")
+endif()
+
+if (CUPDLP_GPU)
+  set (CUPDLP_CPU OFF)
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
+    message(NOTICE "Set build cuPDLP with CUDA")
+    include(FindCUDAConf.cmake)
+else()
+  set (CUPDLP_CPU ON)
+    set(CUDA_LIBRARY-NOTFOUND true)
+endif()
+
 if (BUILD_CXX)
   # Default Build Type to be Release
   get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
@@ -248,7 +269,11 @@ if (BUILD_CXX)
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
     message(STATUS "IPO / LTO: enabled")
   endif()
-endif()
+  if (CUPDLP_GPU AND CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+      set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
+      message(STATUS "IPO / LTO is not supported at the moment when PDLP is using GPU: LTO disabled.")
+  endif()
+
 
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles(
diff --git a/FindCUDAConf.cmake b/FindCUDAConf.cmake
new file mode 100644
index 0000000000..b6697918da
--- /dev/null
+++ b/FindCUDAConf.cmake
@@ -0,0 +1,33 @@
+
+set(CUDA_LIBRARY-NOTFOUND, OFF)
+message(NOTICE "Finding CUDA environment")
+message(NOTICE "    - CUDA Home detected at $ENV{CUDA_HOME}")
+set(CMAKE_CUDA_ARCHITECTURES "all")
+set(CMAKE_CUDA_PATH "$ENV{CUDA_HOME}")
+set(CMAKE_CUDA_COMPILER "${CMAKE_CUDA_PATH}/bin/nvcc")
+
+enable_language(CUDA)
+
+find_library(CUDA_LIBRARY_ART
+        NAMES cudart
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_SPS
+        NAMES cusparse
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_BLS
+        NAMES cublas
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+if (${CUDA_LIBRARY-NOTFOUND})
+    message(WARNING "    - CUDA Libraries not detected at $ENV{CUDA_HOME}")
+else ()
+    message(NOTICE "    - CUDA Libraries detected at $ENV{CUDA_HOME}")
+    set(CUDA_LIBRARY ${CUDA_LIBRARY_ART} ${CUDA_LIBRARY_SPS} ${CUDA_LIBRARY_BLS})
+    message(NOTICE "    -   :${CUDA_LIBRARY}")
+endif ()
+
diff --git a/check/CMakeLists.txt b/check/CMakeLists.txt
index 0fac3894aa..cd228a86c6 100644
--- a/check/CMakeLists.txt
+++ b/check/CMakeLists.txt
@@ -13,6 +13,19 @@ if (FORTRAN)
         ${HIGHS_SOURCE_DIR}/check)
 endif()
 
+if (CUPLDP_GPU)
+    # add a test
+    add_executable(testcudalin ${HIGHS_SOURCE_DIR}/src/pdlp/cupdlp/cuda/test_cuda_linalg.c)
+    add_executable(testcublas ${HIGHS_SOURCE_DIR}/src/pdlp/cupdlp/cuda/test_cublas.c)
+
+    set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    #target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda)
+    target_link_libraries(testcudalin PRIVATE highs ${CUDA_LIBRARY})
+
+    set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(testcublas PRIVATE highs ${CUDA_LIBRARY})
+endif()
+
 if (NOT FAST_BUILD OR ALL_TESTS)
   # prepare Catch library
   set(CATCH_INCLUDE_DIR ${HIGHS_SOURCE_DIR}/extern)
diff --git a/cmake/cpp-highs.cmake b/cmake/cpp-highs.cmake
index 88a529a6f3..56e6a03d3a 100644
--- a/cmake/cpp-highs.cmake
+++ b/cmake/cpp-highs.cmake
@@ -48,18 +48,35 @@ install(FILES ${PROJECT_BINARY_DIR}/highs_export.h
 
 string (TOLOWER ${PROJECT_NAME} lower)
 
-install(TARGETS highs
-   EXPORT ${lower}-targets
-   INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs
-   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-   PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs)
-  
-# Add library targets to the build-tree export set
-export(TARGETS highs
-  NAMESPACE ${PROJECT_NAMESPACE}::highs
-  FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake")
+# if (NOT CUPDLP_GPU)
+    install(TARGETS highs
+      EXPORT ${lower}-targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+      PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs)
+      
+    # Add library targets to the build-tree export set
+    export(TARGETS highs
+      NAMESPACE ${PROJECT_NAMESPACE}::highs
+      FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake")
+# else()
+
+#     install(TARGETS highs cudalin
+#       EXPORT ${lower}-targets
+#       INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs
+#       ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+#       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+#       RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+#       PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/highs)
+      
+#     # Add library targets to the build-tree export set
+#     export(TARGETS highs cudalin
+#       NAMESPACE ${PROJECT_NAMESPACE}::highs
+#       FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake")
+# endif()
+
 
 install(EXPORT ${lower}-targets
   NAMESPACE ${PROJECT_NAMESPACE}::
diff --git a/cmake/python-highs.cmake b/cmake/python-highs.cmake
index 4a429cb110..4217a4e15a 100644
--- a/cmake/python-highs.cmake
+++ b/cmake/python-highs.cmake
@@ -45,6 +45,15 @@ target_link_libraries(_core PRIVATE pybind11::headers)
 # sources for python 
 target_sources(_core PUBLIC ${sources_python} ${headers_python})
 
+if (CUPDLP_GPU)
+    enable_language(CXX CUDA)
+    target_sources(_core PRIVATE ${cuda_sources_python})
+
+    target_include_directories(_core PUBLIC "/usr/local/cuda/include")
+    set_target_properties(_core PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif()
+
+
 # include directories for python 
 target_include_directories(_core PUBLIC ${include_dirs_python})
 
diff --git a/cmake/sources-python.cmake b/cmake/sources-python.cmake
index df2ff5a223..a6c65608f1 100644
--- a/cmake/sources-python.cmake
+++ b/cmake/sources-python.cmake
@@ -43,6 +43,12 @@ set(cupdlp_headers_python
   src/pdlp/cupdlp/cupdlp_step.h
   src/pdlp/cupdlp/cupdlp_utils.c)
 
+set(cuda_sources_python
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu)
+
 set(basiclu_sources_python
   src/ipm/basiclu/basiclu_factorize.c
   src/ipm/basiclu/basiclu_get_factors.c
diff --git a/cmake/sources.cmake b/cmake/sources.cmake
index 96e84b7f5b..e1e8e248f6 100644
--- a/cmake/sources.cmake
+++ b/cmake/sources.cmake
@@ -43,6 +43,13 @@ set(cupdlp_headers
   pdlp/cupdlp/cupdlp_step.h
   pdlp/cupdlp/cupdlp_utils.c)
 
+set(cuda_sources
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
+  pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
+  pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu)
+
+
 set(basiclu_sources
   ipm/basiclu/basiclu_factorize.c
   ipm/basiclu/basiclu_get_factors.c
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 69a78869db..915917818d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -177,6 +177,17 @@ else()
   #   $<BUILD_INTERFACE:${HIGHS_SOURCE_DIR}/extern/pdqsort>)
 
   target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file})
+  
+  # Optional Cuda 
+  if (CUPDLP_GPU)
+    enable_language(CXX CUDA)
+    target_sources(highs PRIVATE ${cuda_sources})
+
+    set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+    target_link_libraries(highs ${CUDA_LIBRARY} m)
+  endif()
+
   target_include_directories(highs PRIVATE ${include_dirs})
 
   if(MSVC)
diff --git a/src/HConfig.h.in b/src/HConfig.h.in
index 198e07b0d3..b4e6d6cb5f 100644
--- a/src/HConfig.h.in
+++ b/src/HConfig.h.in
@@ -4,6 +4,7 @@
 #cmakedefine FAST_BUILD
 #cmakedefine ZLIB_FOUND
 #cmakedefine CUPDLP_CPU
+#cmakedefine CUPDLP_GPU
 #cmakedefine CMAKE_BUILD_TYPE "@CMAKE_BUILD_TYPE@"
 #cmakedefine CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@"
 #cmakedefine HIGHSINT64
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
new file mode 100644
index 0000000000..519408b7c0
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
@@ -0,0 +1,121 @@
+#include "cupdlp_cuda_kernels.cuh"
+
+dim3 cuda_gridsize(cupdlp_int n) {
+  cupdlp_int k = (n - 1) / CUPDLP_BLOCK_SIZE + 1;
+  cupdlp_int x = k;
+  cupdlp_int y = 1;
+  if (x > 65535) {
+    x = ceil(sqrt(k));
+    y = (n - 1) / (x * CUPDLP_BLOCK_SIZE) + 1;
+  }
+  dim3 d = {x, y, 1};
+  return d;
+}
+
+__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] *= y[i];
+}
+
+__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] /= y[i];
+}
+
+__global__ void element_wise_projlb_kernel(cupdlp_float *x,
+                                           const cupdlp_float *lb,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] < lb[i] ? lb[i] : x[i];
+}
+
+__global__ void element_wise_projub_kernel(cupdlp_float *x,
+                                           const cupdlp_float *ub,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] > ub[i] ? ub[i] : x[i];
+}
+
+__global__ void element_wise_projSamelb_kernel(cupdlp_float *x,
+                                               const cupdlp_float lb,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] <= lb ? lb : x[i];
+}
+
+__global__ void element_wise_projSameub_kernel(cupdlp_float *x,
+                                               const cupdlp_float ub,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] >= ub ? ub : x[i];
+}
+
+__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb,
+                                              const cupdlp_float *lb,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) haslb[i] = lb[i] > bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub,
+                                              const cupdlp_float *ub,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) hasub[i] = ub[i] < bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_filterlb_kernal(cupdlp_float *x,
+                                             const cupdlp_float *lb,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = lb[i] > bound ? lb[i] : 0.0;
+}
+
+__global__ void element_wise_filterub_kernal(cupdlp_float *x,
+                                             const cupdlp_float *ub,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = ub[i] < bound ? ub[i] : 0.0;
+}
+
+__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val,
+                                     const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = val;
+}
+
+//xUpdate = x - dPrimalStep * (cost - ATy)
+__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) xUpdate[i] = x[i] - dPrimalStep * (cost[i] - ATy[i]);
+}
+
+//yUpdate = y + dDualStep * (b -2AxUpdate + Ax)
+__global__ void dual_grad_step_kernal(cupdlp_float *yUpdate,
+                                      const cupdlp_float *y,
+                                      const cupdlp_float *b,
+                                      const cupdlp_float *Ax,
+                                      const cupdlp_float *AxUpdate,
+                                      const cupdlp_float dDualStep,
+                                      const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; 
+  if (i < len) yUpdate[i] = y[i] + dDualStep * (b[i] - 2 * AxUpdate[i] + Ax[i]);
+}
+
+// z = x - y
+__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x,
+                                  const cupdlp_float *y, const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) z[i] = x[i] - y[i];
+}
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
new file mode 100644
index 0000000000..c4240abd98
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
@@ -0,0 +1,181 @@
+#ifndef CUPDLP_CUDA_KERNALS_H
+#define CUPDLP_CUDA_KERNALS_H
+
+#include "cuda_runtime.h"
+#define CUPDLP_BLOCK_SIZE 512
+
+#ifndef SFLOAT
+#ifdef DLONG
+typedef long long cupdlp_int;
+#else
+typedef int cupdlp_int;
+#endif
+typedef double cupdlp_float;
+#define CudaComputeType CUDA_R_64F
+#else
+#define CudaComputeType CUDA_R_32F
+#endif
+
+#define CHECK_CUDA(func)                                               \
+  {                                                                    \
+    cudaError_t status = (func);                                       \
+    if (status != cudaSuccess) {                                       \
+      printf("CUDA API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cudaGetErrorString(status), status);  \
+      return EXIT_FAILURE;                                             \
+    }                                                                  \
+  }
+
+#define CHECK_CUSPARSE(func)                                               \
+  {                                                                        \
+    cusparseStatus_t status = (func);                                      \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                               \
+      printf("CUSPARSE API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cusparseGetErrorString(status), status);  \
+      return EXIT_FAILURE;                                                 \
+    }                                                                      \
+  }
+
+#define CHECK_CUBLAS(func)                                               \
+  {                                                                      \
+    cublasStatus_t status = (func);                                      \
+    if (status != CUBLAS_STATUS_SUCCESS) {                               \
+      printf("CUBLAS API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cublasGetStatusString(status), status); \
+      return EXIT_FAILURE;                                               \
+    }                                                                    \
+  }
+
+#define CUPDLP_FREE_VEC(x) \
+  {                        \
+    cudaFree(x);           \
+    x = cupdlp_NULL;       \
+  }
+
+#define CUPDLP_COPY_VEC(dst, src, type, size) \
+  cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault)
+
+#define CUPDLP_INIT_VEC(var, size)                                             \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+#define CUPDLP_INIT_ZERO_VEC(var, size)                                        \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+    status = cudaMemset(var, 0, (size) * sizeof(typeof(*var)));                \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+#define CUPDLP_INIT_ZERO_INT_VEC(var, size)                                    \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+    status = cudaMemset(var, 0, (size) * sizeof(typeof(*var)));                \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+
+#define CUPDLP_INIT_ZERO_DOUBLE_VEC(var, size)                                        \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+    status = cudaMemset(var, 0, (size) * sizeof(typeof(*var)));                \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+#define CUPDLP_ZERO_VEC(var, type, size) \
+  cudaMemset(var, 0, sizeof(type) * (size))
+
+dim3 cuda_gridsize(cupdlp_int n);
+
+__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len);
+
+__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len);
+
+__global__ void element_wise_projlb_kernel(cupdlp_float *x,
+                                           const cupdlp_float *lb,
+                                           const cupdlp_int len);
+
+__global__ void element_wise_projub_kernel(cupdlp_float *x,
+                                           const cupdlp_float *ub,
+                                           const cupdlp_int len);
+
+__global__ void element_wise_projSamelb_kernel(cupdlp_float *x,
+                                               const cupdlp_float lb,
+                                               const cupdlp_int len);
+
+__global__ void element_wise_projSameub_kernel(cupdlp_float *x,
+                                               const cupdlp_float ub,
+                                               const cupdlp_int len);
+
+__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb,
+                                              const cupdlp_float *lb,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len);
+
+__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub,
+                                              const cupdlp_float *ub,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len);
+
+__global__ void element_wise_filterlb_kernal(cupdlp_float *x,
+                                             const cupdlp_float *lb,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len);
+
+__global__ void element_wise_filterub_kernal(cupdlp_float *x,
+                                             const cupdlp_float *ub,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len);
+
+__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val,
+                                     const cupdlp_int len);
+
+__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len);
+
+__global__ void dual_grad_step_kernal(
+    cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b,
+    const cupdlp_float *Ax, const cupdlp_float *AxUpdate,
+    const cupdlp_float dDualStep, const cupdlp_int len);
+
+__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x,
+                                 const cupdlp_float *y, const cupdlp_int len);
+#endif
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu
new file mode 100644
index 0000000000..332b417899
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu
@@ -0,0 +1,209 @@
+#include "cupdlp_cudalinalg.cuh"
+
+extern "C" cupdlp_int cuda_alloc_MVbuffer(
+    cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc,
+    cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx,
+    cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY,
+    cusparseDnVecDescr_t vecATy, void **dBuffer) {
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  size_t AxBufferSize = 0;
+  size_t ATyBufferSize = 0;
+  cupdlp_float alpha = 1.0;
+  cupdlp_float beta = 0.0;
+  // cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
+  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_CSR_ALG2; //deterministic
+
+  // get the buffer size needed by csr Ax
+  CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+      handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, cuda_csr, vecX, &beta,
+      vecAx, computeType, alg, &AxBufferSize))
+
+  // get the buffer size needed by csc ATy
+  CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+      handle, CUSPARSE_OPERATION_TRANSPOSE, &alpha, cuda_csc, vecY, &beta,
+      vecATy, computeType, alg, &ATyBufferSize))
+
+  size_t bufferSize =
+      (AxBufferSize > ATyBufferSize) ? AxBufferSize : ATyBufferSize;
+
+  // allocate an external buffer if needed
+  CHECK_CUDA(cudaMalloc(dBuffer, bufferSize))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csc,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta) {
+  // hAx = alpha * Acsc * hX + beta * hAx
+
+  cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecX, &beta, vecAx,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csr,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta) {
+  // hAx = alpha * Acsc * hX + beta * hAx
+
+  cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecX, &beta, vecAx,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csc,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta) {
+  // hATy = alpha * Acsr^T * hY + beta * hATy
+  cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecY, &beta, vecATy,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csr,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta) {
+  // hATy = alpha * Acsr^T * hY + beta * hATy
+  cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecY, &beta, vecATy,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub,
+                                       const cupdlp_int len) {
+  element_wise_projSameub_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, ub, len);
+}
+
+extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb,
+                                       const cupdlp_int len) {
+  element_wise_projSamelb_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, lb, len);
+}
+
+extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                   const cupdlp_int len) {
+  element_wise_projub_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, ub,
+                                                                        len);
+}
+
+extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                   const cupdlp_int len) {
+  element_wise_projlb_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, lb,
+                                                                        len);
+}
+
+extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len) {
+  element_wise_div_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, y, len);
+}
+
+extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len) {
+  element_wise_dot_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, y, len);
+}
+
+extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len) {
+  element_wise_initHaslb_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      haslb, lb, bound, len);
+}
+
+extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len) {
+  element_wise_initHasub_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      hasub, ub, bound, len);
+}
+
+extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len) {
+  element_wise_filterlb_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, lb, bound, len);
+}
+
+extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len) {
+  element_wise_filterub_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, ub, bound, len);
+}
+
+extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val,
+                                    const cupdlp_int len) {
+  init_cuda_vec_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, val, len);
+}
+
+extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len) {
+    primal_grad_step_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+        xUpdate, x, cost, ATy, dPrimalStep, len);
+}
+
+extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b,
+    const cupdlp_float *Ax, const cupdlp_float *AxUpdate,
+    const cupdlp_float dDualStep, const cupdlp_int len) {
+      dual_grad_step_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+          yUpdate, y, b, Ax, AxUpdate, dDualStep, len);
+}
+
+extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x,
+                                  const cupdlp_float *y, const cupdlp_int len)
+{
+   naive_sub_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(z, x, y, len);
+}
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
new file mode 100644
index 0000000000..e0103b9228
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
@@ -0,0 +1,98 @@
+#ifndef CUPDLP_CUDA_LINALG_H
+#define CUPDLP_CUDA_LINALG_H
+
+#include <cublas_v2.h>         // cublas
+#include <cuda_runtime_api.h>  // cudaMalloc, cudaMemcpy, etc.
+#include <cusparse.h>          // cusparseSpMV
+
+#include "cupdlp_cuda_kernels.cuh"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>   // printf
+#include <stdlib.h>  // EXIT_FAILURE
+
+// #include "../cupdlp_defs.h"
+// #include "../glbopts.h"
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+extern "C" cupdlp_int cuda_alloc_MVbuffer(
+    cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc,
+    cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx,
+    cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY,
+    cusparseDnVecDescr_t vecATy, void **dBuffer);
+
+extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csc,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta);
+extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csr,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta);
+extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csc,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta);
+extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csr,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta);
+
+extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub,
+                                       const cupdlp_int len);
+extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb,
+                                       const cupdlp_int len);
+extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                   const cupdlp_int len);
+extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                   const cupdlp_int len);
+extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len);
+extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len);
+extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len);
+extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len);
+extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len);
+extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len);
+extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val,
+                                    const cupdlp_int len);
+
+extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate, const cupdlp_float *x,
+                                  const cupdlp_float *cost,
+                                  const cupdlp_float *ATy,
+                                  const cupdlp_float dPrimalStep,
+                                  const cupdlp_int len);
+
+extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y,
+                                  const cupdlp_float *b, const cupdlp_float *Ax,
+                                  const cupdlp_float *AxUpdate,
+                                  const cupdlp_float dDualStep,
+                                  const cupdlp_int len);
+
+extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x,
+                                const cupdlp_float *y, const cupdlp_int len);
+#endif
+#endif
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/test_cublas.c b/src/pdlp/cupdlp/cuda/test_cublas.c
new file mode 100644
index 0000000000..4e1a52b85e
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/test_cublas.c
@@ -0,0 +1,152 @@
+#include "cupdlp_cuda_kernels.cuh"
+#include "cupdlp_cudalinalg.cuh"
+
+void use_cublas(cublasHandle_t cublashandle) {
+  cupdlp_int len = 10;
+  // cupdlp_int len = 1<<10;
+
+  // int N = 1<<20;
+
+  // alloc and init host vec memory
+  cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  for (cupdlp_int i = 0; i < len; i++) {
+    h_vec1[i] = 1.0;
+    h_vec2[i] = i;
+    // h_vec1[i] = 1.0;
+    // h_vec2[i] = 2.0;
+  }
+
+  // alloc and init device vec memory
+  cupdlp_float *d_vec1;
+  cupdlp_float *d_vec2;
+  cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float));
+  // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float),
+  //            cudaMemcpyHostToDevice);
+
+  cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float));
+  cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float),
+             cudaMemcpyHostToDevice);
+
+  // init cublas handle
+  // cublasHandle_t cublashandle;
+  // CHECK_CUBLAS(cublasCreate(&cublashandle));
+
+  cupdlp_float result;
+  // call nrm2
+  CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result));
+
+  // print result
+  printf("2-norm is :%f\n", result);
+
+  // copy result back to host
+  // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaError_t errSync = cudaGetLastError();
+  // cudaError_t errAsync = cudaDeviceSynchronize();
+  // if (errSync != cudaSuccess)
+  //     printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
+  // if (errAsync != cudaSuccess)
+  //     printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
+
+  // // print result
+  // for (cupdlp_int i = 0; i < len; i++) {
+  //     printf("%f\n", h_vec1[i]);
+  //     // printf("%f\n", h_vec2[i]);
+  // }
+
+  // destroy cublas handle
+  //  CHECK_CUBLAS(cublasDestroy(cublashandle));
+
+  // free memory
+  free(h_vec1);
+  free(h_vec2);
+  cudaFree(d_vec1);
+  cudaFree(d_vec2);
+}
+int main() {
+  // try cupdlp_edot_cuda
+
+  // int nDevices;
+
+  // cudaGetDeviceCount(&nDevices);
+  //    for (int i = 0; i < nDevices; i++) {
+  //        cudaDeviceProp prop;
+  //        cudaGetDeviceProperties(&prop, i);
+  //        printf("Device Number: %d\n", i);
+  //        printf("  Device name: %s\n", prop.name);
+  //        printf("  Memory Clock Rate (KHz): %d\n",
+  //               prop.memoryClockRate);
+  //        printf("  Memory Bus Width (bits): %d\n",
+  //               prop.memoryBusWidth);
+  //        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
+  //               2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8)
+  //               / 1.0e6);
+  //    }
+
+  // cupdlp_int len = 10;
+  // // cupdlp_int len = 1<<10;
+
+  // // int N = 1<<20;
+
+  // // alloc and init host vec memory
+  // cupdlp_float *h_vec1 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float));
+  // cupdlp_float *h_vec2 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float));
+  // for (cupdlp_int i = 0; i < len; i++) {
+  //     h_vec1[i] = 1.0;
+  //     h_vec2[i] = i;
+  //     // h_vec1[i] = 1.0;
+  //     // h_vec2[i] = 2.0;
+  // }
+
+  // // alloc and init device vec memory
+  // cupdlp_float *d_vec1;
+  // cupdlp_float *d_vec2;
+  // cudaMalloc((void **) &d_vec1, len * sizeof(cupdlp_float));
+  // // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float),
+  // //            cudaMemcpyHostToDevice);
+
+  // cudaMalloc((void **) &d_vec2, len * sizeof(cupdlp_float));
+  // cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float),
+  //            cudaMemcpyHostToDevice);
+
+  // init cublas handle
+  cublasHandle_t cublashandle;
+  CHECK_CUBLAS(cublasCreate(&cublashandle));
+  use_cublas(cublashandle);
+  // cupdlp_float result;
+  // call nrm2
+  // CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result));
+
+  // print result
+  // printf("2-norm is :%f\n", result);
+
+  // copy result back to host
+  // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaError_t errSync = cudaGetLastError();
+  // cudaError_t errAsync = cudaDeviceSynchronize();
+  // if (errSync != cudaSuccess)
+  //     printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
+  // if (errAsync != cudaSuccess)
+  //     printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
+
+  // // print result
+  // for (cupdlp_int i = 0; i < len; i++) {
+  //     printf("%f\n", h_vec1[i]);
+  //     // printf("%f\n", h_vec2[i]);
+  // }
+
+  // destroy cublas handle
+  CHECK_CUBLAS(cublasDestroy(cublashandle));
+
+  // // free memory
+  // free(h_vec1);
+  // free(h_vec2);
+  // cudaFree(d_vec1);
+  // cudaFree(d_vec2);
+}
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/test_cuda_linalg.c b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c
new file mode 100644
index 0000000000..ad19bbe701
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c
@@ -0,0 +1,79 @@
+#include "cupdlp_cuda_kernels.cuh"
+#include "cupdlp_cudalinalg.cuh"
+
+int main() {
+  // try cupdlp_edot_cuda
+
+  int nDevices;
+
+  cudaGetDeviceCount(&nDevices);
+  //    for (int i = 0; i < nDevices; i++) {
+  //        cudaDeviceProp prop;
+  //        cudaGetDeviceProperties(&prop, i);
+  //        printf("Device Number: %d\n", i);
+  //        printf("  Device name: %s\n", prop.name);
+  //        printf("  Memory Clock Rate (KHz): %d\n",
+  //               prop.memoryClockRate);
+  //        printf("  Memory Bus Width (bits): %d\n",
+  //               prop.memoryBusWidth);
+  //        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
+  //               2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8)
+  //               / 1.0e6);
+  //    }
+
+  cupdlp_int len = 10;
+  // cupdlp_int len = 1<<10;
+
+  // int N = 1<<20;
+
+  // alloc and init host vec memory
+  cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  for (cupdlp_int i = 0; i < len; i++) {
+    h_vec1[i] = i;
+    h_vec2[i] = i;
+    // h_vec1[i] = 1.0;
+    // h_vec2[i] = 2.0;
+  }
+
+  // alloc and init device vec memory
+  cupdlp_float *d_vec1;
+  cupdlp_float *d_vec2;
+  cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float));
+  cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float));
+  cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float),
+             cudaMemcpyHostToDevice);
+
+  // call cupdlp_edot_cuda
+  cupdlp_edot_cuda(d_vec1, d_vec2, len);
+  // element_wise_dot_kernel<<<(len+255)/256, 256>>>(d_vec1, d_vec2, len);
+  // saxpy<<<(len+255)/256, 256>>>(len, 2.0f, d_vec1, d_vec2);
+  // element_wise_dot<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(d_vec1, d_vec2,
+  // len);
+
+  // copy result back to host
+  cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float),
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float),
+             cudaMemcpyDeviceToHost);
+  cudaError_t errSync = cudaGetLastError();
+  cudaError_t errAsync = cudaDeviceSynchronize();
+  if (errSync != cudaSuccess)
+    printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
+  if (errAsync != cudaSuccess)
+    printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
+
+  // print result
+  for (cupdlp_int i = 0; i < len; i++) {
+    printf("%f\n", h_vec1[i]);
+    // printf("%f\n", h_vec2[i]);
+  }
+
+  // free memory
+  free(h_vec1);
+  free(h_vec2);
+  cudaFree(d_vec1);
+  cudaFree(d_vec2);
+}
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cupdlp_defs.h b/src/pdlp/cupdlp/cupdlp_defs.h
index 068c150935..f1d7ecfd27 100644
--- a/src/pdlp/cupdlp/cupdlp_defs.h
+++ b/src/pdlp/cupdlp/cupdlp_defs.h
@@ -1,7 +1,7 @@
 #ifndef CUPDLP_H_GUARD
 #define CUPDLP_H_GUARD
 
-#define CUPDLP_CPU
+// #define CUPDLP_CPU
 #define CUPDLP_DEBUG (0)
 #define CUPDLP_TIMER
 
@@ -9,6 +9,7 @@
 #include "cuda/cupdlp_cuda_kernels.cuh"
 #include "cuda/cupdlp_cudalinalg.cuh"
 #endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/pdlp/cupdlp/cupdlp_utils.c b/src/pdlp/cupdlp/cupdlp_utils.c
index 58b14b2ac3..1d9823a276 100644
--- a/src/pdlp/cupdlp/cupdlp_utils.c
+++ b/src/pdlp/cupdlp/cupdlp_utils.c
@@ -384,6 +384,15 @@ cupdlp_int PDHG_Clear(CUPDLPwork *w) {
     }
     if (scaling) {
       // scaling_clear(scaling);
+      if (scaling->colScale) {
+        // cupdlp_free(scaling->colScale);
+        CUPDLP_FREE_VEC(scaling->colScale);  // now on gpu
+      }
+      if (scaling->rowScale) {
+        // cupdlp_free(scaling->rowScale);
+        CUPDLP_FREE_VEC(scaling->rowScale);  // now on gpu
+      }
+      // cupdlp_free(scaling);
       scaling = cupdlp_NULL;
     }
     cupdlp_free(w);
@@ -1318,7 +1327,14 @@ cupdlp_retcode csr_create(CUPDLPcsr **csr) {
 
 cupdlp_retcode csc_create(CUPDLPcsc **csc) {
   cupdlp_retcode retcode = RETCODE_OK;
+#ifdef CUPDLP_CPU
   CUPDLP_INIT_CSC_MATRIX(*csc, 1);
+#else
+  // CUPDLP_INIT(*csc, 1);
+  // (*csc) = typeof(CUPDLPcsc) (malloc((1) * sizeof(CUPDLPcsc)); 
+  (*csc) = (typeof(*csc))malloc((1) * sizeof(typeof(**csc))); if ((*csc) == 0) { retcode = (1); goto exit_cleanup; }
+  // if ((*csc) == 0) { retcode = (1); goto exit_cleanup; } 
+#endif
 
 exit_cleanup:
   return retcode;
@@ -1405,9 +1421,22 @@ cupdlp_retcode csc_alloc_matrix(CUPDLPcsc *csc, cupdlp_int nRows,
     default:
       break;
   }
+
+#ifndef CUPDLP_CPU
+  cusparseStatus_t status = cudaMalloc((void **)&(csc->colMatBeg), (nCols + 1) * sizeof(int));       
+  cusparseStatus_t status2 = cudaMalloc((void **)&(csc->colMatIdx), (nnz) * sizeof(int));       
+  cusparseStatus_t status3 = cudaMalloc((void **)&(csc->colMatElem), (nnz) * sizeof(double));       
+  if (status || status2 || status3) return 2;
+
+  status = cudaMemset(csc->colMatBeg, 0, (nCols + 1) * sizeof(int));        
+  status2 = cudaMemset(csc->colMatIdx, 0, (nnz) * sizeof(int));        
+  status3 = cudaMemset(csc->colMatElem, 0, (nnz) * sizeof(double));        
+  if (status || status2 || status3) return 2;
+#else
   CUPDLP_INIT_ZERO_INT_VEC(csc->colMatBeg, nCols + 1);
   CUPDLP_INIT_ZERO_INT_VEC(csc->colMatIdx, nnz);
   CUPDLP_INIT_ZERO_DOUBLE_VEC(csc->colMatElem, nnz);
+#endif
 
   switch (src_matrix_format) {
     case DENSE:
diff --git a/src/pdlp/cupdlp/glbopts.h b/src/pdlp/cupdlp/glbopts.h
index ff6599c7ca..ddf3f8e2c0 100644
--- a/src/pdlp/cupdlp/glbopts.h
+++ b/src/pdlp/cupdlp/glbopts.h
@@ -7,6 +7,8 @@
 // #include <cusparse.h>          // cusparseSpMV
 // #endif
 
+#include "HConfig.h"
+
 #ifdef __cplusplus
 
 extern "C" {
@@ -70,8 +72,8 @@ extern "C" {
 //     x = cupdlp_NULL;       \
 //   }
 
-// #define CUPDLP_COPY_VEC(dst, src, type, size) \
-//   cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault)
+#define CUPDLP_COPY_VEC(dst, src, type, size) \
+  cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault)
 
 // #define CUPDLP_INIT_VEC(var, size)                                             \
 //   {                                                                            \