kevmo314 · brodeynewman · Dec 20, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/README.md b/README.md
@@ -3,30 +3,16 @@
 SCUDA is a GPU over IP bridge allowing GPUs on remote machines to be attached
 to CPU-only machines.
 
-## Demos
+## Demo
 
-### CUBLAS Matrix Multiplication
+### CUBLAS Matrix Multiplication using Unified Memory
 
 The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
 Left pane is a Mac running a docker container with nvidia utils installed.
 
-The docker container runs this [matrixMulCUBLAS](https://github.com/zchee/cuda-sample/blob/master/0_Simple/matrixMulCUBLAS/matrixMulCUBLAS.cpp) example.
-
-You can view the docker image used [here](./deploy/Dockerfile.cublas-test).
-
-https://github.com/user-attachments/assets/4bf130c5-5544-442f-b1a5-6216255ab499
-
-### Simple torch example
-
-The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
-Left pane is a Mac running a docker container with nvidia utils installed.
-
-The docker container runs `python3 -c "import torch; print(torch.cuda.is_available())"` to check if cuda is available.
-
-You can view the docker image used [here](./deploy/Dockerfile.torch-test).
-
-https://github.com/user-attachments/assets/035950bb-3cc1-4c73-9ad5-b00871a159ec
+The docker container runs this [matrixMulCUBLAS](./deploy/cublas_unified.o) example. This example not only uses cuBLAS, but also takes advantage of unified memory.
 
+You can view the docker image used [here](./deploy/Dockerfile.unified).
 
 ## Local development
 

diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -19,6 +19,7 @@ extern int rpc_end_request(const int index);
 extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
+void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
 extern int rpc_close();
 
 nvmlReturn_t nvmlInit_v2()
@@ -18581,6 +18582,7 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
 
 cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount)
 {
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18603,6 +18605,7 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
     return return_value;
 }
 

diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified
@@ -29,6 +29,7 @@ ENV libscuda_path=/usr/local/lib/libscuda.so
 COPY ./libscuda.so /usr/local/lib/libscuda.so
 COPY unified.o unified.o
 COPY unified_pointer.o unified_pointer.o
+COPY cublas_unified.o cublas_unified.o
 
 COPY start.sh /start.sh
 RUN chmod +x /start.sh

diff --git a/deploy/start.sh b/deploy/start.sh
@@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
 elif [[ "$1" == "unified" ]]; then
     echo "Running cublas example..."
 
-    LD_PRELOAD="$libscuda_path" /unified_pointer.o
+    LD_PRELOAD="$libscuda_path" /cublas_unified.o
 else
     echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
 fi
diff --git a/local.sh b/local.sh
@@ -24,13 +24,6 @@ build() {
 
   echo "building vector file for test..."
 
-  nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
-
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."
     exit 1
@@ -231,6 +224,18 @@ test() {
   done
 }
 
+build_tests() {
+  build
+
+  nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_unified.cu -o cublas_unified.o
+}
+
 run() {
   build
 
@@ -244,6 +249,9 @@ case "$1" in
   build)
     build
     ;;
+  build_tests)
+    build_tests
+    ;;
   run)
     run
     ;;

diff --git a/test/cublas_unified.cu b/test/cublas_unified.cu
@@ -0,0 +1,116 @@
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include "cublas_utils.h"
+
+using data_type = double;
+
+int main(int argc, char *argv[])
+{
+    cublasHandle_t cublasH = NULL;
+    cudaStream_t stream = NULL;
+
+    const int m = 2;
+    const int n = 2;
+    const int k = 2;
+    const int lda = 2;
+    const int ldb = 2;
+    const int ldc = 2;
+    const int batch_count = 2;
+
+    const std::vector<std::vector<data_type>> A_array = {{1.0, 3.0, 2.0, 4.0},
+                                                         {5.0, 7.0, 6.0, 8.0}};
+    const std::vector<std::vector<data_type>> B_array = {{5.0, 7.0, 6.0, 8.0},
+                                                         {9.0, 11.0, 10.0, 12.0}};
+    std::vector<std::vector<data_type>> C_array(batch_count, std::vector<data_type>(m * n));
+
+    const data_type alpha = 1.0;
+    const data_type beta = 0.0;
+
+    data_type **d_A_array = nullptr;
+    data_type **d_B_array = nullptr;
+    data_type **d_C_array = nullptr;
+
+    std::vector<data_type *> d_A(batch_count, nullptr);
+    std::vector<data_type *> d_B(batch_count, nullptr);
+    std::vector<data_type *> d_C(batch_count, nullptr);
+
+    cublasOperation_t transa = CUBLAS_OP_N;
+    cublasOperation_t transb = CUBLAS_OP_N;
+
+    printf("A[0]\n");
+    print_matrix(m, k, A_array[0].data(), lda);
+    printf("=====\n");
+
+    printf("A[1]\n");
+    print_matrix(m, k, A_array[1].data(), lda);
+    printf("=====\n");
+
+    printf("B[0]\n");
+    print_matrix(k, n, B_array[0].data(), ldb);
+    printf("=====\n");
+
+    printf("B[1]\n");
+    print_matrix(k, n, B_array[1].data(), ldb);
+    printf("=====\n");
+
+    /* Step 1: Create cuBLAS handle, bind a stream */
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CUBLAS_CHECK(cublasSetStream(cublasH, stream));
+
+    /* Step 2: Allocate unified memory */
+    CUDA_CHECK(cudaMallocManaged(&d_A_array, sizeof(data_type *) * batch_count));
+    CUDA_CHECK(cudaMallocManaged(&d_B_array, sizeof(data_type *) * batch_count));
+    CUDA_CHECK(cudaMallocManaged(&d_C_array, sizeof(data_type *) * batch_count));
+
+    for (int i = 0; i < batch_count; i++) {
+        CUDA_CHECK(cudaMallocManaged(&d_A[i], sizeof(data_type) * A_array[i].size()));
+        CUDA_CHECK(cudaMallocManaged(&d_B[i], sizeof(data_type) * B_array[i].size()));
+        CUDA_CHECK(cudaMallocManaged(&d_C[i], sizeof(data_type) * C_array[i].size()));
+
+        // Copy data to unified memory (host-side initialization is sufficient)
+        std::copy(A_array[i].begin(), A_array[i].end(), d_A[i]);
+        std::copy(B_array[i].begin(), B_array[i].end(), d_B[i]);
+
+        d_A_array[i] = d_A[i];
+        d_B_array[i] = d_B[i];
+        d_C_array[i] = d_C[i];
+    }
+
+    /* Step 3: Compute */
+    CUBLAS_CHECK(cublasDgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
+                                    d_B_array, ldb, &beta, d_C_array, ldc, batch_count));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    /* Step 4: Verify results */
+    printf("C[0]\n");
+    print_matrix(m, n, d_C[0], ldc);
+    printf("=====\n");
+
+    printf("C[1]\n");
+    print_matrix(m, n, d_C[1], ldc);
+    printf("=====\n");
+
+    /* Free resources */
+    CUDA_CHECK(cudaFree(d_A_array));
+    CUDA_CHECK(cudaFree(d_B_array));
+    CUDA_CHECK(cudaFree(d_C_array));
+    for (int i = 0; i < batch_count; i++) {
+        CUDA_CHECK(cudaFree(d_A[i]));
+        CUDA_CHECK(cudaFree(d_B[i]));
+        CUDA_CHECK(cudaFree(d_C[i]));
+    }
+
+    CUBLAS_CHECK(cublasDestroy(cublasH));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaDeviceReset());
+
+    return EXIT_SUCCESS;
+}