Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Brodey/demo #64

Merged
merged 21 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
350c7d8
chore: bm
brodeynewman Oct 9, 2024
1ad672a
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 9, 2024
59150ee
chore: merge
brodeynewman Oct 9, 2024
c7d0b7d
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 11, 2024
29a919e
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 14, 2024
233b8e9
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 14, 2024
fc00189
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 17, 2024
79ccd26
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 23, 2024
38a351c
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 29, 2024
ab2e209
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 6, 2024
ccd7c31
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 8, 2024
25cad41
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 9, 2024
11f8e43
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 11, 2024
8e3d836
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 18, 2024
e20c750
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 28, 2024
8f56379
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 2, 2024
e5592dc
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 3, 2024
aeef059
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 7, 2024
83d1ebc
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 16, 2024
5fb85af
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 20, 2024
1eca2d2
chore: unified demo
brodeynewman Dec 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 4 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,16 @@
SCUDA is a GPU over IP bridge allowing GPUs on remote machines to be attached
to CPU-only machines.

## Demos
## Demo

### CUBLAS Matrix Multiplication
### CUBLAS Matrix Multiplication using Unified Memory

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs this [matrixMulCUBLAS](https://github.com/zchee/cuda-sample/blob/master/0_Simple/matrixMulCUBLAS/matrixMulCUBLAS.cpp) example.

You can view the docker image used [here](./deploy/Dockerfile.cublas-test).

https://github.com/user-attachments/assets/4bf130c5-5544-442f-b1a5-6216255ab499

### Simple torch example

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs `python3 -c "import torch; print(torch.cuda.is_available())"` to check if cuda is available.

You can view the docker image used [here](./deploy/Dockerfile.torch-test).

https://github.com/user-attachments/assets/035950bb-3cc1-4c73-9ad5-b00871a159ec
The docker container runs this [matrixMulCUBLAS](./deploy/cublas_unified.o) example. This example not only uses cuBLAS, but also takes advantage of unified memory.

You can view the docker image used [here](./deploy/Dockerfile.unified).

## Local development

Expand Down
3 changes: 3 additions & 0 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ extern int rpc_end_request(const int index);
extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
extern int rpc_close();

nvmlReturn_t nvmlInit_v2()
Expand Down Expand Up @@ -18581,6 +18582,7 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr

cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount)
{
cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
cublasStatus_t return_value;
if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
rpc_write(0, &batchCount, sizeof(int)) < 0 ||
Expand All @@ -18603,6 +18605,7 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return CUBLAS_STATUS_NOT_INITIALIZED;
cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
return return_value;
}

Expand Down
1 change: 1 addition & 0 deletions deploy/Dockerfile.unified
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ENV libscuda_path=/usr/local/lib/libscuda.so
COPY ./libscuda.so /usr/local/lib/libscuda.so
COPY unified.o unified.o
COPY unified_pointer.o unified_pointer.o
COPY cublas_unified.o cublas_unified.o

COPY start.sh /start.sh
RUN chmod +x /start.sh
Expand Down
2 changes: 1 addition & 1 deletion deploy/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
elif [[ "$1" == "unified" ]]; then
echo "Running cublas example..."

LD_PRELOAD="$libscuda_path" /unified_pointer.o
LD_PRELOAD="$libscuda_path" /cublas_unified.o
else
echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
fi
22 changes: 15 additions & 7 deletions local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,6 @@ build() {

echo "building vector file for test..."

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o

if [ ! -f "$libscuda_path" ]; then
echo "libscuda.so not found. build may have failed."
exit 1
Expand Down Expand Up @@ -231,6 +224,18 @@ test() {
done
}

build_tests() {
build

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_unified.cu -o cublas_unified.o
}

run() {
build

Expand All @@ -244,6 +249,9 @@ case "$1" in
build)
build
;;
build_tests)
build_tests
;;
run)
run
;;
Expand Down
116 changes: 116 additions & 0 deletions test/cublas_unified.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include <cstdio>
#include <cstdlib>
#include <vector>

#include <cublas_v2.h>
#include <cuda_runtime.h>

#include "cublas_utils.h"

using data_type = double;

int main(int argc, char *argv[])
{
cublasHandle_t cublasH = NULL;
cudaStream_t stream = NULL;

const int m = 2;
const int n = 2;
const int k = 2;
const int lda = 2;
const int ldb = 2;
const int ldc = 2;
const int batch_count = 2;

const std::vector<std::vector<data_type>> A_array = {{1.0, 3.0, 2.0, 4.0},
{5.0, 7.0, 6.0, 8.0}};
const std::vector<std::vector<data_type>> B_array = {{5.0, 7.0, 6.0, 8.0},
{9.0, 11.0, 10.0, 12.0}};
std::vector<std::vector<data_type>> C_array(batch_count, std::vector<data_type>(m * n));

const data_type alpha = 1.0;
const data_type beta = 0.0;

data_type **d_A_array = nullptr;
data_type **d_B_array = nullptr;
data_type **d_C_array = nullptr;

std::vector<data_type *> d_A(batch_count, nullptr);
std::vector<data_type *> d_B(batch_count, nullptr);
std::vector<data_type *> d_C(batch_count, nullptr);

cublasOperation_t transa = CUBLAS_OP_N;
cublasOperation_t transb = CUBLAS_OP_N;

printf("A[0]\n");
print_matrix(m, k, A_array[0].data(), lda);
printf("=====\n");

printf("A[1]\n");
print_matrix(m, k, A_array[1].data(), lda);
printf("=====\n");

printf("B[0]\n");
print_matrix(k, n, B_array[0].data(), ldb);
printf("=====\n");

printf("B[1]\n");
print_matrix(k, n, B_array[1].data(), ldb);
printf("=====\n");

/* Step 1: Create cuBLAS handle, bind a stream */
CUBLAS_CHECK(cublasCreate(&cublasH));

CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(cublasH, stream));

/* Step 2: Allocate unified memory */
CUDA_CHECK(cudaMallocManaged(&d_A_array, sizeof(data_type *) * batch_count));
CUDA_CHECK(cudaMallocManaged(&d_B_array, sizeof(data_type *) * batch_count));
CUDA_CHECK(cudaMallocManaged(&d_C_array, sizeof(data_type *) * batch_count));

for (int i = 0; i < batch_count; i++) {
CUDA_CHECK(cudaMallocManaged(&d_A[i], sizeof(data_type) * A_array[i].size()));
CUDA_CHECK(cudaMallocManaged(&d_B[i], sizeof(data_type) * B_array[i].size()));
CUDA_CHECK(cudaMallocManaged(&d_C[i], sizeof(data_type) * C_array[i].size()));

// Copy data to unified memory (host-side initialization is sufficient)
std::copy(A_array[i].begin(), A_array[i].end(), d_A[i]);
std::copy(B_array[i].begin(), B_array[i].end(), d_B[i]);

d_A_array[i] = d_A[i];
d_B_array[i] = d_B[i];
d_C_array[i] = d_C[i];
}

/* Step 3: Compute */
CUBLAS_CHECK(cublasDgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
d_B_array, ldb, &beta, d_C_array, ldc, batch_count));

CUDA_CHECK(cudaStreamSynchronize(stream));

/* Step 4: Verify results */
printf("C[0]\n");
print_matrix(m, n, d_C[0], ldc);
printf("=====\n");

printf("C[1]\n");
print_matrix(m, n, d_C[1], ldc);
printf("=====\n");

/* Free resources */
CUDA_CHECK(cudaFree(d_A_array));
CUDA_CHECK(cudaFree(d_B_array));
CUDA_CHECK(cudaFree(d_C_array));
for (int i = 0; i < batch_count; i++) {
CUDA_CHECK(cudaFree(d_A[i]));
CUDA_CHECK(cudaFree(d_B[i]));
CUDA_CHECK(cudaFree(d_C[i]));
}

CUBLAS_CHECK(cublasDestroy(cublasH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());

return EXIT_SUCCESS;
}
Loading