-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Christophe Van Gysel
committed
Feb 23, 2018
0 parents
commit 102a383
Showing
20 changed files
with
2,782 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
build/ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
cmake_minimum_required (VERSION 3.5) | ||
project (device_matrix) | ||
|
||
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/third_party) | ||
|
||
find_package(CUDA REQUIRED) | ||
find_package(Glog REQUIRED) | ||
|
||
MESSAGE(STATUS "CUDA: " ${CUDA_LIBRARIES} " " ${CUDA_INCLUDE_DIRS}) | ||
MESSAGE(STATUS "GLog: " ${GLOG_LIBRARIES}) | ||
|
||
# Adapted from http://crascit.com/2015/07/25/cmake-gtest/. | ||
configure_file(third_party/googletest-CMakeLists.txt googletest-download/CMakeLists.txt) | ||
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . | ||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) | ||
execute_process(COMMAND ${CMAKE_COMMAND} --build . | ||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) | ||
|
||
# Prevent GoogleTest from overriding our compiler/linker options | ||
# when building with Visual Studio | ||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) | ||
|
||
# Add googletest directly to our build. This adds | ||
# the following targets: gtest, gtest_main, gmock | ||
# and gmock_main | ||
add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src | ||
${CMAKE_BINARY_DIR}/googletest-build | ||
EXCLUDE_FROM_ALL) | ||
|
||
set(LIBRARIES | ||
-lglog | ||
-lcublas -lcudart -lnvToolsExt -lcnmem | ||
${CUDA_LIBRARIES} ${GLOG_LIBRARIES}) | ||
|
||
set(TEST_LIBRARIES | ||
gtest_main gmock ${LIBRARIES}) | ||
|
||
set(CMAKE_CXX_FLAGS | ||
"-std=c++11 -march=native -O3 -funroll-loops") | ||
|
||
set(CUDA_NVCC_FLAGS | ||
${CUDA_NVCC_FLAGS}; | ||
-arch=sm_52 --relocatable-device-code=true --cudart=shared -use_fast_math -O3 -default-stream per-thread) | ||
set(CUDA_SEPARABLE_COMPILATION ON) | ||
|
||
# Circuimvent CMake here (include_directories) as it seems unable to pass SYSTEM include paths to NVCC. | ||
SET(INCLUDE_DIRS | ||
"${gtest_SOURCE_DIR}/include" | ||
"${gmock_SOURCE_DIR}/include" | ||
${GLOG_INCLUDE_DIRS} | ||
${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc | ||
${GTEST_INCLUDE_DIRS} | ||
) | ||
|
||
foreach(INCLUDE_DIR ${INCLUDE_DIRS}) | ||
set(CUDA_NVCC_FLAGS | ||
${CUDA_NVCC_FLAGS}; | ||
-isystem ${INCLUDE_DIR}) | ||
|
||
set(CMAKE_CXX_FLAGS | ||
"${CMAKE_CXX_FLAGS} -isystem ${INCLUDE_DIR}") | ||
endforeach() | ||
|
||
enable_testing() | ||
subdirs(cpp examples) | ||
|
||
file(GLOB_RECURSE DEVICE_MATRIX_HEADER_FILES | ||
"include/*.h" | ||
) | ||
|
||
# Installation rules for header files. | ||
install(FILES ${DEVICE_MATRIX_HEADER_FILES} DESTINATION include/device_matrix) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2018 Christophe Van Gysel | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
The `device_matrix` library | ||
=========================== | ||
|
||
`device_matrix` is a lightweight, transparent, object-oriented and templated C++ library that encapsulates CUDA memory objects (i.e., tensors) and defines common operations on them. | ||
|
||
Requirements & installation | ||
--------------------------- | ||
|
||
To build the library and manage dependencies, we use [CMake](https://cmake.org/) (version 3.5 and higher). In addition, we rely on the following libraries: | ||
|
||
* [CUDA](https://developer.nvidia.com/cuda-zone) (version 8 and higher preferred), | ||
* [glog](https://github.com/google/glog) (version 0.3.4 and higher), and | ||
* [cnmem](https://github.com/NVIDIA/cnmem). | ||
|
||
The tests are implemented using the [googletest and googlemock](https://github.com/google/googletest) frameworks, which CMake will fetch and compile automatically as part of the build pipeline. Finally, you need a CUDA-compatible GPU in order to perfrom any computations. | ||
|
||
To install `device_matrix`, the following instructions should get you started. | ||
|
||
git clone https://github.com/cvangysel/device_matrix | ||
cd device_matrix | ||
mkdir build | ||
cd build | ||
cmake .. | ||
make | ||
make test | ||
make install | ||
|
||
Please refer to the [CMake documentation](https://cmake.org/documentation) for advanced options. | ||
|
||
Examples | ||
-------- | ||
|
||
The following examples can also be found in the [examples](examples/) sub-directory of this repository. These examples will also be compiled as part of the build process. | ||
|
||
### Matrix multiplication | ||
|
||
``` cpp | ||
#include <device_matrix/device_matrix.h> | ||
|
||
#include <glog/logging.h> | ||
#include <memory> | ||
|
||
using namespace cuda; | ||
|
||
int main(int argc, char* argv[]) { | ||
google::InitGoogleLogging(argv[0]); | ||
|
||
const cudaStream_t stream = 0; // default CUDA stream. | ||
|
||
std::unique_ptr<device_matrix<float32>> a( | ||
device_matrix<float32>::create( | ||
stream, | ||
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, | ||
2 /* num_rows */, 3 /* num_columns */)); | ||
|
||
std::unique_ptr<device_matrix<float32>> b( | ||
device_matrix<float32>::create( | ||
stream, | ||
{7.0, 8.0, 9.0, 10.0, 11.0, 12.0}, | ||
3 /* num_rows */, 2 /* num_columns */)); | ||
|
||
device_matrix<float32> c( | ||
2 /* num_rows */, 2 /* num_columns */, stream); | ||
|
||
matrix_mult(stream, | ||
*a, CUBLAS_OP_N, | ||
*b, CUBLAS_OP_N, | ||
&c); | ||
|
||
cudaDeviceSynchronize(); | ||
|
||
print_matrix(c); | ||
} | ||
``` | ||
### Custom CUDA kernels | ||
``` cpp | ||
#include <device_matrix/device_matrix.h> | ||
#include <glog/logging.h> | ||
#include <memory> | ||
using namespace cuda; | ||
template <typename FloatT> | ||
__global__ | ||
void inverse_kernel(FloatT* const input) { | ||
size_t offset = threadIdx.y * blockDim.x + threadIdx.x; | ||
input[offset] = -input[offset]; | ||
} | ||
int main(int argc, char* argv[]) { | ||
google::InitGoogleLogging(argv[0]); | ||
const cudaStream_t stream = 0; // default CUDA stream. | ||
std::unique_ptr<device_matrix<float32>> a( | ||
device_matrix<float32>::create( | ||
stream, | ||
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, | ||
2 /* num_rows */, 3 /* num_columns */)); | ||
LAUNCH_KERNEL( | ||
inverse_kernel | ||
<<<1, /* a single block */ | ||
dim3(a->getRows(), a->getCols()), /* one thread per component */ | ||
0, | ||
stream>>>( | ||
a->getData())); | ||
cudaDeviceSynchronize(); | ||
print_matrix(*a); | ||
} | ||
``` | ||
|
||
Design principles | ||
----------------- | ||
|
||
`device_matrix` was explicitly designed to be inflexible with regards to variable passing/assignment as the lifetime of a `device_matrix` instance directly corresponds to the lifetime of the CUDA memory region it has allocated. That means that CUDA memory remains allocated as long as its underlying `device_matrix` exists and that `device_matrix` instances can only be passed as pointers or references. This gives total control of the CUDA memory allocation to the programmer, as it avoids garbage collection (e.g., Torch) or reference counting (e.g., `shared_ptr`), and allows for optimized CUDA memory usage. It uses [cnmem](https://github.com/NVIDIA/cnmem) for its memory management in order to avoid performance issues that occur due to the recurrent re-allocation of memory blocks of a particular size. | ||
|
||
To avoid the implicit allocation of on-device memory, any operation resulting in a new allocation needs to be explicit in this. Most operations that return a new result will therefore reuse one of its inputs as destination memory space (in the process, the original input values will be overwritten!). As a result of this, C++ operators that imply value modification were deliberately omitted. | ||
|
||
The underlying CUDA memory space can easily be accessed by the library user. This allows the user to write arbitrary CUDA kernels that perform non-standard operations on CUDA objects in-place. | ||
|
||
License | ||
------- | ||
|
||
`device_matrix` is licensed under the [MIT license](LICENSE). CUDA is a licensed trademark of NVIDIA. Please note that [CUDA](https://developer.nvidia.com/cuda-zone) is licensed separately. | ||
|
||
If you modify `device_matrix` in any way, please link back to this repository. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
SET(DEVICE_MATRIX_SRC_FILES device_matrix.cu runtime.cu) | ||
|
||
include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}) | ||
include_directories(BEFORE ${device_matrix_SOURCE_DIR}/include) | ||
|
||
# Main library. | ||
cuda_add_library(device_matrix | ||
${DEVICE_MATRIX_SRC_FILES} | ||
OPTIONS -DFLOATING_POINT_TYPE=float32 -DNDEBUG=1) | ||
target_compile_options(device_matrix PUBLIC -DFLOATING_POINT_TYPE=float32 -DNDEBUG=1) | ||
|
||
# Tests. | ||
cuda_add_library(device_matrix_debug | ||
${DEVICE_MATRIX_SRC_FILES} | ||
OPTIONS -DFLOATING_POINT_TYPE=float64) | ||
target_compile_options(device_matrix_debug PUBLIC -DFLOATING_POINT_TYPE=float64) | ||
|
||
cuda_add_executable(device_matrix_tests device_matrix_tests.cu OPTIONS -DFLOATING_POINT_TYPE=float64) | ||
target_link_libraries(device_matrix_tests device_matrix_debug ${TEST_LIBRARIES}) | ||
|
||
# Make test visible to CMake. | ||
add_test(NAME device_matrix_tests COMMAND device_matrix_tests) | ||
|
||
# Installation rules for library binary. | ||
install(TARGETS device_matrix DESTINATION lib) |
Oops, something went wrong.