Initial commit.

cvangysel · Feb 23, 2018 · 102a383 · 102a383
commit 102a383
Show file tree

Hide file tree

Showing 20 changed files with 2,782 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+build/
+*.pyc
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,72 @@
+cmake_minimum_required (VERSION 3.5)
+project (device_matrix)
+
+list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/third_party)
+
+find_package(CUDA REQUIRED)
+find_package(Glog REQUIRED)
+
+MESSAGE(STATUS "CUDA: " ${CUDA_LIBRARIES} " " ${CUDA_INCLUDE_DIRS})
+MESSAGE(STATUS "GLog: " ${GLOG_LIBRARIES})
+
+# Adapted from http://crascit.com/2015/07/25/cmake-gtest/.
+configure_file(third_party/googletest-CMakeLists.txt googletest-download/CMakeLists.txt)
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+execute_process(COMMAND ${CMAKE_COMMAND} --build .
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+
+# Prevent GoogleTest from overriding our compiler/linker options
+# when building with Visual Studio
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+# Add googletest directly to our build. This adds
+# the following targets: gtest, gtest_main, gmock
+# and gmock_main
+add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src
+                 ${CMAKE_BINARY_DIR}/googletest-build
+                 EXCLUDE_FROM_ALL)
+
+set(LIBRARIES
+    -lglog
+    -lcublas -lcudart -lnvToolsExt -lcnmem
+    ${CUDA_LIBRARIES} ${GLOG_LIBRARIES})
+
+set(TEST_LIBRARIES
+    gtest_main gmock ${LIBRARIES})
+
+set(CMAKE_CXX_FLAGS
+    "-std=c++11 -march=native -O3 -funroll-loops")
+
+set(CUDA_NVCC_FLAGS
+    ${CUDA_NVCC_FLAGS};
+    -arch=sm_52 --relocatable-device-code=true --cudart=shared -use_fast_math -O3 -default-stream per-thread)
+set(CUDA_SEPARABLE_COMPILATION ON)
+
+# Circuimvent CMake here (include_directories) as it seems unable to pass SYSTEM include paths to NVCC.
+SET(INCLUDE_DIRS
+    "${gtest_SOURCE_DIR}/include"
+    "${gmock_SOURCE_DIR}/include"
+    ${GLOG_INCLUDE_DIRS}
+    ${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc
+    ${GTEST_INCLUDE_DIRS}
+)
+
+foreach(INCLUDE_DIR ${INCLUDE_DIRS})
+    set(CUDA_NVCC_FLAGS
+        ${CUDA_NVCC_FLAGS};
+        -isystem ${INCLUDE_DIR})
+
+    set(CMAKE_CXX_FLAGS
+        "${CMAKE_CXX_FLAGS} -isystem ${INCLUDE_DIR}")
+endforeach()
+
+enable_testing()
+subdirs(cpp examples)
+
+file(GLOB_RECURSE DEVICE_MATRIX_HEADER_FILES
+    "include/*.h"
+)
+
+# Installation rules for header files.
+install(FILES ${DEVICE_MATRIX_HEADER_FILES} DESTINATION include/device_matrix)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2018 Christophe Van Gysel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,132 @@
+The `device_matrix` library
+===========================
+
+`device_matrix` is a lightweight, transparent, object-oriented and templated C++ library that encapsulates CUDA memory objects (i.e., tensors) and defines common operations on them.
+
+Requirements & installation
+---------------------------
+
+To build the library and manage dependencies, we use [CMake](https://cmake.org/) (version 3.5 and higher). In addition, we rely on the following libraries:
+
+   * [CUDA](https://developer.nvidia.com/cuda-zone) (version 8 and higher preferred),
+   * [glog](https://github.com/google/glog) (version 0.3.4 and higher), and
+   * [cnmem](https://github.com/NVIDIA/cnmem).
+
+The tests are implemented using the [googletest and googlemock](https://github.com/google/googletest) frameworks, which CMake will fetch and compile automatically as part of the build pipeline. Finally, you need a CUDA-compatible GPU in order to perfrom any computations.
+
+To install `device_matrix`, the following instructions should get you started.
+
+	git clone https://github.com/cvangysel/device_matrix
+	cd device_matrix
+	mkdir build
+	cd build
+	cmake ..
+	make
+	make test
+	make install
+
+Please refer to the [CMake documentation](https://cmake.org/documentation) for advanced options.
+
+Examples
+--------
+
+The following examples can also be found in the [examples](examples/) sub-directory of this repository. These examples will also be compiled as part of the build process.
+
+### Matrix multiplication
+
+``` cpp
+#include <device_matrix/device_matrix.h>
+
+#include <glog/logging.h>
+#include <memory>
+
+using namespace cuda;
+
+int main(int argc, char* argv[]) {
+    google::InitGoogleLogging(argv[0]);
+
+    const cudaStream_t stream = 0; // default CUDA stream.
+
+    std::unique_ptr<device_matrix<float32>> a(
+        device_matrix<float32>::create(
+            stream,
+            {1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
+            2 /* num_rows */, 3 /* num_columns */));
+
+    std::unique_ptr<device_matrix<float32>> b(
+        device_matrix<float32>::create(
+            stream,
+            {7.0, 8.0, 9.0, 10.0, 11.0, 12.0},
+            3 /* num_rows */, 2 /* num_columns */));
+
+    device_matrix<float32> c(
+        2 /* num_rows */, 2 /* num_columns */, stream);
+
+    matrix_mult(stream,
+                *a, CUBLAS_OP_N,
+                *b, CUBLAS_OP_N,
+                &c);
+
+    cudaDeviceSynchronize();
+
+    print_matrix(c);
+}
+```
+	
+### Custom CUDA kernels
+
+``` cpp
+#include <device_matrix/device_matrix.h>
+
+#include <glog/logging.h>
+#include <memory>
+
+using namespace cuda;
+
+template <typename FloatT>
+__global__
+void inverse_kernel(FloatT* const input) {
+    size_t offset = threadIdx.y * blockDim.x + threadIdx.x;
+    input[offset] = -input[offset];
+}
+
+int main(int argc, char* argv[]) {
+    google::InitGoogleLogging(argv[0]);
+
+    const cudaStream_t stream = 0; // default CUDA stream.
+
+    std::unique_ptr<device_matrix<float32>> a(
+        device_matrix<float32>::create(
+            stream,
+            {1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
+            2 /* num_rows */, 3 /* num_columns */));
+
+    LAUNCH_KERNEL(
+        inverse_kernel
+            <<<1, /* a single block */
+               dim3(a->getRows(), a->getCols()), /* one thread per component */
+               0,
+               stream>>>(
+            a->getData()));
+
+    cudaDeviceSynchronize();
+
+    print_matrix(*a);
+}
+```
+
+Design principles
+-----------------
+
+`device_matrix` was explicitly designed to be inflexible with regards to variable passing/assignment as the lifetime of a `device_matrix` instance directly corresponds to the lifetime of the CUDA memory region it has allocated. That means that CUDA memory remains allocated as long as its underlying `device_matrix` exists and that `device_matrix` instances can only be passed as pointers or references. This gives total control of the CUDA memory allocation to the programmer, as it avoids garbage collection (e.g., Torch) or reference counting (e.g., `shared_ptr`), and allows for optimized CUDA memory usage. It uses [cnmem](https://github.com/NVIDIA/cnmem) for its memory management in order to avoid performance issues that occur due to the recurrent re-allocation of memory blocks of a particular size.
+
+To avoid the implicit allocation of on-device memory, any operation resulting in a new allocation needs to be explicit in this. Most operations that return a new result will therefore reuse one of its inputs as destination memory space (in the process, the original input values will be overwritten!). As a result of this, C++ operators that imply value modification were deliberately omitted. 
+
+The underlying CUDA memory space can easily be accessed by the library user. This allows the user to write arbitrary CUDA kernels that perform non-standard operations on CUDA objects in-place.
+
+License
+-------
+
+`device_matrix` is licensed under the [MIT license](LICENSE). CUDA is a licensed trademark of NVIDIA. Please note that [CUDA](https://developer.nvidia.com/cuda-zone) is licensed separately.
+
+If you modify `device_matrix` in any way, please link back to this repository.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -0,0 +1,25 @@
+SET(DEVICE_MATRIX_SRC_FILES device_matrix.cu runtime.cu)
+
+include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR})
+include_directories(BEFORE ${device_matrix_SOURCE_DIR}/include)
+
+# Main library.
+cuda_add_library(device_matrix
+                 ${DEVICE_MATRIX_SRC_FILES}
+                 OPTIONS -DFLOATING_POINT_TYPE=float32 -DNDEBUG=1)
+target_compile_options(device_matrix PUBLIC -DFLOATING_POINT_TYPE=float32 -DNDEBUG=1)
+
+# Tests.
+cuda_add_library(device_matrix_debug
+                 ${DEVICE_MATRIX_SRC_FILES}
+                 OPTIONS -DFLOATING_POINT_TYPE=float64)
+target_compile_options(device_matrix_debug PUBLIC -DFLOATING_POINT_TYPE=float64)
+
+cuda_add_executable(device_matrix_tests device_matrix_tests.cu OPTIONS -DFLOATING_POINT_TYPE=float64)
+target_link_libraries(device_matrix_tests device_matrix_debug ${TEST_LIBRARIES})
+
+# Make test visible to CMake.
+add_test(NAME device_matrix_tests COMMAND device_matrix_tests)
+
+# Installation rules for library binary.
+install(TARGETS device_matrix DESTINATION lib)