Skip to content

Commit

Permalink
Fix segfaults and memory errors with CUDA 11
Browse files Browse the repository at this point in the history
PyTorch no longer preserves the default cuBLAS pointer mode so we
need to set it explicitly before GEMM calls. This change in
behavior was the source of some illegal memory accesses.

In addition, building haste_pytorch with CppExtension instead of
CUDAExtension causes it to be linked against CUDA 10.2 instead of
CUDA 11 even though the rest of PyTorch is linked against CUDA 11.
As a result, a cuBLAS context created by PyTorch and used by
haste_pytorch can occasionally fail (e.g. on a GEMM with an inner
dimension > 4096) with a segfault.

Issue: #30
  • Loading branch information
sharvil committed May 2, 2021
1 parent 9da2454 commit 459608c
Show file tree
Hide file tree
Showing 14 changed files with 45 additions and 3 deletions.
6 changes: 3 additions & 3 deletions build/setup.pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ def run(self):
with open(f'frameworks/pytorch/_version.py', 'wt') as f:
f.write(f'__version__ = "{VERSION}"')

extension = cpp_extension.CppExtension(
extension = cpp_extension.CUDAExtension(
'haste_pytorch_lib',
sources = glob('frameworks/pytorch/*.cc'),
extra_compile_args = extra_args,
include_dirs = [os.path.join(base_path, 'lib'), os.path.join(CUDA_HOME, 'include')],
libraries = ['haste', 'cublas', 'cudart'],
library_dirs = ['.', os.path.join(CUDA_HOME, 'lib64'), os.path.join(CUDA_HOME, 'lib', 'x64')])
libraries = ['haste'],
library_dirs = ['.'])

setup(name = 'haste_pytorch',
version = VERSION,
Expand Down
12 changes: 12 additions & 0 deletions lib/blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@

template<typename T>
struct blas {
struct set_pointer_mode {
set_pointer_mode(cublasHandle_t handle) : handle_(handle) {
cublasGetPointerMode(handle_, &old_mode_);
cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_HOST);
}
~set_pointer_mode() {
cublasSetPointerMode(handle_, old_mode_);
}
private:
cublasHandle_t handle_;
cublasPointerMode_t old_mode_;
};
struct enable_tensor_cores {
enable_tensor_cores(cublasHandle_t handle) : handle_(handle) {
cublasGetMathMode(handle_, &old_mode_);
Expand Down
3 changes: 3 additions & 0 deletions lib/gru_backward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ void BackwardPass<T>::Iterate(
T* dp, // [N,H*3]
T* dq, // [N,H*3]
const T* zoneout_mask) { // [N,H]
const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const T alpha = static_cast<T>(1.0);
const T beta_sum = static_cast<T>(1.0);
const T beta_assign = static_cast<T>(0.0);
Expand Down Expand Up @@ -337,6 +339,7 @@ void BackwardPass<T>::Run(
T* dq,
const T* zoneout_mask) {
const blas<void>::enable_tensor_cores scoped0(data_->blas_handle);
const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const T alpha = static_cast<T>(1.0);
const T beta_sum = static_cast<T>(1.0);
Expand Down
3 changes: 3 additions & 0 deletions lib/gru_forward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ void ForwardPass<T>::Iterate(
static const T alpha = static_cast<T>(1.0);
static const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down Expand Up @@ -327,6 +329,7 @@ void ForwardPass<T>::Run(
static const T beta = static_cast<T>(0.0);

const blas<void>::enable_tensor_cores scoped0(data_->blas_handle);
const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/indrnn_backward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ void BackwardPass<T>::Run(
const T alpha = static_cast<T>(1.0);
const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/indrnn_forward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ void ForwardPass<T>::Run(
static const T alpha = static_cast<T>(1.0);
static const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const bool training = data_->training;
const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/layer_norm_gru_backward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ void BackwardPass<T>::Run(
const T beta_sum = static_cast<T>(1.0);
const T beta_assign = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/layer_norm_gru_forward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ void ForwardPass<T>::Run(
static const T alpha = static_cast<T>(1.0);
static const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/layer_norm_indrnn_backward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ void BackwardPass<T>::Run(
const T alpha = static_cast<T>(1.0);
const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/layer_norm_indrnn_forward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ void ForwardPass<T>::Run(
static const T alpha = static_cast<T>(1.0);
static const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const bool training = data_->training;
const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/layer_norm_lstm_backward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ void BackwardPass<T>::Run(
const T beta_sum = static_cast<T>(1.0); // Accumulate into output matrix!
const T beta_assign = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down
2 changes: 2 additions & 0 deletions lib/layer_norm_lstm_forward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ void ForwardPass<T>::Run(
static const T alpha = static_cast<T>(1.0);
static const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down
4 changes: 4 additions & 0 deletions lib/lstm_backward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ void BackwardPass<T>::Iterate(
const T beta_sum = static_cast<T>(1.0); // Accumulate into output matrix!
const T beta_assign = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down Expand Up @@ -337,6 +339,8 @@ void BackwardPass<T>::Run(
const T beta_sum = static_cast<T>(1.0); // Accumulate into output matrix!
const T beta_assign = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down
4 changes: 4 additions & 0 deletions lib/lstm_forward_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ void ForwardPass<T>::Iterate(
static const T alpha = static_cast<T>(1.0);
static const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down Expand Up @@ -324,6 +326,8 @@ void ForwardPass<T>::Run(
static const T alpha = static_cast<T>(1.0);
static const T beta = static_cast<T>(0.0);

const blas<void>::set_pointer_mode scoped1(data_->blas_handle);

const int batch_size = data_->batch_size;
const int input_size = data_->input_size;
const int hidden_size = data_->hidden_size;
Expand Down

0 comments on commit 459608c

Please sign in to comment.