Skip to content

Commit

Permalink
Merge pull request #502 from abergeron/cuda9
Browse files Browse the repository at this point in the history
Changes for CUDA 9.0 float16 support.
  • Loading branch information
nouiz authored Aug 25, 2017
2 parents 351f359 + 8ac2448 commit 9885f08
Show file tree
Hide file tree
Showing 8 changed files with 328 additions and 359 deletions.
35 changes: 0 additions & 35 deletions pygpu/collectives.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -38,41 +38,6 @@ cdef class GpuCommCliqueId:
if comm_id is not None:
self.comm_id = comm_id

def __getbuffer__(self, Py_buffer* buffer, int flags):
if buffer == NULL:
raise BufferError, "NULL buffer view in getbuffer"

buffer.buf = <char*>self.c_comm_id.internal
buffer.obj = self
buffer.len = GA_COMM_ID_BYTES * sizeof(char)
buffer.readonly = 0
buffer.itemsize = sizeof(char)
if flags & PyBUF_FORMAT == PyBUF_FORMAT:
buffer.format = 'b'
else:
buffer.format = NULL
buffer.ndim = 1
if flags & PyBUF_ND == PyBUF_ND:
buffer.shape = <Py_ssize_t*>calloc(1, sizeof(Py_ssize_t))
buffer.shape[0] = GA_COMM_ID_BYTES
else:
buffer.shape = NULL
if flags & PyBUF_STRIDES == PyBUF_STRIDES:
buffer.strides = &buffer.itemsize
else:
buffer.strides = NULL
buffer.suboffsets = NULL
buffer.internal = NULL
Py_INCREF(self)

def __releasebuffer__(self, Py_buffer* buffer):
if buffer == NULL:
raise BufferError, "NULL buffer view in releasebuffer"

if buffer.shape != NULL:
free(buffer.shape)
Py_DECREF(self)

def __richcmp__(this, that, int op):
if type(this) != type(that):
raise TypeError, "Cannot compare %s with %s" % (type(this), type(that))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,6 @@ def test_richcmp(self):
with self.assertRaises(TypeError):
a = cid2 > "asdfasfa"

def test_as_buffer(self):
a = np.asarray(self.cid)
assert np.allclose(a, self.cid.comm_id)
a[:] = [ord(b'a')] * COMM_ID_BYTES
assert np.allclose(a, self.cid.comm_id)


@unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module")
@unittest.skipIf(get_user_gpu_rank() == -1, "Collective operations supported on CUDA devices only")
class TestGpuComm(unittest.TestCase):
Expand Down Expand Up @@ -293,19 +286,19 @@ def test_all_gather(self):

a = cpu.reshape((5, 2), order='F')
exp = texp.reshape((5, 2 * self.size), order='F')
gpu = gpuarray.asarray(a, context=self.ctx)
gpu = gpuarray.asarray(a, context=self.ctx, order='F')
resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
check_all(resgpu, exp)

a = cpu.reshape((5, 2), order='F')
exp = texp.reshape((5, 2, self.size), order='F')
gpu = gpuarray.asarray(a, context=self.ctx)
gpu = gpuarray.asarray(a, context=self.ctx, order='F')
resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
check_all(resgpu, exp)

a = cpu.reshape((5, 2), order='F')
exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
gpu = gpuarray.asarray(a, context=self.ctx)
gpu = gpuarray.asarray(a, context=self.ctx, order='F')
resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
check_all(resgpu, exp)

Expand Down
12 changes: 8 additions & 4 deletions src/cluda_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,14 @@ struct ga_half {
ga_ushort data;
};

#define ga_half2float(p) __half2float((p).data)
__device__ static inline ga_half ga_float2half(float f) {
static __device__ inline float ga_half2float(ga_half h) {
float r;
asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(r) : "h"(h.data));
return r;
}
static __device__ inline ga_half ga_float2half(float f) {
ga_half r;
r.data = __float2half_rn(f);
asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(r.data) : "f"(f));
return r;
}

Expand Down Expand Up @@ -142,7 +146,7 @@ __device__ ga_half atom_add_eg(ga_half *addr, ga_half val) {
do {
assumed = old;
tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410);
sum = __float2half_rn(__half2float(val.data) + __half2float(tmp.data));
sum = ga_float2half(ga_half2float(val) + ga_half2float(tmp)).data;
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
Expand Down
607 changes: 309 additions & 298 deletions src/cluda_cuda.h.c

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/gpuarray/ext_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define LIBGPU_EXT_CUDA

#include <cuda.h>
#include <cuda_fp16.h>

#include <gpuarray/config.h>
#include <gpuarray/buffer.h>
Expand Down
13 changes: 5 additions & 8 deletions src/gpuarray_buffer_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "private.h"
#include "private_cuda.h"

#include "loaders/libnvrtc.h"
#include "loaders/libcublas.h"

Expand Down Expand Up @@ -1087,9 +1088,11 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
size_t buflen;
const char *heads[1] = {"cluda.h"};
const char *hsrc[1];
const char *opts[4] = {
const char *opts[] = {
"-arch", ""
#ifdef DEBUG
, "-G", "-lineinfo"
#endif
};
nvrtcResult err;

Expand All @@ -1100,13 +1103,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
if (err != NVRTC_SUCCESS)
return error_nvrtc(ctx->err, "nvrtcCreateProgram", err);

err = nvrtcCompileProgram(prog,
#ifdef DEBUG
4,
#else
2,
#endif
opts);
err = nvrtcCompileProgram(prog, sizeof(opts)/sizeof(char *), opts);

/* Get the log before handling the error */
if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) {
Expand Down
5 changes: 2 additions & 3 deletions src/gpuarray_elemwise.c
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,6 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
static int check_contig(GpuElemwise *ge, void **args,
size_t *_n, int *contig) {
GpuArray *a = NULL, *v;
gpucontext *ctx = GpuKernel_context(&ge->k_contig);
size_t n = 1;
unsigned int i, j;
int c_contig = 1, f_contig = 1;
Expand All @@ -563,10 +562,10 @@ static int check_contig(GpuElemwise *ge, void **args,
f_contig &= GpuArray_IS_F_CONTIGUOUS(v);
if (a != v) {
if (a->nd != v->nd)
return error_fmt(ctx->err, GA_INVALID_ERROR, "Mismatched nd for input %u (expected %u, got %u)", i, a->nd, v->nd);
return -1; /* We don't check the value of the error code */
for (j = 0; j < a->nd; j++) {
if (v->dimensions[j] != a->dimensions[j])
return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u (expected %" SPREFIX "u, got %" SPREFIX "u)", j, a->dimensions[j], v->dimensions[j]);
return -1; /* We don't check the value of the error code */
}
}
}
Expand Down
1 change: 0 additions & 1 deletion tests/check_collectives.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ extern void teardown_comm(void);
int(*EXP)[(outcols)]; \
size_t indims[ND]; \
size_t outdims[ND]; \
const ssize_t instrds[ND] = {sizeof(*A), sizeof(int)}; \
const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)}; \
int err; \
size_t i, j, outsize; \
Expand Down

0 comments on commit 9885f08

Please sign in to comment.