-
Notifications
You must be signed in to change notification settings - Fork 0
/
first.py
36 lines (26 loc) · 796 Bytes
/
first.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
from numba import cuda
@cuda.jit
def square_array(array):
# Get the thread ID
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
# Get the block size
block_size = cuda.blockDim.x
# Get the grid size
grid_size = cuda.gridDim.x
start = tx + ty * block_size
stride = block_size * grid_size
for i in range(start, array.size, stride):
array[i] *= array[i]
# Create an array of 100 elements
arr = np.array([1, 2, 3, 4, 5], dtype=np.float32)
array = cuda.to_device(arr)
threadsperblock = 32
blockspergrid = (array.size + (threadsperblock - 1)) // threadsperblock
# Launch the kernel
square_array[blockspergrid, threadsperblock](array)
# Copy the result back to the host
result = array.copy_to_host()
# Print the result
print(result)