Parallel implementation using dask.delayed
import numpy as np
from numba import cuda
import dask
n = 2**30
n_gpu = 2
n_block = n // n_thread
n_thread = 2**10
a = np.zeros(n, dtype=np.int8)
@dask.delayed
def plus(idx_gpu, a, n_block, n_thread):
cuda.select_device(idx_gpu)
PLUS[n_block, n_thread](a)
cuda.close()
@cuda.jit
def PLUS(a):
tid = cuda.grid(1)
a[tid] += 1
def taskGenerator(a, n_gpu, n_block, n_thread):
n = len(a)
start = lambda idx_gpu: idx_gpu * (n//2)
end = lambda idx_gpu: (idx_gpu + 1) * (n//2)
return [plus(idx_gpu,
a[start(idx_gpu):end(idx_gpu)],
n_block//2, n_thread)
for idx_gpu in range(n_gpu)]
print(a)
task = taskGenerator(a, n_gpu, n_block, n_thread)
%time dask.compute(*task)
print(a)
if a.all():
print("\nOK!")
else:
print("\nNO!")
[0 0 0 ... 0 0 0]
CPU times: user 615 ms, sys: 906 ms, total: 1.52 s
Wall time: 924 ms
[1 1 1 ... 1 1 1]
OK!
PREVIOUSEtc