Dask example code

Parallel implementation using dask.delayed

import numpy as np
from numba import cuda
import dask

n = 2**30
n_gpu = 2
n_block = n // n_thread
n_thread = 2**10
a = np.zeros(n, dtype=np.int8)


@dask.delayed
def plus(idx_gpu, a, n_block, n_thread):
    cuda.select_device(idx_gpu)
    PLUS[n_block, n_thread](a)
    cuda.close()
    
@cuda.jit
def PLUS(a):
    tid = cuda.grid(1)
    a[tid] += 1

def taskGenerator(a, n_gpu, n_block, n_thread):
    n = len(a)
    start = lambda idx_gpu: idx_gpu * (n//2)
    end = lambda idx_gpu: (idx_gpu + 1) * (n//2)
    
    return [plus(idx_gpu,
                 a[start(idx_gpu):end(idx_gpu)],
                 n_block//2, n_thread)
            for idx_gpu in range(n_gpu)]
    
print(a)
task = taskGenerator(a, n_gpu, n_block, n_thread)
%time dask.compute(*task)
print(a)

if a.all():
    print("\nOK!")
else:
    print("\nNO!")

[0 0 0 ... 0 0 0]
CPU times: user 615 ms, sys: 906 ms, total: 1.52 s
Wall time: 924 ms
[1 1 1 ... 1 1 1]

OK!

PREVIOUSEtc