Reduction on SYCL-supported Devices#
Numba-dpex does not yet provide any specific decorator to implement
reduction kernels. However, a kernel reduction can be written explicitly. This
section provides two approaches for writing a reduction kernel as a
numba_dpex.kernel
function.
Example 1#
This example demonstrates a summation reduction on a one-dimensional array.
Full example can be found at numba_dpex/examples/sum_reduction.py
.
In this example, to reduce the array we invoke the kernel multiple times.
@ndpx.kernel
def sum_reduction_kernel(A, R, stride):
i = ndpx.get_global_id(0)
# sum two element
R[i] = A[i] + A[i + stride]
# store the sum to be used in nex iteration
A[i] = R[i]
def sum_reduce(A):
"""Size of A should be power of two."""
total = len(A)
# max size will require half the size of A to store sum
R = np.array(np.random.random(math.floor(total / 2)), dtype=A.dtype)
while total > 1:
global_size = math.floor(total // 2)
total = total - global_size
sum_reduction_kernel[ndpx.Range(global_size)](A, R, total)
return R[0]
Example 2#
Full example can be found at
numba_dpex/examples/sum_reduction_recursive_ocl.py
.
@ndpx.kernel
def sum_reduction_kernel(A, input_size, partial_sums):
local_id = ndpx.get_local_id(0)
global_id = ndpx.get_global_id(0)
group_size = ndpx.get_local_size(0)
group_id = ndpx.get_group_id(0)
local_sums = ndpx.local.array(64, int32)
local_sums[local_id] = 0
if global_id < input_size:
local_sums[local_id] = A[global_id]
# Loop for computing local_sums : divide workgroup into 2 parts
stride = group_size // 2
while stride > 0:
# Waiting for each 2x2 addition into given workgroup
ndpx.barrier(ndpx.LOCAL_MEM_FENCE)
# Add elements 2 by 2 between local_id and local_id + stride
if local_id < stride:
local_sums[local_id] += local_sums[local_id + stride]
stride >>= 1
if local_id == 0:
partial_sums[group_id] = local_sums[0]
def sum_recursive_reduction(size, group_size, Dinp, Dpartial_sums):
result = 0
nb_work_groups = 0
passed_size = size
if size <= group_size:
nb_work_groups = 1
else:
nb_work_groups = size // group_size
if size % group_size != 0:
nb_work_groups += 1
passed_size = nb_work_groups * group_size
gr = ndpx.Range(passed_size)
lr = ndpx.Range(group_size)
sum_reduction_kernel[ndpx.NdRange(gr, lr)](Dinp, size, Dpartial_sums)
if nb_work_groups <= group_size:
sum_reduction_kernel[ndpx.NdRange(lr, lr)](
Dpartial_sums, nb_work_groups, Dinp
)
result = int(Dinp[0])
else:
result = sum_recursive_reduction(
nb_work_groups, group_size, Dpartial_sums, Dinp
)
return result
def sum_reduce(A):
global_size = len(A)
work_group_size = 64
nb_work_groups = global_size // work_group_size
if (global_size % work_group_size) != 0:
nb_work_groups += 1
partial_sums = dpt.zeros(nb_work_groups, dtype=A.dtype, device=A.device)
result = sum_recursive_reduction(
global_size, work_group_size, A, partial_sums
)
return result
Note
Numba-dpex does not yet provide any analogue to the numba.cuda.reduce
decorator for writing reductions kernel. Such a decorator will be added in
future releases.
Full examples#
numba_dpex/examples/sum_reduction_recursive_ocl.py
numba_dpex/examples/sum_reduction_ocl.py
numba_dpex/examples/sum_reduction.py