DPNP C++ backend kernel library 0.20.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
common.hpp
1//*****************************************************************************
2// Copyright (c) 2025, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// maxification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// - Neither the name of the copyright holder nor the names of its contributors
13// may be used to endorse or promote products derived from this software
14// without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26// THE POSSIBILITY OF SUCH DAMAGE.
27//*****************************************************************************
28
29#pragma once
30
31#include <cstddef>
32#include <cstdint>
33#include <utility>
34#include <vector>
35
36#include <sycl/sycl.hpp>
37
38// dpctl tensor headers
39#include "kernels/alignment.hpp"
40#include "kernels/elementwise_functions/common.hpp"
41#include "utils/sycl_utils.hpp"
42
43namespace dpnp::extensions::py_internal::elementwise_common
44{
45using dpctl::tensor::kernels::alignment_utils::
46 disabled_sg_loadstore_wrapper_krn;
47using dpctl::tensor::kernels::alignment_utils::is_aligned;
48using dpctl::tensor::kernels::alignment_utils::required_alignment;
49
50using dpctl::tensor::kernels::elementwise_common::select_lws;
51
52using dpctl::tensor::sycl_utils::sub_group_load;
53using dpctl::tensor::sycl_utils::sub_group_store;
54
62template <typename argT,
63 typename resT1,
64 typename resT2,
65 typename UnaryTwoOutputsOpT,
66 std::uint8_t vec_sz = 4u,
67 std::uint8_t n_vecs = 2u,
68 bool enable_sg_loadstore = true>
70{
71private:
72 const argT *in = nullptr;
73 resT1 *out1 = nullptr;
74 resT2 *out2 = nullptr;
75 std::size_t nelems_;
76
77public:
78 UnaryTwoOutputsContigFunctor(const argT *inp,
79 resT1 *res1,
80 resT2 *res2,
81 const std::size_t n_elems)
82 : in(inp), out1(res1), out2(res2), nelems_(n_elems)
83 {
84 }
85
86 void operator()(sycl::nd_item<1> ndit) const
87 {
88 static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
89 UnaryTwoOutputsOpT op{};
90 /* Each work-item processes vec_sz elements, contiguous in memory */
91 /* NOTE: work-group size must be divisible by sub-group size */
92
93 if constexpr (enable_sg_loadstore &&
94 UnaryTwoOutputsOpT::is_constant::value) {
95 // value of operator is known to be a known constant
96 constexpr resT1 const_val1 = UnaryTwoOutputsOpT::constant_value1;
97 constexpr resT2 const_val2 = UnaryTwoOutputsOpT::constant_value2;
98
99 auto sg = ndit.get_sub_group();
100 const std::uint16_t sgSize = sg.get_max_local_range()[0];
101
102 const std::size_t base =
103 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
104 sg.get_group_id()[0] * sgSize);
105 if (base + elems_per_wi * sgSize < nelems_) {
106 static constexpr sycl::vec<resT1, vec_sz> res1_vec(const_val1);
107 static constexpr sycl::vec<resT2, vec_sz> res2_vec(const_val2);
108#pragma unroll
109 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
110 const std::size_t offset = base + it * sgSize;
111 auto out1_multi_ptr = sycl::address_space_cast<
112 sycl::access::address_space::global_space,
113 sycl::access::decorated::yes>(&out1[offset]);
114 auto out2_multi_ptr = sycl::address_space_cast<
115 sycl::access::address_space::global_space,
116 sycl::access::decorated::yes>(&out2[offset]);
117
118 sub_group_store<vec_sz>(sg, res1_vec, out1_multi_ptr);
119 sub_group_store<vec_sz>(sg, res2_vec, out2_multi_ptr);
120 }
121 }
122 else {
123 const std::size_t lane_id = sg.get_local_id()[0];
124 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
125 out1[k] = const_val1;
126 out2[k] = const_val2;
127 }
128 }
129 }
130 else if constexpr (enable_sg_loadstore &&
131 UnaryTwoOutputsOpT::supports_sg_loadstore::value &&
132 UnaryTwoOutputsOpT::supports_vec::value &&
133 (vec_sz > 1))
134 {
135 auto sg = ndit.get_sub_group();
136 const std::uint16_t sgSize = sg.get_max_local_range()[0];
137
138 const std::size_t base =
139 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
140 sg.get_group_id()[0] * sgSize);
141 if (base + elems_per_wi * sgSize < nelems_) {
142#pragma unroll
143 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
144 const std::size_t offset = base + it * sgSize;
145 auto in_multi_ptr = sycl::address_space_cast<
146 sycl::access::address_space::global_space,
147 sycl::access::decorated::yes>(&in[offset]);
148 auto out1_multi_ptr = sycl::address_space_cast<
149 sycl::access::address_space::global_space,
150 sycl::access::decorated::yes>(&out1[offset]);
151 auto out2_multi_ptr = sycl::address_space_cast<
152 sycl::access::address_space::global_space,
153 sycl::access::decorated::yes>(&out2[offset]);
154
155 const sycl::vec<argT, vec_sz> x =
156 sub_group_load<vec_sz>(sg, in_multi_ptr);
157 sycl::vec<resT2, vec_sz> res2_vec = {};
158 const sycl::vec<resT1, vec_sz> res1_vec = op(x, res2_vec);
159 sub_group_store<vec_sz>(sg, res1_vec, out1_multi_ptr);
160 sub_group_store<vec_sz>(sg, res2_vec, out2_multi_ptr);
161 }
162 }
163 else {
164 const std::size_t lane_id = sg.get_local_id()[0];
165 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
166 // scalar call
167 out1[k] = op(in[k], out2[k]);
168 }
169 }
170 }
171 else if constexpr (enable_sg_loadstore &&
172 UnaryTwoOutputsOpT::supports_sg_loadstore::value &&
173 std::is_same_v<resT1, argT>)
174 {
175 // default: use scalar-value function
176
177 auto sg = ndit.get_sub_group();
178 const std::uint16_t sgSize = sg.get_max_local_range()[0];
179 const std::size_t base =
180 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
181 sg.get_group_id()[0] * sgSize);
182
183 if (base + elems_per_wi * sgSize < nelems_) {
184#pragma unroll
185 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
186 const std::size_t offset = base + it * sgSize;
187 auto in_multi_ptr = sycl::address_space_cast<
188 sycl::access::address_space::global_space,
189 sycl::access::decorated::yes>(&in[offset]);
190 auto out1_multi_ptr = sycl::address_space_cast<
191 sycl::access::address_space::global_space,
192 sycl::access::decorated::yes>(&out1[offset]);
193 auto out2_multi_ptr = sycl::address_space_cast<
194 sycl::access::address_space::global_space,
195 sycl::access::decorated::yes>(&out2[offset]);
196
197 sycl::vec<argT, vec_sz> arg_vec =
198 sub_group_load<vec_sz>(sg, in_multi_ptr);
199 sycl::vec<resT2, vec_sz> res2_vec = {};
200#pragma unroll
201 for (std::uint32_t k = 0; k < vec_sz; ++k) {
202 arg_vec[k] = op(arg_vec[k], res2_vec[k]);
203 }
204 sub_group_store<vec_sz>(sg, arg_vec, out1_multi_ptr);
205 sub_group_store<vec_sz>(sg, res2_vec, out2_multi_ptr);
206 }
207 }
208 else {
209 const std::size_t lane_id = sg.get_local_id()[0];
210 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
211 out1[k] = op(in[k], out2[k]);
212 }
213 }
214 }
215 else if constexpr (enable_sg_loadstore &&
216 UnaryTwoOutputsOpT::supports_sg_loadstore::value)
217 {
218 // default: use scalar-value function
219
220 auto sg = ndit.get_sub_group();
221 const std::uint16_t sgSize = sg.get_max_local_range()[0];
222 const std::size_t base =
223 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
224 sg.get_group_id()[0] * sgSize);
225
226 if (base + elems_per_wi * sgSize < nelems_) {
227#pragma unroll
228 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
229 const std::size_t offset = base + it * sgSize;
230 auto in_multi_ptr = sycl::address_space_cast<
231 sycl::access::address_space::global_space,
232 sycl::access::decorated::yes>(&in[offset]);
233 auto out1_multi_ptr = sycl::address_space_cast<
234 sycl::access::address_space::global_space,
235 sycl::access::decorated::yes>(&out1[offset]);
236 auto out2_multi_ptr = sycl::address_space_cast<
237 sycl::access::address_space::global_space,
238 sycl::access::decorated::yes>(&out2[offset]);
239
240 const sycl::vec<argT, vec_sz> arg_vec =
241 sub_group_load<vec_sz>(sg, in_multi_ptr);
242 sycl::vec<resT1, vec_sz> res1_vec = {};
243 sycl::vec<resT2, vec_sz> res2_vec = {};
244#pragma unroll
245 for (std::uint8_t k = 0; k < vec_sz; ++k) {
246 res1_vec[k] = op(arg_vec[k], res2_vec[k]);
247 }
248 sub_group_store<vec_sz>(sg, res1_vec, out1_multi_ptr);
249 sub_group_store<vec_sz>(sg, res2_vec, out2_multi_ptr);
250 }
251 }
252 else {
253 const std::size_t lane_id = sg.get_local_id()[0];
254 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
255 out1[k] = op(in[k], out2[k]);
256 }
257 }
258 }
259 else {
260 const std::uint16_t sgSize =
261 ndit.get_sub_group().get_local_range()[0];
262 const std::size_t gid = ndit.get_global_linear_id();
263 const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
264
265 const std::size_t start =
266 (gid / sgSize) * (elems_per_sg - sgSize) + gid;
267 const std::size_t end = std::min(nelems_, start + elems_per_sg);
268 for (std::size_t offset = start; offset < end; offset += sgSize) {
269 out1[offset] = op(in[offset], out2[offset]);
270 }
271 }
272 }
273};
274
282template <typename argT,
283 typename resT1,
284 typename resT2,
285 typename IndexerT,
286 typename UnaryTwoOutputsOpT>
288{
289private:
290 const argT *inp_ = nullptr;
291 resT1 *res1_ = nullptr;
292 resT2 *res2_ = nullptr;
293 IndexerT inp_out_indexer_;
294
295public:
296 UnaryTwoOutputsStridedFunctor(const argT *inp_p,
297 resT1 *res1_p,
298 resT2 *res2_p,
299 const IndexerT &inp_out_indexer)
300 : inp_(inp_p), res1_(res1_p), res2_(res2_p),
301 inp_out_indexer_(inp_out_indexer)
302 {
303 }
304
305 void operator()(sycl::id<1> wid) const
306 {
307 const auto &offsets_ = inp_out_indexer_(wid.get(0));
308 const ssize_t &inp_offset = offsets_.get_first_offset();
309 const ssize_t &res1_offset = offsets_.get_second_offset();
310 const ssize_t &res2_offset = offsets_.get_third_offset();
311
312 UnaryTwoOutputsOpT op{};
313
314 res1_[res1_offset] = op(inp_[inp_offset], res2_[res2_offset]);
315 }
316};
317
325template <typename argTy,
326 template <typename T>
327 class UnaryTwoOutputsType,
328 template <typename A,
329 typename R1,
330 typename R2,
331 std::uint8_t vs,
332 std::uint8_t nv,
333 bool enable>
334 class UnaryTwoOutputsContigFunctorT,
335 template <typename A,
336 typename R1,
337 typename R2,
338 std::uint8_t vs,
339 std::uint8_t nv>
340 class kernel_name,
341 std::uint8_t vec_sz = 4u,
342 std::uint8_t n_vecs = 2u>
343sycl::event
344 unary_two_outputs_contig_impl(sycl::queue &exec_q,
345 std::size_t nelems,
346 const char *arg_p,
347 char *res1_p,
348 char *res2_p,
349 const std::vector<sycl::event> &depends = {})
350{
351 static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
352 const std::size_t n_work_items_needed = nelems / elems_per_wi;
353 const std::size_t lws =
354 select_lws(exec_q.get_device(), n_work_items_needed);
355
356 const std::size_t n_groups =
357 ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
358 const auto gws_range = sycl::range<1>(n_groups * lws);
359 const auto lws_range = sycl::range<1>(lws);
360
361 using resTy1 = typename UnaryTwoOutputsType<argTy>::value_type1;
362 using resTy2 = typename UnaryTwoOutputsType<argTy>::value_type2;
363 using BaseKernelName = kernel_name<argTy, resTy1, resTy2, vec_sz, n_vecs>;
364
365 const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
366 resTy1 *res1_tp = reinterpret_cast<resTy1 *>(res1_p);
367 resTy2 *res2_tp = reinterpret_cast<resTy2 *>(res2_p);
368
369 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
370 cgh.depends_on(depends);
371
372 if (is_aligned<required_alignment>(arg_p) &&
373 is_aligned<required_alignment>(res1_p) &&
374 is_aligned<required_alignment>(res2_p))
375 {
376 static constexpr bool enable_sg_loadstore = true;
377 using KernelName = BaseKernelName;
378 using Impl =
379 UnaryTwoOutputsContigFunctorT<argTy, resTy1, resTy2, vec_sz,
380 n_vecs, enable_sg_loadstore>;
381
382 cgh.parallel_for<KernelName>(
383 sycl::nd_range<1>(gws_range, lws_range),
384 Impl(arg_tp, res1_tp, res2_tp, nelems));
385 }
386 else {
387 static constexpr bool disable_sg_loadstore = false;
388 using KernelName =
389 disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
390 using Impl =
391 UnaryTwoOutputsContigFunctorT<argTy, resTy1, resTy2, vec_sz,
392 n_vecs, disable_sg_loadstore>;
393
394 cgh.parallel_for<KernelName>(
395 sycl::nd_range<1>(gws_range, lws_range),
396 Impl(arg_tp, res1_tp, res2_tp, nelems));
397 }
398 });
399
400 return comp_ev;
401}
402
410template <typename argTy,
411 template <typename T>
412 class UnaryTwoOutputsType,
413 template <typename A, typename R1, typename R2, typename I>
414 class UnaryTwoOutputsStridedFunctorT,
415 template <typename A, typename R1, typename R2, typename I>
416 class kernel_name>
417sycl::event unary_two_outputs_strided_impl(
418 sycl::queue &exec_q,
419 std::size_t nelems,
420 int nd,
421 const ssize_t *shape_and_strides,
422 const char *arg_p,
423 ssize_t arg_offset,
424 char *res1_p,
425 ssize_t res1_offset,
426 char *res2_p,
427 ssize_t res2_offset,
428 const std::vector<sycl::event> &depends,
429 const std::vector<sycl::event> &additional_depends)
430{
431 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
432 cgh.depends_on(depends);
433 cgh.depends_on(additional_depends);
434
435 using res1Ty = typename UnaryTwoOutputsType<argTy>::value_type1;
436 using res2Ty = typename UnaryTwoOutputsType<argTy>::value_type2;
437 using IndexerT =
438 typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
439
440 const IndexerT indexer{nd, arg_offset, res1_offset, res2_offset,
441 shape_and_strides};
442
443 const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
444 res1Ty *res1_tp = reinterpret_cast<res1Ty *>(res1_p);
445 res2Ty *res2_tp = reinterpret_cast<res2Ty *>(res2_p);
446
447 using Impl =
448 UnaryTwoOutputsStridedFunctorT<argTy, res1Ty, res2Ty, IndexerT>;
449
450 cgh.parallel_for<kernel_name<argTy, res1Ty, res2Ty, IndexerT>>(
451 {nelems}, Impl(arg_tp, res1_tp, res2_tp, indexer));
452 });
453 return comp_ev;
454}
455
456// Typedefs for function pointers
457
458typedef sycl::event (*unary_two_outputs_contig_impl_fn_ptr_t)(
459 sycl::queue &,
460 std::size_t,
461 const char *,
462 char *,
463 char *,
464 const std::vector<sycl::event> &);
465
466typedef sycl::event (*unary_two_outputs_strided_impl_fn_ptr_t)(
467 sycl::queue &,
468 std::size_t,
469 int,
470 const ssize_t *,
471 const char *,
472 ssize_t,
473 char *,
474 ssize_t,
475 char *,
476 ssize_t,
477 const std::vector<sycl::event> &,
478 const std::vector<sycl::event> &);
479
480} // namespace dpnp::extensions::py_internal::elementwise_common
Functor for evaluation of a unary function with two output arrays on contiguous arrays.
Definition common.hpp:70
Functor for evaluation of a unary function with two output arrays on strided data.
Definition common.hpp:288