DPNP C++ backend kernel library 0.18.0dev1
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
sliding_window1d.hpp
1//*****************************************************************************
2// Copyright (c) 2024-2025, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23// THE POSSIBILITY OF SUCH DAMAGE.
24//*****************************************************************************
25
26#pragma once
27
28#include "utils/math_utils.hpp"
29#include <sycl/sycl.hpp>
30#include <type_traits>
31
32#include <stdio.h>
33
34#include "ext/common.hpp"
35
36using dpctl::tensor::usm_ndarray;
37
38using ext::common::Align;
39using ext::common::CeilDiv;
40
41namespace statistics::sliding_window1d
42{
43
44template <typename T, uint32_t Size>
46{
47public:
48 using ncT = typename std::remove_const_t<T>;
49 using SizeT = decltype(Size);
50 static constexpr SizeT _size = Size;
51
52 _RegistryDataStorage(const sycl::nd_item<1> &item)
53 : sbgroup(item.get_sub_group())
54 {
55 }
56
57 template <typename yT>
58 T &operator[](const yT &idx)
59 {
60 static_assert(std::is_integral_v<yT>,
61 "idx must be of an integral type");
62 return data[idx];
63 }
64
65 template <typename yT>
66 const T &operator[](const yT &idx) const
67 {
68 static_assert(std::is_integral_v<yT>,
69 "idx must be of an integral type");
70 return data[idx];
71 }
72
73 T &value()
74 {
75 static_assert(Size == 1,
76 "Size is not equal to 1. Use value(idx) instead");
77 return data[0];
78 }
79
80 const T &value() const
81 {
82 static_assert(Size == 1,
83 "Size is not equal to 1. Use value(idx) instead");
84 return data[0];
85 }
86
87 template <typename yT, typename xT>
88 T broadcast(const yT &y, const xT &x) const
89 {
90 static_assert(std::is_integral_v<std::remove_reference_t<yT>>,
91 "y must be of an integral type");
92 static_assert(std::is_integral_v<std::remove_reference_t<xT>>,
93 "x must be of an integral type");
94
95 return sycl::select_from_group(sbgroup, data[y], x);
96 }
97
98 template <typename iT>
99 T broadcast(const iT &idx) const
100 {
101 if constexpr (Size == 1) {
102 return broadcast(0, idx);
103 }
104 else {
105 return broadcast(idx / size_x(), idx % size_x());
106 }
107 }
108
109 template <typename yT, typename xT>
110 T shift_left(const yT &y, const xT &x) const
111 {
112 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
113 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
114
115 return sycl::shift_group_left(sbgroup, data[y], x);
116 }
117
118 template <typename yT, typename xT>
119 T shift_right(const yT &y, const xT &x) const
120 {
121 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
122 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
123
124 return sycl::shift_group_right(sbgroup, data[y], x);
125 }
126
127 constexpr SizeT size_y() const
128 {
129 return _size;
130 }
131
132 SizeT size_x() const
133 {
134 return sbgroup.get_max_local_range()[0];
135 }
136
137 SizeT total_size() const
138 {
139 return size_x() * size_y();
140 }
141
142 ncT *ptr()
143 {
144 return data;
145 }
146
147 SizeT x() const
148 {
149 return sbgroup.get_local_linear_id();
150 }
151
152protected:
153 const sycl::sub_group sbgroup;
154 ncT data[Size];
155};
156
157template <typename T, uint32_t Size = 1>
158struct RegistryData : public _RegistryDataStorage<T, Size>
159{
160 using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
161
162 using _RegistryDataStorage<T, Size>::_RegistryDataStorage;
163
164 template <typename LaneIdT,
165 typename Condition,
166 typename = std::enable_if_t<
167 std::is_invocable_r_v<bool, Condition, SizeT>>>
168 void fill_lane(const LaneIdT &lane_id, const T &value, Condition &&mask)
169 {
170 static_assert(std::is_integral_v<LaneIdT>,
171 "lane_id must be of an integral type");
172 if (mask(this->x())) {
173 this->data[lane_id] = value;
174 }
175 }
176
177 template <typename LaneIdT>
178 void fill_lane(const LaneIdT &lane_id, const T &value, const bool &mask)
179 {
180 fill_lane(lane_id, value, [mask](auto &&) { return mask; });
181 }
182
183 template <typename LaneIdT>
184 void fill_lane(const LaneIdT &lane_id, const T &value)
185 {
186 fill_lane(lane_id, value, true);
187 }
188
189 template <typename Condition,
190 typename = std::enable_if_t<
191 std::is_invocable_r_v<bool, Condition, SizeT, SizeT>>>
192 void fill(const T &value, Condition &&mask)
193 {
194 for (SizeT i = 0; i < Size; ++i) {
195 fill_lane(i, value, mask(i, this->x()));
196 }
197 }
198
199 void fill(const T &value)
200 {
201 fill(value, [](auto &&, auto &&) { return true; });
202 }
203
204 template <typename LaneIdT,
205 typename Condition,
206 typename = std::enable_if_t<
207 std::is_invocable_r_v<bool, Condition, const T *const>>>
208 T *load_lane(const LaneIdT &lane_id,
209 const T *const data,
210 Condition &&mask,
211 const T &default_v)
212 {
213 static_assert(std::is_integral_v<LaneIdT>,
214 "lane_id must be of an integral type");
215 this->data[lane_id] = mask(data) ? data[0] : default_v;
216
217 return data + this->size_x();
218 }
219
220 template <typename LaneIdT>
221 T *load_lane(const LaneIdT &laned_id,
222 const T *const data,
223 const bool &mask,
224 const T &default_v)
225 {
226 return load_lane(
227 laned_id, data, [mask](auto &&) { return mask; }, default_v);
228 }
229
230 template <typename LaneIdT>
231 T *load_lane(const LaneIdT &laned_id, const T *const data)
232 {
233 constexpr T default_v = 0;
234 return load_lane(laned_id, data, true, default_v);
235 }
236
237 template <typename yStrideT,
238 typename Condition,
239 typename = std::enable_if_t<
240 std::is_invocable_r_v<bool, Condition, const T *const>>>
241 T *load(const T *const data,
242 const yStrideT &y_stride,
243 Condition &&mask,
244 const T &default_v)
245 {
246 auto *it = data;
247 for (SizeT i = 0; i < Size; ++i) {
248 load_lane(i, it, mask, default_v);
249 it += y_stride;
250 }
251
252 return it;
253 }
254
255 template <typename yStrideT>
256 T *load(const T *const data,
257 const yStrideT &y_stride,
258 const bool &mask,
259 const T &default_v)
260 {
261 return load(
262 data, y_stride, [mask](auto &&) { return mask; }, default_v);
263 }
264
265 template <typename Condition,
266 typename = std::enable_if_t<
267 std::is_invocable_r_v<bool, Condition, const T *const>>>
268 T *load(const T *const data, Condition &&mask, const T &default_v)
269 {
270 return load(data, this->size_x(), mask, default_v);
271 }
272
273 T *load(const T *const data, const bool &mask, const T &default_v)
274 {
275 return load(
276 data, [mask](auto &&) { return mask; }, default_v);
277 }
278
279 T *load(const T *const data)
280 {
281 constexpr T default_v = 0;
282 return load(data, true, default_v);
283 }
284
285 template <typename LaneIdT,
286 typename Condition,
287 typename = std::enable_if_t<
288 std::is_invocable_r_v<bool, Condition, const T *const>>>
289 T *store_lane(const LaneIdT &lane_id, T *const data, Condition &&mask)
290 {
291 static_assert(std::is_integral_v<LaneIdT>,
292 "lane_id must be of an integral type");
293
294 if (mask(data)) {
295 data[0] = this->data[lane_id];
296 }
297
298 return data + this->size_x();
299 }
300
301 template <typename LaneIdT>
302 T *store_lane(const LaneIdT &lane_id, T *const data, const bool &mask)
303 {
304 return store_lane(lane_id, data, [mask](auto &&) { return mask; });
305 }
306
307 template <typename LaneIdT>
308 T *store_lane(const LaneIdT &lane_id, T *const data)
309 {
310 return store_lane(lane_id, data, true);
311 }
312
313 template <typename yStrideT,
314 typename Condition,
315 typename = std::enable_if_t<
316 std::is_invocable_r_v<bool, Condition, const T *const>>>
317 T *store(T *const data, const yStrideT &y_stride, Condition &&condition)
318 {
319 auto *it = data;
320 for (SizeT i = 0; i < Size; ++i) {
321 store_lane(i, it, condition);
322 it += y_stride;
323 }
324
325 return it;
326 }
327
328 template <typename yStrideT>
329 T *store(T *const data, const yStrideT &y_stride, const bool &mask)
330 {
331 return store(data, y_stride, [mask](auto &&) { return mask; });
332 }
333
334 template <typename Condition,
335 typename = std::enable_if_t<
336 std::is_invocable_r_v<bool, Condition, const T *const>>>
337 T *store(T *const data, Condition &&condition)
338 {
339 return store(data, this->size_x(), condition);
340 }
341
342 T *store(T *const data, const bool &mask)
343 {
344 return store(data, [mask](auto &&) { return mask; });
345 }
346
347 T *store(T *const data)
348 {
349 return store(data, true);
350 }
351};
352
353template <typename T, uint32_t Size>
354struct RegistryWindow : public RegistryData<T, Size>
355{
356 using SizeT = typename RegistryData<T, Size>::SizeT;
357
358 using RegistryData<T, Size>::RegistryData;
359
360 template <typename shT>
361 void advance_left(const shT &shift, const T &fill_value)
362 {
363 static_assert(std::is_integral_v<shT>,
364 "shift must be of an integral type");
365
366 uint32_t shift_r = this->size_x() - shift;
367 for (SizeT i = 0; i < Size; ++i) {
368 this->data[i] = this->shift_left(i, shift);
369 auto border =
370 i < Size - 1 ? this->shift_right(i + 1, shift_r) : fill_value;
371 if (this->x() >= shift_r) {
372 this->data[i] = border;
373 }
374 }
375 }
376
377 void advance_left(const T &fill_value)
378 {
379 advance_left(1, fill_value);
380 }
381
382 void advance_left()
383 {
384 constexpr T fill_value = 0;
385 advance_left(fill_value);
386 }
387};
388
389template <typename T, typename SizeT = size_t>
390class Span
391{
392public:
393 using value_type = T;
394 using size_type = SizeT;
395
396 Span(T *const data, const SizeT size) : data_(data), size_(size) {}
397
398 T *begin() const
399 {
400 return data();
401 }
402
403 T *end() const
404 {
405 return data() + size();
406 }
407
408 SizeT size() const
409 {
410 return size_;
411 }
412
413 T *data() const
414 {
415 return data_;
416 }
417
418protected:
419 T *const data_;
420 const SizeT size_;
421};
422
423template <typename T, typename SizeT = size_t>
424Span<T, SizeT> make_span(T *const data, const SizeT size)
425{
426 return Span<T, SizeT>(data, size);
427}
428
429template <typename T, typename SizeT = size_t>
430class PaddedSpan : public Span<T, SizeT>
431{
432public:
433 using value_type = T;
434 using size_type = SizeT;
435
436 PaddedSpan(T *const data, const SizeT size, const SizeT pad)
437 : Span<T, SizeT>(data, size), pad_(pad)
438 {
439 }
440
441 T *padded_begin() const
442 {
443 return this->begin() - pad();
444 }
445
446 SizeT pad() const
447 {
448 return pad_;
449 }
450
451protected:
452 const SizeT pad_;
453};
454
455template <typename T, typename SizeT = size_t>
457 make_padded_span(T *const data, const SizeT size, const SizeT offset)
458{
459 return PaddedSpan<T, SizeT>(data, size, offset);
460}
461
462template <typename Results,
463 typename AData,
464 typename VData,
465 typename Op,
466 typename Red>
467void process_block(Results &results,
468 uint32_t r_size,
469 AData &a_data,
470 VData &v_data,
471 uint32_t block_size,
472 Op op,
473 Red red)
474{
475 for (uint32_t i = 0; i < block_size; ++i) {
476 auto v_val = v_data.broadcast(i);
477 for (uint32_t r = 0; r < r_size; ++r) {
478 results[r] = red(results[r], op(a_data[r], v_val));
479 }
480 a_data.advance_left();
481 }
482}
483
484template <typename SizeT>
485SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
486{
487 auto sbgroup = item.get_sub_group();
488 const auto sg_loc_id = sbgroup.get_local_linear_id();
489
490 const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
491 const SizeT id = sg_base_id + sg_loc_id;
492
493 return id;
494}
495
496template <typename SizeT>
497uint32_t get_results_num(const uint32_t wpi,
498 const SizeT size,
499 const SizeT global_id,
500 const sycl::nd_item<1> &item)
501{
502 auto sbgroup = item.get_sub_group();
503
504 const auto sbg_size = sbgroup.get_max_local_range()[0];
505 const auto size_ = sycl::sub_sat(size, global_id);
506 return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
507}
508
509template <uint32_t WorkPI,
510 typename T,
511 typename SizeT,
512 typename Op,
513 typename Red>
515
516template <uint32_t WorkPI,
517 typename T,
518 typename SizeT,
519 typename Op,
520 typename Red>
521void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
522 const Span<const T, SizeT> &v,
523 const Op &op,
524 const Red &red,
525 Span<T, SizeT> &out,
526 sycl::nd_range<1> nd_range,
527 sycl::handler &cgh)
528{
530 nd_range, [=](sycl::nd_item<1> item) {
531 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
532
533 auto results = RegistryData<T, WorkPI>(item);
534 results.fill(0);
535
536 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
537
538 const auto *a_begin = a.begin();
539 const auto *a_end = a.end();
540
541 auto sbgroup = item.get_sub_group();
542
543 const auto chunks_count =
544 CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
545
546 const auto *a_ptr = &a.padded_begin()[glid];
547
548 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
549 return ptr >= a_begin && ptr < a_end;
550 };
551
552 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
553 a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
554
555 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
556 auto v_size = v.size();
557
558 for (uint32_t b = 0; b < chunks_count; ++b) {
559 auto v_data = RegistryData<const T>(item);
560 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
561
562 uint32_t chunk_size_ =
563 std::min(v_size, SizeT(v_data.total_size()));
564 process_block(results, results_num, a_data, v_data, chunk_size_,
565 op, red);
566
567 if (b != chunks_count - 1) {
568 a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
569 _a_load_cond, 0);
570 v_size -= v_data.total_size();
571 }
572 }
573
574 auto *const out_ptr = out.begin();
575 // auto *const out_end = out.end();
576
577 auto y_start = glid;
578 auto y_stop =
579 std::min(y_start + WorkPI * results.size_x(), out.size());
580 uint32_t i = 0;
581 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
582 out_ptr[y] = results[i++];
583 }
584 // while the code itself seems to be valid, inside correlate
585 // kernel it results in memory corruption. Further investigation
586 // is needed. SAT-7693
587 // corruption results.store(&out_ptr[glid],
588 // [out_end](auto &&ptr) { return ptr < out_end; });
589 });
590}
591
592template <uint32_t WorkPI,
593 typename T,
594 typename SizeT,
595 typename Op,
596 typename Red>
598
599template <uint32_t WorkPI,
600 typename T,
601 typename SizeT,
602 typename Op,
603 typename Red>
604void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
605 const Span<const T, SizeT> &v,
606 const Op &op,
607 const Red &red,
608 Span<T, SizeT> &out,
609 sycl::nd_range<1> nd_range,
610 sycl::handler &cgh)
611{
613 nd_range, [=](sycl::nd_item<1> item) {
614 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
615
616 auto results = RegistryData<T, WorkPI>(item);
617 results.fill(0);
618
619 auto sbgroup = item.get_sub_group();
620 auto sg_size = sbgroup.get_max_local_range()[0];
621
622 const uint32_t to_read = WorkPI * sg_size + v.size();
623 const auto *a_begin = a.begin();
624
625 const auto *a_ptr = &a.padded_begin()[glid];
626 const auto *a_end = std::min(a_ptr + to_read, a.end());
627
628 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
629 return ptr >= a_begin && ptr < a_end;
630 };
631
632 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
633 a_data.load(a_ptr, _a_load_cond, 0);
634
635 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
636 auto v_size = v.size();
637
638 auto v_data = RegistryData<const T>(item);
639 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
640
641 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
642
643 process_block(results, results_num, a_data, v_data, v_size, op,
644 red);
645
646 auto *const out_ptr = out.begin();
647 // auto *const out_end = out.end();
648
649 auto y_start = glid;
650 auto y_stop =
651 std::min(y_start + WorkPI * results.size_x(), out.size());
652 uint32_t i = 0;
653 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
654 out_ptr[y] = results[i++];
655 }
656 // while the code itself seems to be valid, inside correlate
657 // kernel it results in memory corruption. Further investigation
658 // is needed. SAT-7693
659 // corruption results.store(&out_ptr[glid],
660 // [out_end](auto &&ptr) { return ptr < out_end; });
661 });
662}
663
664void validate(const usm_ndarray &a,
665 const usm_ndarray &v,
666 const usm_ndarray &out,
667 const size_t l_pad,
668 const size_t r_pad);
669} // namespace statistics::sliding_window1d