DPNP C++ backend kernel library 0.18.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
sliding_window1d.hpp
1//*****************************************************************************
2// Copyright (c) 2024-2025, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23// THE POSSIBILITY OF SUCH DAMAGE.
24//*****************************************************************************
25
26#pragma once
27
28#include "utils/math_utils.hpp"
29#include <sycl/sycl.hpp>
30#include <type_traits>
31
32#include <stdio.h>
33
34#include "common.hpp"
35
36using dpctl::tensor::usm_ndarray;
37
38namespace statistics
39{
40using common::Align;
41using common::CeilDiv;
42
43namespace sliding_window1d
44{
45
46template <typename T, uint32_t Size>
48{
49public:
50 using ncT = typename std::remove_const_t<T>;
51 using SizeT = decltype(Size);
52 static constexpr SizeT _size = Size;
53
54 _RegistryDataStorage(const sycl::nd_item<1> &item)
55 : sbgroup(item.get_sub_group())
56 {
57 }
58
59 template <typename yT>
60 T &operator[](const yT &idx)
61 {
62 static_assert(std::is_integral_v<yT>,
63 "idx must be of an integral type");
64 return data[idx];
65 }
66
67 template <typename yT>
68 const T &operator[](const yT &idx) const
69 {
70 static_assert(std::is_integral_v<yT>,
71 "idx must be of an integral type");
72 return data[idx];
73 }
74
75 T &value()
76 {
77 static_assert(Size == 1,
78 "Size is not equal to 1. Use value(idx) instead");
79 return data[0];
80 }
81
82 const T &value() const
83 {
84 static_assert(Size == 1,
85 "Size is not equal to 1. Use value(idx) instead");
86 return data[0];
87 }
88
89 template <typename yT, typename xT>
90 T broadcast(const yT &y, const xT &x) const
91 {
92 static_assert(std::is_integral_v<std::remove_reference_t<yT>>,
93 "y must be of an integral type");
94 static_assert(std::is_integral_v<std::remove_reference_t<xT>>,
95 "x must be of an integral type");
96
97 return sycl::select_from_group(sbgroup, data[y], x);
98 }
99
100 template <typename iT>
101 T broadcast(const iT &idx) const
102 {
103 if constexpr (Size == 1) {
104 return broadcast(0, idx);
105 }
106 else {
107 return broadcast(idx / size_x(), idx % size_x());
108 }
109 }
110
111 template <typename yT, typename xT>
112 T shift_left(const yT &y, const xT &x) const
113 {
114 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
115 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
116
117 return sycl::shift_group_left(sbgroup, data[y], x);
118 }
119
120 template <typename yT, typename xT>
121 T shift_right(const yT &y, const xT &x) const
122 {
123 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
124 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
125
126 return sycl::shift_group_right(sbgroup, data[y], x);
127 }
128
129 constexpr SizeT size_y() const
130 {
131 return _size;
132 }
133
134 SizeT size_x() const
135 {
136 return sbgroup.get_max_local_range()[0];
137 }
138
139 SizeT total_size() const
140 {
141 return size_x() * size_y();
142 }
143
144 ncT *ptr()
145 {
146 return data;
147 }
148
149 SizeT x() const
150 {
151 return sbgroup.get_local_linear_id();
152 }
153
154protected:
155 const sycl::sub_group sbgroup;
156 ncT data[Size];
157};
158
159template <typename T, uint32_t Size = 1>
160struct RegistryData : public _RegistryDataStorage<T, Size>
161{
162 using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
163
164 using _RegistryDataStorage<T, Size>::_RegistryDataStorage;
165
166 template <typename LaneIdT,
167 typename Condition,
168 typename = std::enable_if_t<
169 std::is_invocable_r_v<bool, Condition, SizeT>>>
170 void fill_lane(const LaneIdT &lane_id, const T &value, Condition &&mask)
171 {
172 static_assert(std::is_integral_v<LaneIdT>,
173 "lane_id must be of an integral type");
174 if (mask(this->x())) {
175 this->data[lane_id] = value;
176 }
177 }
178
179 template <typename LaneIdT>
180 void fill_lane(const LaneIdT &lane_id, const T &value, const bool &mask)
181 {
182 fill_lane(lane_id, value, [mask](auto &&) { return mask; });
183 }
184
185 template <typename LaneIdT>
186 void fill_lane(const LaneIdT &lane_id, const T &value)
187 {
188 fill_lane(lane_id, value, true);
189 }
190
191 template <typename Condition,
192 typename = std::enable_if_t<
193 std::is_invocable_r_v<bool, Condition, SizeT, SizeT>>>
194 void fill(const T &value, Condition &&mask)
195 {
196 for (SizeT i = 0; i < Size; ++i) {
197 fill_lane(i, value, mask(i, this->x()));
198 }
199 }
200
201 void fill(const T &value)
202 {
203 fill(value, [](auto &&, auto &&) { return true; });
204 }
205
206 template <typename LaneIdT,
207 typename Condition,
208 typename = std::enable_if_t<
209 std::is_invocable_r_v<bool, Condition, const T *const>>>
210 T *load_lane(const LaneIdT &lane_id,
211 const T *const data,
212 Condition &&mask,
213 const T &default_v)
214 {
215 static_assert(std::is_integral_v<LaneIdT>,
216 "lane_id must be of an integral type");
217 this->data[lane_id] = mask(data) ? data[0] : default_v;
218
219 return data + this->size_x();
220 }
221
222 template <typename LaneIdT>
223 T *load_lane(const LaneIdT &laned_id,
224 const T *const data,
225 const bool &mask,
226 const T &default_v)
227 {
228 return load_lane(
229 laned_id, data, [mask](auto &&) { return mask; }, default_v);
230 }
231
232 template <typename LaneIdT>
233 T *load_lane(const LaneIdT &laned_id, const T *const data)
234 {
235 constexpr T default_v = 0;
236 return load_lane(laned_id, data, true, default_v);
237 }
238
239 template <typename yStrideT,
240 typename Condition,
241 typename = std::enable_if_t<
242 std::is_invocable_r_v<bool, Condition, const T *const>>>
243 T *load(const T *const data,
244 const yStrideT &y_stride,
245 Condition &&mask,
246 const T &default_v)
247 {
248 auto *it = data;
249 for (SizeT i = 0; i < Size; ++i) {
250 load_lane(i, it, mask, default_v);
251 it += y_stride;
252 }
253
254 return it;
255 }
256
257 template <typename yStrideT>
258 T *load(const T *const data,
259 const yStrideT &y_stride,
260 const bool &mask,
261 const T &default_v)
262 {
263 return load(
264 data, y_stride, [mask](auto &&) { return mask; }, default_v);
265 }
266
267 template <typename Condition,
268 typename = std::enable_if_t<
269 std::is_invocable_r_v<bool, Condition, const T *const>>>
270 T *load(const T *const data, Condition &&mask, const T &default_v)
271 {
272 return load(data, this->size_x(), mask, default_v);
273 }
274
275 T *load(const T *const data, const bool &mask, const T &default_v)
276 {
277 return load(
278 data, [mask](auto &&) { return mask; }, default_v);
279 }
280
281 T *load(const T *const data)
282 {
283 constexpr T default_v = 0;
284 return load(data, true, default_v);
285 }
286
287 template <typename LaneIdT,
288 typename Condition,
289 typename = std::enable_if_t<
290 std::is_invocable_r_v<bool, Condition, const T *const>>>
291 T *store_lane(const LaneIdT &lane_id, T *const data, Condition &&mask)
292 {
293 static_assert(std::is_integral_v<LaneIdT>,
294 "lane_id must be of an integral type");
295
296 if (mask(data)) {
297 data[0] = this->data[lane_id];
298 }
299
300 return data + this->size_x();
301 }
302
303 template <typename LaneIdT>
304 T *store_lane(const LaneIdT &lane_id, T *const data, const bool &mask)
305 {
306 return store_lane(lane_id, data, [mask](auto &&) { return mask; });
307 }
308
309 template <typename LaneIdT>
310 T *store_lane(const LaneIdT &lane_id, T *const data)
311 {
312 return store_lane(lane_id, data, true);
313 }
314
315 template <typename yStrideT,
316 typename Condition,
317 typename = std::enable_if_t<
318 std::is_invocable_r_v<bool, Condition, const T *const>>>
319 T *store(T *const data, const yStrideT &y_stride, Condition &&condition)
320 {
321 auto *it = data;
322 for (SizeT i = 0; i < Size; ++i) {
323 store_lane(i, it, condition);
324 it += y_stride;
325 }
326
327 return it;
328 }
329
330 template <typename yStrideT>
331 T *store(T *const data, const yStrideT &y_stride, const bool &mask)
332 {
333 return store(data, y_stride, [mask](auto &&) { return mask; });
334 }
335
336 template <typename Condition,
337 typename = std::enable_if_t<
338 std::is_invocable_r_v<bool, Condition, const T *const>>>
339 T *store(T *const data, Condition &&condition)
340 {
341 return store(data, this->size_x(), condition);
342 }
343
344 T *store(T *const data, const bool &mask)
345 {
346 return store(data, [mask](auto &&) { return mask; });
347 }
348
349 T *store(T *const data)
350 {
351 return store(data, true);
352 }
353};
354
355template <typename T, uint32_t Size>
356struct RegistryWindow : public RegistryData<T, Size>
357{
358 using SizeT = typename RegistryData<T, Size>::SizeT;
359
360 using RegistryData<T, Size>::RegistryData;
361
362 template <typename shT>
363 void advance_left(const shT &shift, const T &fill_value)
364 {
365 static_assert(std::is_integral_v<shT>,
366 "shift must be of an integral type");
367
368 uint32_t shift_r = this->size_x() - shift;
369 for (SizeT i = 0; i < Size; ++i) {
370 this->data[i] = this->shift_left(i, shift);
371 auto border =
372 i < Size - 1 ? this->shift_right(i + 1, shift_r) : fill_value;
373 if (this->x() >= shift_r) {
374 this->data[i] = border;
375 }
376 }
377 }
378
379 void advance_left(const T &fill_value)
380 {
381 advance_left(1, fill_value);
382 }
383
384 void advance_left()
385 {
386 constexpr T fill_value = 0;
387 advance_left(fill_value);
388 }
389};
390
391template <typename T, typename SizeT = size_t>
392class Span
393{
394public:
395 using value_type = T;
396 using size_type = SizeT;
397
398 Span(T *const data, const SizeT size) : data_(data), size_(size) {}
399
400 T *begin() const
401 {
402 return data();
403 }
404
405 T *end() const
406 {
407 return data() + size();
408 }
409
410 SizeT size() const
411 {
412 return size_;
413 }
414
415 T *data() const
416 {
417 return data_;
418 }
419
420protected:
421 T *const data_;
422 const SizeT size_;
423};
424
425template <typename T, typename SizeT = size_t>
426Span<T, SizeT> make_span(T *const data, const SizeT size)
427{
428 return Span<T, SizeT>(data, size);
429}
430
431template <typename T, typename SizeT = size_t>
432class PaddedSpan : public Span<T, SizeT>
433{
434public:
435 using value_type = T;
436 using size_type = SizeT;
437
438 PaddedSpan(T *const data, const SizeT size, const SizeT pad)
439 : Span<T, SizeT>(data, size), pad_(pad)
440 {
441 }
442
443 T *padded_begin() const
444 {
445 return this->begin() - pad();
446 }
447
448 SizeT pad() const
449 {
450 return pad_;
451 }
452
453protected:
454 const SizeT pad_;
455};
456
457template <typename T, typename SizeT = size_t>
459 make_padded_span(T *const data, const SizeT size, const SizeT offset)
460{
461 return PaddedSpan<T, SizeT>(data, size, offset);
462}
463
464template <typename Results,
465 typename AData,
466 typename VData,
467 typename Op,
468 typename Red>
469void process_block(Results &results,
470 uint32_t r_size,
471 AData &a_data,
472 VData &v_data,
473 uint32_t block_size,
474 Op op,
475 Red red)
476{
477 for (uint32_t i = 0; i < block_size; ++i) {
478 auto v_val = v_data.broadcast(i);
479 for (uint32_t r = 0; r < r_size; ++r) {
480 results[r] = red(results[r], op(a_data[r], v_val));
481 }
482 a_data.advance_left();
483 }
484}
485
486template <typename SizeT>
487SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
488{
489 auto sbgroup = item.get_sub_group();
490 const auto sg_loc_id = sbgroup.get_local_linear_id();
491
492 const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
493 const SizeT id = sg_base_id + sg_loc_id;
494
495 return id;
496}
497
498template <typename SizeT>
499uint32_t get_results_num(const uint32_t wpi,
500 const SizeT size,
501 const SizeT global_id,
502 const sycl::nd_item<1> &item)
503{
504 auto sbgroup = item.get_sub_group();
505
506 const auto sbg_size = sbgroup.get_max_local_range()[0];
507 const auto size_ = sycl::sub_sat(size, global_id);
508 return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
509}
510
511template <uint32_t WorkPI,
512 typename T,
513 typename SizeT,
514 typename Op,
515 typename Red>
517
518template <uint32_t WorkPI,
519 typename T,
520 typename SizeT,
521 typename Op,
522 typename Red>
523void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
524 const Span<const T, SizeT> &v,
525 const Op &op,
526 const Red &red,
527 Span<T, SizeT> &out,
528 sycl::nd_range<1> nd_range,
529 sycl::handler &cgh)
530{
532 nd_range, [=](sycl::nd_item<1> item) {
533 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
534
535 auto results = RegistryData<T, WorkPI>(item);
536 results.fill(0);
537
538 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
539
540 const auto *a_begin = a.begin();
541 const auto *a_end = a.end();
542
543 auto sbgroup = item.get_sub_group();
544
545 const auto chunks_count =
546 CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
547
548 const auto *a_ptr = &a.padded_begin()[glid];
549
550 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
551 return ptr >= a_begin && ptr < a_end;
552 };
553
554 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
555 a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
556
557 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
558 auto v_size = v.size();
559
560 for (uint32_t b = 0; b < chunks_count; ++b) {
561 auto v_data = RegistryData<const T>(item);
562 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
563
564 uint32_t chunk_size_ =
565 std::min(v_size, SizeT(v_data.total_size()));
566 process_block(results, results_num, a_data, v_data, chunk_size_,
567 op, red);
568
569 if (b != chunks_count - 1) {
570 a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
571 _a_load_cond, 0);
572 v_size -= v_data.total_size();
573 }
574 }
575
576 auto *const out_ptr = out.begin();
577 // auto *const out_end = out.end();
578
579 auto y_start = glid;
580 auto y_stop =
581 std::min(y_start + WorkPI * results.size_x(), out.size());
582 uint32_t i = 0;
583 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
584 out_ptr[y] = results[i++];
585 }
586 // while the code itself seems to be valid, inside correlate
587 // kernel it results in memory corruption. Further investigation
588 // is needed. SAT-7693
589 // corruption results.store(&out_ptr[glid],
590 // [out_end](auto &&ptr) { return ptr < out_end; });
591 });
592}
593
594template <uint32_t WorkPI,
595 typename T,
596 typename SizeT,
597 typename Op,
598 typename Red>
600
601template <uint32_t WorkPI,
602 typename T,
603 typename SizeT,
604 typename Op,
605 typename Red>
606void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
607 const Span<const T, SizeT> &v,
608 const Op &op,
609 const Red &red,
610 Span<T, SizeT> &out,
611 sycl::nd_range<1> nd_range,
612 sycl::handler &cgh)
613{
615 nd_range, [=](sycl::nd_item<1> item) {
616 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
617
618 auto results = RegistryData<T, WorkPI>(item);
619 results.fill(0);
620
621 auto sbgroup = item.get_sub_group();
622 auto sg_size = sbgroup.get_max_local_range()[0];
623
624 const uint32_t to_read = WorkPI * sg_size + v.size();
625 const auto *a_begin = a.begin();
626
627 const auto *a_ptr = &a.padded_begin()[glid];
628 const auto *a_end = std::min(a_ptr + to_read, a.end());
629
630 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
631 return ptr >= a_begin && ptr < a_end;
632 };
633
634 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
635 a_data.load(a_ptr, _a_load_cond, 0);
636
637 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
638 auto v_size = v.size();
639
640 auto v_data = RegistryData<const T>(item);
641 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
642
643 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
644
645 process_block(results, results_num, a_data, v_data, v_size, op,
646 red);
647
648 auto *const out_ptr = out.begin();
649 // auto *const out_end = out.end();
650
651 auto y_start = glid;
652 auto y_stop =
653 std::min(y_start + WorkPI * results.size_x(), out.size());
654 uint32_t i = 0;
655 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
656 out_ptr[y] = results[i++];
657 }
658 // while the code itself seems to be valid, inside correlate
659 // kernel it results in memory corruption. Further investigation
660 // is needed. SAT-7693
661 // corruption results.store(&out_ptr[glid],
662 // [out_end](auto &&ptr) { return ptr < out_end; });
663 });
664}
665
666void validate(const usm_ndarray &a,
667 const usm_ndarray &v,
668 const usm_ndarray &out,
669 const size_t l_pad,
670 const size_t r_pad);
671} // namespace sliding_window1d
672} // namespace statistics