DPNP C++ backend kernel library 0.20.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
sliding_window1d.hpp
1//*****************************************************************************
2// Copyright (c) 2024, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// - Neither the name of the copyright holder nor the names of its contributors
13// may be used to endorse or promote products derived from this software
14// without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26// THE POSSIBILITY OF SUCH DAMAGE.
27//*****************************************************************************
28
29#pragma once
30
31#include "utils/math_utils.hpp"
32#include <sycl/sycl.hpp>
33#include <type_traits>
34
35#include <stdio.h>
36
37#include "ext/common.hpp"
38
39using dpctl::tensor::usm_ndarray;
40
41using ext::common::Align;
42using ext::common::CeilDiv;
43
44namespace statistics::sliding_window1d
45{
46
47template <typename T, uint32_t Size>
49{
50public:
51 using ncT = typename std::remove_const_t<T>;
52 using SizeT = decltype(Size);
53 static constexpr SizeT _size = Size;
54
55 _RegistryDataStorage(const sycl::nd_item<1> &item)
56 : sbgroup(item.get_sub_group())
57 {
58 }
59
60 template <typename yT>
61 T &operator[](const yT &idx)
62 {
63 static_assert(std::is_integral_v<yT>,
64 "idx must be of an integral type");
65 return data[idx];
66 }
67
68 template <typename yT>
69 const T &operator[](const yT &idx) const
70 {
71 static_assert(std::is_integral_v<yT>,
72 "idx must be of an integral type");
73 return data[idx];
74 }
75
76 T &value()
77 {
78 static_assert(Size == 1,
79 "Size is not equal to 1. Use value(idx) instead");
80 return data[0];
81 }
82
83 const T &value() const
84 {
85 static_assert(Size == 1,
86 "Size is not equal to 1. Use value(idx) instead");
87 return data[0];
88 }
89
90 template <typename yT, typename xT>
91 T broadcast(const yT &y, const xT &x) const
92 {
93 static_assert(std::is_integral_v<std::remove_reference_t<yT>>,
94 "y must be of an integral type");
95 static_assert(std::is_integral_v<std::remove_reference_t<xT>>,
96 "x must be of an integral type");
97
98 return sycl::select_from_group(sbgroup, data[y], x);
99 }
100
101 template <typename iT>
102 T broadcast(const iT &idx) const
103 {
104 if constexpr (Size == 1) {
105 return broadcast(0, idx);
106 }
107 else {
108 return broadcast(idx / size_x(), idx % size_x());
109 }
110 }
111
112 template <typename yT, typename xT>
113 T shift_left(const yT &y, const xT &x) const
114 {
115 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
116 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
117
118 return sycl::shift_group_left(sbgroup, data[y], x);
119 }
120
121 template <typename yT, typename xT>
122 T shift_right(const yT &y, const xT &x) const
123 {
124 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
125 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
126
127 return sycl::shift_group_right(sbgroup, data[y], x);
128 }
129
130 constexpr SizeT size_y() const
131 {
132 return _size;
133 }
134
135 SizeT size_x() const
136 {
137 return sbgroup.get_max_local_range()[0];
138 }
139
140 SizeT total_size() const
141 {
142 return size_x() * size_y();
143 }
144
145 ncT *ptr()
146 {
147 return data;
148 }
149
150 SizeT x() const
151 {
152 return sbgroup.get_local_linear_id();
153 }
154
155protected:
156 const sycl::sub_group sbgroup;
157 ncT data[Size];
158};
159
160template <typename T, uint32_t Size = 1>
161struct RegistryData : public _RegistryDataStorage<T, Size>
162{
163 using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
164
165 using _RegistryDataStorage<T, Size>::_RegistryDataStorage;
166
167 template <typename LaneIdT,
168 typename Condition,
169 typename = std::enable_if_t<
170 std::is_invocable_r_v<bool, Condition, SizeT>>>
171 void fill_lane(const LaneIdT &lane_id, const T &value, Condition &&mask)
172 {
173 static_assert(std::is_integral_v<LaneIdT>,
174 "lane_id must be of an integral type");
175 if (mask(this->x())) {
176 this->data[lane_id] = value;
177 }
178 }
179
180 template <typename LaneIdT>
181 void fill_lane(const LaneIdT &lane_id, const T &value, const bool &mask)
182 {
183 fill_lane(lane_id, value, [mask](auto &&) { return mask; });
184 }
185
186 template <typename LaneIdT>
187 void fill_lane(const LaneIdT &lane_id, const T &value)
188 {
189 fill_lane(lane_id, value, true);
190 }
191
192 template <typename Condition,
193 typename = std::enable_if_t<
194 std::is_invocable_r_v<bool, Condition, SizeT, SizeT>>>
195 void fill(const T &value, Condition &&mask)
196 {
197 for (SizeT i = 0; i < Size; ++i) {
198 fill_lane(i, value, mask(i, this->x()));
199 }
200 }
201
202 void fill(const T &value)
203 {
204 fill(value, [](auto &&, auto &&) { return true; });
205 }
206
207 template <typename LaneIdT,
208 typename Condition,
209 typename = std::enable_if_t<
210 std::is_invocable_r_v<bool, Condition, const T *const>>>
211 T *load_lane(const LaneIdT &lane_id,
212 const T *const data,
213 Condition &&mask,
214 const T &default_v)
215 {
216 static_assert(std::is_integral_v<LaneIdT>,
217 "lane_id must be of an integral type");
218 this->data[lane_id] = mask(data) ? data[0] : default_v;
219
220 return data + this->size_x();
221 }
222
223 template <typename LaneIdT>
224 T *load_lane(const LaneIdT &laned_id,
225 const T *const data,
226 const bool &mask,
227 const T &default_v)
228 {
229 return load_lane(
230 laned_id, data, [mask](auto &&) { return mask; }, default_v);
231 }
232
233 template <typename LaneIdT>
234 T *load_lane(const LaneIdT &laned_id, const T *const data)
235 {
236 constexpr T default_v = 0;
237 return load_lane(laned_id, data, true, default_v);
238 }
239
240 template <typename yStrideT,
241 typename Condition,
242 typename = std::enable_if_t<
243 std::is_invocable_r_v<bool, Condition, const T *const>>>
244 T *load(const T *const data,
245 const yStrideT &y_stride,
246 Condition &&mask,
247 const T &default_v)
248 {
249 auto *it = data;
250 for (SizeT i = 0; i < Size; ++i) {
251 load_lane(i, it, mask, default_v);
252 it += y_stride;
253 }
254
255 return it;
256 }
257
258 template <typename yStrideT>
259 T *load(const T *const data,
260 const yStrideT &y_stride,
261 const bool &mask,
262 const T &default_v)
263 {
264 return load(
265 data, y_stride, [mask](auto &&) { return mask; }, default_v);
266 }
267
268 template <typename Condition,
269 typename = std::enable_if_t<
270 std::is_invocable_r_v<bool, Condition, const T *const>>>
271 T *load(const T *const data, Condition &&mask, const T &default_v)
272 {
273 return load(data, this->size_x(), mask, default_v);
274 }
275
276 T *load(const T *const data, const bool &mask, const T &default_v)
277 {
278 return load(
279 data, [mask](auto &&) { return mask; }, default_v);
280 }
281
282 T *load(const T *const data)
283 {
284 constexpr T default_v = 0;
285 return load(data, true, default_v);
286 }
287
288 template <typename LaneIdT,
289 typename Condition,
290 typename = std::enable_if_t<
291 std::is_invocable_r_v<bool, Condition, const T *const>>>
292 T *store_lane(const LaneIdT &lane_id, T *const data, Condition &&mask)
293 {
294 static_assert(std::is_integral_v<LaneIdT>,
295 "lane_id must be of an integral type");
296
297 if (mask(data)) {
298 data[0] = this->data[lane_id];
299 }
300
301 return data + this->size_x();
302 }
303
304 template <typename LaneIdT>
305 T *store_lane(const LaneIdT &lane_id, T *const data, const bool &mask)
306 {
307 return store_lane(lane_id, data, [mask](auto &&) { return mask; });
308 }
309
310 template <typename LaneIdT>
311 T *store_lane(const LaneIdT &lane_id, T *const data)
312 {
313 return store_lane(lane_id, data, true);
314 }
315
316 template <typename yStrideT,
317 typename Condition,
318 typename = std::enable_if_t<
319 std::is_invocable_r_v<bool, Condition, const T *const>>>
320 T *store(T *const data, const yStrideT &y_stride, Condition &&condition)
321 {
322 auto *it = data;
323 for (SizeT i = 0; i < Size; ++i) {
324 store_lane(i, it, condition);
325 it += y_stride;
326 }
327
328 return it;
329 }
330
331 template <typename yStrideT>
332 T *store(T *const data, const yStrideT &y_stride, const bool &mask)
333 {
334 return store(data, y_stride, [mask](auto &&) { return mask; });
335 }
336
337 template <typename Condition,
338 typename = std::enable_if_t<
339 std::is_invocable_r_v<bool, Condition, const T *const>>>
340 T *store(T *const data, Condition &&condition)
341 {
342 return store(data, this->size_x(), condition);
343 }
344
345 T *store(T *const data, const bool &mask)
346 {
347 return store(data, [mask](auto &&) { return mask; });
348 }
349
350 T *store(T *const data)
351 {
352 return store(data, true);
353 }
354};
355
356template <typename T, uint32_t Size>
357struct RegistryWindow : public RegistryData<T, Size>
358{
359 using SizeT = typename RegistryData<T, Size>::SizeT;
360
361 using RegistryData<T, Size>::RegistryData;
362
363 template <typename shT>
364 void advance_left(const shT &shift, const T &fill_value)
365 {
366 static_assert(std::is_integral_v<shT>,
367 "shift must be of an integral type");
368
369 uint32_t shift_r = this->size_x() - shift;
370 for (SizeT i = 0; i < Size; ++i) {
371 this->data[i] = this->shift_left(i, shift);
372 auto border =
373 i < Size - 1 ? this->shift_right(i + 1, shift_r) : fill_value;
374 if (this->x() >= shift_r) {
375 this->data[i] = border;
376 }
377 }
378 }
379
380 void advance_left(const T &fill_value)
381 {
382 advance_left(1, fill_value);
383 }
384
385 void advance_left()
386 {
387 constexpr T fill_value = 0;
388 advance_left(fill_value);
389 }
390};
391
392template <typename T, typename SizeT = size_t>
393class Span
394{
395public:
396 using value_type = T;
397 using size_type = SizeT;
398
399 Span(T *const data, const SizeT size) : data_(data), size_(size) {}
400
401 T *begin() const
402 {
403 return data();
404 }
405
406 T *end() const
407 {
408 return data() + size();
409 }
410
411 SizeT size() const
412 {
413 return size_;
414 }
415
416 T *data() const
417 {
418 return data_;
419 }
420
421protected:
422 T *const data_;
423 const SizeT size_;
424};
425
426template <typename T, typename SizeT = size_t>
427Span<T, SizeT> make_span(T *const data, const SizeT size)
428{
429 return Span<T, SizeT>(data, size);
430}
431
432template <typename T, typename SizeT = size_t>
433class PaddedSpan : public Span<T, SizeT>
434{
435public:
436 using value_type = T;
437 using size_type = SizeT;
438
439 PaddedSpan(T *const data, const SizeT size, const SizeT pad)
440 : Span<T, SizeT>(data, size), pad_(pad)
441 {
442 }
443
444 T *padded_begin() const
445 {
446 return this->begin() - pad();
447 }
448
449 SizeT pad() const
450 {
451 return pad_;
452 }
453
454protected:
455 const SizeT pad_;
456};
457
458template <typename T, typename SizeT = size_t>
460 make_padded_span(T *const data, const SizeT size, const SizeT offset)
461{
462 return PaddedSpan<T, SizeT>(data, size, offset);
463}
464
465template <typename Results,
466 typename AData,
467 typename VData,
468 typename Op,
469 typename Red>
470void process_block(Results &results,
471 uint32_t r_size,
472 AData &a_data,
473 VData &v_data,
474 uint32_t block_size,
475 Op op,
476 Red red)
477{
478 for (uint32_t i = 0; i < block_size; ++i) {
479 auto v_val = v_data.broadcast(i);
480 for (uint32_t r = 0; r < r_size; ++r) {
481 results[r] = red(results[r], op(a_data[r], v_val));
482 }
483 a_data.advance_left();
484 }
485}
486
487template <typename SizeT>
488SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
489{
490 auto sbgroup = item.get_sub_group();
491 const auto sg_loc_id = sbgroup.get_local_linear_id();
492
493 const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
494 const SizeT id = sg_base_id + sg_loc_id;
495
496 return id;
497}
498
499template <typename SizeT>
500uint32_t get_results_num(const uint32_t wpi,
501 const SizeT size,
502 const SizeT global_id,
503 const sycl::nd_item<1> &item)
504{
505 auto sbgroup = item.get_sub_group();
506
507 const auto sbg_size = sbgroup.get_max_local_range()[0];
508 const auto size_ = sycl::sub_sat(size, global_id);
509 return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
510}
511
512template <uint32_t WorkPI,
513 typename T,
514 typename SizeT,
515 typename Op,
516 typename Red>
518
519template <uint32_t WorkPI,
520 typename T,
521 typename SizeT,
522 typename Op,
523 typename Red>
524void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
525 const Span<const T, SizeT> &v,
526 const Op &op,
527 const Red &red,
528 Span<T, SizeT> &out,
529 sycl::nd_range<1> nd_range,
530 sycl::handler &cgh)
531{
533 nd_range, [=](sycl::nd_item<1> item) {
534 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
535
536 auto results = RegistryData<T, WorkPI>(item);
537 results.fill(0);
538
539 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
540
541 const auto *a_begin = a.begin();
542 const auto *a_end = a.end();
543
544 auto sbgroup = item.get_sub_group();
545
546 const auto chunks_count =
547 CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
548
549 const auto *a_ptr = &a.padded_begin()[glid];
550
551 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
552 return ptr >= a_begin && ptr < a_end;
553 };
554
555 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
556 a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
557
558 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
559 auto v_size = v.size();
560
561 for (uint32_t b = 0; b < chunks_count; ++b) {
562 auto v_data = RegistryData<const T>(item);
563 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
564
565 uint32_t chunk_size_ =
566 std::min(v_size, SizeT(v_data.total_size()));
567 process_block(results, results_num, a_data, v_data, chunk_size_,
568 op, red);
569
570 if (b != chunks_count - 1) {
571 a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
572 _a_load_cond, 0);
573 v_size -= v_data.total_size();
574 }
575 }
576
577 auto *const out_ptr = out.begin();
578 // auto *const out_end = out.end();
579
580 auto y_start = glid;
581 auto y_stop =
582 std::min(y_start + WorkPI * results.size_x(), out.size());
583 uint32_t i = 0;
584 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
585 out_ptr[y] = results[i++];
586 }
587 // while the code itself seems to be valid, inside correlate
588 // kernel it results in memory corruption. Further investigation
589 // is needed. SAT-7693
590 // corruption results.store(&out_ptr[glid],
591 // [out_end](auto &&ptr) { return ptr < out_end; });
592 });
593}
594
595template <uint32_t WorkPI,
596 typename T,
597 typename SizeT,
598 typename Op,
599 typename Red>
601
602template <uint32_t WorkPI,
603 typename T,
604 typename SizeT,
605 typename Op,
606 typename Red>
607void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
608 const Span<const T, SizeT> &v,
609 const Op &op,
610 const Red &red,
611 Span<T, SizeT> &out,
612 sycl::nd_range<1> nd_range,
613 sycl::handler &cgh)
614{
616 nd_range, [=](sycl::nd_item<1> item) {
617 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
618
619 auto results = RegistryData<T, WorkPI>(item);
620 results.fill(0);
621
622 auto sbgroup = item.get_sub_group();
623 auto sg_size = sbgroup.get_max_local_range()[0];
624
625 const uint32_t to_read = WorkPI * sg_size + v.size();
626 const auto *a_begin = a.begin();
627
628 const auto *a_ptr = &a.padded_begin()[glid];
629 const auto *a_end = std::min(a_ptr + to_read, a.end());
630
631 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
632 return ptr >= a_begin && ptr < a_end;
633 };
634
635 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
636 a_data.load(a_ptr, _a_load_cond, 0);
637
638 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
639 auto v_size = v.size();
640
641 auto v_data = RegistryData<const T>(item);
642 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
643
644 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
645
646 process_block(results, results_num, a_data, v_data, v_size, op,
647 red);
648
649 auto *const out_ptr = out.begin();
650 // auto *const out_end = out.end();
651
652 auto y_start = glid;
653 auto y_stop =
654 std::min(y_start + WorkPI * results.size_x(), out.size());
655 uint32_t i = 0;
656 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
657 out_ptr[y] = results[i++];
658 }
659 // while the code itself seems to be valid, inside correlate
660 // kernel it results in memory corruption. Further investigation
661 // is needed. SAT-7693
662 // corruption results.store(&out_ptr[glid],
663 // [out_end](auto &&ptr) { return ptr < out_end; });
664 });
665}
666
667void validate(const usm_ndarray &a,
668 const usm_ndarray &v,
669 const usm_ndarray &out,
670 const size_t l_pad,
671 const size_t r_pad);
672} // namespace statistics::sliding_window1d