DPNP C++ backend kernel library 0.20.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
sliding_window1d.hpp
1//*****************************************************************************
2// Copyright (c) 2024, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// - Neither the name of the copyright holder nor the names of its contributors
13// may be used to endorse or promote products derived from this software
14// without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26// THE POSSIBILITY OF SUCH DAMAGE.
27//*****************************************************************************
28
29#pragma once
30
31#include <algorithm>
32
33#include "utils/math_utils.hpp"
34#include <sycl/sycl.hpp>
35#include <type_traits>
36
37#include <stdio.h>
38
39#include "ext/common.hpp"
40
41using dpctl::tensor::usm_ndarray;
42
43using ext::common::Align;
44using ext::common::CeilDiv;
45
46namespace statistics::sliding_window1d
47{
48
49template <typename T, uint32_t Size>
51{
52public:
53 using ncT = typename std::remove_const_t<T>;
54 using SizeT = decltype(Size);
55 static constexpr SizeT _size = Size;
56
57 _RegistryDataStorage(const sycl::nd_item<1> &item)
58 : sbgroup(item.get_sub_group())
59 {
60 }
61
62 template <typename yT>
63 T &operator[](const yT &idx)
64 {
65 static_assert(std::is_integral_v<yT>,
66 "idx must be of an integral type");
67 return data[idx];
68 }
69
70 template <typename yT>
71 const T &operator[](const yT &idx) const
72 {
73 static_assert(std::is_integral_v<yT>,
74 "idx must be of an integral type");
75 return data[idx];
76 }
77
78 T &value()
79 {
80 static_assert(Size == 1,
81 "Size is not equal to 1. Use value(idx) instead");
82 return data[0];
83 }
84
85 const T &value() const
86 {
87 static_assert(Size == 1,
88 "Size is not equal to 1. Use value(idx) instead");
89 return data[0];
90 }
91
92 template <typename yT, typename xT>
93 T broadcast(const yT &y, const xT &x) const
94 {
95 static_assert(std::is_integral_v<std::remove_reference_t<yT>>,
96 "y must be of an integral type");
97 static_assert(std::is_integral_v<std::remove_reference_t<xT>>,
98 "x must be of an integral type");
99
100 return sycl::select_from_group(sbgroup, data[y], x);
101 }
102
103 template <typename iT>
104 T broadcast(const iT &idx) const
105 {
106 if constexpr (Size == 1) {
107 return broadcast(0, idx);
108 }
109 else {
110 return broadcast(idx / size_x(), idx % size_x());
111 }
112 }
113
114 template <typename yT, typename xT>
115 T shift_left(const yT &y, const xT &x) const
116 {
117 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
118 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
119
120 return sycl::shift_group_left(sbgroup, data[y], x);
121 }
122
123 template <typename yT, typename xT>
124 T shift_right(const yT &y, const xT &x) const
125 {
126 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
127 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
128
129 return sycl::shift_group_right(sbgroup, data[y], x);
130 }
131
132 constexpr SizeT size_y() const
133 {
134 return _size;
135 }
136
137 SizeT size_x() const
138 {
139 return sbgroup.get_max_local_range()[0];
140 }
141
142 SizeT total_size() const
143 {
144 return size_x() * size_y();
145 }
146
147 ncT *ptr()
148 {
149 return data;
150 }
151
152 SizeT x() const
153 {
154 return sbgroup.get_local_linear_id();
155 }
156
157protected:
158 const sycl::sub_group sbgroup;
159 ncT data[Size];
160};
161
162template <typename T, uint32_t Size = 1>
163struct RegistryData : public _RegistryDataStorage<T, Size>
164{
165 using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
166
167 using _RegistryDataStorage<T, Size>::_RegistryDataStorage;
168
169 template <typename LaneIdT,
170 typename Condition,
171 typename = std::enable_if_t<
172 std::is_invocable_r_v<bool, Condition, SizeT>>>
173 void fill_lane(const LaneIdT &lane_id, const T &value, Condition &&mask)
174 {
175 static_assert(std::is_integral_v<LaneIdT>,
176 "lane_id must be of an integral type");
177 if (mask(this->x())) {
178 this->data[lane_id] = value;
179 }
180 }
181
182 template <typename LaneIdT>
183 void fill_lane(const LaneIdT &lane_id, const T &value, const bool &mask)
184 {
185 fill_lane(lane_id, value, [mask](auto &&) { return mask; });
186 }
187
188 template <typename LaneIdT>
189 void fill_lane(const LaneIdT &lane_id, const T &value)
190 {
191 fill_lane(lane_id, value, true);
192 }
193
194 template <typename Condition,
195 typename = std::enable_if_t<
196 std::is_invocable_r_v<bool, Condition, SizeT, SizeT>>>
197 void fill(const T &value, Condition &&mask)
198 {
199 for (SizeT i = 0; i < Size; ++i) {
200 fill_lane(i, value, mask(i, this->x()));
201 }
202 }
203
204 void fill(const T &value)
205 {
206 fill(value, [](auto &&, auto &&) { return true; });
207 }
208
209 template <typename LaneIdT,
210 typename Condition,
211 typename = std::enable_if_t<
212 std::is_invocable_r_v<bool, Condition, const T *const>>>
213 T *load_lane(const LaneIdT &lane_id,
214 const T *const data,
215 Condition &&mask,
216 const T &default_v)
217 {
218 static_assert(std::is_integral_v<LaneIdT>,
219 "lane_id must be of an integral type");
220 this->data[lane_id] = mask(data) ? data[0] : default_v;
221
222 return data + this->size_x();
223 }
224
225 template <typename LaneIdT>
226 T *load_lane(const LaneIdT &laned_id,
227 const T *const data,
228 const bool &mask,
229 const T &default_v)
230 {
231 return load_lane(
232 laned_id, data, [mask](auto &&) { return mask; }, default_v);
233 }
234
235 template <typename LaneIdT>
236 T *load_lane(const LaneIdT &laned_id, const T *const data)
237 {
238 constexpr T default_v = 0;
239 return load_lane(laned_id, data, true, default_v);
240 }
241
242 template <typename yStrideT,
243 typename Condition,
244 typename = std::enable_if_t<
245 std::is_invocable_r_v<bool, Condition, const T *const>>>
246 T *load(const T *const data,
247 const yStrideT &y_stride,
248 Condition &&mask,
249 const T &default_v)
250 {
251 auto *it = data;
252 for (SizeT i = 0; i < Size; ++i) {
253 load_lane(i, it, mask, default_v);
254 it += y_stride;
255 }
256
257 return it;
258 }
259
260 template <typename yStrideT>
261 T *load(const T *const data,
262 const yStrideT &y_stride,
263 const bool &mask,
264 const T &default_v)
265 {
266 return load(
267 data, y_stride, [mask](auto &&) { return mask; }, default_v);
268 }
269
270 template <typename Condition,
271 typename = std::enable_if_t<
272 std::is_invocable_r_v<bool, Condition, const T *const>>>
273 T *load(const T *const data, Condition &&mask, const T &default_v)
274 {
275 return load(data, this->size_x(), mask, default_v);
276 }
277
278 T *load(const T *const data, const bool &mask, const T &default_v)
279 {
280 return load(
281 data, [mask](auto &&) { return mask; }, default_v);
282 }
283
284 T *load(const T *const data)
285 {
286 constexpr T default_v = 0;
287 return load(data, true, default_v);
288 }
289
290 template <typename LaneIdT,
291 typename Condition,
292 typename = std::enable_if_t<
293 std::is_invocable_r_v<bool, Condition, const T *const>>>
294 T *store_lane(const LaneIdT &lane_id, T *const data, Condition &&mask)
295 {
296 static_assert(std::is_integral_v<LaneIdT>,
297 "lane_id must be of an integral type");
298
299 if (mask(data)) {
300 data[0] = this->data[lane_id];
301 }
302
303 return data + this->size_x();
304 }
305
306 template <typename LaneIdT>
307 T *store_lane(const LaneIdT &lane_id, T *const data, const bool &mask)
308 {
309 return store_lane(lane_id, data, [mask](auto &&) { return mask; });
310 }
311
312 template <typename LaneIdT>
313 T *store_lane(const LaneIdT &lane_id, T *const data)
314 {
315 return store_lane(lane_id, data, true);
316 }
317
318 template <typename yStrideT,
319 typename Condition,
320 typename = std::enable_if_t<
321 std::is_invocable_r_v<bool, Condition, const T *const>>>
322 T *store(T *const data, const yStrideT &y_stride, Condition &&condition)
323 {
324 auto *it = data;
325 for (SizeT i = 0; i < Size; ++i) {
326 store_lane(i, it, condition);
327 it += y_stride;
328 }
329
330 return it;
331 }
332
333 template <typename yStrideT>
334 T *store(T *const data, const yStrideT &y_stride, const bool &mask)
335 {
336 return store(data, y_stride, [mask](auto &&) { return mask; });
337 }
338
339 template <typename Condition,
340 typename = std::enable_if_t<
341 std::is_invocable_r_v<bool, Condition, const T *const>>>
342 T *store(T *const data, Condition &&condition)
343 {
344 return store(data, this->size_x(), condition);
345 }
346
347 T *store(T *const data, const bool &mask)
348 {
349 return store(data, [mask](auto &&) { return mask; });
350 }
351
352 T *store(T *const data)
353 {
354 return store(data, true);
355 }
356};
357
358template <typename T, uint32_t Size>
359struct RegistryWindow : public RegistryData<T, Size>
360{
361 using SizeT = typename RegistryData<T, Size>::SizeT;
362
363 using RegistryData<T, Size>::RegistryData;
364
365 template <typename shT>
366 void advance_left(const shT &shift, const T &fill_value)
367 {
368 static_assert(std::is_integral_v<shT>,
369 "shift must be of an integral type");
370
371 uint32_t shift_r = this->size_x() - shift;
372 for (SizeT i = 0; i < Size; ++i) {
373 this->data[i] = this->shift_left(i, shift);
374 auto border =
375 i < Size - 1 ? this->shift_right(i + 1, shift_r) : fill_value;
376 if (this->x() >= shift_r) {
377 this->data[i] = border;
378 }
379 }
380 }
381
382 void advance_left(const T &fill_value)
383 {
384 advance_left(1, fill_value);
385 }
386
387 void advance_left()
388 {
389 constexpr T fill_value = 0;
390 advance_left(fill_value);
391 }
392};
393
394template <typename T, typename SizeT = size_t>
395class Span
396{
397public:
398 using value_type = T;
399 using size_type = SizeT;
400
401 Span(T *const data, const SizeT size) : data_(data), size_(size) {}
402
403 T *begin() const
404 {
405 return data();
406 }
407
408 T *end() const
409 {
410 return data() + size();
411 }
412
413 SizeT size() const
414 {
415 return size_;
416 }
417
418 T *data() const
419 {
420 return data_;
421 }
422
423protected:
424 T *const data_;
425 const SizeT size_;
426};
427
428template <typename T, typename SizeT = size_t>
429Span<T, SizeT> make_span(T *const data, const SizeT size)
430{
431 return Span<T, SizeT>(data, size);
432}
433
434template <typename T, typename SizeT = size_t>
435class PaddedSpan : public Span<T, SizeT>
436{
437public:
438 using value_type = T;
439 using size_type = SizeT;
440
441 PaddedSpan(T *const data, const SizeT size, const SizeT pad)
442 : Span<T, SizeT>(data, size), pad_(pad)
443 {
444 }
445
446 T *padded_begin() const
447 {
448 return this->begin() - pad();
449 }
450
451 SizeT pad() const
452 {
453 return pad_;
454 }
455
456protected:
457 const SizeT pad_;
458};
459
460template <typename T, typename SizeT = size_t>
462 make_padded_span(T *const data, const SizeT size, const SizeT offset)
463{
464 return PaddedSpan<T, SizeT>(data, size, offset);
465}
466
467template <typename Results,
468 typename AData,
469 typename VData,
470 typename Op,
471 typename Red>
472void process_block(Results &results,
473 uint32_t r_size,
474 AData &a_data,
475 VData &v_data,
476 uint32_t block_size,
477 Op op,
478 Red red)
479{
480 for (uint32_t i = 0; i < block_size; ++i) {
481 auto v_val = v_data.broadcast(i);
482 for (uint32_t r = 0; r < r_size; ++r) {
483 results[r] = red(results[r], op(a_data[r], v_val));
484 }
485 a_data.advance_left();
486 }
487}
488
489template <typename SizeT>
490SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
491{
492 auto sbgroup = item.get_sub_group();
493 const auto sg_loc_id = sbgroup.get_local_linear_id();
494
495 const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
496 const SizeT id = sg_base_id + sg_loc_id;
497
498 return id;
499}
500
501template <typename SizeT>
502uint32_t get_results_num(const uint32_t wpi,
503 const SizeT size,
504 const SizeT global_id,
505 const sycl::nd_item<1> &item)
506{
507 auto sbgroup = item.get_sub_group();
508
509 const auto sbg_size = sbgroup.get_max_local_range()[0];
510 const auto size_ = sycl::sub_sat(size, global_id);
511 return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
512}
513
514template <uint32_t WorkPI,
515 typename T,
516 typename SizeT,
517 typename Op,
518 typename Red>
520
521template <uint32_t WorkPI,
522 typename T,
523 typename SizeT,
524 typename Op,
525 typename Red>
526void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
527 const Span<const T, SizeT> &v,
528 const Op &op,
529 const Red &red,
530 Span<T, SizeT> &out,
531 sycl::nd_range<1> nd_range,
532 sycl::handler &cgh)
533{
535 nd_range, [=](sycl::nd_item<1> item) {
536 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
537
538 auto results = RegistryData<T, WorkPI>(item);
539 results.fill(0);
540
541 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
542
543 const auto *a_begin = a.begin();
544 const auto *a_end = a.end();
545
546 auto sbgroup = item.get_sub_group();
547
548 const auto chunks_count =
549 CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
550
551 const auto *a_ptr = &a.padded_begin()[glid];
552
553 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
554 return ptr >= a_begin && ptr < a_end;
555 };
556
557 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
558 a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
559
560 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
561 auto v_size = v.size();
562
563 for (uint32_t b = 0; b < chunks_count; ++b) {
564 auto v_data = RegistryData<const T>(item);
565 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
566
567 uint32_t chunk_size_ =
568 std::min(v_size, SizeT(v_data.total_size()));
569 process_block(results, results_num, a_data, v_data, chunk_size_,
570 op, red);
571
572 if (b != chunks_count - 1) {
573 a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
574 _a_load_cond, 0);
575 v_size -= v_data.total_size();
576 }
577 }
578
579 auto *const out_ptr = out.begin();
580 // auto *const out_end = out.end();
581
582 auto y_start = glid;
583 auto y_stop =
584 std::min(y_start + WorkPI * results.size_x(), out.size());
585 uint32_t i = 0;
586 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
587 out_ptr[y] = results[i++];
588 }
589 // while the code itself seems to be valid, inside correlate
590 // kernel it results in memory corruption. Further investigation
591 // is needed. SAT-7693
592 // corruption results.store(&out_ptr[glid],
593 // [out_end](auto &&ptr) { return ptr < out_end; });
594 });
595}
596
597template <uint32_t WorkPI,
598 typename T,
599 typename SizeT,
600 typename Op,
601 typename Red>
603
604template <uint32_t WorkPI,
605 typename T,
606 typename SizeT,
607 typename Op,
608 typename Red>
609void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
610 const Span<const T, SizeT> &v,
611 const Op &op,
612 const Red &red,
613 Span<T, SizeT> &out,
614 sycl::nd_range<1> nd_range,
615 sycl::handler &cgh)
616{
618 nd_range, [=](sycl::nd_item<1> item) {
619 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
620
621 auto results = RegistryData<T, WorkPI>(item);
622 results.fill(0);
623
624 auto sbgroup = item.get_sub_group();
625 auto sg_size = sbgroup.get_max_local_range()[0];
626
627 const uint32_t to_read = WorkPI * sg_size + v.size();
628 const auto *a_begin = a.begin();
629
630 const auto *a_ptr = &a.padded_begin()[glid];
631 const auto *a_end = std::min(a_ptr + to_read, a.end());
632
633 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
634 return ptr >= a_begin && ptr < a_end;
635 };
636
637 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
638 a_data.load(a_ptr, _a_load_cond, 0);
639
640 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
641 auto v_size = v.size();
642
643 auto v_data = RegistryData<const T>(item);
644 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
645
646 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
647
648 process_block(results, results_num, a_data, v_data, v_size, op,
649 red);
650
651 auto *const out_ptr = out.begin();
652 // auto *const out_end = out.end();
653
654 auto y_start = glid;
655 auto y_stop =
656 std::min(y_start + WorkPI * results.size_x(), out.size());
657 uint32_t i = 0;
658 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
659 out_ptr[y] = results[i++];
660 }
661 // while the code itself seems to be valid, inside correlate
662 // kernel it results in memory corruption. Further investigation
663 // is needed. SAT-7693
664 // corruption results.store(&out_ptr[glid],
665 // [out_end](auto &&ptr) { return ptr < out_end; });
666 });
667}
668
669void validate(const usm_ndarray &a,
670 const usm_ndarray &v,
671 const usm_ndarray &out,
672 const size_t l_pad,
673 const size_t r_pad);
674} // namespace statistics::sliding_window1d