DPNP C++ backend kernel library 0.20.0dev4
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
sliding_window1d.hpp
1//*****************************************************************************
2// Copyright (c) 2024, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// - Neither the name of the copyright holder nor the names of its contributors
13// may be used to endorse or promote products derived from this software
14// without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26// THE POSSIBILITY OF SUCH DAMAGE.
27//*****************************************************************************
28
29#pragma once
30
31#include <algorithm>
32
33#include "utils/math_utils.hpp"
34#include <sycl/sycl.hpp>
35#include <type_traits>
36
37#include <stdio.h>
38
39#include "ext/common.hpp"
40
41using dpctl::tensor::usm_ndarray;
42
43using ext::common::Align;
44using ext::common::CeilDiv;
45
46namespace statistics::sliding_window1d
47{
48
49template <typename T, uint32_t Size>
51{
52public:
53 using ncT = typename std::remove_const_t<T>;
54 using SizeT = decltype(Size);
55 static constexpr SizeT _size = Size;
56
57 _RegistryDataStorage(const sycl::nd_item<1> &item)
58 : sbgroup(item.get_sub_group())
59 {
60 }
61
62 template <typename yT>
63 T &operator[](const yT &idx)
64 {
65 static_assert(std::is_integral_v<yT>,
66 "idx must be of an integral type");
67 return data[idx];
68 }
69
70 template <typename yT>
71 const T &operator[](const yT &idx) const
72 {
73 static_assert(std::is_integral_v<yT>,
74 "idx must be of an integral type");
75 return data[idx];
76 }
77
78 T &value()
79 {
80 static_assert(Size == 1,
81 "Size is not equal to 1. Use value(idx) instead");
82 return data[0];
83 }
84
85 const T &value() const
86 {
87 static_assert(Size == 1,
88 "Size is not equal to 1. Use value(idx) instead");
89 return data[0];
90 }
91
92 template <typename yT, typename xT>
93 T broadcast(const yT &y, const xT &x) const
94 {
95 static_assert(std::is_integral_v<std::remove_reference_t<yT>>,
96 "y must be of an integral type");
97 static_assert(std::is_integral_v<std::remove_reference_t<xT>>,
98 "x must be of an integral type");
99
100 return sycl::select_from_group(sbgroup, data[y], x);
101 }
102
103 template <typename iT>
104 T broadcast(const iT &idx) const
105 {
106 if constexpr (Size == 1) {
107 return broadcast(0, idx);
108 }
109 else {
110 return broadcast(idx / size_x(), idx % size_x());
111 }
112 }
113
114 template <typename yT, typename xT>
115 T shift_left(const yT &y, const xT &x) const
116 {
117 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
118 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
119
120 return sycl::shift_group_left(sbgroup, data[y], x);
121 }
122
123 template <typename yT, typename xT>
124 T shift_right(const yT &y, const xT &x) const
125 {
126 static_assert(std::is_integral_v<yT>, "y must be of an integral type");
127 static_assert(std::is_integral_v<xT>, "x must be of an integral type");
128
129 return sycl::shift_group_right(sbgroup, data[y], x);
130 }
131
132 constexpr SizeT size_y() const { return _size; }
133
134 SizeT size_x() const { return sbgroup.get_max_local_range()[0]; }
135
136 SizeT total_size() const { return size_x() * size_y(); }
137
138 ncT *ptr() { return data; }
139
140 SizeT x() const { return sbgroup.get_local_linear_id(); }
141
142protected:
143 const sycl::sub_group sbgroup;
144 ncT data[Size];
145};
146
147template <typename T, uint32_t Size = 1>
148struct RegistryData : public _RegistryDataStorage<T, Size>
149{
150 using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
151
152 using _RegistryDataStorage<T, Size>::_RegistryDataStorage;
153
154 template <typename LaneIdT,
155 typename Condition,
156 typename = std::enable_if_t<
157 std::is_invocable_r_v<bool, Condition, SizeT>>>
158 void fill_lane(const LaneIdT &lane_id, const T &value, Condition &&mask)
159 {
160 static_assert(std::is_integral_v<LaneIdT>,
161 "lane_id must be of an integral type");
162 if (mask(this->x())) {
163 this->data[lane_id] = value;
164 }
165 }
166
167 template <typename LaneIdT>
168 void fill_lane(const LaneIdT &lane_id, const T &value, const bool &mask)
169 {
170 fill_lane(lane_id, value, [mask](auto &&) { return mask; });
171 }
172
173 template <typename LaneIdT>
174 void fill_lane(const LaneIdT &lane_id, const T &value)
175 {
176 fill_lane(lane_id, value, true);
177 }
178
179 template <typename Condition,
180 typename = std::enable_if_t<
181 std::is_invocable_r_v<bool, Condition, SizeT, SizeT>>>
182 void fill(const T &value, Condition &&mask)
183 {
184 for (SizeT i = 0; i < Size; ++i) {
185 fill_lane(i, value, mask(i, this->x()));
186 }
187 }
188
189 void fill(const T &value)
190 {
191 fill(value, [](auto &&, auto &&) { return true; });
192 }
193
194 template <typename LaneIdT,
195 typename Condition,
196 typename = std::enable_if_t<
197 std::is_invocable_r_v<bool, Condition, const T *const>>>
198 T *load_lane(const LaneIdT &lane_id,
199 const T *const data,
200 Condition &&mask,
201 const T &default_v)
202 {
203 static_assert(std::is_integral_v<LaneIdT>,
204 "lane_id must be of an integral type");
205 this->data[lane_id] = mask(data) ? data[0] : default_v;
206
207 return data + this->size_x();
208 }
209
210 template <typename LaneIdT>
211 T *load_lane(const LaneIdT &laned_id,
212 const T *const data,
213 const bool &mask,
214 const T &default_v)
215 {
216 return load_lane(
217 laned_id, data, [mask](auto &&) { return mask; }, default_v);
218 }
219
220 template <typename LaneIdT>
221 T *load_lane(const LaneIdT &laned_id, const T *const data)
222 {
223 constexpr T default_v = 0;
224 return load_lane(laned_id, data, true, default_v);
225 }
226
227 template <typename yStrideT,
228 typename Condition,
229 typename = std::enable_if_t<
230 std::is_invocable_r_v<bool, Condition, const T *const>>>
231 T *load(const T *const data,
232 const yStrideT &y_stride,
233 Condition &&mask,
234 const T &default_v)
235 {
236 auto *it = data;
237 for (SizeT i = 0; i < Size; ++i) {
238 load_lane(i, it, mask, default_v);
239 it += y_stride;
240 }
241
242 return it;
243 }
244
245 template <typename yStrideT>
246 T *load(const T *const data,
247 const yStrideT &y_stride,
248 const bool &mask,
249 const T &default_v)
250 {
251 return load(
252 data, y_stride, [mask](auto &&) { return mask; }, default_v);
253 }
254
255 template <typename Condition,
256 typename = std::enable_if_t<
257 std::is_invocable_r_v<bool, Condition, const T *const>>>
258 T *load(const T *const data, Condition &&mask, const T &default_v)
259 {
260 return load(data, this->size_x(), mask, default_v);
261 }
262
263 T *load(const T *const data, const bool &mask, const T &default_v)
264 {
265 return load(data, [mask](auto &&) { return mask; }, default_v);
266 }
267
268 T *load(const T *const data)
269 {
270 constexpr T default_v = 0;
271 return load(data, true, default_v);
272 }
273
274 template <typename LaneIdT,
275 typename Condition,
276 typename = std::enable_if_t<
277 std::is_invocable_r_v<bool, Condition, const T *const>>>
278 T *store_lane(const LaneIdT &lane_id, T *const data, Condition &&mask)
279 {
280 static_assert(std::is_integral_v<LaneIdT>,
281 "lane_id must be of an integral type");
282
283 if (mask(data)) {
284 data[0] = this->data[lane_id];
285 }
286
287 return data + this->size_x();
288 }
289
290 template <typename LaneIdT>
291 T *store_lane(const LaneIdT &lane_id, T *const data, const bool &mask)
292 {
293 return store_lane(lane_id, data, [mask](auto &&) { return mask; });
294 }
295
296 template <typename LaneIdT>
297 T *store_lane(const LaneIdT &lane_id, T *const data)
298 {
299 return store_lane(lane_id, data, true);
300 }
301
302 template <typename yStrideT,
303 typename Condition,
304 typename = std::enable_if_t<
305 std::is_invocable_r_v<bool, Condition, const T *const>>>
306 T *store(T *const data, const yStrideT &y_stride, Condition &&condition)
307 {
308 auto *it = data;
309 for (SizeT i = 0; i < Size; ++i) {
310 store_lane(i, it, condition);
311 it += y_stride;
312 }
313
314 return it;
315 }
316
317 template <typename yStrideT>
318 T *store(T *const data, const yStrideT &y_stride, const bool &mask)
319 {
320 return store(data, y_stride, [mask](auto &&) { return mask; });
321 }
322
323 template <typename Condition,
324 typename = std::enable_if_t<
325 std::is_invocable_r_v<bool, Condition, const T *const>>>
326 T *store(T *const data, Condition &&condition)
327 {
328 return store(data, this->size_x(), condition);
329 }
330
331 T *store(T *const data, const bool &mask)
332 {
333 return store(data, [mask](auto &&) { return mask; });
334 }
335
336 T *store(T *const data) { return store(data, true); }
337};
338
339template <typename T, uint32_t Size>
340struct RegistryWindow : public RegistryData<T, Size>
341{
342 using SizeT = typename RegistryData<T, Size>::SizeT;
343
344 using RegistryData<T, Size>::RegistryData;
345
346 template <typename shT>
347 void advance_left(const shT &shift, const T &fill_value)
348 {
349 static_assert(std::is_integral_v<shT>,
350 "shift must be of an integral type");
351
352 uint32_t shift_r = this->size_x() - shift;
353 for (SizeT i = 0; i < Size; ++i) {
354 this->data[i] = this->shift_left(i, shift);
355 auto border =
356 i < Size - 1 ? this->shift_right(i + 1, shift_r) : fill_value;
357 if (this->x() >= shift_r) {
358 this->data[i] = border;
359 }
360 }
361 }
362
363 void advance_left(const T &fill_value) { advance_left(1, fill_value); }
364
365 void advance_left()
366 {
367 constexpr T fill_value = 0;
368 advance_left(fill_value);
369 }
370};
371
372template <typename T, typename SizeT = size_t>
373class Span
374{
375public:
376 using value_type = T;
377 using size_type = SizeT;
378
379 Span(T *const data, const SizeT size) : data_(data), size_(size) {}
380
381 T *begin() const { return data(); }
382
383 T *end() const { return data() + size(); }
384
385 SizeT size() const { return size_; }
386
387 T *data() const { return data_; }
388
389protected:
390 T *const data_;
391 const SizeT size_;
392};
393
394template <typename T, typename SizeT = size_t>
395Span<T, SizeT> make_span(T *const data, const SizeT size)
396{
397 return Span<T, SizeT>(data, size);
398}
399
400template <typename T, typename SizeT = size_t>
401class PaddedSpan : public Span<T, SizeT>
402{
403public:
404 using value_type = T;
405 using size_type = SizeT;
406
407 PaddedSpan(T *const data, const SizeT size, const SizeT pad)
408 : Span<T, SizeT>(data, size), pad_(pad)
409 {
410 }
411
412 T *padded_begin() const { return this->begin() - pad(); }
413
414 SizeT pad() const { return pad_; }
415
416protected:
417 const SizeT pad_;
418};
419
420template <typename T, typename SizeT = size_t>
422 make_padded_span(T *const data, const SizeT size, const SizeT offset)
423{
424 return PaddedSpan<T, SizeT>(data, size, offset);
425}
426
427template <typename Results,
428 typename AData,
429 typename VData,
430 typename Op,
431 typename Red>
432void process_block(Results &results,
433 uint32_t r_size,
434 AData &a_data,
435 VData &v_data,
436 uint32_t block_size,
437 Op op,
438 Red red)
439{
440 for (uint32_t i = 0; i < block_size; ++i) {
441 auto v_val = v_data.broadcast(i);
442 for (uint32_t r = 0; r < r_size; ++r) {
443 results[r] = red(results[r], op(a_data[r], v_val));
444 }
445 a_data.advance_left();
446 }
447}
448
449template <typename SizeT>
450SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
451{
452 auto sbgroup = item.get_sub_group();
453 const auto sg_loc_id = sbgroup.get_local_linear_id();
454
455 const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
456 const SizeT id = sg_base_id + sg_loc_id;
457
458 return id;
459}
460
461template <typename SizeT>
462uint32_t get_results_num(const uint32_t wpi,
463 const SizeT size,
464 const SizeT global_id,
465 const sycl::nd_item<1> &item)
466{
467 auto sbgroup = item.get_sub_group();
468
469 const auto sbg_size = sbgroup.get_max_local_range()[0];
470 const auto size_ = sycl::sub_sat(size, global_id);
471 return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
472}
473
474template <uint32_t WorkPI,
475 typename T,
476 typename SizeT,
477 typename Op,
478 typename Red>
480
481template <uint32_t WorkPI,
482 typename T,
483 typename SizeT,
484 typename Op,
485 typename Red>
486void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
487 const Span<const T, SizeT> &v,
488 const Op &op,
489 const Red &red,
490 Span<T, SizeT> &out,
491 sycl::nd_range<1> nd_range,
492 sycl::handler &cgh)
493{
495 nd_range, [=](sycl::nd_item<1> item) {
496 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
497
498 auto results = RegistryData<T, WorkPI>(item);
499 results.fill(0);
500
501 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
502
503 const auto *a_begin = a.begin();
504 const auto *a_end = a.end();
505
506 auto sbgroup = item.get_sub_group();
507
508 const auto chunks_count =
509 CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
510
511 const auto *a_ptr = &a.padded_begin()[glid];
512
513 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
514 return ptr >= a_begin && ptr < a_end;
515 };
516
517 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
518 a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
519
520 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
521 auto v_size = v.size();
522
523 for (uint32_t b = 0; b < chunks_count; ++b) {
524 auto v_data = RegistryData<const T>(item);
525 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
526
527 uint32_t chunk_size_ =
528 std::min(v_size, SizeT(v_data.total_size()));
529 process_block(results, results_num, a_data, v_data, chunk_size_,
530 op, red);
531
532 if (b != chunks_count - 1) {
533 a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
534 _a_load_cond, 0);
535 v_size -= v_data.total_size();
536 }
537 }
538
539 auto *const out_ptr = out.begin();
540 // auto *const out_end = out.end();
541
542 auto y_start = glid;
543 auto y_stop =
544 std::min(y_start + WorkPI * results.size_x(), out.size());
545 uint32_t i = 0;
546 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
547 out_ptr[y] = results[i++];
548 }
549 // while the code itself seems to be valid, inside correlate
550 // kernel it results in memory corruption. Further investigation
551 // is needed. SAT-7693
552 // corruption results.store(&out_ptr[glid],
553 // [out_end](auto &&ptr) { return ptr < out_end; });
554 });
555}
556
557template <uint32_t WorkPI,
558 typename T,
559 typename SizeT,
560 typename Op,
561 typename Red>
563
564template <uint32_t WorkPI,
565 typename T,
566 typename SizeT,
567 typename Op,
568 typename Red>
569void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
570 const Span<const T, SizeT> &v,
571 const Op &op,
572 const Red &red,
573 Span<T, SizeT> &out,
574 sycl::nd_range<1> nd_range,
575 sycl::handler &cgh)
576{
578 nd_range, [=](sycl::nd_item<1> item) {
579 auto glid = get_global_linear_id<SizeT>(WorkPI, item);
580
581 auto results = RegistryData<T, WorkPI>(item);
582 results.fill(0);
583
584 auto sbgroup = item.get_sub_group();
585 auto sg_size = sbgroup.get_max_local_range()[0];
586
587 const uint32_t to_read = WorkPI * sg_size + v.size();
588 const auto *a_begin = a.begin();
589
590 const auto *a_ptr = &a.padded_begin()[glid];
591 const auto *a_end = std::min(a_ptr + to_read, a.end());
592
593 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
594 return ptr >= a_begin && ptr < a_end;
595 };
596
597 auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
598 a_data.load(a_ptr, _a_load_cond, 0);
599
600 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
601 auto v_size = v.size();
602
603 auto v_data = RegistryData<const T>(item);
604 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
605
606 auto results_num = get_results_num(WorkPI, out.size(), glid, item);
607
608 process_block(results, results_num, a_data, v_data, v_size, op,
609 red);
610
611 auto *const out_ptr = out.begin();
612 // auto *const out_end = out.end();
613
614 auto y_start = glid;
615 auto y_stop =
616 std::min(y_start + WorkPI * results.size_x(), out.size());
617 uint32_t i = 0;
618 for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
619 out_ptr[y] = results[i++];
620 }
621 // while the code itself seems to be valid, inside correlate
622 // kernel it results in memory corruption. Further investigation
623 // is needed. SAT-7693
624 // corruption results.store(&out_ptr[glid],
625 // [out_end](auto &&ptr) { return ptr < out_end; });
626 });
627}
628
629void validate(const usm_ndarray &a,
630 const usm_ndarray &v,
631 const usm_ndarray &out,
632 const size_t l_pad,
633 const size_t r_pad);
634} // namespace statistics::sliding_window1d