DPNP C++ backend kernel library 0.20.0dev4
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
sliding_window1d.hpp
1//*****************************************************************************
2// Copyright (c) 2026, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// - Neither the name of the copyright holder nor the names of its contributors
13// may be used to endorse or promote products derived from this software
14// without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26// THE POSSIBILITY OF SUCH DAMAGE.
27//*****************************************************************************
28
29#pragma once
30
31#include <algorithm>
32#include <cstdint>
33
34#include <sycl/sycl.hpp>
35
36#include "ext/common.hpp"
37
38namespace dpnp::kernels::sliding_window1d
39{
40using ext::common::CeilDiv;
41
42namespace detail
43{
44template <typename SizeT>
45SizeT get_global_linear_id(const std::uint32_t wpi,
46 const sycl::nd_item<1> &item)
47{
48 auto sbgroup = item.get_sub_group();
49 const auto sg_loc_id = sbgroup.get_local_linear_id();
50
51 const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
52 const SizeT id = sg_base_id + sg_loc_id;
53
54 return id;
55}
56
57template <typename SizeT>
58std::uint32_t get_results_num(const std::uint32_t wpi,
59 const SizeT size,
60 const SizeT global_id,
61 const sycl::nd_item<1> &item)
62{
63 auto sbgroup = item.get_sub_group();
64
65 const auto sbg_size = sbgroup.get_max_local_range()[0];
66 const auto size_ = sycl::sub_sat(size, global_id);
67 return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
68}
69
70template <typename Results,
71 typename AData,
72 typename VData,
73 typename Op,
74 typename Red>
75void process_block(Results &results,
76 std::uint32_t r_size,
77 AData &a_data,
78 VData &v_data,
79 std::uint32_t block_size,
80 Op op,
81 Red red)
82{
83 for (std::uint32_t i = 0; i < block_size; ++i) {
84 auto v_val = v_data.broadcast(i);
85 for (std::uint32_t r = 0; r < r_size; ++r) {
86 results[r] = red(results[r], op(a_data[r], v_val));
87 }
88 a_data.advance_left();
89 }
90}
91} // namespace detail
92
93template <std::uint32_t WorkPI,
94 typename SpanT,
95 typename KernelT,
96 typename OpT,
97 typename RedT,
98 typename ResultT,
99 template <typename, std::uint32_t> class RegistryDataT,
100 template <typename, std::uint32_t> class RegistryWindowT>
102{
103private:
104 const SpanT a;
105 const KernelT v;
106 const OpT op;
107 const RedT red;
108 ResultT out;
109
110 static constexpr std::uint32_t default_reg_data_size = 1;
111 using SizeT = typename SpanT::size_type;
112
113public:
114 SlidingWindow1dFunctor(const SpanT &a_,
115 const KernelT &v_,
116 const OpT &op_,
117 const RedT &red_,
118 ResultT &out_)
119 : a(a_), v(v_), op(op_), red(red_), out(out_)
120 {
121 }
122
123 void operator()(sycl::nd_item<1> item) const
124 {
125 auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
126
127 auto results =
128 RegistryDataT<typename ResultT::value_type, WorkPI>(item);
129 results.fill(0);
130
131 auto results_num =
132 detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
133
134 const auto *a_begin = a.begin();
135 const auto *a_end = a.end();
136
137 auto sbgroup = item.get_sub_group();
138
139 const auto chunks_count =
140 CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
141
142 const auto *a_ptr = &a.padded_begin()[glid];
143
144 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
145 return ptr >= a_begin && ptr < a_end;
146 };
147
148 auto a_data =
149 RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
150 a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
151
152 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
153 auto v_size = v.size();
154
155 for (std::uint32_t b = 0; b < chunks_count; ++b) {
156 auto v_data = RegistryDataT<typename KernelT::value_type,
157 default_reg_data_size>(item);
158 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
159
160 std::uint32_t chunk_size_ =
161 std::min(v_size, SizeT(v_data.total_size()));
162 detail::process_block(results, results_num, a_data, v_data,
163 chunk_size_, op, red);
164
165 if (b != chunks_count - 1) {
166 a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
167 _a_load_cond, 0);
168 v_size -= v_data.total_size();
169 }
170 }
171
172 auto *const out_ptr = out.begin();
173 // auto *const out_end = out.end();
174
175 auto y_start = glid;
176 auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
177 std::uint32_t i = 0;
178 for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
179 out_ptr[y] = results[i++];
180 }
181 // while the code itself seems to be valid, inside correlate
182 // kernel it results in memory corruption. Further investigation
183 // is needed. SAT-7693
184 // corruption results.store(&out_ptr[glid],
185 // [out_end](auto &&ptr) { return ptr < out_end; });
186 }
187};
188
189template <std::uint32_t WorkPI,
190 typename SpanT,
191 typename KernelT,
192 typename OpT,
193 typename RedT,
194 typename ResultT,
195 template <typename, std::uint32_t> class RegistryDataT,
196 template <typename, std::uint32_t> class RegistryWindowT>
198{
199private:
200 const SpanT a;
201 const KernelT v;
202 const OpT op;
203 const RedT red;
204 ResultT out;
205
206 static constexpr std::uint32_t default_reg_data_size = 1;
207 using SizeT = typename SpanT::size_type;
208
209public:
210 SlidingWindow1dSmallFunctor(const SpanT &a_,
211 const KernelT &v_,
212 const OpT &op_,
213 const RedT &red_,
214 ResultT &out_)
215 : a(a_), v(v_), op(op_), red(red_), out(out_)
216 {
217 }
218
219 void operator()(sycl::nd_item<1> item) const
220 {
221 auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
222
223 auto results =
224 RegistryDataT<typename ResultT::value_type, WorkPI>(item);
225 results.fill(0);
226
227 auto sbgroup = item.get_sub_group();
228 auto sg_size = sbgroup.get_max_local_range()[0];
229
230 const std::uint32_t to_read = WorkPI * sg_size + v.size();
231 const auto *a_begin = a.begin();
232
233 const auto *a_ptr = &a.padded_begin()[glid];
234 const auto *a_end = std::min(a_ptr + to_read, a.end());
235
236 auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
237 return ptr >= a_begin && ptr < a_end;
238 };
239
240 auto a_data =
241 RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
242 a_data.load(a_ptr, _a_load_cond, 0);
243
244 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
245 auto v_size = v.size();
246
247 auto v_data =
248 RegistryDataT<typename KernelT::value_type, default_reg_data_size>(
249 item);
250 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
251
252 auto results_num =
253 detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
254
255 detail::process_block(results, results_num, a_data, v_data, v_size, op,
256 red);
257
258 auto *const out_ptr = out.begin();
259 // auto *const out_end = out.end();
260
261 auto y_start = glid;
262 auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
263 std::uint32_t i = 0;
264 for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
265 out_ptr[y] = results[i++];
266 }
267 // while the code itself seems to be valid, inside correlate
268 // kernel it results in memory corruption. Further investigation
269 // is needed. SAT-7693
270 // corruption results.store(&out_ptr[glid],
271 // [out_end](auto &&ptr) { return ptr < out_end; });
272 }
273};
274} // namespace dpnp::kernels::sliding_window1d