110 static constexpr std::uint32_t default_reg_data_size = 1;
111 using SizeT =
typename SpanT::size_type;
119 : a(a_), v(v_), op(op_), red(red_), out(out_)
123 void operator()(sycl::nd_item<1> item)
const
125 auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
128 RegistryDataT<typename ResultT::value_type, WorkPI>(item);
132 detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
134 const auto *a_begin = a.begin();
135 const auto *a_end = a.end();
137 auto sbgroup = item.get_sub_group();
139 const auto chunks_count =
140 CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
142 const auto *a_ptr = &a.padded_begin()[glid];
144 auto _a_load_cond = [a_begin, a_end](
auto &&ptr) {
145 return ptr >= a_begin && ptr < a_end;
149 RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
150 a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
152 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
153 auto v_size = v.size();
155 for (std::uint32_t b = 0; b < chunks_count; ++b) {
156 auto v_data = RegistryDataT<
typename KernelT::value_type,
157 default_reg_data_size>(item);
158 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
160 std::uint32_t chunk_size_ =
161 std::min(v_size, SizeT(v_data.total_size()));
162 detail::process_block(results, results_num, a_data, v_data,
163 chunk_size_, op, red);
165 if (b != chunks_count - 1) {
166 a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
168 v_size -= v_data.total_size();
172 auto *
const out_ptr = out.begin();
176 auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
178 for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
179 out_ptr[y] = results[i++];
206 static constexpr std::uint32_t default_reg_data_size = 1;
207 using SizeT =
typename SpanT::size_type;
215 : a(a_), v(v_), op(op_), red(red_), out(out_)
219 void operator()(sycl::nd_item<1> item)
const
221 auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
224 RegistryDataT<typename ResultT::value_type, WorkPI>(item);
227 auto sbgroup = item.get_sub_group();
228 auto sg_size = sbgroup.get_max_local_range()[0];
230 const std::uint32_t to_read = WorkPI * sg_size + v.size();
231 const auto *a_begin = a.begin();
233 const auto *a_ptr = &a.padded_begin()[glid];
234 const auto *a_end = std::min(a_ptr + to_read, a.end());
236 auto _a_load_cond = [a_begin, a_end](
auto &&ptr) {
237 return ptr >= a_begin && ptr < a_end;
241 RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
242 a_data.load(a_ptr, _a_load_cond, 0);
244 const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
245 auto v_size = v.size();
248 RegistryDataT<typename KernelT::value_type, default_reg_data_size>(
250 v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
253 detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
255 detail::process_block(results, results_num, a_data, v_data, v_size, op,
258 auto *
const out_ptr = out.begin();
262 auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
264 for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
265 out_ptr[y] = results[i++];