DPNP C++ backend kernel library 0.18.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
histogram_common.hpp
1//*****************************************************************************
2// Copyright (c) 2024-2025, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23// THE POSSIBILITY OF SUCH DAMAGE.
24//*****************************************************************************
25
26#pragma once
27
28#include <sycl/sycl.hpp>
29
30#include "common.hpp"
31
32namespace dpctl::tensor
33{
34class usm_ndarray;
35}
36
37using dpctl::tensor::usm_ndarray;
38
39namespace statistics
40{
41using common::AtomicOp;
42using common::IsNan;
43using common::Less;
44
45namespace histogram
46{
47
48template <typename T, int Dims>
50{
51 static constexpr bool const sync_after_init = true;
52 using Shape = sycl::range<Dims>;
53 using value_type = T;
54 using pointer_type = value_type *;
55 static constexpr auto dims = Dims;
56
57 using ncT = typename std::remove_const<value_type>::type;
58 using LocalData = sycl::local_accessor<ncT, Dims>;
59
60 CachedData(T *global_data, Shape shape, sycl::handler &cgh)
61 {
62 this->global_data = global_data;
63 local_data = LocalData(shape, cgh);
64 }
65
66 T *get_ptr() const
67 {
68 return &local_data[0];
69 }
70
71 template <int _Dims>
72 void init(const sycl::nd_item<_Dims> &item) const
73 {
74 uint32_t llid = item.get_local_linear_id();
75 auto local_ptr = &local_data[0];
76 uint32_t size = local_data.size();
77 auto group = item.get_group();
78 uint32_t local_size = group.get_local_linear_range();
79
80 for (uint32_t i = llid; i < size; i += local_size) {
81 local_ptr[i] = global_data[i];
82 }
83 }
84
85 size_t size() const
86 {
87 return local_data.size();
88 }
89
90 T &operator[](const sycl::id<Dims> &id) const
91 {
92 return local_data[id];
93 }
94
95 template <typename = std::enable_if_t<Dims == 1>>
96 T &operator[](const size_t id) const
97 {
98 return local_data[id];
99 }
100
101private:
102 LocalData local_data;
103 value_type *global_data = nullptr;
104};
105
106template <typename T, int Dims>
108{
109 static constexpr bool const sync_after_init = false;
110 using Shape = sycl::range<Dims>;
111 using value_type = T;
112 using pointer_type = value_type *;
113 static constexpr auto dims = Dims;
114
115 UncachedData(T *global_data, const Shape &shape, sycl::handler &)
116 {
117 this->global_data = global_data;
118 _shape = shape;
119 }
120
121 T *get_ptr() const
122 {
123 return global_data;
124 }
125
126 template <int _Dims>
127 void init(const sycl::nd_item<_Dims> &) const
128 {
129 }
130
131 size_t size() const
132 {
133 return _shape.size();
134 }
135
136 T &operator[](const sycl::id<Dims> &id) const
137 {
138 return global_data[id];
139 }
140
141 template <typename = std::enable_if_t<Dims == 1>>
142 T &operator[](const size_t id) const
143 {
144 return global_data[id];
145 }
146
147private:
148 T *global_data = nullptr;
149 Shape _shape;
150};
151
152template <typename T>
154{
155 using type = T;
156};
157
158template <>
159struct HistLocalType<uint64_t>
160{
161 using type = uint32_t;
162};
163
164template <>
165struct HistLocalType<int64_t>
166{
167 using type = int32_t;
168};
169
170template <typename T, typename localT = typename HistLocalType<T>::type>
172{
173 static constexpr bool const sync_after_init = true;
174 static constexpr bool const sync_before_finalize = true;
175
176 using LocalHist = sycl::local_accessor<localT, 2>;
177
178 HistWithLocalCopies(T *global_data,
179 size_t bins_count,
180 int32_t copies_count,
181 sycl::handler &cgh)
182 {
183 local_hist = LocalHist(sycl::range<2>(copies_count, bins_count), cgh);
184 global_hist = global_data;
185 }
186
187 template <int _Dims>
188 void init(const sycl::nd_item<_Dims> &item, localT val = 0) const
189 {
190 uint32_t llid = item.get_local_linear_id();
191 auto *local_ptr = &local_hist[0][0];
192 uint32_t size = local_hist.size();
193 auto group = item.get_group();
194 uint32_t local_size = group.get_local_linear_range();
195
196 for (uint32_t i = llid; i < size; i += local_size) {
197 local_ptr[i] = val;
198 }
199 }
200
201 template <int _Dims>
202 void add(const sycl::nd_item<_Dims> &item, int32_t bin, localT value) const
203 {
204 int32_t llid = item.get_local_linear_id();
205 int32_t local_hist_count = local_hist.get_range().get(0);
206 int32_t local_copy_id =
207 local_hist_count == 1 ? 0 : llid % local_hist_count;
208
209 AtomicOp<localT, sycl::memory_order::relaxed,
210 sycl::memory_scope::work_group>::add(local_hist[local_copy_id]
211 [bin],
212 value);
213 }
214
215 template <int _Dims>
216 void finalize(const sycl::nd_item<_Dims> &item) const
217 {
218 uint32_t llid = item.get_local_linear_id();
219 uint32_t bins_count = local_hist.get_range().get(1);
220 uint32_t local_hist_count = local_hist.get_range().get(0);
221 auto group = item.get_group();
222 uint32_t local_size = group.get_local_linear_range();
223
224 for (uint32_t i = llid; i < bins_count; i += local_size) {
225 auto value = local_hist[0][i];
226 for (uint32_t lhc = 1; lhc < local_hist_count; ++lhc) {
227 value += local_hist[lhc][i];
228 }
229 if (value != T(0)) {
230 AtomicOp<T, sycl::memory_order::relaxed,
231 sycl::memory_scope::device>::add(global_hist[i],
232 value);
233 }
234 }
235 }
236
237 uint32_t size() const
238 {
239 return local_hist.size();
240 }
241
242private:
243 LocalHist local_hist;
244 T *global_hist = nullptr;
245};
246
247template <typename T>
249{
250 static constexpr bool const sync_after_init = false;
251 static constexpr bool const sync_before_finalize = false;
252
253 HistGlobalMemory(T *global_data)
254 {
255 global_hist = global_data;
256 }
257
258 template <int _Dims>
259 void init(const sycl::nd_item<_Dims> &) const
260 {
261 }
262
263 template <int _Dims>
264 void add(const sycl::nd_item<_Dims> &, int32_t bin, T value) const
265 {
266 AtomicOp<T, sycl::memory_order::relaxed,
267 sycl::memory_scope::device>::add(global_hist[bin], value);
268 }
269
270 template <int _Dims>
271 void finalize(const sycl::nd_item<_Dims> &) const
272 {
273 }
274
275private:
276 T *global_hist = nullptr;
277};
278
279template <typename T = uint32_t>
281{
282 constexpr T get(size_t) const
283 {
284 return 1;
285 }
286};
287
288template <typename T>
290{
291 Weights(T *weights)
292 {
293 data = weights;
294 }
295
296 T get(size_t id) const
297 {
298 return data[id];
299 }
300
301private:
302 T *data = nullptr;
303};
304
305template <typename dT>
306bool check_in_bounds(const dT &val, const dT &min, const dT &max)
307{
308 Less<dT> _less;
309 return !_less(val, min) && !_less(max, val) && !IsNan<dT>::isnan(val);
310}
311
312template <typename T, typename HistImpl, typename Edges, typename Weights>
314
315template <typename T, typename HistImpl, typename Edges, typename Weights>
316void submit_histogram(const T *in,
317 const size_t size,
318 const size_t dims,
319 const uint32_t WorkPI,
320 const HistImpl &hist,
321 const Edges &edges,
322 const Weights &weights,
323 sycl::nd_range<1> nd_range,
324 sycl::handler &cgh)
325{
327 nd_range, [=](sycl::nd_item<1> item) {
328 auto id = item.get_group_linear_id();
329 auto lid = item.get_local_linear_id();
330 auto group = item.get_group();
331 auto local_size = item.get_local_range(0);
332
333 hist.init(item);
334 edges.init(item);
335
336 if constexpr (HistImpl::sync_after_init || Edges::sync_after_init) {
337 sycl::group_barrier(group, sycl::memory_scope::work_group);
338 }
339
340 auto bounds = edges.get_bounds();
341
342 for (uint32_t i = 0; i < WorkPI; ++i) {
343 auto data_idx = id * WorkPI * local_size + i * local_size + lid;
344 if (data_idx < size) {
345 auto *d = &in[data_idx * dims];
346
347 if (edges.in_bounds(d, bounds)) {
348 auto bin = edges.get_bin(item, d, bounds);
349 auto weight = weights.get(data_idx);
350 hist.add(item, bin, weight);
351 }
352 }
353 }
354
355 if constexpr (HistImpl::sync_before_finalize) {
356 sycl::group_barrier(group, sycl::memory_scope::work_group);
357 }
358
359 hist.finalize(item);
360 });
361}
362
363void validate(const usm_ndarray &sample,
364 const std::optional<const dpctl::tensor::usm_ndarray> &bins,
365 const std::optional<const dpctl::tensor::usm_ndarray> &weights,
366 const usm_ndarray &histogram);
367
368uint32_t get_local_hist_copies_count(uint32_t loc_mem_size_in_items,
369 uint32_t local_size,
370 uint32_t hist_size_in_items);
371
372} // namespace histogram
373} // namespace statistics