DPNP C++ backend kernel library 0.19.0dev2
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
isclose.hpp
1//*****************************************************************************
2// Copyright (c) 2025, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23// THE POSSIBILITY OF SUCH DAMAGE.
24//*****************************************************************************
25
26#pragma once
27
28#include <algorithm>
29#include <complex>
30#include <cstddef>
31#include <vector>
32
33#include <sycl/sycl.hpp>
34// dpctl tensor headers
35#include "kernels/alignment.hpp"
36#include "kernels/dpctl_tensor_types.hpp"
37#include "kernels/elementwise_functions/sycl_complex.hpp"
38#include "utils/offset_utils.hpp"
39#include "utils/sycl_utils.hpp"
40#include "utils/type_utils.hpp"
41
42namespace dpnp::kernels::isclose
43{
44
45template <typename T>
46inline bool isclose(const T a,
47 const T b,
48 const T rtol,
49 const T atol,
50 const bool equal_nan)
51{
52 static_assert(std::is_floating_point_v<T> || std::is_same_v<T, sycl::half>);
53
54 if (sycl::isfinite(a) && sycl::isfinite(b)) {
55 return sycl::fabs(a - b) <= atol + rtol * sycl::fabs(b);
56 }
57
58 if (sycl::isnan(a) && sycl::isnan(b)) {
59 return equal_nan;
60 }
61
62 return a == b;
63}
64
65template <typename T>
66inline bool isclose(const std::complex<T> a,
67 const std::complex<T> b,
68 const T rtol,
69 const T atol,
70 const bool equal_nan)
71{
72 const bool a_finite = sycl::isfinite(a.real()) && sycl::isfinite(a.imag());
73 const bool b_finite = sycl::isfinite(b.real()) && sycl::isfinite(b.imag());
74
75 if (a_finite && b_finite) {
76 return exprm_ns::abs(exprm_ns::complex<T>(a - b)) <=
77 atol + rtol * exprm_ns::abs(exprm_ns::complex<T>(b));
78 }
79
80 if (sycl::isnan(a.real()) && sycl::isnan(a.imag()) &&
81 sycl::isnan(b.real()) && sycl::isnan(b.imag()))
82 {
83 return equal_nan;
84 }
85
86 return a == b;
87}
88
89template <typename T,
90 typename scT,
91 typename resTy,
92 typename ThreeOffsets_IndexerT>
94{
95private:
96 const T *a_ = nullptr;
97 const T *b_ = nullptr;
98 resTy *out_ = nullptr;
99 const ThreeOffsets_IndexerT three_offsets_indexer_;
100 const scT rtol_;
101 const scT atol_;
102 const bool equal_nan_;
103
104public:
106 const T *b,
107 resTy *out,
108 const ThreeOffsets_IndexerT &inps_res_indexer,
109 const scT rtol,
110 const scT atol,
111 const bool equal_nan)
112 : a_(a), b_(b), out_(out), three_offsets_indexer_(inps_res_indexer),
113 rtol_(rtol), atol_(atol), equal_nan_(equal_nan)
114 {
115 }
116
117 void operator()(sycl::id<1> wid) const
118 {
119 const auto &three_offsets_ = three_offsets_indexer_(wid.get(0));
120 const dpctl::tensor::ssize_t &inp1_offset =
121 three_offsets_.get_first_offset();
122 const dpctl::tensor::ssize_t &inp2_offset =
123 three_offsets_.get_second_offset();
124 const dpctl::tensor::ssize_t &out_offset =
125 three_offsets_.get_third_offset();
126
127 out_[out_offset] =
128 isclose(a_[inp1_offset], b_[inp2_offset], rtol_, atol_, equal_nan_);
129 }
130};
131
132template <typename T,
133 typename scT,
134 typename resTy,
135 std::uint8_t vec_sz = 4u,
136 std::uint8_t n_vecs = 2u,
137 bool enable_sg_loadstore = true>
139{
140private:
141 const T *a_ = nullptr;
142 const T *b_ = nullptr;
143 resTy *out_ = nullptr;
144 std::size_t nelems_;
145 const scT rtol_;
146 const scT atol_;
147 const bool equal_nan_;
148
149public:
151 const T *b,
152 resTy *out,
153 const std::size_t n_elems,
154 const scT rtol,
155 const scT atol,
156 const bool equal_nan)
157 : a_(a), b_(b), out_(out), nelems_(n_elems), rtol_(rtol), atol_(atol),
158 equal_nan_(equal_nan)
159 {
160 }
161
162 void operator()(sycl::nd_item<1> ndit) const
163 {
164 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
165 /* Each work-item processes vec_sz elements, contiguous in memory */
166 /* NOTE: work-group size must be divisible by sub-group size */
167
168 using dpctl::tensor::type_utils::is_complex_v;
169 if constexpr (enable_sg_loadstore && !is_complex_v<T>) {
170 auto sg = ndit.get_sub_group();
171 const std::uint16_t sgSize = sg.get_max_local_range()[0];
172 const std::size_t base =
173 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
174 sg.get_group_id()[0] * sgSize);
175
176 if (base + elems_per_wi * sgSize < nelems_) {
177 using dpctl::tensor::sycl_utils::sub_group_load;
178 using dpctl::tensor::sycl_utils::sub_group_store;
179#pragma unroll
180 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
181 const std::size_t offset = base + it * sgSize;
182 auto a_multi_ptr = sycl::address_space_cast<
183 sycl::access::address_space::global_space,
184 sycl::access::decorated::yes>(&a_[offset]);
185 auto b_multi_ptr = sycl::address_space_cast<
186 sycl::access::address_space::global_space,
187 sycl::access::decorated::yes>(&b_[offset]);
188 auto out_multi_ptr = sycl::address_space_cast<
189 sycl::access::address_space::global_space,
190 sycl::access::decorated::yes>(&out_[offset]);
191
192 const sycl::vec<T, vec_sz> a_vec =
193 sub_group_load<vec_sz>(sg, a_multi_ptr);
194 const sycl::vec<T, vec_sz> b_vec =
195 sub_group_load<vec_sz>(sg, b_multi_ptr);
196
197 sycl::vec<resTy, vec_sz> res_vec;
198#pragma unroll
199 for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
200 res_vec[vec_id] = isclose(a_vec[vec_id], b_vec[vec_id],
201 rtol_, atol_, equal_nan_);
202 }
203 sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
204 }
205 }
206 else {
207 const std::size_t lane_id = sg.get_local_id()[0];
208 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
209 out_[k] = isclose(a_[k], b_[k], rtol_, atol_, equal_nan_);
210 }
211 }
212 }
213 else {
214 const std::uint16_t sgSize =
215 ndit.get_sub_group().get_local_range()[0];
216 const std::size_t gid = ndit.get_global_linear_id();
217 const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
218
219 const std::size_t start =
220 (gid / sgSize) * (elems_per_sg - sgSize) + gid;
221 const std::size_t end = std::min(nelems_, start + elems_per_sg);
222 for (std::size_t offset = start; offset < end; offset += sgSize) {
223 out_[offset] =
224 isclose(a_[offset], b_[offset], rtol_, atol_, equal_nan_);
225 }
226 }
227 }
228};
229
230template <typename T, typename scT>
231sycl::event
232 isclose_strided_scalar_impl(sycl::queue &exec_q,
233 const int nd,
234 std::size_t nelems,
235 const dpctl::tensor::ssize_t *shape_strides,
236 const scT rtol,
237 const scT atol,
238 const bool equal_nan,
239 const char *a_cp,
240 const dpctl::tensor::ssize_t a_offset,
241 const char *b_cp,
242 const dpctl::tensor::ssize_t b_offset,
243 char *out_cp,
244 const dpctl::tensor::ssize_t out_offset,
245 const std::vector<sycl::event> &depends)
246{
247 dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
248
249 const T *a_tp = reinterpret_cast<const T *>(a_cp);
250 const T *b_tp = reinterpret_cast<const T *>(b_cp);
251
252 using resTy = bool;
253 resTy *out_tp = reinterpret_cast<resTy *>(out_cp);
254
255 using IndexerT =
256 typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
257 const IndexerT indexer{nd, a_offset, b_offset, out_offset, shape_strides};
258
259 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
260 cgh.depends_on(depends);
261
262 using IsCloseFunc =
263 IsCloseStridedScalarFunctor<T, scT, resTy, IndexerT>;
264 cgh.parallel_for<IsCloseFunc>(
265 {nelems},
266 IsCloseFunc(a_tp, b_tp, out_tp, indexer, rtol, atol, equal_nan));
267 });
268 return comp_ev;
269}
270
271template <typename T,
272 typename scT,
273 std::uint8_t vec_sz = 4u,
274 std::uint8_t n_vecs = 2u>
275sycl::event
276 isclose_contig_scalar_impl(sycl::queue &exec_q,
277 std::size_t nelems,
278 const scT rtol,
279 const scT atol,
280 const bool equal_nan,
281 const char *a_cp,
282 const char *b_cp,
283 char *out_cp,
284 const std::vector<sycl::event> &depends = {})
285{
286 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
287 const std::size_t n_work_items_needed = nelems / elems_per_wi;
288 const std::size_t empirical_threshold = std::size_t(1) << 21;
289 const std::size_t lws = (n_work_items_needed <= empirical_threshold)
290 ? std::size_t(128)
291 : std::size_t(256);
292
293 const std::size_t n_groups =
294 ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
295 const auto gws_range = sycl::range<1>(n_groups * lws);
296 const auto lws_range = sycl::range<1>(lws);
297
298 const T *a_tp = reinterpret_cast<const T *>(a_cp);
299 const T *b_tp = reinterpret_cast<const T *>(b_cp);
300
301 using resTy = bool;
302 resTy *out_tp = reinterpret_cast<resTy *>(out_cp);
303
304 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
305 cgh.depends_on(depends);
306
307 using dpctl::tensor::kernels::alignment_utils::is_aligned;
308 using dpctl::tensor::kernels::alignment_utils::required_alignment;
309 if (is_aligned<required_alignment>(a_tp) &&
310 is_aligned<required_alignment>(b_tp) &&
311 is_aligned<required_alignment>(out_tp))
312 {
313 constexpr bool enable_sg_loadstore = true;
314 using IsCloseFunc =
315 IsCloseContigScalarFunctor<T, scT, resTy, vec_sz, n_vecs,
316 enable_sg_loadstore>;
317
318 cgh.parallel_for<IsCloseFunc>(
319 sycl::nd_range<1>(gws_range, lws_range),
320 IsCloseFunc(a_tp, b_tp, out_tp, nelems, rtol, atol, equal_nan));
321 }
322 else {
323 constexpr bool disable_sg_loadstore = false;
324 using IsCloseFunc =
325 IsCloseContigScalarFunctor<T, scT, resTy, vec_sz, n_vecs,
326 disable_sg_loadstore>;
327
328 cgh.parallel_for<IsCloseFunc>(
329 sycl::nd_range<1>(gws_range, lws_range),
330 IsCloseFunc(a_tp, b_tp, out_tp, nelems, rtol, atol, equal_nan));
331 }
332 });
333
334 return comp_ev;
335}
336
337} // namespace dpnp::kernels::isclose