DPNP C++ backend kernel library 0.20.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
isclose.hpp
1//*****************************************************************************
2// Copyright (c) 2025, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// - Neither the name of the copyright holder nor the names of its contributors
13// may be used to endorse or promote products derived from this software
14// without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26// THE POSSIBILITY OF SUCH DAMAGE.
27//*****************************************************************************
28
29#pragma once
30
31#include <algorithm>
32#include <complex>
33#include <cstddef>
34#include <vector>
35
36#include <sycl/sycl.hpp>
37// dpctl tensor headers
38#include "kernels/alignment.hpp"
39#include "kernels/dpctl_tensor_types.hpp"
40#include "kernels/elementwise_functions/sycl_complex.hpp"
41#include "utils/offset_utils.hpp"
42#include "utils/sycl_utils.hpp"
43#include "utils/type_utils.hpp"
44
45namespace dpnp::kernels::isclose
46{
47
48template <typename T>
49inline bool isclose(const T a,
50 const T b,
51 const T rtol,
52 const T atol,
53 const bool equal_nan)
54{
55 static_assert(std::is_floating_point_v<T> || std::is_same_v<T, sycl::half>);
56
57 if (sycl::isfinite(a) && sycl::isfinite(b)) {
58 return sycl::fabs(a - b) <= atol + rtol * sycl::fabs(b);
59 }
60
61 if (sycl::isnan(a) && sycl::isnan(b)) {
62 return equal_nan;
63 }
64
65 return a == b;
66}
67
68template <typename T>
69inline bool isclose(const std::complex<T> a,
70 const std::complex<T> b,
71 const T rtol,
72 const T atol,
73 const bool equal_nan)
74{
75 const bool a_finite = sycl::isfinite(a.real()) && sycl::isfinite(a.imag());
76 const bool b_finite = sycl::isfinite(b.real()) && sycl::isfinite(b.imag());
77
78 if (a_finite && b_finite) {
79 return exprm_ns::abs(exprm_ns::complex<T>(a - b)) <=
80 atol + rtol * exprm_ns::abs(exprm_ns::complex<T>(b));
81 }
82
83 if (sycl::isnan(a.real()) && sycl::isnan(a.imag()) &&
84 sycl::isnan(b.real()) && sycl::isnan(b.imag()))
85 {
86 return equal_nan;
87 }
88
89 return a == b;
90}
91
92template <typename T,
93 typename scT,
94 typename resTy,
95 typename ThreeOffsets_IndexerT>
97{
98private:
99 const T *a_ = nullptr;
100 const T *b_ = nullptr;
101 resTy *out_ = nullptr;
102 const ThreeOffsets_IndexerT three_offsets_indexer_;
103 const scT rtol_;
104 const scT atol_;
105 const bool equal_nan_;
106
107public:
109 const T *b,
110 resTy *out,
111 const ThreeOffsets_IndexerT &inps_res_indexer,
112 const scT rtol,
113 const scT atol,
114 const bool equal_nan)
115 : a_(a), b_(b), out_(out), three_offsets_indexer_(inps_res_indexer),
116 rtol_(rtol), atol_(atol), equal_nan_(equal_nan)
117 {
118 }
119
120 void operator()(sycl::id<1> wid) const
121 {
122 const auto &three_offsets_ = three_offsets_indexer_(wid.get(0));
123 const dpctl::tensor::ssize_t &inp1_offset =
124 three_offsets_.get_first_offset();
125 const dpctl::tensor::ssize_t &inp2_offset =
126 three_offsets_.get_second_offset();
127 const dpctl::tensor::ssize_t &out_offset =
128 three_offsets_.get_third_offset();
129
130 out_[out_offset] =
131 isclose(a_[inp1_offset], b_[inp2_offset], rtol_, atol_, equal_nan_);
132 }
133};
134
135template <typename T,
136 typename scT,
137 typename resTy,
138 std::uint8_t vec_sz = 4u,
139 std::uint8_t n_vecs = 2u,
140 bool enable_sg_loadstore = true>
142{
143private:
144 const T *a_ = nullptr;
145 const T *b_ = nullptr;
146 resTy *out_ = nullptr;
147 std::size_t nelems_;
148 const scT rtol_;
149 const scT atol_;
150 const bool equal_nan_;
151
152public:
154 const T *b,
155 resTy *out,
156 const std::size_t n_elems,
157 const scT rtol,
158 const scT atol,
159 const bool equal_nan)
160 : a_(a), b_(b), out_(out), nelems_(n_elems), rtol_(rtol), atol_(atol),
161 equal_nan_(equal_nan)
162 {
163 }
164
165 void operator()(sycl::nd_item<1> ndit) const
166 {
167 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
168 /* Each work-item processes vec_sz elements, contiguous in memory */
169 /* NOTE: work-group size must be divisible by sub-group size */
170
171 using dpctl::tensor::type_utils::is_complex_v;
172 if constexpr (enable_sg_loadstore && !is_complex_v<T>) {
173 auto sg = ndit.get_sub_group();
174 const std::uint16_t sgSize = sg.get_max_local_range()[0];
175 const std::size_t base =
176 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
177 sg.get_group_id()[0] * sgSize);
178
179 if (base + elems_per_wi * sgSize < nelems_) {
180 using dpctl::tensor::sycl_utils::sub_group_load;
181 using dpctl::tensor::sycl_utils::sub_group_store;
182#pragma unroll
183 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
184 const std::size_t offset = base + it * sgSize;
185 auto a_multi_ptr = sycl::address_space_cast<
186 sycl::access::address_space::global_space,
187 sycl::access::decorated::yes>(&a_[offset]);
188 auto b_multi_ptr = sycl::address_space_cast<
189 sycl::access::address_space::global_space,
190 sycl::access::decorated::yes>(&b_[offset]);
191 auto out_multi_ptr = sycl::address_space_cast<
192 sycl::access::address_space::global_space,
193 sycl::access::decorated::yes>(&out_[offset]);
194
195 const sycl::vec<T, vec_sz> a_vec =
196 sub_group_load<vec_sz>(sg, a_multi_ptr);
197 const sycl::vec<T, vec_sz> b_vec =
198 sub_group_load<vec_sz>(sg, b_multi_ptr);
199
200 sycl::vec<resTy, vec_sz> res_vec;
201#pragma unroll
202 for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
203 res_vec[vec_id] = isclose(a_vec[vec_id], b_vec[vec_id],
204 rtol_, atol_, equal_nan_);
205 }
206 sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
207 }
208 }
209 else {
210 const std::size_t lane_id = sg.get_local_id()[0];
211 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
212 out_[k] = isclose(a_[k], b_[k], rtol_, atol_, equal_nan_);
213 }
214 }
215 }
216 else {
217 const std::uint16_t sgSize =
218 ndit.get_sub_group().get_local_range()[0];
219 const std::size_t gid = ndit.get_global_linear_id();
220 const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
221
222 const std::size_t start =
223 (gid / sgSize) * (elems_per_sg - sgSize) + gid;
224 const std::size_t end = std::min(nelems_, start + elems_per_sg);
225 for (std::size_t offset = start; offset < end; offset += sgSize) {
226 out_[offset] =
227 isclose(a_[offset], b_[offset], rtol_, atol_, equal_nan_);
228 }
229 }
230 }
231};
232
233template <typename T, typename scT>
234sycl::event
235 isclose_strided_scalar_impl(sycl::queue &exec_q,
236 const int nd,
237 std::size_t nelems,
238 const dpctl::tensor::ssize_t *shape_strides,
239 const scT rtol,
240 const scT atol,
241 const bool equal_nan,
242 const char *a_cp,
243 const dpctl::tensor::ssize_t a_offset,
244 const char *b_cp,
245 const dpctl::tensor::ssize_t b_offset,
246 char *out_cp,
247 const dpctl::tensor::ssize_t out_offset,
248 const std::vector<sycl::event> &depends)
249{
250 dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
251
252 const T *a_tp = reinterpret_cast<const T *>(a_cp);
253 const T *b_tp = reinterpret_cast<const T *>(b_cp);
254
255 using resTy = bool;
256 resTy *out_tp = reinterpret_cast<resTy *>(out_cp);
257
258 using IndexerT =
259 typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
260 const IndexerT indexer{nd, a_offset, b_offset, out_offset, shape_strides};
261
262 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
263 cgh.depends_on(depends);
264
265 using IsCloseFunc =
266 IsCloseStridedScalarFunctor<T, scT, resTy, IndexerT>;
267 cgh.parallel_for<IsCloseFunc>(
268 {nelems},
269 IsCloseFunc(a_tp, b_tp, out_tp, indexer, rtol, atol, equal_nan));
270 });
271 return comp_ev;
272}
273
274template <typename T,
275 typename scT,
276 std::uint8_t vec_sz = 4u,
277 std::uint8_t n_vecs = 2u>
278sycl::event
279 isclose_contig_scalar_impl(sycl::queue &exec_q,
280 std::size_t nelems,
281 const scT rtol,
282 const scT atol,
283 const bool equal_nan,
284 const char *a_cp,
285 const char *b_cp,
286 char *out_cp,
287 const std::vector<sycl::event> &depends = {})
288{
289 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
290 const std::size_t n_work_items_needed = nelems / elems_per_wi;
291 const std::size_t empirical_threshold = std::size_t(1) << 21;
292 const std::size_t lws = (n_work_items_needed <= empirical_threshold)
293 ? std::size_t(128)
294 : std::size_t(256);
295
296 const std::size_t n_groups =
297 ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
298 const auto gws_range = sycl::range<1>(n_groups * lws);
299 const auto lws_range = sycl::range<1>(lws);
300
301 const T *a_tp = reinterpret_cast<const T *>(a_cp);
302 const T *b_tp = reinterpret_cast<const T *>(b_cp);
303
304 using resTy = bool;
305 resTy *out_tp = reinterpret_cast<resTy *>(out_cp);
306
307 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
308 cgh.depends_on(depends);
309
310 using dpctl::tensor::kernels::alignment_utils::is_aligned;
311 using dpctl::tensor::kernels::alignment_utils::required_alignment;
312 if (is_aligned<required_alignment>(a_tp) &&
313 is_aligned<required_alignment>(b_tp) &&
314 is_aligned<required_alignment>(out_tp))
315 {
316 constexpr bool enable_sg_loadstore = true;
317 using IsCloseFunc =
318 IsCloseContigScalarFunctor<T, scT, resTy, vec_sz, n_vecs,
319 enable_sg_loadstore>;
320
321 cgh.parallel_for<IsCloseFunc>(
322 sycl::nd_range<1>(gws_range, lws_range),
323 IsCloseFunc(a_tp, b_tp, out_tp, nelems, rtol, atol, equal_nan));
324 }
325 else {
326 constexpr bool disable_sg_loadstore = false;
327 using IsCloseFunc =
328 IsCloseContigScalarFunctor<T, scT, resTy, vec_sz, n_vecs,
329 disable_sg_loadstore>;
330
331 cgh.parallel_for<IsCloseFunc>(
332 sycl::nd_range<1>(gws_range, lws_range),
333 IsCloseFunc(a_tp, b_tp, out_tp, nelems, rtol, atol, equal_nan));
334 }
335 });
336
337 return comp_ev;
338}
339
340} // namespace dpnp::kernels::isclose