DPNP C++ backend kernel library 0.20.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
nan_to_num.hpp
1//*****************************************************************************
2// Copyright (c) 2024, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// - Neither the name of the copyright holder nor the names of its contributors
13// may be used to endorse or promote products derived from this software
14// without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26// THE POSSIBILITY OF SUCH DAMAGE.
27//*****************************************************************************
28
29#pragma once
30
31#include <algorithm>
32#include <complex>
33#include <cstddef>
34#include <vector>
35
36#include <sycl/sycl.hpp>
37// dpctl tensor headers
38#include "kernels/alignment.hpp"
39#include "kernels/dpctl_tensor_types.hpp"
40#include "utils/offset_utils.hpp"
41#include "utils/sycl_utils.hpp"
42#include "utils/type_utils.hpp"
43
44namespace dpnp::kernels::nan_to_num
45{
46
47template <typename T>
48inline T to_num(const T v, const T nan, const T posinf, const T neginf)
49{
50 return (sycl::isnan(v)) ? nan
51 : (sycl::isinf(v)) ? (v > 0) ? posinf : neginf
52 : v;
53}
54
55template <typename T, typename scT, typename InOutIndexerT>
57{
58private:
59 const T *inp_ = nullptr;
60 T *out_ = nullptr;
61 const InOutIndexerT inp_out_indexer_;
62 const scT nan_;
63 const scT posinf_;
64 const scT neginf_;
65
66public:
67 NanToNumFunctor(const T *inp,
68 T *out,
69 const InOutIndexerT &inp_out_indexer,
70 const scT nan,
71 const scT posinf,
72 const scT neginf)
73 : inp_(inp), out_(out), inp_out_indexer_(inp_out_indexer), nan_(nan),
74 posinf_(posinf), neginf_(neginf)
75 {
76 }
77
78 void operator()(sycl::id<1> wid) const
79 {
80 const auto &offsets_ = inp_out_indexer_(wid.get(0));
81 const dpctl::tensor::ssize_t &inp_offset = offsets_.get_first_offset();
82 const dpctl::tensor::ssize_t &out_offset = offsets_.get_second_offset();
83
84 using dpctl::tensor::type_utils::is_complex_v;
85 if constexpr (is_complex_v<T>) {
86 using realT = typename T::value_type;
87 static_assert(std::is_same_v<realT, scT>);
88 T z = inp_[inp_offset];
89 realT x = to_num(z.real(), nan_, posinf_, neginf_);
90 realT y = to_num(z.imag(), nan_, posinf_, neginf_);
91 out_[out_offset] = T{x, y};
92 }
93 else {
94 out_[out_offset] = to_num(inp_[inp_offset], nan_, posinf_, neginf_);
95 }
96 }
97};
98
99template <typename T,
100 typename scT,
101 std::uint8_t vec_sz = 4u,
102 std::uint8_t n_vecs = 2u,
103 bool enable_sg_loadstore = true>
105{
106private:
107 const T *in_ = nullptr;
108 T *out_ = nullptr;
109 std::size_t nelems_;
110 const scT nan_;
111 const scT posinf_;
112 const scT neginf_;
113
114public:
115 NanToNumContigFunctor(const T *in,
116 T *out,
117 const std::size_t n_elems,
118 const scT nan,
119 const scT posinf,
120 const scT neginf)
121 : in_(in), out_(out), nelems_(n_elems), nan_(nan), posinf_(posinf),
122 neginf_(neginf)
123 {
124 }
125
126 void operator()(sycl::nd_item<1> ndit) const
127 {
128 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
129 /* Each work-item processes vec_sz elements, contiguous in memory */
130 /* NOTE: work-group size must be divisible by sub-group size */
131
132 using dpctl::tensor::type_utils::is_complex_v;
133 if constexpr (enable_sg_loadstore && !is_complex_v<T>) {
134 auto sg = ndit.get_sub_group();
135 const std::uint16_t sgSize = sg.get_max_local_range()[0];
136 const std::size_t base =
137 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
138 sg.get_group_id()[0] * sgSize);
139
140 if (base + elems_per_wi * sgSize < nelems_) {
141 using dpctl::tensor::sycl_utils::sub_group_load;
142 using dpctl::tensor::sycl_utils::sub_group_store;
143#pragma unroll
144 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
145 const std::size_t offset = base + it * sgSize;
146 auto in_multi_ptr = sycl::address_space_cast<
147 sycl::access::address_space::global_space,
148 sycl::access::decorated::yes>(&in_[offset]);
149 auto out_multi_ptr = sycl::address_space_cast<
150 sycl::access::address_space::global_space,
151 sycl::access::decorated::yes>(&out_[offset]);
152
153 sycl::vec<T, vec_sz> arg_vec =
154 sub_group_load<vec_sz>(sg, in_multi_ptr);
155#pragma unroll
156 for (std::uint32_t k = 0; k < vec_sz; ++k) {
157 arg_vec[k] = to_num(arg_vec[k], nan_, posinf_, neginf_);
158 }
159 sub_group_store<vec_sz>(sg, arg_vec, out_multi_ptr);
160 }
161 }
162 else {
163 const std::size_t lane_id = sg.get_local_id()[0];
164 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
165 out_[k] = to_num(in_[k], nan_, posinf_, neginf_);
166 }
167 }
168 }
169 else {
170 const std::uint16_t sgSize =
171 ndit.get_sub_group().get_local_range()[0];
172 const std::size_t gid = ndit.get_global_linear_id();
173 const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
174
175 const std::size_t start =
176 (gid / sgSize) * (elems_per_sg - sgSize) + gid;
177 const std::size_t end = std::min(nelems_, start + elems_per_sg);
178 for (std::size_t offset = start; offset < end; offset += sgSize) {
179 if constexpr (is_complex_v<T>) {
180 using realT = typename T::value_type;
181 static_assert(std::is_same_v<realT, scT>);
182
183 T z = in_[offset];
184 realT x = to_num(z.real(), nan_, posinf_, neginf_);
185 realT y = to_num(z.imag(), nan_, posinf_, neginf_);
186 out_[offset] = T{x, y};
187 }
188 else {
189 out_[offset] = to_num(in_[offset], nan_, posinf_, neginf_);
190 }
191 }
192 }
193 }
194};
195
196template <typename T, typename scT>
197sycl::event nan_to_num_strided_impl(sycl::queue &q,
198 const size_t nelems,
199 const int nd,
200 const dpctl::tensor::ssize_t *shape_strides,
201 const scT nan,
202 const scT posinf,
203 const scT neginf,
204 const char *in_cp,
205 const dpctl::tensor::ssize_t in_offset,
206 char *out_cp,
207 const dpctl::tensor::ssize_t out_offset,
208 const std::vector<sycl::event> &depends)
209{
210 dpctl::tensor::type_utils::validate_type_for_device<T>(q);
211
212 const T *in_tp = reinterpret_cast<const T *>(in_cp);
213 T *out_tp = reinterpret_cast<T *>(out_cp);
214
215 using InOutIndexerT =
216 typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
217 const InOutIndexerT indexer{nd, in_offset, out_offset, shape_strides};
218
219 sycl::event comp_ev = q.submit([&](sycl::handler &cgh) {
220 cgh.depends_on(depends);
221
222 using NanToNumFunc = NanToNumFunctor<T, scT, InOutIndexerT>;
223 cgh.parallel_for<NanToNumFunc>(
224 {nelems},
225 NanToNumFunc(in_tp, out_tp, indexer, nan, posinf, neginf));
226 });
227 return comp_ev;
228}
229
230template <typename T,
231 typename scT,
232 std::uint8_t vec_sz = 4u,
233 std::uint8_t n_vecs = 2u>
234sycl::event nan_to_num_contig_impl(sycl::queue &exec_q,
235 std::size_t nelems,
236 const scT nan,
237 const scT posinf,
238 const scT neginf,
239 const char *in_cp,
240 char *out_cp,
241 const std::vector<sycl::event> &depends = {})
242{
243 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
244 const std::size_t n_work_items_needed = nelems / elems_per_wi;
245 const std::size_t empirical_threshold = std::size_t(1) << 21;
246 const std::size_t lws = (n_work_items_needed <= empirical_threshold)
247 ? std::size_t(128)
248 : std::size_t(256);
249
250 const std::size_t n_groups =
251 ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
252 const auto gws_range = sycl::range<1>(n_groups * lws);
253 const auto lws_range = sycl::range<1>(lws);
254
255 const T *in_tp = reinterpret_cast<const T *>(in_cp);
256 T *out_tp = reinterpret_cast<T *>(out_cp);
257
258 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
259 cgh.depends_on(depends);
260
261 using dpctl::tensor::kernels::alignment_utils::is_aligned;
262 using dpctl::tensor::kernels::alignment_utils::required_alignment;
263 if (is_aligned<required_alignment>(in_tp) &&
264 is_aligned<required_alignment>(out_tp))
265 {
266 constexpr bool enable_sg_loadstore = true;
267 using NanToNumFunc = NanToNumContigFunctor<T, scT, vec_sz, n_vecs,
268 enable_sg_loadstore>;
269
270 cgh.parallel_for<NanToNumFunc>(
271 sycl::nd_range<1>(gws_range, lws_range),
272 NanToNumFunc(in_tp, out_tp, nelems, nan, posinf, neginf));
273 }
274 else {
275 constexpr bool disable_sg_loadstore = false;
276 using NanToNumFunc = NanToNumContigFunctor<T, scT, vec_sz, n_vecs,
277 disable_sg_loadstore>;
278
279 cgh.parallel_for<NanToNumFunc>(
280 sycl::nd_range<1>(gws_range, lws_range),
281 NanToNumFunc(in_tp, out_tp, nelems, nan, posinf, neginf));
282 }
283 });
284
285 return comp_ev;
286}
287
288} // namespace dpnp::kernels::nan_to_num