DPNP C++ backend kernel library 0.18.0dev0
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
nan_to_num.hpp
1//*****************************************************************************
2// Copyright (c) 2024, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23// THE POSSIBILITY OF SUCH DAMAGE.
24//*****************************************************************************
25
26#pragma once
27
28#include <complex>
29#include <cstddef>
30#include <vector>
31
32#include <sycl/sycl.hpp>
33// dpctl tensor headers
34#include "kernels/alignment.hpp"
35#include "kernels/dpctl_tensor_types.hpp"
36#include "utils/offset_utils.hpp"
37#include "utils/sycl_utils.hpp"
38#include "utils/type_utils.hpp"
39
40namespace dpnp::kernels::nan_to_num
41{
42
43template <typename T>
44inline T to_num(const T v, const T nan, const T posinf, const T neginf)
45{
46 return (sycl::isnan(v)) ? nan
47 : (sycl::isinf(v)) ? (v > 0) ? posinf : neginf
48 : v;
49}
50
51template <typename T, typename scT, typename InOutIndexerT>
53{
54private:
55 const T *inp_ = nullptr;
56 T *out_ = nullptr;
57 const InOutIndexerT inp_out_indexer_;
58 const scT nan_;
59 const scT posinf_;
60 const scT neginf_;
61
62public:
63 NanToNumFunctor(const T *inp,
64 T *out,
65 const InOutIndexerT &inp_out_indexer,
66 const scT nan,
67 const scT posinf,
68 const scT neginf)
69 : inp_(inp), out_(out), inp_out_indexer_(inp_out_indexer), nan_(nan),
70 posinf_(posinf), neginf_(neginf)
71 {
72 }
73
74 void operator()(sycl::id<1> wid) const
75 {
76 const auto &offsets_ = inp_out_indexer_(wid.get(0));
77 const dpctl::tensor::ssize_t &inp_offset = offsets_.get_first_offset();
78 const dpctl::tensor::ssize_t &out_offset = offsets_.get_second_offset();
79
80 using dpctl::tensor::type_utils::is_complex_v;
81 if constexpr (is_complex_v<T>) {
82 using realT = typename T::value_type;
83 static_assert(std::is_same_v<realT, scT>);
84 T z = inp_[inp_offset];
85 realT x = to_num(z.real(), nan_, posinf_, neginf_);
86 realT y = to_num(z.imag(), nan_, posinf_, neginf_);
87 out_[out_offset] = T{x, y};
88 }
89 else {
90 out_[out_offset] = to_num(inp_[inp_offset], nan_, posinf_, neginf_);
91 }
92 }
93};
94
95template <typename T,
96 typename scT,
97 std::uint8_t vec_sz = 4u,
98 std::uint8_t n_vecs = 2u,
99 bool enable_sg_loadstore = true>
101{
102private:
103 const T *in_ = nullptr;
104 T *out_ = nullptr;
105 std::size_t nelems_;
106 const scT nan_;
107 const scT posinf_;
108 const scT neginf_;
109
110public:
111 NanToNumContigFunctor(const T *in,
112 T *out,
113 const std::size_t n_elems,
114 const scT nan,
115 const scT posinf,
116 const scT neginf)
117 : in_(in), out_(out), nelems_(n_elems), nan_(nan), posinf_(posinf),
118 neginf_(neginf)
119 {
120 }
121
122 void operator()(sycl::nd_item<1> ndit) const
123 {
124 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
125 /* Each work-item processes vec_sz elements, contiguous in memory */
126 /* NOTE: work-group size must be divisible by sub-group size */
127
128 using dpctl::tensor::type_utils::is_complex_v;
129 if constexpr (enable_sg_loadstore && !is_complex_v<T>) {
130 auto sg = ndit.get_sub_group();
131 const std::uint16_t sgSize = sg.get_max_local_range()[0];
132 const std::size_t base =
133 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
134 sg.get_group_id()[0] * sgSize);
135
136 if (base + elems_per_wi * sgSize < nelems_) {
137 using dpctl::tensor::sycl_utils::sub_group_load;
138 using dpctl::tensor::sycl_utils::sub_group_store;
139#pragma unroll
140 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
141 const std::size_t offset = base + it * sgSize;
142 auto in_multi_ptr = sycl::address_space_cast<
143 sycl::access::address_space::global_space,
144 sycl::access::decorated::yes>(&in_[offset]);
145 auto out_multi_ptr = sycl::address_space_cast<
146 sycl::access::address_space::global_space,
147 sycl::access::decorated::yes>(&out_[offset]);
148
149 sycl::vec<T, vec_sz> arg_vec =
150 sub_group_load<vec_sz>(sg, in_multi_ptr);
151#pragma unroll
152 for (std::uint32_t k = 0; k < vec_sz; ++k) {
153 arg_vec[k] = to_num(arg_vec[k], nan_, posinf_, neginf_);
154 }
155 sub_group_store<vec_sz>(sg, arg_vec, out_multi_ptr);
156 }
157 }
158 else {
159 const std::size_t lane_id = sg.get_local_id()[0];
160 for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
161 out_[k] = to_num(in_[k], nan_, posinf_, neginf_);
162 }
163 }
164 }
165 else {
166 const std::uint16_t sgSize =
167 ndit.get_sub_group().get_local_range()[0];
168 const std::size_t gid = ndit.get_global_linear_id();
169 const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
170
171 const std::size_t start =
172 (gid / sgSize) * (elems_per_sg - sgSize) + gid;
173 const std::size_t end = std::min(nelems_, start + elems_per_sg);
174 for (std::size_t offset = start; offset < end; offset += sgSize) {
175 if constexpr (is_complex_v<T>) {
176 using realT = typename T::value_type;
177 static_assert(std::is_same_v<realT, scT>);
178
179 T z = in_[offset];
180 realT x = to_num(z.real(), nan_, posinf_, neginf_);
181 realT y = to_num(z.imag(), nan_, posinf_, neginf_);
182 out_[offset] = T{x, y};
183 }
184 else {
185 out_[offset] = to_num(in_[offset], nan_, posinf_, neginf_);
186 }
187 }
188 }
189 }
190};
191
192template <typename T, typename scT>
193sycl::event nan_to_num_strided_impl(sycl::queue &q,
194 const size_t nelems,
195 const int nd,
196 const dpctl::tensor::ssize_t *shape_strides,
197 const scT nan,
198 const scT posinf,
199 const scT neginf,
200 const char *in_cp,
201 const dpctl::tensor::ssize_t in_offset,
202 char *out_cp,
203 const dpctl::tensor::ssize_t out_offset,
204 const std::vector<sycl::event> &depends)
205{
206 dpctl::tensor::type_utils::validate_type_for_device<T>(q);
207
208 const T *in_tp = reinterpret_cast<const T *>(in_cp);
209 T *out_tp = reinterpret_cast<T *>(out_cp);
210
211 using InOutIndexerT =
212 typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
213 const InOutIndexerT indexer{nd, in_offset, out_offset, shape_strides};
214
215 sycl::event comp_ev = q.submit([&](sycl::handler &cgh) {
216 cgh.depends_on(depends);
217
218 using NanToNumFunc = NanToNumFunctor<T, scT, InOutIndexerT>;
219 cgh.parallel_for<NanToNumFunc>(
220 {nelems},
221 NanToNumFunc(in_tp, out_tp, indexer, nan, posinf, neginf));
222 });
223 return comp_ev;
224}
225
226template <typename T,
227 typename scT,
228 std::uint8_t vec_sz = 4u,
229 std::uint8_t n_vecs = 2u>
230sycl::event nan_to_num_contig_impl(sycl::queue &exec_q,
231 std::size_t nelems,
232 const scT nan,
233 const scT posinf,
234 const scT neginf,
235 const char *in_cp,
236 char *out_cp,
237 const std::vector<sycl::event> &depends = {})
238{
239 constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
240 const std::size_t n_work_items_needed = nelems / elems_per_wi;
241 const std::size_t empirical_threshold = std::size_t(1) << 21;
242 const std::size_t lws = (n_work_items_needed <= empirical_threshold)
243 ? std::size_t(128)
244 : std::size_t(256);
245
246 const std::size_t n_groups =
247 ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
248 const auto gws_range = sycl::range<1>(n_groups * lws);
249 const auto lws_range = sycl::range<1>(lws);
250
251 const T *in_tp = reinterpret_cast<const T *>(in_cp);
252 T *out_tp = reinterpret_cast<T *>(out_cp);
253
254 sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
255 cgh.depends_on(depends);
256
257 using dpctl::tensor::kernels::alignment_utils::is_aligned;
258 using dpctl::tensor::kernels::alignment_utils::required_alignment;
259 if (is_aligned<required_alignment>(in_tp) &&
260 is_aligned<required_alignment>(out_tp))
261 {
262 constexpr bool enable_sg_loadstore = true;
263 using NanToNumFunc = NanToNumContigFunctor<T, scT, vec_sz, n_vecs,
264 enable_sg_loadstore>;
265
266 cgh.parallel_for<NanToNumFunc>(
267 sycl::nd_range<1>(gws_range, lws_range),
268 NanToNumFunc(in_tp, out_tp, nelems, nan, posinf, neginf));
269 }
270 else {
271 constexpr bool disable_sg_loadstore = false;
272 using NanToNumFunc = NanToNumContigFunctor<T, scT, vec_sz, n_vecs,
273 disable_sg_loadstore>;
274
275 cgh.parallel_for<NanToNumFunc>(
276 sycl::nd_range<1>(gws_range, lws_range),
277 NanToNumFunc(in_tp, out_tp, nelems, nan, posinf, neginf));
278 }
279 });
280
281 return comp_ev;
282}
283
284} // namespace dpnp::kernels::nan_to_num