DPNP C++ backend kernel library 0.18.0rc1
Data Parallel Extension for NumPy*
Loading...
Searching...
No Matches
partitioning_one_pivot_kernel_cpu.hpp
1//*****************************************************************************
2// Copyright (c) 2024-2025, Intel Corporation
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7// - Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// - Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23// THE POSSIBILITY OF SUCH DAMAGE.
24//*****************************************************************************
25
26#pragma once
27
28#include "utils/math_utils.hpp"
29#include <sycl/sycl.hpp>
30#include <type_traits>
31
32#include <stdio.h>
33
34#include "ext/common.hpp"
35
36#include "partitioning.hpp"
37
38using dpctl::tensor::usm_ndarray;
39
43using ext::common::make_ndrange;
44
45namespace statistics::partitioning
46{
47
48template <typename T, uint32_t WorkPI>
50
51template <typename T, uint32_t WorkPI>
52auto partition_one_pivot_func_cpu(sycl::handler &cgh,
53 T *in,
54 T *out,
55 PartitionState<T> &state)
56{
57 auto loc_counters =
58 sycl::local_accessor<uint32_t, 1>(sycl::range<1>(4), cgh);
59
60 return [=](sycl::nd_item<1> item) {
61 if (state.stop[0])
62 return;
63
64 auto group = item.get_group();
65 uint64_t items_per_group = group.get_local_range(0) * WorkPI;
66 uint64_t num_elems = state.num_elems[0];
67
68 if (group.get_group_id(0) * items_per_group >= num_elems)
69 return;
70
71 T *_in = nullptr;
72 if (state.left[0]) {
73 _in = in;
74 }
75 else {
76 _in = in + state.n - num_elems;
77 }
78
79 auto value = state.pivot[0];
80
81 auto sbg = item.get_sub_group();
82 uint32_t sbg_size = sbg.get_max_local_range()[0];
83
84 uint64_t i_base =
85 (item.get_global_linear_id() - sbg.get_local_linear_id()) * WorkPI;
86
87 if (group.leader()) {
88 loc_counters[0] = 0;
89 loc_counters[1] = 0;
90 loc_counters[2] = 0;
91 }
92
93 sycl::group_barrier(group);
94
95 uint32_t less_count = 0;
96 uint32_t equal_count = 0;
97 uint32_t greater_equal_count = 0;
98 uint32_t nan_count = 0;
99
100 T values[WorkPI];
101 uint32_t actual_count = 0;
102 uint64_t local_i_base = i_base + sbg.get_local_linear_id();
103
104 for (uint32_t _i = 0; _i < WorkPI; ++_i) {
105 auto i = local_i_base + _i * sbg_size;
106 if (i < num_elems) {
107 values[_i] = _in[i];
108 auto is_nan = IsNan<T>::isnan(values[_i]);
109 less_count += (Less<T>{}(values[_i], value) && !is_nan);
110 equal_count += (values[_i] == value && !is_nan);
111 nan_count += is_nan;
112 actual_count++;
113 }
114 }
115
116 greater_equal_count = actual_count - less_count - nan_count;
117
118 auto sbg_less_equal =
119 sycl::reduce_over_group(sbg, less_count, sycl::plus<>());
120 auto sbg_equal =
121 sycl::reduce_over_group(sbg, equal_count, sycl::plus<>());
122 auto sbg_greater =
123 sycl::reduce_over_group(sbg, greater_equal_count, sycl::plus<>());
124
125 uint32_t local_less_offset = 0;
126 uint32_t local_gr_offset = 0;
127 if (sbg.leader()) {
128 sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
129 sycl::memory_scope::work_group>
130 gr_less_eq(loc_counters[0]);
131 local_less_offset = gr_less_eq.fetch_add(sbg_less_equal);
132
133 sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
134 sycl::memory_scope::work_group>
135 gr_eq(loc_counters[1]);
136 gr_eq += sbg_equal;
137
138 sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
139 sycl::memory_scope::work_group>
140 gr_greater(loc_counters[2]);
141 local_gr_offset = gr_greater.fetch_add(sbg_greater);
142 }
143
144 local_less_offset = sycl::select_from_group(sbg, local_less_offset, 0);
145 local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0);
146
147 sycl::group_barrier(group);
148
149 if (group.leader()) {
150 sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
151 sycl::memory_scope::device>
152 glbl_less_eq(state.iteration_counters.less_count[0]);
153 auto global_less_eq_offset =
154 glbl_less_eq.fetch_add(loc_counters[0]);
155
156 sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
157 sycl::memory_scope::device>
158 glbl_eq(state.iteration_counters.equal_count[0]);
159 glbl_eq += loc_counters[1];
160
161 sycl::atomic_ref<uint64_t, sycl::memory_order::relaxed,
162 sycl::memory_scope::device>
163 glbl_greater(state.iteration_counters.greater_equal_count[0]);
164 auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]);
165
166 loc_counters[0] = global_less_eq_offset;
167 loc_counters[2] = global_gr_offset;
168 }
169
170 sycl::group_barrier(group);
171
172 auto sbg_less_offset = loc_counters[0] + local_less_offset;
173 auto sbg_gr_offset =
174 state.n - (loc_counters[2] + local_gr_offset + sbg_greater);
175
176 uint32_t le_item_offset = 0;
177 uint32_t gr_item_offset = 0;
178
179 for (uint32_t _i = 0; _i < WorkPI; ++_i) {
180 uint32_t is_nan = IsNan<T>::isnan(values[_i]);
181 uint32_t less = (!is_nan && Less<T>{}(values[_i], value));
182 auto le_pos =
183 sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>());
184 auto ge_pos = sbg.get_local_linear_id() - le_pos;
185
186 auto total_le = sycl::reduce_over_group(sbg, less, sycl::plus<>());
187 auto total_nan =
188 sycl::reduce_over_group(sbg, is_nan, sycl::plus<>());
189 auto total_gr = sbg_size - total_le - total_nan;
190
191 if (_i < actual_count) {
192 if (less) {
193 out[sbg_less_offset + le_item_offset + le_pos] = values[_i];
194 }
195 else if (!is_nan) {
196 out[sbg_gr_offset + gr_item_offset + ge_pos] = values[_i];
197 }
198 le_item_offset += total_le;
199 gr_item_offset += total_gr;
200 }
201 }
202 };
203}
204
205template <typename T, uint32_t WorkPI>
206sycl::event run_partition_one_pivot_cpu(sycl::queue &exec_q,
207 T *in,
208 T *out,
209 PartitionState<T> &state,
210 const std::vector<sycl::event> &deps,
211 uint32_t group_size)
212{
213 auto e = exec_q.submit([&](sycl::handler &cgh) {
214 cgh.depends_on(deps);
215
216 auto work_range = make_ndrange(state.n, group_size, WorkPI);
217
218 cgh.parallel_for<partition_one_pivot_kernel_cpu<T, WorkPI>>(
219 work_range,
220 partition_one_pivot_func_cpu<T, WorkPI>(cgh, in, out, state));
221 });
222
223 return e;
224}
225
226} // namespace statistics::partitioning