38#include <pybind11/numpy.h>
39#include <pybind11/pybind11.h>
41#include <sycl/sycl.hpp>
43#include "dpnp4pybind11.hpp"
45#include "elementwise_functions_type_utils.hpp"
46#include "simplify_iteration_space.hpp"
49#include "kernels/alignment.hpp"
50#include "utils/memory_overlap.hpp"
51#include "utils/offset_utils.hpp"
52#include "utils/output_validation.hpp"
53#include "utils/sycl_alloc_utils.hpp"
54#include "utils/type_dispatch.hpp"
56static_assert(std::is_same_v<py::ssize_t, dpnp::tensor::ssize_t>);
58namespace dpnp::extensions::py_internal
60namespace py = pybind11;
61namespace td_ns = dpnp::tensor::type_dispatch;
63using dpnp::tensor::kernels::alignment_utils::is_aligned;
64using dpnp::tensor::kernels::alignment_utils::required_alignment;
66using type_utils::_result_typeid;
69template <
typename output_typesT,
70 typename contig_dispatchT,
71 typename strided_dispatchT>
72std::pair<sycl::event, sycl::event>
76 const std::vector<sycl::event> &depends,
78 const output_typesT &output_type_vec,
79 const contig_dispatchT &contig_dispatch_vector,
80 const strided_dispatchT &strided_dispatch_vector)
82 int src_typenum = src.get_typenum();
83 int dst_typenum = dst.get_typenum();
85 const auto &array_types = td_ns::usm_ndarray_types();
86 int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
87 int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
89 int func_output_typeid = output_type_vec[src_typeid];
92 if (dst_typeid != func_output_typeid) {
93 throw py::value_error(
94 "Destination array has unexpected elemental data type.");
98 if (!dpnp::utils::queues_are_compatible(q, {src, dst})) {
99 throw py::value_error(
100 "Execution queue is not compatible with allocation queues");
103 dpnp::tensor::validation::CheckWritable::throw_if_not_writable(dst);
106 int src_nd = src.get_ndim();
107 if (src_nd != dst.get_ndim()) {
108 throw py::value_error(
"Array dimensions are not the same.");
112 const py::ssize_t *src_shape = src.get_shape_raw();
113 const py::ssize_t *dst_shape = dst.get_shape_raw();
114 bool shapes_equal(
true);
115 std::size_t src_nelems(1);
117 for (
int i = 0; i < src_nd; ++i) {
118 src_nelems *=
static_cast<std::size_t
>(src_shape[i]);
119 shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
122 throw py::value_error(
"Array shapes are not the same.");
126 if (src_nelems == 0) {
127 return std::make_pair(sycl::event(), sycl::event());
130 dpnp::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
133 auto const &overlap = dpnp::tensor::overlap::MemoryOverlap();
134 auto const &same_logical_tensors =
135 dpnp::tensor::overlap::SameLogicalTensors();
136 if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
137 throw py::value_error(
"Arrays index overlapping segments of memory");
140 const char *src_data = src.get_data();
141 char *dst_data = dst.get_data();
144 bool is_src_c_contig = src.is_c_contiguous();
145 bool is_src_f_contig = src.is_f_contiguous();
147 bool is_dst_c_contig = dst.is_c_contiguous();
148 bool is_dst_f_contig = dst.is_f_contiguous();
150 bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
151 bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
153 if (both_c_contig || both_f_contig) {
154 auto contig_fn = contig_dispatch_vector[src_typeid];
156 if (contig_fn ==
nullptr) {
157 throw std::runtime_error(
158 "Contiguous implementation is missing for src_typeid=" +
159 std::to_string(src_typeid));
162 auto comp_ev = contig_fn(q, src_nelems, src_data, dst_data, depends);
164 dpnp::utils::keep_args_alive(q, {src, dst}, {comp_ev});
166 return std::make_pair(ht_ev, comp_ev);
173 auto const &src_strides = src.get_strides_vector();
174 auto const &dst_strides = dst.get_strides_vector();
176 using shT = std::vector<py::ssize_t>;
177 shT simplified_shape;
178 shT simplified_src_strides;
179 shT simplified_dst_strides;
180 py::ssize_t src_offset(0);
181 py::ssize_t dst_offset(0);
184 const py::ssize_t *shape = src_shape;
186 simplify_iteration_space(nd, shape, src_strides, dst_strides,
188 simplified_shape, simplified_src_strides,
189 simplified_dst_strides, src_offset, dst_offset);
191 if (nd == 1 && simplified_src_strides[0] == 1 &&
192 simplified_dst_strides[0] == 1) {
194 auto contig_fn = contig_dispatch_vector[src_typeid];
196 if (contig_fn ==
nullptr) {
197 throw std::runtime_error(
198 "Contiguous implementation is missing for src_typeid=" +
199 std::to_string(src_typeid));
202 int src_elem_size = src.get_elemsize();
203 int dst_elem_size = dst.get_elemsize();
205 contig_fn(q, src_nelems, src_data + src_elem_size * src_offset,
206 dst_data + dst_elem_size * dst_offset, depends);
209 dpnp::utils::keep_args_alive(q, {src, dst}, {comp_ev});
211 return std::make_pair(ht_ev, comp_ev);
215 auto strided_fn = strided_dispatch_vector[src_typeid];
217 if (strided_fn ==
nullptr) {
218 throw std::runtime_error(
219 "Strided implementation is missing for src_typeid=" +
220 std::to_string(src_typeid));
223 using dpnp::tensor::offset_utils::device_allocate_and_pack;
225 std::vector<sycl::event> host_tasks{};
226 host_tasks.reserve(2);
228 auto ptr_size_event_triple_ = device_allocate_and_pack<py::ssize_t>(
229 q, host_tasks, simplified_shape, simplified_src_strides,
230 simplified_dst_strides);
231 auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_));
232 const auto ©_shape_ev = std::get<2>(ptr_size_event_triple_);
233 const py::ssize_t *shape_strides = shape_strides_owner.get();
235 sycl::event strided_fn_ev =
236 strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset,
237 dst_data, dst_offset, depends, {copy_shape_ev});
240 sycl::event tmp_cleanup_ev = dpnp::tensor::alloc_utils::async_smart_free(
241 q, {strided_fn_ev}, shape_strides_owner);
243 host_tasks.push_back(tmp_cleanup_ev);
245 return std::make_pair(
246 dpnp::utils::keep_args_alive(q, {src, dst}, host_tasks), strided_fn_ev);
251template <
typename output_typesT>
252py::object py_unary_ufunc_result_type(
const py::dtype &input_dtype,
253 const output_typesT &output_types)
255 int tn = input_dtype.num();
258 auto array_types = td_ns::usm_ndarray_types();
261 src_typeid = array_types.typenum_to_lookup_id(tn);
262 }
catch (
const std::exception &e) {
263 throw py::value_error(e.what());
266 int dst_typeid = _result_typeid(src_typeid, output_types);
267 if (dst_typeid < 0) {
268 auto res = py::none();
269 return py::cast<py::object>(res);
272 using type_utils::_dtype_from_typenum;
274 auto dst_typenum_t =
static_cast<td_ns::typenum_t
>(dst_typeid);
275 auto dt = _dtype_from_typenum(dst_typenum_t);
277 return py::cast<py::object>(dt);
285template <
typename output_typesT,
286 typename contig_dispatchT,
287 typename strided_dispatchT>
288std::pair<sycl::event, sycl::event>
293 const std::vector<sycl::event> &depends,
295 const output_typesT &output_type_vec,
296 const contig_dispatchT &contig_dispatch_vector,
297 const strided_dispatchT &strided_dispatch_vector)
299 int src_typenum = src.get_typenum();
300 int dst1_typenum = dst1.get_typenum();
301 int dst2_typenum = dst2.get_typenum();
303 const auto &array_types = td_ns::usm_ndarray_types();
304 int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
305 int dst1_typeid = array_types.typenum_to_lookup_id(dst1_typenum);
306 int dst2_typeid = array_types.typenum_to_lookup_id(dst2_typenum);
308 std::pair<int, int> func_output_typeids = output_type_vec[src_typeid];
311 if (dst1_typeid != func_output_typeids.first ||
312 dst2_typeid != func_output_typeids.second) {
313 throw py::value_error(
314 "One of destination arrays has unexpected elemental data type.");
318 if (!dpnp::utils::queues_are_compatible(q, {src, dst1, dst2})) {
319 throw py::value_error(
320 "Execution queue is not compatible with allocation queues");
323 dpnp::tensor::validation::CheckWritable::throw_if_not_writable(dst1);
324 dpnp::tensor::validation::CheckWritable::throw_if_not_writable(dst2);
327 int src_nd = src.get_ndim();
328 if (src_nd != dst1.get_ndim() || src_nd != dst2.get_ndim()) {
329 throw py::value_error(
"Array dimensions are not the same.");
333 const py::ssize_t *src_shape = src.get_shape_raw();
334 const py::ssize_t *dst1_shape = dst1.get_shape_raw();
335 const py::ssize_t *dst2_shape = dst2.get_shape_raw();
336 bool shapes_equal(
true);
337 std::size_t src_nelems(1);
339 for (
int i = 0; i < src_nd; ++i) {
340 src_nelems *=
static_cast<std::size_t
>(src_shape[i]);
341 shapes_equal = shapes_equal && (src_shape[i] == dst1_shape[i]) &&
342 (src_shape[i] == dst2_shape[i]);
345 throw py::value_error(
"Array shapes are not the same.");
349 if (src_nelems == 0) {
350 return std::make_pair(sycl::event(), sycl::event());
353 dpnp::tensor::validation::AmpleMemory::throw_if_not_ample(dst1, src_nelems);
354 dpnp::tensor::validation::AmpleMemory::throw_if_not_ample(dst2, src_nelems);
357 auto const &overlap = dpnp::tensor::overlap::MemoryOverlap();
358 auto const &same_logical_tensors =
359 dpnp::tensor::overlap::SameLogicalTensors();
360 if ((overlap(src, dst1) && !same_logical_tensors(src, dst1)) ||
361 (overlap(src, dst2) && !same_logical_tensors(src, dst2)) ||
362 (overlap(dst1, dst2) && !same_logical_tensors(dst1, dst2))) {
363 throw py::value_error(
"Arrays index overlapping segments of memory");
366 const char *src_data = src.get_data();
367 char *dst1_data = dst1.get_data();
368 char *dst2_data = dst2.get_data();
371 bool is_src_c_contig = src.is_c_contiguous();
372 bool is_src_f_contig = src.is_f_contiguous();
374 bool is_dst1_c_contig = dst1.is_c_contiguous();
375 bool is_dst1_f_contig = dst1.is_f_contiguous();
377 bool is_dst2_c_contig = dst2.is_c_contiguous();
378 bool is_dst2_f_contig = dst2.is_f_contiguous();
381 (is_src_c_contig && is_dst1_c_contig && is_dst2_c_contig);
383 (is_src_f_contig && is_dst1_f_contig && is_dst2_f_contig);
385 if (all_c_contig || all_f_contig) {
386 auto contig_fn = contig_dispatch_vector[src_typeid];
388 if (contig_fn ==
nullptr) {
389 throw std::runtime_error(
390 "Contiguous implementation is missing for src_typeid=" +
391 std::to_string(src_typeid));
395 contig_fn(q, src_nelems, src_data, dst1_data, dst2_data, depends);
397 dpnp::utils::keep_args_alive(q, {src, dst1, dst2}, {comp_ev});
399 return std::make_pair(ht_ev, comp_ev);
406 auto const &src_strides = src.get_strides_vector();
407 auto const &dst1_strides = dst1.get_strides_vector();
408 auto const &dst2_strides = dst2.get_strides_vector();
410 using shT = std::vector<py::ssize_t>;
411 shT simplified_shape;
412 shT simplified_src_strides;
413 shT simplified_dst1_strides;
414 shT simplified_dst2_strides;
415 py::ssize_t src_offset(0);
416 py::ssize_t dst1_offset(0);
417 py::ssize_t dst2_offset(0);
420 const py::ssize_t *shape = src_shape;
422 simplify_iteration_space_3(
423 nd, shape, src_strides, dst1_strides, dst2_strides,
425 simplified_shape, simplified_src_strides, simplified_dst1_strides,
426 simplified_dst2_strides, src_offset, dst1_offset, dst2_offset);
428 if (nd == 1 && simplified_src_strides[0] == 1 &&
429 simplified_dst1_strides[0] == 1 && simplified_dst2_strides[0] == 1) {
431 auto contig_fn = contig_dispatch_vector[src_typeid];
433 if (contig_fn ==
nullptr) {
434 throw std::runtime_error(
435 "Contiguous implementation is missing for src_typeid=" +
436 std::to_string(src_typeid));
439 int src_elem_size = src.get_elemsize();
440 int dst1_elem_size = dst1.get_elemsize();
441 int dst2_elem_size = dst2.get_elemsize();
443 contig_fn(q, src_nelems, src_data + src_elem_size * src_offset,
444 dst1_data + dst1_elem_size * dst1_offset,
445 dst2_data + dst2_elem_size * dst2_offset, depends);
448 dpnp::utils::keep_args_alive(q, {src, dst1, dst2}, {comp_ev});
450 return std::make_pair(ht_ev, comp_ev);
454 auto strided_fn = strided_dispatch_vector[src_typeid];
456 if (strided_fn ==
nullptr) {
457 throw std::runtime_error(
458 "Strided implementation is missing for src_typeid=" +
459 std::to_string(src_typeid));
462 using dpnp::tensor::offset_utils::device_allocate_and_pack;
464 std::vector<sycl::event> host_tasks{};
465 host_tasks.reserve(2);
467 auto ptr_size_event_triple_ = device_allocate_and_pack<py::ssize_t>(
468 q, host_tasks, simplified_shape, simplified_src_strides,
469 simplified_dst1_strides, simplified_dst2_strides);
470 auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_));
471 const auto ©_shape_ev = std::get<2>(ptr_size_event_triple_);
472 const py::ssize_t *shape_strides = shape_strides_owner.get();
474 sycl::event strided_fn_ev = strided_fn(
475 q, src_nelems, nd, shape_strides, src_data, src_offset, dst1_data,
476 dst1_offset, dst2_data, dst2_offset, depends, {copy_shape_ev});
479 sycl::event tmp_cleanup_ev = dpnp::tensor::alloc_utils::async_smart_free(
480 q, {strided_fn_ev}, shape_strides_owner);
482 host_tasks.push_back(tmp_cleanup_ev);
484 return std::make_pair(
485 dpnp::utils::keep_args_alive(q, {src, dst1, dst2}, host_tasks),
493template <
typename output_typesT>
494std::pair<py::object, py::object>
495 py_unary_two_outputs_ufunc_result_type(
const py::dtype &input_dtype,
496 const output_typesT &output_types)
498 int tn = input_dtype.num();
501 auto array_types = td_ns::usm_ndarray_types();
504 src_typeid = array_types.typenum_to_lookup_id(tn);
505 }
catch (
const std::exception &e) {
506 throw py::value_error(e.what());
509 std::pair<int, int> dst_typeids = _result_typeid(src_typeid, output_types);
510 int dst1_typeid = dst_typeids.first;
511 int dst2_typeid = dst_typeids.second;
513 if (dst1_typeid < 0 || dst2_typeid < 0) {
514 auto res = py::none();
515 auto py_res = py::cast<py::object>(res);
516 return std::make_pair(py_res, py_res);
519 using type_utils::_dtype_from_typenum;
521 auto dst1_typenum_t =
static_cast<td_ns::typenum_t
>(dst1_typeid);
522 auto dst2_typenum_t =
static_cast<td_ns::typenum_t
>(dst2_typeid);
523 auto dt1 = _dtype_from_typenum(dst1_typenum_t);
524 auto dt2 = _dtype_from_typenum(dst2_typenum_t);
526 return std::make_pair(py::cast<py::object>(dt1),
527 py::cast<py::object>(dt2));
535template <
class Container,
class T>
536bool isEqual(Container
const &c, std::initializer_list<T>
const &l)
538 return std::equal(std::begin(c), std::end(c), std::begin(l), std::end(l));
544template <
typename output_typesT,
545 typename contig_dispatchT,
546 typename strided_dispatchT,
547 typename contig_matrix_row_dispatchT,
548 typename contig_row_matrix_dispatchT>
549std::pair<sycl::event, sycl::event> py_binary_ufunc(
554 const std::vector<sycl::event> &depends,
556 const output_typesT &output_type_table,
557 const contig_dispatchT &contig_dispatch_table,
558 const strided_dispatchT &strided_dispatch_table,
559 const contig_matrix_row_dispatchT
560 &contig_matrix_row_broadcast_dispatch_table,
561 const contig_row_matrix_dispatchT
562 &contig_row_matrix_broadcast_dispatch_table)
565 int src1_typenum = src1.get_typenum();
566 int src2_typenum = src2.get_typenum();
567 int dst_typenum = dst.get_typenum();
569 auto array_types = td_ns::usm_ndarray_types();
570 int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum);
571 int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum);
572 int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
574 int output_typeid = output_type_table[src1_typeid][src2_typeid];
576 if (output_typeid != dst_typeid) {
577 throw py::value_error(
578 "Destination array has unexpected elemental data type.");
582 if (!dpnp::utils::queues_are_compatible(exec_q, {src1, src2, dst})) {
583 throw py::value_error(
584 "Execution queue is not compatible with allocation queues");
587 dpnp::tensor::validation::CheckWritable::throw_if_not_writable(dst);
591 int dst_nd = dst.get_ndim();
592 if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim()) {
593 throw py::value_error(
"Array dimensions are not the same.");
597 const py::ssize_t *src1_shape = src1.get_shape_raw();
598 const py::ssize_t *src2_shape = src2.get_shape_raw();
599 const py::ssize_t *dst_shape = dst.get_shape_raw();
600 bool shapes_equal(
true);
601 std::size_t src_nelems(1);
603 for (
int i = 0; i < dst_nd; ++i) {
604 src_nelems *=
static_cast<std::size_t
>(src1_shape[i]);
605 shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] &&
606 src2_shape[i] == dst_shape[i]);
609 throw py::value_error(
"Array shapes are not the same.");
613 if (src_nelems == 0) {
614 return std::make_pair(sycl::event(), sycl::event());
617 dpnp::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
619 auto const &overlap = dpnp::tensor::overlap::MemoryOverlap();
620 auto const &same_logical_tensors =
621 dpnp::tensor::overlap::SameLogicalTensors();
622 if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
623 (overlap(src2, dst) && !same_logical_tensors(src2, dst))) {
624 throw py::value_error(
"Arrays index overlapping segments of memory");
627 const char *src1_data = src1.get_data();
628 const char *src2_data = src2.get_data();
629 char *dst_data = dst.get_data();
632 bool is_src1_c_contig = src1.is_c_contiguous();
633 bool is_src1_f_contig = src1.is_f_contiguous();
635 bool is_src2_c_contig = src2.is_c_contiguous();
636 bool is_src2_f_contig = src2.is_f_contiguous();
638 bool is_dst_c_contig = dst.is_c_contiguous();
639 bool is_dst_f_contig = dst.is_f_contiguous();
642 (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig);
644 (is_src1_f_contig && is_src2_f_contig && is_dst_f_contig);
647 if (all_c_contig || all_f_contig) {
648 auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
650 if (contig_fn !=
nullptr) {
651 auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, 0,
652 src2_data, 0, dst_data, 0, depends);
653 sycl::event ht_ev = dpnp::utils::keep_args_alive(
654 exec_q, {src1, src2, dst}, {comp_ev});
656 return std::make_pair(ht_ev, comp_ev);
661 auto const &src1_strides = src1.get_strides_vector();
662 auto const &src2_strides = src2.get_strides_vector();
663 auto const &dst_strides = dst.get_strides_vector();
665 using shT = std::vector<py::ssize_t>;
666 shT simplified_shape;
667 shT simplified_src1_strides;
668 shT simplified_src2_strides;
669 shT simplified_dst_strides;
670 py::ssize_t src1_offset(0);
671 py::ssize_t src2_offset(0);
672 py::ssize_t dst_offset(0);
675 const py::ssize_t *shape = src1_shape;
677 simplify_iteration_space_3(
678 nd, shape, src1_strides, src2_strides, dst_strides,
680 simplified_shape, simplified_src1_strides, simplified_src2_strides,
681 simplified_dst_strides, src1_offset, src2_offset, dst_offset);
683 std::vector<sycl::event> host_tasks{};
685 static constexpr auto unit_stride =
686 std::initializer_list<py::ssize_t>{1};
688 if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
689 isEqual(simplified_src2_strides, unit_stride) &&
690 isEqual(simplified_dst_strides, unit_stride)) {
691 auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
693 if (contig_fn !=
nullptr) {
694 auto comp_ev = contig_fn(exec_q, src_nelems, src1_data,
695 src1_offset, src2_data, src2_offset,
696 dst_data, dst_offset, depends);
697 sycl::event ht_ev = dpnp::utils::keep_args_alive(
698 exec_q, {src1, src2, dst}, {comp_ev});
700 return std::make_pair(ht_ev, comp_ev);
704 static constexpr auto zero_one_strides =
705 std::initializer_list<py::ssize_t>{0, 1};
706 static constexpr auto one_zero_strides =
707 std::initializer_list<py::ssize_t>{1, 0};
708 static constexpr py::ssize_t one{1};
710 if (isEqual(simplified_src2_strides, zero_one_strides) &&
711 isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
712 isEqual(simplified_dst_strides, {simplified_shape[1], one})) {
713 auto matrix_row_broadcast_fn =
714 contig_matrix_row_broadcast_dispatch_table[src1_typeid]
716 if (matrix_row_broadcast_fn !=
nullptr) {
717 int src1_itemsize = src1.get_elemsize();
718 int src2_itemsize = src2.get_elemsize();
719 int dst_itemsize = dst.get_elemsize();
721 if (is_aligned<required_alignment>(
722 src1_data + src1_offset * src1_itemsize) &&
723 is_aligned<required_alignment>(
724 src2_data + src2_offset * src2_itemsize) &&
725 is_aligned<required_alignment>(
726 dst_data + dst_offset * dst_itemsize)) {
727 std::size_t n0 = simplified_shape[0];
728 std::size_t n1 = simplified_shape[1];
729 sycl::event comp_ev = matrix_row_broadcast_fn(
730 exec_q, host_tasks, n0, n1, src1_data, src1_offset,
731 src2_data, src2_offset, dst_data, dst_offset,
734 return std::make_pair(
735 dpnp::utils::keep_args_alive(
736 exec_q, {src1, src2, dst}, host_tasks),
741 if (isEqual(simplified_src1_strides, one_zero_strides) &&
742 isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
743 isEqual(simplified_dst_strides, {one, simplified_shape[0]})) {
744 auto row_matrix_broadcast_fn =
745 contig_row_matrix_broadcast_dispatch_table[src1_typeid]
747 if (row_matrix_broadcast_fn !=
nullptr) {
749 int src1_itemsize = src1.get_elemsize();
750 int src2_itemsize = src2.get_elemsize();
751 int dst_itemsize = dst.get_elemsize();
753 if (is_aligned<required_alignment>(
754 src1_data + src1_offset * src1_itemsize) &&
755 is_aligned<required_alignment>(
756 src2_data + src2_offset * src2_itemsize) &&
757 is_aligned<required_alignment>(
758 dst_data + dst_offset * dst_itemsize)) {
759 std::size_t n0 = simplified_shape[1];
760 std::size_t n1 = simplified_shape[0];
761 sycl::event comp_ev = row_matrix_broadcast_fn(
762 exec_q, host_tasks, n0, n1, src1_data, src1_offset,
763 src2_data, src2_offset, dst_data, dst_offset,
766 return std::make_pair(
767 dpnp::utils::keep_args_alive(
768 exec_q, {src1, src2, dst}, host_tasks),
777 auto strided_fn = strided_dispatch_table[src1_typeid][src2_typeid];
779 if (strided_fn ==
nullptr) {
780 throw std::runtime_error(
781 "Strided implementation is missing for src1_typeid=" +
782 std::to_string(src1_typeid) +
783 " and src2_typeid=" + std::to_string(src2_typeid));
786 using dpnp::tensor::offset_utils::device_allocate_and_pack;
787 auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
788 exec_q, host_tasks, simplified_shape, simplified_src1_strides,
789 simplified_src2_strides, simplified_dst_strides);
790 auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
791 auto ©_shape_ev = std::get<2>(ptr_sz_event_triple_);
793 const py::ssize_t *shape_strides = shape_strides_owner.get();
795 sycl::event strided_fn_ev = strided_fn(
796 exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset,
797 src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev});
800 sycl::event tmp_cleanup_ev = dpnp::tensor::alloc_utils::async_smart_free(
801 exec_q, {strided_fn_ev}, shape_strides_owner);
803 host_tasks.push_back(tmp_cleanup_ev);
805 return std::make_pair(
806 dpnp::utils::keep_args_alive(exec_q, {src1, src2, dst}, host_tasks),
811template <
typename output_typesT>
812py::object py_binary_ufunc_result_type(
const py::dtype &input1_dtype,
813 const py::dtype &input2_dtype,
814 const output_typesT &output_types_table)
816 int tn1 = input1_dtype.num();
817 int tn2 = input2_dtype.num();
818 int src1_typeid = -1;
819 int src2_typeid = -1;
821 auto array_types = td_ns::usm_ndarray_types();
824 src1_typeid = array_types.typenum_to_lookup_id(tn1);
825 src2_typeid = array_types.typenum_to_lookup_id(tn2);
826 }
catch (
const std::exception &e) {
827 throw py::value_error(e.what());
830 if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
831 src2_typeid >= td_ns::num_types) {
832 throw std::runtime_error(
"binary output type lookup failed");
834 int dst_typeid = output_types_table[src1_typeid][src2_typeid];
836 if (dst_typeid < 0) {
837 auto res = py::none();
838 return py::cast<py::object>(res);
841 using type_utils::_dtype_from_typenum;
843 auto dst_typenum_t =
static_cast<td_ns::typenum_t
>(dst_typeid);
844 auto dt = _dtype_from_typenum(dst_typenum_t);
846 return py::cast<py::object>(dt);
852template <
typename output_typesT,
853 typename contig_dispatchT,
854 typename strided_dispatchT>
855std::pair<sycl::event, sycl::event>
861 const std::vector<sycl::event> &depends,
863 const output_typesT &output_types_table,
864 const contig_dispatchT &contig_dispatch_table,
865 const strided_dispatchT &strided_dispatch_table)
868 int src1_typenum = src1.get_typenum();
869 int src2_typenum = src2.get_typenum();
870 int dst1_typenum = dst1.get_typenum();
871 int dst2_typenum = dst2.get_typenum();
873 auto array_types = td_ns::usm_ndarray_types();
874 int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum);
875 int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum);
876 int dst1_typeid = array_types.typenum_to_lookup_id(dst1_typenum);
877 int dst2_typeid = array_types.typenum_to_lookup_id(dst2_typenum);
879 std::pair<int, int> output_typeids =
880 output_types_table[src1_typeid][src2_typeid];
882 if (dst1_typeid != output_typeids.first ||
883 dst2_typeid != output_typeids.second) {
884 throw py::value_error(
885 "One of destination arrays has unexpected elemental data type.");
889 if (!dpnp::utils::queues_are_compatible(exec_q, {src1, src2, dst1, dst2})) {
890 throw py::value_error(
891 "Execution queue is not compatible with allocation queues");
894 dpnp::tensor::validation::CheckWritable::throw_if_not_writable(dst1);
895 dpnp::tensor::validation::CheckWritable::throw_if_not_writable(dst2);
899 int src1_nd = src1.get_ndim();
900 int src2_nd = src2.get_ndim();
901 int dst1_nd = dst1.get_ndim();
902 int dst2_nd = dst2.get_ndim();
904 if (dst1_nd != src1_nd || dst1_nd != src2_nd || dst1_nd != dst2_nd) {
905 throw py::value_error(
"Array dimensions are not the same.");
909 const py::ssize_t *src1_shape = src1.get_shape_raw();
910 const py::ssize_t *src2_shape = src2.get_shape_raw();
911 const py::ssize_t *dst1_shape = dst1.get_shape_raw();
912 const py::ssize_t *dst2_shape = dst2.get_shape_raw();
913 bool shapes_equal(
true);
914 std::size_t src_nelems(1);
916 for (
int i = 0; i < dst1_nd; ++i) {
917 const auto &sh_i = dst1_shape[i];
918 src_nelems *=
static_cast<std::size_t
>(src1_shape[i]);
920 shapes_equal && (src1_shape[i] == sh_i && src2_shape[i] == sh_i &&
921 dst2_shape[i] == sh_i);
924 throw py::value_error(
"Array shapes are not the same.");
928 if (src_nelems == 0) {
929 return std::make_pair(sycl::event(), sycl::event());
932 dpnp::tensor::validation::AmpleMemory::throw_if_not_ample(dst1, src_nelems);
933 dpnp::tensor::validation::AmpleMemory::throw_if_not_ample(dst2, src_nelems);
936 auto const &overlap = dpnp::tensor::overlap::MemoryOverlap();
937 auto const &same_logical_tensors =
938 dpnp::tensor::overlap::SameLogicalTensors();
939 if ((overlap(src1, dst1) && !same_logical_tensors(src1, dst1)) ||
940 (overlap(src1, dst2) && !same_logical_tensors(src1, dst2)) ||
941 (overlap(src2, dst1) && !same_logical_tensors(src2, dst1)) ||
942 (overlap(src2, dst2) && !same_logical_tensors(src2, dst2)) ||
943 (overlap(dst1, dst2))) {
944 throw py::value_error(
"Arrays index overlapping segments of memory");
947 const char *src1_data = src1.get_data();
948 const char *src2_data = src2.get_data();
949 char *dst1_data = dst1.get_data();
950 char *dst2_data = dst2.get_data();
953 bool is_src1_c_contig = src1.is_c_contiguous();
954 bool is_src1_f_contig = src1.is_f_contiguous();
956 bool is_src2_c_contig = src2.is_c_contiguous();
957 bool is_src2_f_contig = src2.is_f_contiguous();
959 bool is_dst1_c_contig = dst1.is_c_contiguous();
960 bool is_dst1_f_contig = dst1.is_f_contiguous();
962 bool is_dst2_c_contig = dst2.is_c_contiguous();
963 bool is_dst2_f_contig = dst2.is_f_contiguous();
965 bool all_c_contig = (is_src1_c_contig && is_src2_c_contig &&
966 is_dst1_c_contig && is_dst2_c_contig);
967 bool all_f_contig = (is_src1_f_contig && is_src2_f_contig &&
968 is_dst1_f_contig && is_dst2_f_contig);
971 if (all_c_contig || all_f_contig) {
972 auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
974 if (contig_fn !=
nullptr) {
976 contig_fn(exec_q, src_nelems, src1_data, 0, src2_data, 0,
977 dst1_data, 0, dst2_data, 0, depends);
978 sycl::event ht_ev = dpnp::utils::keep_args_alive(
979 exec_q, {src1, src2, dst1, dst2}, {comp_ev});
981 return std::make_pair(ht_ev, comp_ev);
986 auto const &src1_strides = src1.get_strides_vector();
987 auto const &src2_strides = src2.get_strides_vector();
988 auto const &dst1_strides = dst1.get_strides_vector();
989 auto const &dst2_strides = dst2.get_strides_vector();
991 using shT = std::vector<py::ssize_t>;
992 shT simplified_shape;
993 shT simplified_src1_strides;
994 shT simplified_src2_strides;
995 shT simplified_dst1_strides;
996 shT simplified_dst2_strides;
997 py::ssize_t src1_offset(0);
998 py::ssize_t src2_offset(0);
999 py::ssize_t dst1_offset(0);
1000 py::ssize_t dst2_offset(0);
1003 const py::ssize_t *shape = src1_shape;
1005 simplify_iteration_space_4(
1006 nd, shape, src1_strides, src2_strides, dst1_strides, dst2_strides,
1008 simplified_shape, simplified_src1_strides, simplified_src2_strides,
1009 simplified_dst1_strides, simplified_dst2_strides, src1_offset,
1010 src2_offset, dst1_offset, dst2_offset);
1012 std::vector<sycl::event> host_tasks{};
1013 static constexpr auto unit_stride = std::initializer_list<py::ssize_t>{1};
1015 if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
1016 isEqual(simplified_src2_strides, unit_stride) &&
1017 isEqual(simplified_dst1_strides, unit_stride) &&
1018 isEqual(simplified_dst2_strides, unit_stride)) {
1019 auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
1021 if (contig_fn !=
nullptr) {
1023 contig_fn(exec_q, src_nelems, src1_data, src1_offset, src2_data,
1024 src2_offset, dst1_data, dst1_offset, dst2_data,
1025 dst2_offset, depends);
1026 sycl::event ht_ev = dpnp::utils::keep_args_alive(
1027 exec_q, {src1, src2, dst1, dst2}, {comp_ev});
1029 return std::make_pair(ht_ev, comp_ev);
1034 auto strided_fn = strided_dispatch_table[src1_typeid][src2_typeid];
1036 if (strided_fn ==
nullptr) {
1037 throw std::runtime_error(
1038 "Strided implementation is missing for src1_typeid=" +
1039 std::to_string(src1_typeid) +
1040 " and src2_typeid=" + std::to_string(src2_typeid));
1043 using dpnp::tensor::offset_utils::device_allocate_and_pack;
1044 auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
1045 exec_q, host_tasks, simplified_shape, simplified_src1_strides,
1046 simplified_src2_strides, simplified_dst1_strides,
1047 simplified_dst2_strides);
1048 auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
1049 auto ©_shape_ev = std::get<2>(ptr_sz_event_triple_);
1051 const py::ssize_t *shape_strides = shape_strides_owner.get();
1053 sycl::event strided_fn_ev =
1054 strided_fn(exec_q, src_nelems, nd, shape_strides, src1_data,
1055 src1_offset, src2_data, src2_offset, dst1_data, dst1_offset,
1056 dst2_data, dst2_offset, depends, {copy_shape_ev});
1059 sycl::event tmp_cleanup_ev = dpnp::tensor::alloc_utils::async_smart_free(
1060 exec_q, {strided_fn_ev}, shape_strides_owner);
1061 host_tasks.push_back(tmp_cleanup_ev);
1063 return std::make_pair(dpnp::utils::keep_args_alive(
1064 exec_q, {src1, src2, dst1, dst2}, host_tasks),
1072template <
typename output_typesT>
1073std::pair<py::object, py::object> py_binary_two_outputs_ufunc_result_type(
1074 const py::dtype &input1_dtype,
1075 const py::dtype &input2_dtype,
1076 const output_typesT &output_types_table)
1078 int tn1 = input1_dtype.num();
1079 int tn2 = input2_dtype.num();
1080 int src1_typeid = -1;
1081 int src2_typeid = -1;
1083 auto array_types = td_ns::usm_ndarray_types();
1086 src1_typeid = array_types.typenum_to_lookup_id(tn1);
1087 src2_typeid = array_types.typenum_to_lookup_id(tn2);
1088 }
catch (
const std::exception &e) {
1089 throw py::value_error(e.what());
1092 if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
1093 src2_typeid >= td_ns::num_types) {
1094 throw std::runtime_error(
"binary output type lookup failed");
1096 std::pair<int, int> dst_typeids =
1097 output_types_table[src1_typeid][src2_typeid];
1098 int dst1_typeid = dst_typeids.first;
1099 int dst2_typeid = dst_typeids.second;
1101 if (dst1_typeid < 0 || dst2_typeid < 0) {
1102 auto res = py::none();
1103 auto py_res = py::cast<py::object>(res);
1104 return std::make_pair(py_res, py_res);
1107 using type_utils::_dtype_from_typenum;
1109 auto dst1_typenum_t =
static_cast<td_ns::typenum_t
>(dst1_typeid);
1110 auto dst2_typenum_t =
static_cast<td_ns::typenum_t
>(dst2_typeid);
1111 auto dt1 = _dtype_from_typenum(dst1_typenum_t);
1112 auto dt2 = _dtype_from_typenum(dst2_typenum_t);
1114 return std::make_pair(py::cast<py::object>(dt1),
1115 py::cast<py::object>(dt2));
1121template <
typename output_typesT,
1122 typename contig_dispatchT,
1123 typename strided_dispatchT,
1124 typename contig_row_matrix_dispatchT>
1125std::pair<sycl::event, sycl::event>
1128 sycl::queue &exec_q,
1129 const std::vector<sycl::event> &depends,
1131 const output_typesT &output_type_table,
1132 const contig_dispatchT &contig_dispatch_table,
1133 const strided_dispatchT &strided_dispatch_table,
1134 const contig_row_matrix_dispatchT
1135 &contig_row_matrix_broadcast_dispatch_table)
1137 dpnp::tensor::validation::CheckWritable::throw_if_not_writable(lhs);
1140 int rhs_typenum = rhs.get_typenum();
1141 int lhs_typenum = lhs.get_typenum();
1143 auto array_types = td_ns::usm_ndarray_types();
1144 int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
1145 int lhs_typeid = array_types.typenum_to_lookup_id(lhs_typenum);
1147 int output_typeid = output_type_table[rhs_typeid][lhs_typeid];
1149 if (output_typeid != lhs_typeid) {
1150 throw py::value_error(
1151 "Left-hand side array has unexpected elemental data type.");
1155 if (!dpnp::utils::queues_are_compatible(exec_q, {rhs, lhs})) {
1156 throw py::value_error(
1157 "Execution queue is not compatible with allocation queues");
1162 int lhs_nd = lhs.get_ndim();
1163 if (lhs_nd != rhs.get_ndim()) {
1164 throw py::value_error(
"Array dimensions are not the same.");
1168 const py::ssize_t *rhs_shape = rhs.get_shape_raw();
1169 const py::ssize_t *lhs_shape = lhs.get_shape_raw();
1170 bool shapes_equal(
true);
1171 std::size_t rhs_nelems(1);
1173 for (
int i = 0; i < lhs_nd; ++i) {
1174 rhs_nelems *=
static_cast<std::size_t
>(rhs_shape[i]);
1175 shapes_equal = shapes_equal && (rhs_shape[i] == lhs_shape[i]);
1177 if (!shapes_equal) {
1178 throw py::value_error(
"Array shapes are not the same.");
1182 if (rhs_nelems == 0) {
1183 return std::make_pair(sycl::event(), sycl::event());
1186 dpnp::tensor::validation::AmpleMemory::throw_if_not_ample(lhs, rhs_nelems);
1189 auto const &same_logical_tensors =
1190 dpnp::tensor::overlap::SameLogicalTensors();
1191 auto const &overlap = dpnp::tensor::overlap::MemoryOverlap();
1192 if (overlap(rhs, lhs) && !same_logical_tensors(rhs, lhs)) {
1193 throw py::value_error(
"Arrays index overlapping segments of memory");
1196 const char *rhs_data = rhs.get_data();
1197 char *lhs_data = lhs.get_data();
1200 bool is_rhs_c_contig = rhs.is_c_contiguous();
1201 bool is_rhs_f_contig = rhs.is_f_contiguous();
1203 bool is_lhs_c_contig = lhs.is_c_contiguous();
1204 bool is_lhs_f_contig = lhs.is_f_contiguous();
1206 bool both_c_contig = (is_rhs_c_contig && is_lhs_c_contig);
1207 bool both_f_contig = (is_rhs_f_contig && is_lhs_f_contig);
1210 if (both_c_contig || both_f_contig) {
1211 auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
1213 if (contig_fn !=
nullptr) {
1214 auto comp_ev = contig_fn(exec_q, rhs_nelems, rhs_data, 0, lhs_data,
1217 dpnp::utils::keep_args_alive(exec_q, {rhs, lhs}, {comp_ev});
1219 return std::make_pair(ht_ev, comp_ev);
1224 auto const &rhs_strides = rhs.get_strides_vector();
1225 auto const &lhs_strides = lhs.get_strides_vector();
1227 using shT = std::vector<py::ssize_t>;
1228 shT simplified_shape;
1229 shT simplified_rhs_strides;
1230 shT simplified_lhs_strides;
1231 py::ssize_t rhs_offset(0);
1232 py::ssize_t lhs_offset(0);
1235 const py::ssize_t *shape = rhs_shape;
1237 simplify_iteration_space(nd, shape, rhs_strides, lhs_strides,
1239 simplified_shape, simplified_rhs_strides,
1240 simplified_lhs_strides, rhs_offset, lhs_offset);
1242 std::vector<sycl::event> host_tasks{};
1244 static constexpr auto unit_stride =
1245 std::initializer_list<py::ssize_t>{1};
1247 if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
1248 isEqual(simplified_lhs_strides, unit_stride)) {
1249 auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
1251 if (contig_fn !=
nullptr) {
1253 contig_fn(exec_q, rhs_nelems, rhs_data, rhs_offset,
1254 lhs_data, lhs_offset, depends);
1256 dpnp::utils::keep_args_alive(exec_q, {rhs, lhs}, {comp_ev});
1258 return std::make_pair(ht_ev, comp_ev);
1262 static constexpr auto one_zero_strides =
1263 std::initializer_list<py::ssize_t>{1, 0};
1264 static constexpr py::ssize_t one{1};
1266 if (isEqual(simplified_rhs_strides, one_zero_strides) &&
1267 isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
1268 auto row_matrix_broadcast_fn =
1269 contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
1271 if (row_matrix_broadcast_fn !=
nullptr) {
1272 std::size_t n0 = simplified_shape[1];
1273 std::size_t n1 = simplified_shape[0];
1274 sycl::event comp_ev = row_matrix_broadcast_fn(
1275 exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
1276 lhs_data, lhs_offset, depends);
1278 return std::make_pair(dpnp::utils::keep_args_alive(
1279 exec_q, {lhs, rhs}, host_tasks),
1287 auto strided_fn = strided_dispatch_table[rhs_typeid][lhs_typeid];
1289 if (strided_fn ==
nullptr) {
1290 throw std::runtime_error(
1291 "Strided implementation is missing for rhs_typeid=" +
1292 std::to_string(rhs_typeid) +
1293 " and lhs_typeid=" + std::to_string(lhs_typeid));
1296 using dpnp::tensor::offset_utils::device_allocate_and_pack;
1297 auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
1298 exec_q, host_tasks, simplified_shape, simplified_rhs_strides,
1299 simplified_lhs_strides);
1300 auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
1301 auto copy_shape_ev = std::get<2>(ptr_sz_event_triple_);
1303 const py::ssize_t *shape_strides = shape_strides_owner.get();
1305 sycl::event strided_fn_ev =
1306 strided_fn(exec_q, rhs_nelems, nd, shape_strides, rhs_data, rhs_offset,
1307 lhs_data, lhs_offset, depends, {copy_shape_ev});
1310 sycl::event tmp_cleanup_ev = dpnp::tensor::alloc_utils::async_smart_free(
1311 exec_q, {strided_fn_ev}, shape_strides_owner);
1313 host_tasks.push_back(tmp_cleanup_ev);
1315 return std::make_pair(
1316 dpnp::utils::keep_args_alive(exec_q, {rhs, lhs}, host_tasks),