Source code for dpnp.dpnp_iface_histograms

# -*- coding: utf-8 -*-
# *****************************************************************************
# Copyright (c) 2024-2025, Intel Corporation
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# - Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

"""
Interface of histogram-related DPNP functions

Notes
-----
This module is a face or public interface file for the library
it contains:
 - Interface functions
 - documentation for the functions
 - The functions parameters check

"""

import operator
from collections.abc import Iterable

import dpctl.utils as dpu
import numpy

import dpnp

# pylint: disable=no-name-in-module
import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
from dpnp.dpnp_utils.dpnp_utils_common import (
    result_type_for_device,
    to_supported_dtypes,
)

# pylint: disable=no-name-in-module
from .dpnp_utils import get_usm_allocations

__all__ = [
    "bincount",
    "digitize",
    "histogram",
    "histogram_bin_edges",
    "histogram2d",
    "histogramdd",
]

# range is a keyword argument to many functions, so save the builtin so they can
# use it.
_range = range


def _align_dtypes(a_dtype, bins_dtype, ntype, supported_types, device):
    a_bin_dtype = result_type_for_device([a_dtype, bins_dtype], device)

    # histogram implementation doesn't support uint64 as histogram type
    # we can use int64 instead. Result would be correct even in case of overflow
    if ntype == numpy.uint64:
        ntype = dpnp.int64

    return to_supported_dtypes([a_bin_dtype, ntype], supported_types, device)


def _ravel_check_a_and_weights(a, weights):
    """
    Check input `a` and `weights` arrays, and ravel both.
    The returned array have :class:`dpnp.ndarray` type always.

    """

    # ensure that `a` array has supported type
    dpnp.check_supported_arrays_type(a)
    usm_type = a.usm_type

    if weights is not None:
        # check that `weights` array has supported type
        dpnp.check_supported_arrays_type(weights)
        usm_type = dpu.get_coerced_usm_type([usm_type, weights.usm_type])

        # check that arrays have the same allocation queue
        if dpu.get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None:
            raise ValueError(
                "a and weights must be allocated on the same SYCL queue"
            )

        if weights.shape != a.shape:
            raise ValueError("weights should have the same shape as a.")
        weights = dpnp.ravel(weights)

    a = dpnp.ravel(a)
    return a, weights, usm_type


def _get_outer_edges(a, range):
    """
    Determine the outer bin edges to use, from either the data or the range
    argument.

    """

    def _is_finite(a):
        if dpnp.is_supported_array_type(a):
            return dpnp.isfinite(a)

        return numpy.isfinite(a)

    if range is not None:
        if len(range) != 2:
            raise ValueError("range argument must consist of 2 elements.")

        first_edge, last_edge = range
        if first_edge > last_edge:
            raise ValueError("max must be larger than min in range parameter.")

        if not (_is_finite(first_edge) and _is_finite(last_edge)):
            raise ValueError(
                f"supplied range of [{first_edge}, {last_edge}] is not finite"
            )

    elif a.size == 0:
        # handle empty arrays. Can't determine range, so use 0-1.
        first_edge, last_edge = 0, 1

    else:
        first_edge, last_edge = a.min(), a.max()
        if not (_is_finite(first_edge) and _is_finite(last_edge)):
            raise ValueError(
                f"autodetected range of [{first_edge}, {last_edge}] "
                "is not finite"
            )

    # expand empty range to avoid divide by zero
    if first_edge == last_edge:
        first_edge = first_edge - 0.5
        last_edge = last_edge + 0.5

    return first_edge, last_edge


def _get_bin_edges(a, bins, range, usm_type):
    """Computes the bins used internally by `histogram`."""

    # parse the overloaded bins argument
    n_equal_bins = None
    bin_edges = None
    sycl_queue = a.sycl_queue

    if isinstance(bins, str):
        # TODO: implement support of string bins
        raise NotImplementedError("only integer and array bins are implemented")

    if numpy.ndim(bins) == 0:
        try:
            n_equal_bins = operator.index(bins)
        except TypeError as e:
            raise TypeError("`bins` must be an integer or an array") from e
        if n_equal_bins < 1:
            raise ValueError("`bins` must be positive, when an integer")

        first_edge, last_edge = _get_outer_edges(a, range)

    elif numpy.ndim(bins) == 1:
        if dpnp.is_supported_array_type(bins):
            if dpu.get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None:
                raise ValueError(
                    "a and bins must be allocated on the same SYCL queue"
                )

            bin_edges = bins
        else:
            bin_edges = dpnp.asarray(
                bins, sycl_queue=sycl_queue, usm_type=usm_type
            )

        if dpnp.any(bin_edges[:-1] > bin_edges[1:]):
            raise ValueError(
                "`bins` must increase monotonically, when an array"
            )

    else:
        raise ValueError("`bins` must be 1d, when an array")

    if n_equal_bins is not None:
        # numpy's gh-10322 means that type resolution rules are dependent on
        # array shapes. To avoid this causing problems, we pick a type now and
        # stick with it throughout.
        # pylint: disable=possibly-used-before-assignment
        bin_type = dpnp.result_type(first_edge, last_edge, a)
        if dpnp.issubdtype(bin_type, dpnp.integer):
            bin_type = dpnp.result_type(
                bin_type, dpnp.default_float_type(sycl_queue=sycl_queue), a
            )

        # bin edges must be computed
        bin_edges = dpnp.linspace(
            first_edge,
            last_edge,
            n_equal_bins + 1,
            endpoint=True,
            dtype=bin_type,
            sycl_queue=sycl_queue,
            usm_type=usm_type,
        )
        return bin_edges, (first_edge, last_edge, n_equal_bins)
    return bin_edges, None


def _bincount_validate(x, weights, minlength):
    if x.ndim > 1:
        raise ValueError("object too deep for desired array")
    if x.ndim < 1:
        raise ValueError("object of too small depth for desired array")
    if not dpnp.issubdtype(x.dtype, dpnp.integer) and not dpnp.issubdtype(
        x.dtype, dpnp.bool
    ):
        raise TypeError("x must be an integer array")
    if weights is not None:
        if x.shape != weights.shape:
            raise ValueError("The weights and x don't have the same length.")
        if not (
            dpnp.issubdtype(weights.dtype, dpnp.integer)
            or dpnp.issubdtype(weights.dtype, dpnp.floating)
            or dpnp.issubdtype(weights.dtype, dpnp.bool)
        ):
            raise ValueError(
                f"Weights must be integer or float. Got {weights.dtype}"
            )

    if minlength is not None:
        minlength = int(minlength)
        if minlength < 0:
            raise ValueError("minlength must be non-negative")


def _bincount_run_native(
    x_casted, weights_casted, minlength, n_dtype, usm_type
):
    queue = x_casted.sycl_queue

    max_v = dpnp.max(x_casted)
    min_v = dpnp.min(x_casted)

    if min_v < 0:
        raise ValueError("x argument must have no negative arguments")

    size = int(dpnp.max(max_v)) + 1
    if minlength is not None:
        size = max(size, minlength)

    # bincount implementation uses atomics, but atomics doesn't work with
    # host usm memory
    n_usm_type = "device" if usm_type == "host" else usm_type

    # bincount implementation requires output array to be filled with zeros
    n_casted = dpnp.zeros(
        size, dtype=n_dtype, usm_type=n_usm_type, sycl_queue=queue
    )

    _manager = dpu.SequentialOrderManager[queue]

    x_usm = dpnp.get_usm_ndarray(x_casted)
    weights_usm = (
        dpnp.get_usm_ndarray(weights_casted)
        if weights_casted is not None
        else None
    )
    n_usm = dpnp.get_usm_ndarray(n_casted)

    mem_ev, bc_ev = statistics_ext.bincount(
        x_usm,
        min_v,
        max_v,
        weights_usm,
        n_usm,
        depends=_manager.submitted_events,
    )

    _manager.add_event_pair(mem_ev, bc_ev)

    return n_casted


[docs] def bincount(x, weights=None, minlength=None): """ bincount(x, /, weights=None, minlength=None) Count number of occurrences of each value in array of non-negative ints. For full documentation refer to :obj:`numpy.bincount`. Parameters ---------- x : {dpnp.ndarray, usm_ndarray} Input 1-dimensional array with non-negative integer values. weights : {None, dpnp.ndarray, usm_ndarray}, optional Weights, array of the same shape as `x`. Default: ``None`` minlength : {None, int}, optional A minimum number of bins for the output array. Default: ``None`` Returns ------- out : dpnp.ndarray of ints The result of binning the input array. The length of `out` is equal to ``dpnp.max(x) + 1``. See Also -------- :obj:`dpnp.histogram` : Compute the histogram of a data set. :obj:`dpnp.digitize` : Return the indices of the bins to which each value :obj:`dpnp.unique` : Find the unique elements of an array. Examples -------- >>> import dpnp as np >>> np.bincount(np.arange(5)) array([1, 1, 1, 1, 1]) >>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7])) array([1, 3, 1, 1, 0, 0, 0, 1]) >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23]) >>> np.bincount(x).size == np.amax(x) + 1 array(True) The input array needs to be of integer dtype, otherwise a TypeError is raised: >>> np.bincount(np.arange(5, dtype=np.float32)) Traceback (most recent call last): ... TypeError: x must be an integer array A possible use of :obj:`dpnp.bincount` is to perform sums over variable-size chunks of an array, using the `weights` keyword. >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6], dtype=np.float32) # weights >>> x = np.array([0, 1, 1, 2, 2, 2]) >>> np.bincount(x, weights=w) array([0.3, 0.7, 1.1], dtype=float32) """ _bincount_validate(x, weights, minlength) x, weights, usm_type = _ravel_check_a_and_weights(x, weights) queue = x.sycl_queue device = queue.sycl_device if weights is None: ntype = dpnp.dtype(dpnp.intp) else: # unlike in case of histogram result type is integer if no weights # provided and float if weights are provided even if weights are integer ntype = dpnp.default_float_type(sycl_queue=queue) weights_casted = None supported_types = statistics_ext.bincount_dtypes() x_casted_dtype, ntype_casted = _align_dtypes( x.dtype, x.dtype, ntype, supported_types, device ) if x_casted_dtype is None or ntype_casted is None: # pragma: no cover raise ValueError( f"function '{bincount}' does not support input types " f"({x.dtype}, {ntype}), " "and the inputs could not be coerced to any " "supported types" ) x_casted = dpnp.asarray(x, dtype=x_casted_dtype, order="C") if weights is not None: weights_casted = dpnp.asarray(weights, dtype=ntype_casted, order="C") n_casted = _bincount_run_native( x_casted, weights_casted, minlength, ntype_casted, usm_type ) return dpnp.asarray(n_casted, dtype=ntype, usm_type=usm_type)
[docs] def digitize(x, bins, right=False): """ Return the indices of the bins to which each value in input array belongs. For full documentation refer to :obj:`numpy.digitize`. Parameters ---------- a : {dpnp.ndarray, usm_ndarray} Input array to be binned. bins : {dpnp.ndarray, usm_ndarray} Array of bins. It has to be 1-dimensional and monotonic increasing or decreasing. right : bool, optional Indicates whether the intervals include the right or the left bin edge. Default: ``False``. Returns ------- indices : dpnp.ndarray Array of indices with the same shape as `x`. Notes ----- This will not raise an exception when the input array is not monotonic. See Also -------- :obj:`dpnp.bincount` : Count number of occurrences of each value in array of non-negative integers. :obj:`dpnp.histogram` : Compute the histogram of a data set. :obj:`dpnp.unique` : Find the unique elements of an array. :obj:`dpnp.searchsorted` : Find indices where elements should be inserted to maintain order. Examples -------- >>> import dpnp as np >>> x = np.array([0.2, 6.4, 3.0, 1.6]) >>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0]) >>> inds = np.digitize(x, bins) >>> inds array([1, 4, 3, 2]) >>> for n in range(x.size): ... print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]]) ... 0. <= 0.2 < 1. 4. <= 6.4 < 10. 2.5 <= 3. < 4. 1. <= 1.6 < 2.5 >>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.]) >>> bins = np.array([0, 5, 10, 15, 20]) >>> np.digitize(x, bins, right=True) array([1, 2, 3, 4, 4]) >>> np.digitize(x, bins, right=False) array([1, 3, 3, 4, 5]) """ dpnp.check_supported_arrays_type(x, bins) if dpnp.issubdtype(x.dtype, dpnp.complexfloating): raise TypeError("x may not be complex") if bins.ndim > 1: raise ValueError("object too deep for desired array") if bins.ndim < 1: raise ValueError("object of too small depth for desired array") # This is backwards because the arguments below are swapped side = "left" if right else "right" # Check if bins are monotonically increasing. # If bins is empty, the array is considered to be increasing. # If all bins are NaN, the array is considered to be decreasing. if bins.size == 0: bins_increasing = True else: bins_increasing = bins[0] <= bins[-1] or ( not dpnp.isnan(bins[0]) and dpnp.isnan(bins[-1]) ) if bins_increasing: # Use dpnp.searchsorted directly if bins are increasing return dpnp.searchsorted(bins, x, side=side) # Reverse bins and adjust indices if bins are decreasing return bins.size - dpnp.searchsorted(bins[::-1], x, side=side)
[docs] def histogram(a, bins=10, range=None, density=None, weights=None): """ Compute the histogram of a data set. For full documentation refer to :obj:`numpy.histogram`. Parameters ---------- a : {dpnp.ndarray, usm_ndarray} Input data. The histogram is computed over the flattened array. bins : {int, dpnp.ndarray, usm_ndarray, sequence of scalars}, optional If `bins` is an int, it defines the number of equal-width bins in the given range. If `bins` is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing for non-uniform bin widths. Default: ``10``. range : {None, 2-tuple of float}, optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. Default: ``None``. density : {None, bool}, optional If ``False`` or ``None``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is ``1``. Note that the sum of the histogram values will not be equal to ``1`` unless bins of unity width are chosen; it is not a probability *mass* function. Default: ``None``. weights : {None, dpnp.ndarray, usm_ndarray}, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). If `density` is ``True``, the weights are normalized, so that the integral of the density over the range remains ``1``. Please note that the ``dtype`` of `weights` will also become the ``dtype`` of the returned accumulator (`hist`), so it must be large enough to hold accumulated values as well. Default: ``None``. Returns ------- hist : {dpnp.ndarray} The values of the histogram. See `density` and `weights` for a description of the possible semantics. If `weights` are given, ``hist.dtype`` will be taken from `weights`. bin_edges : {dpnp.ndarray of floating data type} Return the bin edges ``(length(hist) + 1)``. See Also -------- :obj:`dpnp.histogramdd` : Compute the multidimensional histogram. :obj:`dpnp.bincount` : Count number of occurrences of each value in array of non-negative integers. :obj:`dpnp.searchsorted` : Find indices where elements should be inserted to maintain order. :obj:`dpnp.digitize` : Return the indices of the bins to which each value in input array belongs. :obj:`dpnp.histogram_bin_edges` : Return only the edges of the bins used by the obj:`dpnp.histogram` function. Examples -------- >>> import dpnp as np >>> np.histogram(np.array([1, 2, 1]), bins=[0, 1, 2, 3]) (array([0, 2, 1]), array([0, 1, 2, 3])) >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) >>> np.histogram(np.array([[1, 2, 1], [1, 0, 1]]), bins=[0, 1, 2, 3]) (array([1, 4, 1]), array([0, 1, 2, 3])) >>> a = np.arange(5) >>> hist, bin_edges = np.histogram(a, density=True) >>> hist array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) >>> hist.sum() array(2.5) >>> np.sum(hist * np.diff(bin_edges)) array(1.) """ a, weights, usm_type = _ravel_check_a_and_weights(a, weights) bin_edges, _ = _get_bin_edges(a, bins, range, usm_type) # Histogram is an integer or a float array depending on the weights. if weights is None: ntype = dpnp.dtype(dpnp.intp) else: ntype = weights.dtype queue = a.sycl_queue device = queue.sycl_device supported_types = statistics_ext.histogram_dtypes() a_bin_dtype, hist_dtype = _align_dtypes( a.dtype, bin_edges.dtype, ntype, supported_types, device ) if a_bin_dtype is None or hist_dtype is None: # pragma: no cover raise ValueError( f"function '{histogram}' does not support input types " f"({a.dtype}, {bin_edges.dtype}, {ntype}), " "and the inputs could not be coerced to any " "supported types" ) a_casted = dpnp.asarray(a, dtype=a_bin_dtype, order="C") bin_edges_casted = dpnp.asarray(bin_edges, dtype=a_bin_dtype, order="C") weights_casted = ( dpnp.asarray(weights, dtype=hist_dtype, order="C") if weights is not None else None ) # histogram implementation uses atomics, but atomics doesn't work with # host usm memory n_usm_type = "device" if usm_type == "host" else usm_type # histogram implementation requires output array to be filled with zeros n_casted = dpnp.zeros( bin_edges.size - 1, dtype=hist_dtype, sycl_queue=a.sycl_queue, usm_type=n_usm_type, ) _manager = dpu.SequentialOrderManager[queue] a_usm = dpnp.get_usm_ndarray(a_casted) bins_usm = dpnp.get_usm_ndarray(bin_edges_casted) weights_usm = ( dpnp.get_usm_ndarray(weights_casted) if weights_casted is not None else None ) n_usm = dpnp.get_usm_ndarray(n_casted) mem_ev, ht_ev = statistics_ext.histogram( a_usm, bins_usm, weights_usm, n_usm, depends=_manager.submitted_events, ) _manager.add_event_pair(mem_ev, ht_ev) n = dpnp.asarray(n_casted, dtype=ntype, usm_type=usm_type) if density: db = dpnp.astype( dpnp.diff(bin_edges), dpnp.default_float_type(sycl_queue=queue) ) return n / db / dpnp.sum(n), bin_edges return n, bin_edges
[docs] def histogram_bin_edges(a, bins=10, range=None, weights=None): """ Function to calculate only the edges of the bins used by the :obj:`dpnp.histogram` function. For full documentation refer to :obj:`numpy.histogram_bin_edges`. Parameters ---------- a : {dpnp.ndarray, usm_ndarray} Input data. The histogram is computed over the flattened array. bins : {int, dpnp.ndarray, usm_ndarray, sequence of scalars}, optional If `bins` is an int, it defines the number of equal-width bins in the given range. If `bins` is a sequence, it defines the bin edges, including the rightmost edge, allowing for non-uniform bin widths. Default: ``10``. range : {None, 2-tuple of float}, optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. Default: ``None``. weights : {None, dpnp.ndarray, usm_ndarray}, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). This is currently not used by any of the bin estimators, but may be in the future. Default: ``None``. Returns ------- bin_edges : {dpnp.ndarray of floating data type} The edges to pass into :obj:`dpnp.histogram`. See Also -------- :obj:`dpnp.histogram` : Compute the histogram of a data set. Examples -------- >>> import dpnp as np >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5]) >>> np.histogram_bin_edges(arr, bins=2) array([0. , 2.5, 5. ]) For consistency with histogram, an array of pre-computed bins is passed through unmodified: >>> np.histogram_bin_edges(arr, [1, 2]) array([1, 2]) This function allows one set of bins to be computed, and reused across multiple histograms: >>> shared_bins = np.histogram_bin_edges(arr, bins=5) >>> shared_bins array([0., 1., 2., 3., 4., 5.]) >>> gid = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1]) >>> hist_0, _ = np.histogram(arr[gid == 0], bins=shared_bins) >>> hist_1, _ = np.histogram(arr[gid == 1], bins=shared_bins) >>> hist_0, hist_1 (array([1, 1, 0, 1, 0]), array([2, 0, 1, 1, 2])) Which gives more easily comparable results than using separate bins for each histogram: >>> hist_0, bins_0 = np.histogram(arr[gid == 0], bins=3) >>> hist_1, bins_1 = np.histogram(arr[gid == 1], bins=4) >>> hist_0, hist_1 (array([1, 1, 1]), array([2, 1, 1, 2])) >>> bins_0, bins_1 (array([0., 1., 2., 3.]), array([0. , 1.25, 2.5 , 3.75, 5. ])) """ a, weights, usm_type = _ravel_check_a_and_weights(a, weights) bin_edges, _ = _get_bin_edges(a, bins, range, usm_type) return bin_edges
[docs] def histogram2d(x, y, bins=10, range=None, density=None, weights=None): """ Compute the bi-dimensional histogram of two data samples. Parameters ---------- x : {dpnp.ndarray, usm_ndarray} of shape (N,) An array containing the `x` coordinates of the points to be histogrammed. y : {dpnp.ndarray, usm_ndarray} of shape (N,) An array containing the `y` coordinates of the points to be histogrammed. bins : {int, dpnp.ndarray, usm_ndarray, [int, int], [array, array], \ [int, array], [array, int]}, optional The bins specification: * If int, the number of bins for the two dimensions (nx=ny=bins). * If array, the bin edges for the two dimensions (x_edges=y_edges=bins). * If [int, int], the number of bins in each dimension (nx, ny = bins). * If [array, array], the bin edges in each dimension (x_edges, y_edges = bins). * A combination [int, array] or [array, int], where int is the number of bins and array is the bin edges. Default: ``10``. range : {None, dpnp.ndarray, usm_ndarray} of shape (2,2), optional The leftmost and rightmost edges of the bins along each dimension If ``None`` the ranges are ``[[x.min(), x.max()], [y.min(), y.max()]]``. All values outside of this range will be considered outliers and not tallied in the histogram. Default: ``None``. density : {None, bool}, optional If ``False`` or ``None``, the default, returns the number of samples in each bin. If ``True``, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. Default: ``None``. weights : {None, dpnp.ndarray, usm_ndarray} of shape (N,), optional An array of values ``w_i`` weighing each sample ``(x_i, y_i)``. Weights are normalized to ``1`` if `density` is ``True``. If `density` is ``False``, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. If ``None`` all samples are assigned a weight of ``1``. Default: ``None``. Returns ------- H : dpnp.ndarray of shape (nx, ny) The bi-dimensional histogram of samples `x` and `y`. Values in `x` are histogrammed along the first dimension and values in `y` are histogrammed along the second dimension. xedges : dpnp.ndarray of shape (nx+1,) The bin edges along the first dimension. yedges : dpnp.ndarray of shape (ny+1,) The bin edges along the second dimension. See Also -------- :obj:`dpnp.histogram` : 1D histogram :obj:`dpnp.histogramdd` : Multidimensional histogram Notes ----- When `density` is ``True``, then the returned histogram is the sample density, defined such that the sum over bins of the product ``bin_value * bin_area`` is 1. Please note that the histogram does not follow the Cartesian convention where `x` values are on the abscissa and `y` values on the ordinate axis. Rather, `x` is histogrammed along the first dimension of the array (vertical), and `y` along the second dimension of the array (horizontal). This ensures compatibility with `histogramdd`. Examples -------- >>> import dpnp as np >>> x = np.random.randn(20).astype("float32") >>> y = np.random.randn(20).astype("float32") >>> hist, edges_x, edges_y = np.histogram2d(x, y, bins=(4, 3)) >>> hist.shape (4, 3) >>> hist array([[1., 2., 0.], [0., 3., 1.], [1., 4., 1.], [1., 3., 3.]], dtype=float32) >>> edges_x.shape (5,) >>> edges_x array([-1.7516936 , -0.96109843, -0.17050326, 0.62009203, 1.4106871 ], dtype=float32) >>> edges_y.shape (4,) >>> edges_y array([-2.6604428 , -0.94615364, 0.76813555, 2.4824247 ], dtype=float32) Please note, that resulting values of histogram and edges may vary. """ dpnp.check_supported_arrays_type(x, y) if weights is not None: dpnp.check_supported_arrays_type(weights) if x.ndim != 1 or y.ndim != 1: raise ValueError( f"x and y must be 1-dimensional arrays." f"Got {x.ndim} and {y.ndim} respectively" ) if len(x) != len(y): raise ValueError( f"x and y must have the same length." f"Got {len(x)} and {len(y)} respectively" ) usm_type, exec_q = get_usm_allocations([x, y, bins, range, weights]) device = exec_q.sycl_device sample_dtype = result_type_for_device([x.dtype, y.dtype], device) # Unlike histogramdd histogram2d accepts 1d bins and # apply it to both dimensions # at the same moment two elements bins should be interpreted as # number of bins in each dimension and array-like bins with one element # is not allowed if isinstance(bins, Iterable) and len(bins) > 2: bins = [bins] * 2 bins = _histdd_normalize_bins(bins, 2) bins_dtypes = [sample_dtype] bins_dtypes += [b.dtype for b in bins if hasattr(b, "dtype")] bins_dtype = result_type_for_device(bins_dtypes, device) hist_dtype = _histdd_hist_dtype(exec_q, weights) supported_types = statistics_ext.histogramdd_dtypes() sample_dtype, _ = _align_dtypes( sample_dtype, bins_dtype, hist_dtype, supported_types, device ) sample = dpnp.empty_like( x, shape=x.shape + (2,), dtype=sample_dtype, usm_type=usm_type ) sample[:, 0] = x sample[:, 1] = y hist, edges = histogramdd( sample, bins=bins, range=range, density=density, weights=weights ) return hist, edges[0], edges[1]
def _histdd_validate_bins(bins): for i, b in enumerate(bins): if numpy.ndim(b) == 0: if b < 1: raise ValueError( f"'bins[{i}' must be positive, when an integer" ) elif numpy.ndim(b) == 1: # will check for monotonicity later pass else: raise ValueError( f"'bins[{i}]' must be either scalar or 1d array-like," + f" but it is {type(b)}" ) def _histdd_normalize_bins(bins, ndims): if not isinstance(bins, Iterable): if not dpnp.issubdtype(type(bins), dpnp.integer): raise ValueError("'bins' must be an integer, when a scalar") bins = [bins] * ndims if len(bins) != ndims: raise ValueError( f"The dimension of bins ({len(bins)}) must be equal" + f" to the dimension of the sample ({ndims})." ) _histdd_validate_bins(bins) return bins def _histdd_normalize_range(range, ndims): if range is None: range = [None] * ndims if len(range) != ndims: raise ValueError( f"range argument length ({len(range)}) must match" + f" number of dimensions ({ndims})" ) return range def _histdd_make_edges(sample, bins, range, usm_type): bedges_list = [] for i, (r, _bins) in enumerate(zip(range, bins)): bedges, _ = _get_bin_edges(sample[:, i], _bins, r, usm_type) bedges_list.append(bedges) return bedges_list def _histdd_flatten_binedges(bedges_list, edges_count_list, dtype): total_edges_size = numpy.sum(edges_count_list) bin_edges_flat = dpnp.empty_like( bedges_list[0], shape=total_edges_size, dtype=dtype ) offset = numpy.pad(numpy.cumsum(edges_count_list), (1, 0)) bin_edges_view_list = [] for start, end, bedges in zip(offset[:-1], offset[1:], bedges_list): edges_slice = bin_edges_flat[start:end] bin_edges_view_list.append(edges_slice) edges_slice[:] = bedges return bin_edges_flat, bin_edges_view_list def _histdd_run_native( sample, weights, hist_dtype, bin_edges, edges_count_list, usm_type ): queue = sample.sycl_queue hist_shape = [ec - 1 for ec in edges_count_list] bin_edges_count = dpnp.asarray( edges_count_list, dtype=dpnp.int64, sycl_queue=queue ) n_usm_type = "device" if usm_type == "host" else usm_type n = dpnp.zeros( shape=hist_shape, dtype=hist_dtype, sycl_queue=queue, usm_type=n_usm_type, ) sample_usm = dpnp.get_usm_ndarray(sample) weights_usm = dpnp.get_usm_ndarray(weights) if weights is not None else None edges_usm = dpnp.get_usm_ndarray(bin_edges) edges_count_usm = dpnp.get_usm_ndarray(bin_edges_count) n_usm = dpnp.get_usm_ndarray(n) _manager = dpu.SequentialOrderManager[queue] mem_ev, hdd_ev = statistics_ext.histogramdd( sample_usm, edges_usm, edges_count_usm, weights_usm, n_usm, depends=_manager.submitted_events, ) _manager.add_event_pair(mem_ev, hdd_ev) return n def _histdd_hist_dtype(queue, weights): hist_dtype = dpnp.default_float_type(sycl_queue=queue) device = queue.sycl_device if weights is not None: # hist_dtype is either float or complex, so it is ok # to calculate it as result type between default_float and # weights.dtype hist_dtype = result_type_for_device([hist_dtype, weights.dtype], device) return hist_dtype def _histdd_sample_dtype(queue, sample, bin_edges_list): device = queue.sycl_device dtypes_ = [bin_edges.dtype for bin_edges in bin_edges_list] dtypes_.append(sample.dtype) return result_type_for_device(dtypes_, device) def _histdd_supported_dtypes(sample, bin_edges_list, weights): queue = sample.sycl_queue device = queue.sycl_device hist_dtype = _histdd_hist_dtype(queue, weights) sample_dtype = _histdd_sample_dtype(queue, sample, bin_edges_list) supported_types = statistics_ext.histogramdd_dtypes() # passing sample_dtype twice as we already # aligned sample_dtype and bins dtypes sample_dtype, hist_dtype = _align_dtypes( sample_dtype, sample_dtype, hist_dtype, supported_types, device ) return sample_dtype, hist_dtype def _histdd_extract_arrays(sample, weights, bins): all_arrays = [sample] if weights is not None: all_arrays.append(weights) if isinstance(bins, Iterable): all_arrays.extend([b for b in bins if dpnp.is_supported_array_type(b)]) return all_arrays
[docs] def histogramdd(sample, bins=10, range=None, density=None, weights=None): """ Compute the multidimensional histogram of some data. For full documentation refer to :obj:`numpy.histogramdd`. Parameters ---------- sample : {dpnp.ndarray, usm_ndarray} Input (N, D)-shaped array to be histogrammed. bins : {sequence, int}, optional The bin specification: * A sequence of arrays describing the monotonically increasing bin edges along each dimension. * The number of bins for each dimension (nx, ny, ... =bins) * The number of bins for all dimensions (nx=ny=...=bins). Default: ``10``. range : {None, sequence}, optional A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. An entry of ``None`` in the sequence results in the minimum and maximum values being used for the corresponding dimension. ``None`` is equivalent to passing a tuple of D ``None`` values. Default: ``None``. density : {None, bool}, optional If ``False`` or ``None``, the default, returns the number of samples in each bin. If ``True``, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. Default: ``None``. weights : {None, dpnp.ndarray, usm_ndarray}, optional An (N,)-shaped array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. Weights are normalized to ``1`` if density is ``True``. If density is ``False``, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. If ``None`` all samples are assigned a weight of ``1``. Default: ``None``. Returns ------- H : dpnp.ndarray The multidimensional histogram of sample x. See density and weights for the different possible semantics. edges : list of {dpnp.ndarray or usm_ndarray} A list of D arrays describing the bin edges for each dimension. See Also -------- :obj:`dpnp.histogram`: 1-D histogram :obj:`dpnp.histogram2d`: 2-D histogram Examples -------- >>> import dpnp as np >>> r = np.random.normal(size=(100, 3)) >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) >>> H.shape, edges[0].size, edges[1].size, edges[2].size ((5, 8, 4), 6, 9, 5) """ dpnp.check_supported_arrays_type(sample) if weights is not None: dpnp.check_supported_arrays_type(weights) if sample.ndim < 2: sample = dpnp.reshape(sample, (sample.size, 1)) elif sample.ndim > 2: raise ValueError("sample must have no more than 2 dimensions") ndim = sample.shape[1] _arrays = _histdd_extract_arrays(sample, weights, bins) usm_type, queue = get_usm_allocations(_arrays) bins = _histdd_normalize_bins(bins, ndim) range = _histdd_normalize_range(range, ndim) bin_edges_list = _histdd_make_edges(sample, bins, range, usm_type) sample_dtype, hist_dtype = _histdd_supported_dtypes( sample, bin_edges_list, weights ) edges_count_list = [bin_edges.size for bin_edges in bin_edges_list] bin_edges_flat, bin_edges_view_list = _histdd_flatten_binedges( bin_edges_list, edges_count_list, sample_dtype ) sample_ = dpnp.asarray(sample, dtype=sample_dtype, order="C") weights_ = ( dpnp.asarray(weights, dtype=hist_dtype, order="C") if weights is not None else None ) n = _histdd_run_native( sample_, weights_, hist_dtype, bin_edges_flat, edges_count_list, usm_type, ) expexted_hist_dtype = _histdd_hist_dtype(queue, weights) n = dpnp.asarray(n, dtype=expexted_hist_dtype, usm_type=usm_type) if density: # calculate the probability density function s = n.sum() for i in _range(ndim): diff = dpnp.diff(bin_edges_view_list[i]) shape = [1] * ndim shape[i] = diff.size n = n / dpnp.reshape(diff, shape=shape) n /= s for i, b in enumerate(bins): if dpnp.is_supported_array_type(b): bin_edges_view_list[i] = b return n, bin_edges_view_list