Source code for dpnp.dpnp_iface_histograms

# -*- coding: utf-8 -*-
# *****************************************************************************
# Copyright (c) 2024, Intel Corporation
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# - Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

"""
Interface of histogram-related DPNP functions

Notes
-----
This module is a face or public interface file for the library
it contains:
 - Interface functions
 - documentation for the functions
 - The functions parameters check

"""

import operator
import warnings

import dpctl.utils as dpu
import numpy

import dpnp

__all__ = [
    "digitize",
    "histogram",
    "histogram_bin_edges",
]

# range is a keyword argument to many functions, so save the builtin so they can
# use it.
_range = range


def _ravel_check_a_and_weights(a, weights):
    """
    Check input `a` and `weights` arrays, and ravel both.
    The returned array have :class:`dpnp.ndarray` type always.

    """

    # ensure that `a` array has supported type
    dpnp.check_supported_arrays_type(a)
    usm_type = a.usm_type

    # ensure that the array is a "subtractable" dtype
    if a.dtype == dpnp.bool:
        warnings.warn(
            f"Converting input from {a.dtype} to {numpy.uint8} "
            "for compatibility.",
            RuntimeWarning,
            stacklevel=3,
        )
        a = dpnp.astype(a, numpy.uint8)

    if weights is not None:
        # check that `weights` array has supported type
        dpnp.check_supported_arrays_type(weights)
        usm_type = dpu.get_coerced_usm_type([usm_type, weights.usm_type])

        # check that arrays have the same allocation queue
        if dpu.get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None:
            raise ValueError(
                "a and weights must be allocated on the same SYCL queue"
            )

        if weights.shape != a.shape:
            raise ValueError("weights should have the same shape as a.")
        weights = dpnp.ravel(weights)

    a = dpnp.ravel(a)
    return a, weights, usm_type


def _get_outer_edges(a, range):
    """
    Determine the outer bin edges to use, from either the data or the range
    argument.

    """

    if range is not None:
        first_edge, last_edge = range
        if first_edge > last_edge:
            raise ValueError("max must be larger than min in range parameter.")

        if not (numpy.isfinite(first_edge) and numpy.isfinite(last_edge)):
            raise ValueError(
                f"supplied range of [{first_edge}, {last_edge}] is not finite"
            )

    elif a.size == 0:
        # handle empty arrays. Can't determine range, so use 0-1.
        first_edge, last_edge = 0, 1

    else:
        first_edge, last_edge = a.min(), a.max()
        if not (dpnp.isfinite(first_edge) and dpnp.isfinite(last_edge)):
            raise ValueError(
                f"autodetected range of [{first_edge}, {last_edge}] "
                "is not finite"
            )

    # expand empty range to avoid divide by zero
    if first_edge == last_edge:
        first_edge = first_edge - 0.5
        last_edge = last_edge + 0.5

    return first_edge, last_edge


def _get_bin_edges(a, bins, range, usm_type):
    """Computes the bins used internally by `histogram`."""

    # parse the overloaded bins argument
    n_equal_bins = None
    bin_edges = None
    sycl_queue = a.sycl_queue

    if isinstance(bins, str):
        # TODO: implement support of string bins
        raise NotImplementedError("only integer and array bins are implemented")

    if numpy.ndim(bins) == 0:
        try:
            n_equal_bins = operator.index(bins)
        except TypeError as e:
            raise TypeError("`bins` must be an integer or an array") from e
        if n_equal_bins < 1:
            raise ValueError("`bins` must be positive, when an integer")

        first_edge, last_edge = _get_outer_edges(a, range)

    elif numpy.ndim(bins) == 1:
        if dpnp.is_supported_array_type(bins):
            if dpu.get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None:
                raise ValueError(
                    "a and bins must be allocated on the same SYCL queue"
                )

            bin_edges = bins
        else:
            bin_edges = dpnp.asarray(
                bins, sycl_queue=sycl_queue, usm_type=usm_type
            )

        if dpnp.any(bin_edges[:-1] > bin_edges[1:]):
            raise ValueError(
                "`bins` must increase monotonically, when an array"
            )

    else:
        raise ValueError("`bins` must be 1d, when an array")

    if n_equal_bins is not None:
        # numpy's gh-10322 means that type resolution rules are dependent on
        # array shapes. To avoid this causing problems, we pick a type now and
        # stick with it throughout.
        # pylint: disable=possibly-used-before-assignment
        bin_type = dpnp.result_type(first_edge, last_edge, a)
        if dpnp.issubdtype(bin_type, dpnp.integer):
            bin_type = dpnp.result_type(
                bin_type, dpnp.default_float_type(sycl_queue=sycl_queue), a
            )

        # bin edges must be computed
        bin_edges = dpnp.linspace(
            first_edge,
            last_edge,
            n_equal_bins + 1,
            endpoint=True,
            dtype=bin_type,
            sycl_queue=sycl_queue,
            usm_type=usm_type,
        )
        return bin_edges, (first_edge, last_edge, n_equal_bins)
    return bin_edges, None


def _search_sorted_inclusive(a, v):
    """
    Like :obj:`dpnp.searchsorted`, but where the last item in `v` is placed
    on the right.
    In the context of a histogram, this makes the last bin edge inclusive

    """

    return dpnp.concatenate(
        (a.searchsorted(v[:-1], "left"), a.searchsorted(v[-1:], "right"))
    )



[docs]
def digitize(x, bins, right=False):
    """
    Return the indices of the bins to which each value in input array belongs.

    For full documentation refer to :obj:`numpy.digitize`.

    Parameters
    ----------
    a : {dpnp.ndarray, usm_ndarray}
        Input array to be binned.
    bins : {dpnp.ndarray, usm_ndarray}
        Array of bins. It has to be 1-dimensional and monotonic
        increasing or decreasing.
    right : bool, optional
        Indicates whether the intervals include the right or the left bin edge.
        Default: ``False``.

    Returns
    -------
    indices : dpnp.ndarray
        Array of indices with the same shape as `x`.

    Notes
    -----
    This will not raise an exception when the input array is
    not monotonic.

    See Also
    --------
    :obj:`dpnp.bincount` : Count number of occurrences of each value in array
                           of non-negative integers.
    :obj:`dpnp.histogram` : Compute the histogram of a data set.
    :obj:`dpnp.unique` : Find the unique elements of an array.
    :obj:`dpnp.searchsorted` : Find indices where elements should be inserted
                               to maintain order.

    Examples
    --------
    >>> import dpnp as np
    >>> x = np.array([0.2, 6.4, 3.0, 1.6])
    >>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
    >>> inds = np.digitize(x, bins)
    >>> inds
    array([1, 4, 3, 2])
    >>> for n in range(x.size):
    ...     print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
    ...
    0. <= 0.2 < 1.
    4. <= 6.4 < 10.
    2.5 <= 3. < 4.
    1. <= 1.6 < 2.5

    >>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
    >>> bins = np.array([0, 5, 10, 15, 20])
    >>> np.digitize(x, bins, right=True)
    array([1, 2, 3, 4, 4])
    >>> np.digitize(x, bins, right=False)
    array([1, 3, 3, 4, 5])

    """

    dpnp.check_supported_arrays_type(x, bins)

    if dpnp.issubdtype(x.dtype, dpnp.complexfloating):
        raise TypeError("x may not be complex")

    if bins.ndim > 1:
        raise ValueError("object too deep for desired array")
    if bins.ndim < 1:
        raise ValueError("object of too small depth for desired array")

    # This is backwards because the arguments below are swapped
    side = "left" if right else "right"

    # Check if bins are monotonically increasing.
    # If bins is empty, the array is considered to be increasing.
    # If all bins are NaN, the array is considered to be decreasing.
    if bins.size == 0:
        bins_increasing = True
    else:
        bins_increasing = bins[0] <= bins[-1] or (
            not dpnp.isnan(bins[0]) and dpnp.isnan(bins[-1])
        )

    if bins_increasing:
        # Use dpnp.searchsorted directly if bins are increasing
        return dpnp.searchsorted(bins, x, side=side)

    # Reverse bins and adjust indices if bins are decreasing
    return bins.size - dpnp.searchsorted(bins[::-1], x, side=side)




[docs]
def histogram(a, bins=10, range=None, density=None, weights=None):
    """
    Compute the histogram of a data set.

    For full documentation refer to :obj:`numpy.histogram`.

    Parameters
    ----------
    a : {dpnp.ndarray, usm_ndarray}
        Input data. The histogram is computed over the flattened array.
    bins : {int, dpnp.ndarray, usm_ndarray, sequence of scalars}, optional
        If `bins` is an int, it defines the number of equal-width bins in the
        given range.
        If `bins` is a sequence, it defines a monotonically increasing array
        of bin edges, including the rightmost edge, allowing for non-uniform
        bin widths.
        Default: ``10``.
    range : {None, 2-tuple of float}, optional
        The lower and upper range of the bins. If not provided, range is simply
        ``(a.min(), a.max())``. Values outside the range are ignored. The first
        element of the range must be less than or equal to the second. `range`
        affects the automatic bin computation as well. While bin width is
        computed to be optimal based on the actual data within `range`, the bin
        count will fill the entire range including portions containing no data.
        Default: ``None``.
    density : {None, bool}, optional
        If ``False`` or ``None``, the result will contain the number of samples
        in each bin. If ``True``, the result is the value of the probability
        *density* function at the bin, normalized such that the *integral* over
        the range is ``1``. Note that the sum of the histogram values will not
        be equal to ``1`` unless bins of unity width are chosen; it is not
        a probability *mass* function.
        Default: ``None``.
    weights : {None, dpnp.ndarray, usm_ndarray}, optional
        An array of weights, of the same shape as `a`. Each value in `a` only
        contributes its associated weight towards the bin count (instead of 1).
        If `density` is ``True``, the weights are normalized, so that the
        integral of the density over the range remains ``1``.
        Please note that the ``dtype`` of `weights` will also become the
        ``dtype`` of the returned accumulator (`hist`), so it must be large
        enough to hold accumulated values as well.
        Default: ``None``.

    Returns
    -------
    hist : {dpnp.ndarray}
        The values of the histogram. See `density` and `weights` for a
        description of the possible semantics. If `weights` are given,
        ``hist.dtype`` will be taken from `weights`.
    bin_edges : {dpnp.ndarray of floating data type}
        Return the bin edges ``(length(hist) + 1)``.

    See Also
    --------
    :obj:`dpnp.histogramdd` : Compute the multidimensional histogram.
    :obj:`dpnp.bincount` : Count number of occurrences of each value in array
                           of non-negative integers.
    :obj:`dpnp.searchsorted` : Find indices where elements should be inserted
                               to maintain order.
    :obj:`dpnp.digitize` : Return the indices of the bins to which each value
                           in input array belongs.
    :obj:`dpnp.histogram_bin_edges` : Return only the edges of the bins used
                                      by the obj:`dpnp.histogram` function.

    Examples
    --------
    >>> import dpnp as np
    >>> np.histogram(np.array([1, 2, 1]), bins=[0, 1, 2, 3])
    (array([0, 2, 1]), array([0, 1, 2, 3]))
    >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
    (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
    >>> np.histogram(np.array([[1, 2, 1], [1, 0, 1]]), bins=[0, 1, 2, 3])
    (array([1, 4, 1]), array([0, 1, 2, 3]))

    >>> a = np.arange(5)
    >>> hist, bin_edges = np.histogram(a, density=True)
    >>> hist
    array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
    >>> hist.sum()
    array(2.5)
    >>> np.sum(hist * np.diff(bin_edges))
    array(1.)

    """

    a, weights, usm_type = _ravel_check_a_and_weights(a, weights)

    bin_edges, uniform_bins = _get_bin_edges(a, bins, range, usm_type)

    # Histogram is an integer or a float array depending on the weights.
    if weights is None:
        ntype = dpnp.dtype(dpnp.intp)
    else:
        ntype = weights.dtype

    # The fast path uses bincount, but that only works for certain types
    # of weight
    # simple_weights = (
    #     weights is None or
    #     dpnp.can_cast(weights.dtype, dpnp.double) or
    #     dpnp.can_cast(weights.dtype, complex)
    # )
    # TODO: implement a fast path
    simple_weights = False

    if uniform_bins is not None and simple_weights:
        # TODO: implement fast algorithm for equal bins
        pass
    else:
        # Compute via cumulative histogram
        if weights is None:
            sa = dpnp.sort(a)
            cum_n = _search_sorted_inclusive(sa, bin_edges)
        else:
            zero = dpnp.zeros(
                1, dtype=ntype, sycl_queue=a.sycl_queue, usm_type=usm_type
            )
            sorting_index = dpnp.argsort(a)
            sa = a[sorting_index]
            sw = weights[sorting_index]
            cw = dpnp.concatenate((zero, sw.cumsum(dtype=ntype)))
            bin_index = _search_sorted_inclusive(sa, bin_edges)
            cum_n = cw[bin_index]

        n = dpnp.diff(cum_n)

    if density:
        # pylint: disable=possibly-used-before-assignment
        db = dpnp.diff(bin_edges).astype(dpnp.default_float_type())
        return n / db / n.sum(), bin_edges

    return n, bin_edges




[docs]
def histogram_bin_edges(a, bins=10, range=None, weights=None):
    """
    Function to calculate only the edges of the bins used by the
    :obj:`dpnp.histogram` function.

    For full documentation refer to :obj:`numpy.histogram_bin_edges`.

    Parameters
    ----------
    a : {dpnp.ndarray, usm_ndarray}
        Input data. The histogram is computed over the flattened array.
    bins : {int, dpnp.ndarray, usm_ndarray, sequence of scalars}, optional
        If `bins` is an int, it defines the number of equal-width bins in the
        given range.
        If `bins` is a sequence, it defines the bin edges, including the
        rightmost edge, allowing for non-uniform bin widths.
        Default: ``10``.
    range : {None, 2-tuple of float}, optional
        The lower and upper range of the bins. If not provided, range is simply
        ``(a.min(), a.max())``. Values outside the range are ignored. The first
        element of the range must be less than or equal to the second. `range`
        affects the automatic bin computation as well. While bin width is
        computed to be optimal based on the actual data within `range`, the bin
        count will fill the entire range including portions containing no data.
        Default: ``None``.
    weights : {None, dpnp.ndarray, usm_ndarray}, optional
        An array of weights, of the same shape as `a`. Each value in `a` only
        contributes its associated weight towards the bin count (instead of 1).
        This is currently not used by any of the bin estimators, but may be in
        the future.
        Default: ``None``.

    Returns
    -------
    bin_edges : {dpnp.ndarray of floating data type}
        The edges to pass into :obj:`dpnp.histogram`.

    See Also
    --------
    :obj:`dpnp.histogram` : Compute the histogram of a data set.

    Examples
    --------
    >>> import dpnp as np
    >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])
    >>> np.histogram_bin_edges(arr, bins=2)
    array([0. , 2.5, 5. ])

    For consistency with histogram, an array of pre-computed bins is
    passed through unmodified:

    >>> np.histogram_bin_edges(arr, [1, 2])
    array([1, 2])

    This function allows one set of bins to be computed, and reused across
    multiple histograms:

    >>> shared_bins = np.histogram_bin_edges(arr, bins=5)
    >>> shared_bins
    array([0., 1., 2., 3., 4., 5.])

    >>> gid = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1])
    >>> hist_0, _ = np.histogram(arr[gid == 0], bins=shared_bins)
    >>> hist_1, _ = np.histogram(arr[gid == 1], bins=shared_bins)

    >>> hist_0, hist_1
    (array([1, 1, 0, 1, 0]), array([2, 0, 1, 1, 2]))

    Which gives more easily comparable results than using separate bins for
    each histogram:

    >>> hist_0, bins_0 = np.histogram(arr[gid == 0], bins=3)
    >>> hist_1, bins_1 = np.histogram(arr[gid == 1], bins=4)
    >>> hist_0, hist_1
    (array([1, 1, 1]), array([2, 1, 1, 2]))
    >>> bins_0, bins_1
    (array([0., 1., 2., 3.]), array([0.  , 1.25, 2.5 , 3.75, 5.  ]))

    """

    a, weights, usm_type = _ravel_check_a_and_weights(a, weights)
    bin_edges, _ = _get_bin_edges(a, bins, range, usm_type)
    return bin_edges