Source code for plothist.comparison

from __future__ import annotations

import boost_histogram as bh
import numpy as np
from scipy import stats

from plothist.histogramming import _check_counting_histogram


def _check_uncertainty_type(uncertainty_type: str) -> None:
    """
    Check that the uncertainty type is valid.

    Parameters
    ----------
    uncertainty_type : str
        The uncertainty type to check.

    Raise
    -----
    ValueError
        If the uncertainty type is not valid.

    """
    _valid_uncertainty_types = [
        "symmetrical",
        "asymmetrical",
        "asymmetrical_double_sided_zeros",
        "asymmetrical_one_sided_zeros",
    ]

    if uncertainty_type not in _valid_uncertainty_types:
        raise ValueError(
            f"Uncertainty type {uncertainty_type} not valid. Must be in {_valid_uncertainty_types}."
        )


def _is_unweighted(hist: bh.Histogram) -> bool:
    """
    Check whether a histogram is unweighted.

    Parameters
    ----------
    hist : bh.Histogram
        The histogram to check.

    Returns
    -------
    bool
        True if the histogram is unweighted, False otherwise.
    """
    return bool(np.allclose(hist.variances(), hist.values(), equal_nan=True))



[docs]
def get_asymmetrical_uncertainties(
    hist: bh.Histogram,
    uncertainty_type: str = "asymmetrical",
) -> tuple[np.ndarray, np.ndarray]:
    """
    Get Poisson asymmetrical uncertainties for a histogram via a frequentist approach based on a confidence-interval computation.
    Asymmetrical uncertainties can only be computed for an unweighted histogram, because the bin contents of a weighted histogram do not follow a Poisson distribution.
    More information in :ref:`documentation-statistics-label`.

    Parameters
    ----------
    hist : bh.Histogram
        The histogram.
    uncertainty_type : str, optional
        The type of uncertainty to compute for bins with 0 entry. Default is "asymmetrical" (= "asymmetrical_one_sided_zeros"). Use "asymmetrical_double_sided_zeros" to have the double-sided definition. More information in :ref:`documentation-statistics-label`.

    Returns
    -------
    uncertainties_low : np.ndarray
        The lower uncertainties.
    uncertainties_high : np.ndarray
        The upper uncertainties.

    Raise
    -----
    ValueError
        If the histogram is weighted.

    """
    _check_counting_histogram(hist)
    _check_uncertainty_type(uncertainty_type)

    if not _is_unweighted(hist):
        raise ValueError(
            "Asymmetrical uncertainties can only be computed for an unweighted histogram."
        )

    alpha = 1.0 - 0.682689492
    tail_probability = alpha / 2

    n = hist.values()

    lower_bound = np.zeros_like(n, dtype=float)
    upper_bound = np.zeros_like(n, dtype=float)

    # Two-sided Garwood intervals for n > 0
    lower_bound[n > 0] = stats.gamma.ppf(q=tail_probability, a=n[n > 0], scale=1)
    upper_bound[n > 0] = stats.gamma.ppf(
        q=1 - tail_probability, a=n[n > 0] + 1, scale=1
    )

    if uncertainty_type == "asymmetrical_double_sided_zeros":
        # Two-sided Garwood intervals for n == 0
        upper_bound[n == 0] = stats.gamma.ppf(q=1 - tail_probability, a=1, scale=1)
    elif uncertainty_type in ["asymmetrical_one_sided_zeros", "asymmetrical"]:
        # One-sided upper limit for n == 0
        upper_bound[n == 0] = stats.gamma.ppf(q=1 - 2 * tail_probability, a=1, scale=1)
    else:
        raise ValueError(
            f"Invalid uncertainty type '{uncertainty_type}' for asymmetrical uncertainties."
        )

    # Compute asymmetric uncertainties
    uncertainties_low = n - lower_bound
    uncertainties_high = upper_bound - n

    uncertainties_low = np.nan_to_num(uncertainties_low, nan=0.0)
    uncertainties_high = np.nan_to_num(uncertainties_high, nan=0.0)

    return uncertainties_low, uncertainties_high



def _check_binning_consistency(hist_list: list[bh.Histogram]) -> None:
    """
    Check that all the histograms in the provided list share the same definition of their bins.

    Parameters
    ----------
    hist_list : list[bh.Histogram]
        The list of histograms to check.

    Raise
    -----
    ValueError
        If the histograms do not share the same dimensionality or if their bins are not equal.

    """
    for h in hist_list:
        if not len(h.axes) == len(hist_list[0].axes):
            raise ValueError("Histograms must have same dimensionality.")
        for i in range(len(h.axes)):
            if not h.axes[i] == hist_list[0].axes[i]:
                raise ValueError("The bins of the histograms must be equal.")



[docs]
def get_ratio_variances(h1: bh.Histogram, h2: bh.Histogram) -> np.ndarray:
    """
    Calculate the variances of the ratio of two uncorrelated histograms (h1/h2).

    Parameters
    ----------
    h1 : bh.Histogram
        The first histogram.
    h2 : bh.Histogram
        The second histogram.

    Returns
    -------
    ratio_variances : np.ndarray
        The variances of the ratio of the two histograms.

    Raises
    ------
    ValueError
        If the bins of the histograms are not equal.
    """
    _check_binning_consistency([h1, h2])
    _check_counting_histogram(h1)
    _check_counting_histogram(h2)

    with np.errstate(divide="ignore", invalid="ignore"):
        return np.where(
            h2.values() != 0,
            h1.variances() / h2.values() ** 2
            + h2.variances() * h1.values() ** 2 / h2.values() ** 4,
            np.nan,
        )




[docs]
def get_pull(
    h1: bh.Histogram,
    h2: bh.Histogram,
    h1_uncertainty_type: str = "symmetrical",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Compute the pull between two histograms.

    Parameters
    ----------
    h1 : bh.Histogram
        The first histogram.
    h2 : bh.Histogram
        The second histogram.
    h1_uncertainty_type : str, optional
        What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval. Default is "symmetrical".

    Returns
    -------
    pull_values : np.ndarray
        The pull values.
    pull_uncertainties_low : np.ndarray
        The lower uncertainties on the pull. Always ones.
    pull_uncertainties_high : np.ndarray
        The upper uncertainties on the pull. Always ones.
    """
    _check_uncertainty_type(h1_uncertainty_type)
    _check_binning_consistency([h1, h2])
    _check_counting_histogram(h1)
    _check_counting_histogram(h2)

    if "asymmetrical" in h1_uncertainty_type:
        uncertainties_low, uncertainties_high = get_asymmetrical_uncertainties(
            h1, h1_uncertainty_type
        )
        h1_variances = np.where(
            h1.values() >= h2.values(),
            uncertainties_low**2,
            uncertainties_high**2,
        )
        h1 = h1.copy()
        h1[:] = np.c_[h1.values(), h1_variances]

    pull_values = np.where(
        h1.variances() + h2.variances() != 0,
        (h1.values() - h2.values()) / np.sqrt(h1.variances() + h2.variances()),
        np.nan,
    )
    pull_uncertainties_low = np.ones_like(pull_values)
    pull_uncertainties_high = pull_uncertainties_low

    return (
        pull_values,
        pull_uncertainties_low,
        pull_uncertainties_high,
    )




[docs]
def get_difference(
    h1: bh.Histogram,
    h2: bh.Histogram,
    h1_uncertainty_type: str = "symmetrical",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Compute the difference between two histograms.

    Parameters
    ----------
    h1 : bh.Histogram
        The first histogram.
    h2 : bh.Histogram
        The second histogram.
    h1_uncertainty_type : str, optional
        What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval. Default is "symmetrical".

    Returns
    -------
    difference_values : np.ndarray
        The difference values.
    difference_uncertainties_low : np.ndarray
        The lower uncertainties on the difference.
    difference_uncertainties_high : np.ndarray
        The upper uncertainties on the difference.
    """
    _check_uncertainty_type(h1_uncertainty_type)
    _check_binning_consistency([h1, h2])
    _check_counting_histogram(h1)
    _check_counting_histogram(h2)

    difference_values = h1.values() - h2.values()

    if "asymmetrical" in h1_uncertainty_type:
        uncertainties_low, uncertainties_high = get_asymmetrical_uncertainties(
            h1, h1_uncertainty_type
        )

        difference_uncertainties_low = np.sqrt(uncertainties_low**2 + h2.variances())
        difference_uncertainties_high = np.sqrt(uncertainties_high**2 + h2.variances())
    else:
        difference_uncertainties_low = np.sqrt(h1.variances() + h2.variances())
        difference_uncertainties_high = difference_uncertainties_low

    return (
        difference_values,
        difference_uncertainties_low,
        difference_uncertainties_high,
    )




[docs]
def get_efficiency(h1: bh.Histogram, h2: bh.Histogram) -> tuple[np.ndarray, np.ndarray]:
    """
    Calculate the ratio of two correlated histograms (h1/h2), in which the entries of h1 are a subsample of the entries of h2.
    The variances are calculated according to the formula given in :ref:`documentation-statistics-label`.

    The following conditions must be fulfilled for the calculation of the efficiency:
    * The bins of the histograms must be equal.
    * The histograms must be unweighted.
    * The bin contents of both histograms must be positive or zero.
    * The bin contents of h1 must be a subsample of the bin contents of h2.

    Parameters
    ----------
    h1 : bh.Histogram
        The first histogram.
    h2 : bh.Histogram
        The second histogram.

    Returns
    -------
    efficiency_values : np.ndarray
        The efficiency values.
    efficiency_uncertainties : np.ndarray
        The uncertainties on the efficiency values.

    Raises
    ------
    ValueError
        If the histograms are weighted.
    ValueError
        If the bin contents of the histograms are not positive or zero.
    ValueError
        If the bin contents of h1 are not a subsample of the bin contents of h2.
    """
    _check_binning_consistency([h1, h2])
    _check_counting_histogram(h1)
    _check_counting_histogram(h2)

    if not (_is_unweighted(h1) and _is_unweighted(h2)):
        raise ValueError(
            "The ratio of two correlated histograms (efficiency) can only be computed for unweighted histograms."
        )
    if not (np.all(h1.values() >= 0) and np.all(h2.values() >= 0)):
        raise ValueError(
            "The ratio of two correlated histograms (efficiency) can only be computed if the bin contents of both histograms are positive or zero."
        )
    if not np.all(h1.values() <= h2.values()):
        raise ValueError(
            "The ratio of two correlated histograms (efficiency) can only be computed if the bin contents of h1 are a subsample of the bin contents of h2."
        )

    efficiency_values = np.where(h2.values() != 0, h1.values() / h2.values(), np.nan)

    k = h1.values()
    n = h2.values()

    efficiency_variances = (k + 1) * (k + 2) / (n + 2) / (n + 3) - (k + 1) ** 2 / (
        n + 2
    ) ** 2

    return efficiency_values, np.sqrt(efficiency_variances)




[docs]
def get_asymmetry(h1: bh.Histogram, h2: bh.Histogram) -> tuple[np.ndarray, np.ndarray]:
    """
    Get the asymmetry between two histograms h1 and h2, defined as (h1 - h2) / (h1 + h2).
    Only symmetrical uncertainties are supported.

    Parameters
    ----------
    h1 : bh.Histogram
        The first histogram.
    h2 : bh.Histogram
        The second histogram.

    Returns
    -------
    asymmetry_values : np.ndarray
        The asymmetry values.
    asymmetry_uncertainties : np.ndarray
        The uncertainties on the asymmetry.
    """
    _check_binning_consistency([h1, h2])
    _check_counting_histogram(h1)
    _check_counting_histogram(h2)

    hist_sum = h1 + h2
    hist_diff = h1 + (-1 * h2)
    asymmetry_values = np.where(
        hist_sum.values() != 0, hist_diff.values() / hist_sum.values(), np.nan
    )
    asymmetry_variances = get_ratio_variances(hist_diff, hist_sum)
    return (
        asymmetry_values,
        np.sqrt(asymmetry_variances),
    )




[docs]
def get_ratio(
    h1: bh.Histogram,
    h2: bh.Histogram,
    h1_uncertainty_type: str = "symmetrical",
    ratio_uncertainty_type: str = "uncorrelated",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Compute the ratio h1/h2 between two uncorrelated histograms h1 and h2.

    Parameters
    ----------
    h1 : bh.Histogram
        The numerator histogram.
    h2 : bh.Histogram
        The denominator histogram.
    h1_uncertainty_type : str, optional
        What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval. Default is "symmetrical".
    ratio_uncertainty_type : str, optional
        How to treat the uncertainties of the histograms:
        * "uncorrelated" for the comparison of two uncorrelated histograms,
        * "split" for scaling down the uncertainties of h1 by bin contents of h2, i.e. assuming zero uncertainty coming from h2 in the ratio uncertainty.
        Default is "uncorrelated".

    Returns
    -------
    ratio_values : np.ndarray
        The ratio values.
    ratio_uncertainties_low : np.ndarray
        The lower uncertainties on the ratio.
    ratio_uncertainties_high : np.ndarray
        The upper uncertainties on the ratio.

    Raises
    ------
    ValueError
        If the ratio_uncertainty_type is not valid.
    """
    _check_uncertainty_type(h1_uncertainty_type)
    _check_binning_consistency([h1, h2])
    _check_counting_histogram(h1)
    _check_counting_histogram(h2)

    ratio_values = np.where(h2.values() != 0, h1.values() / h2.values(), np.nan)

    if "asymmetrical" in h1_uncertainty_type:
        uncertainties_low, uncertainties_high = get_asymmetrical_uncertainties(
            h1, h1_uncertainty_type
        )

    if ratio_uncertainty_type == "uncorrelated":
        if "asymmetrical" in h1_uncertainty_type:
            h1_high = h1.copy()
            h1_high[:] = np.c_[h1_high.values(), uncertainties_high**2]
            h1_low = h1.copy()
            h1_low[:] = np.c_[h1_low.values(), uncertainties_low**2]
            ratio_uncertainties_low = np.sqrt(get_ratio_variances(h1_low, h2))
            ratio_uncertainties_high = np.sqrt(get_ratio_variances(h1_high, h2))
        else:
            ratio_uncertainties_low = np.sqrt(get_ratio_variances(h1, h2))
            ratio_uncertainties_high = ratio_uncertainties_low
    elif ratio_uncertainty_type == "split":
        if "asymmetrical" in h1_uncertainty_type:
            ratio_uncertainties_low = uncertainties_low / h2.values()
            ratio_uncertainties_high = uncertainties_high / h2.values()
        else:
            h1_scaled_uncertainties = np.where(
                h2.values() != 0,
                np.sqrt(h1.variances()) / h2.values(),
                np.nan,
            )
            ratio_uncertainties_low = h1_scaled_uncertainties
            ratio_uncertainties_high = ratio_uncertainties_low
    else:
        raise ValueError("ratio_uncertainty_type not in ['uncorrelated', 'split'].")

    return (
        ratio_values,
        ratio_uncertainties_low,
        ratio_uncertainties_high,
    )




[docs]
def get_comparison(
    h1: bh.Histogram,
    h2: bh.Histogram,
    comparison: str,
    h1_uncertainty_type: str = "symmetrical",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Compute the comparison between two histograms.

    Parameters
    ----------
    h1 : bh.Histogram
        The first histogram for comparison.
    h2 : bh.Histogram
        The second histogram for comparison.
    comparison : str
        The type of comparison ("ratio", "split_ratio", "pull", "difference", "relative_difference", "efficiency", or "asymmetry").
        When the `split_ratio` option is used, the uncertainties of h1 are scaled down by the bin contents of h2, i.e. assuming zero uncertainty coming from h2 in the ratio uncertainty.
    h1_uncertainty_type : str, optional
        What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval.
        Asymmetrical uncertainties are not supported for the asymmetry and efficiency comparisons.
        Default is "symmetrical".

    Returns
    -------
    values : np.ndarray
        The comparison values.
    lower_uncertainties : np.ndarray
        The lower uncertainties on the comparison values.
    upper_uncertainties : np.ndarray
        The upper uncertainties on the comparison values.

    Raises
    ------
    ValueError
        If the comparison is not valid.
    ValueError
        If the h1_uncertainty_type is "asymmetrical" and the comparison is "asymmetry" or "efficiency".
    """
    _check_uncertainty_type(h1_uncertainty_type)
    _check_binning_consistency([h1, h2])
    _check_counting_histogram(h1)
    _check_counting_histogram(h2)

    with np.errstate(divide="ignore", invalid="ignore"):
        if comparison == "ratio":
            values, lower_uncertainties, upper_uncertainties = get_ratio(
                h1, h2, h1_uncertainty_type, "uncorrelated"
            )
        elif comparison == "split_ratio":
            values, lower_uncertainties, upper_uncertainties = get_ratio(
                h1, h2, h1_uncertainty_type, "split"
            )
        elif comparison == "relative_difference":
            values, lower_uncertainties, upper_uncertainties = get_ratio(
                h1, h2, h1_uncertainty_type, "uncorrelated"
            )
            values -= 1  # relative difference is ratio-1
        elif comparison == "pull":
            values, lower_uncertainties, upper_uncertainties = get_pull(
                h1, h2, h1_uncertainty_type
            )
        elif comparison == "difference":
            values, lower_uncertainties, upper_uncertainties = get_difference(
                h1, h2, h1_uncertainty_type
            )
        elif comparison == "asymmetry":
            if "asymmetrical" in h1_uncertainty_type:
                raise ValueError(
                    "Asymmetrical uncertainties are not supported for the asymmetry comparison."
                )
            values, uncertainties = get_asymmetry(h1, h2)
            lower_uncertainties = uncertainties
            upper_uncertainties = uncertainties
        elif comparison == "efficiency":
            if "asymmetrical" in h1_uncertainty_type:
                raise ValueError(
                    "Asymmetrical uncertainties are not supported in an efficiency computation."
                )
            values, uncertainties = get_efficiency(h1, h2)
            lower_uncertainties = uncertainties
            upper_uncertainties = uncertainties
        else:
            raise ValueError(
                f"{comparison} not available as a comparison ('ratio', 'split_ratio', 'pull', 'difference', 'relative_difference', 'asymmetry' or 'efficiency')."
            )

    return values, lower_uncertainties, upper_uncertainties