from __future__ import annotations
import boost_histogram as bh
import numpy as np
from scipy import stats
from plothist.histogramming import _check_counting_histogram
def _check_uncertainty_type(uncertainty_type: str) -> None:
"""
Check that the uncertainty type is valid.
Parameters
----------
uncertainty_type : str
The uncertainty type to check.
Raise
-----
ValueError
If the uncertainty type is not valid.
"""
_valid_uncertainty_types = [
"symmetrical",
"asymmetrical",
"asymmetrical_double_sided_zeros",
"asymmetrical_one_sided_zeros",
]
if uncertainty_type not in _valid_uncertainty_types:
raise ValueError(
f"Uncertainty type {uncertainty_type} not valid. Must be in {_valid_uncertainty_types}."
)
def _is_unweighted(hist: bh.Histogram) -> bool:
"""
Check whether a histogram is unweighted.
Parameters
----------
hist : bh.Histogram
The histogram to check.
Returns
-------
bool
True if the histogram is unweighted, False otherwise.
"""
return bool(np.allclose(hist.variances(), hist.values(), equal_nan=True))
[docs]
def get_asymmetrical_uncertainties(
hist: bh.Histogram,
uncertainty_type: str = "asymmetrical",
) -> tuple[np.ndarray, np.ndarray]:
"""
Get Poisson asymmetrical uncertainties for a histogram via a frequentist approach based on a confidence-interval computation.
Asymmetrical uncertainties can only be computed for an unweighted histogram, because the bin contents of a weighted histogram do not follow a Poisson distribution.
More information in :ref:`documentation-statistics-label`.
Parameters
----------
hist : bh.Histogram
The histogram.
uncertainty_type : str, optional
The type of uncertainty to compute for bins with 0 entry. Default is "asymmetrical" (= "asymmetrical_one_sided_zeros"). Use "asymmetrical_double_sided_zeros" to have the double-sided definition. More information in :ref:`documentation-statistics-label`.
Returns
-------
uncertainties_low : np.ndarray
The lower uncertainties.
uncertainties_high : np.ndarray
The upper uncertainties.
Raise
-----
ValueError
If the histogram is weighted.
"""
_check_counting_histogram(hist)
_check_uncertainty_type(uncertainty_type)
if not _is_unweighted(hist):
raise ValueError(
"Asymmetrical uncertainties can only be computed for an unweighted histogram."
)
alpha = 1.0 - 0.682689492
tail_probability = alpha / 2
n = hist.values()
lower_bound = np.zeros_like(n, dtype=float)
upper_bound = np.zeros_like(n, dtype=float)
# Two-sided Garwood intervals for n > 0
lower_bound[n > 0] = stats.gamma.ppf(q=tail_probability, a=n[n > 0], scale=1)
upper_bound[n > 0] = stats.gamma.ppf(
q=1 - tail_probability, a=n[n > 0] + 1, scale=1
)
if uncertainty_type == "asymmetrical_double_sided_zeros":
# Two-sided Garwood intervals for n == 0
upper_bound[n == 0] = stats.gamma.ppf(q=1 - tail_probability, a=1, scale=1)
elif uncertainty_type in ["asymmetrical_one_sided_zeros", "asymmetrical"]:
# One-sided upper limit for n == 0
upper_bound[n == 0] = stats.gamma.ppf(q=1 - 2 * tail_probability, a=1, scale=1)
else:
raise ValueError(
f"Invalid uncertainty type '{uncertainty_type}' for asymmetrical uncertainties."
)
# Compute asymmetric uncertainties
uncertainties_low = n - lower_bound
uncertainties_high = upper_bound - n
uncertainties_low = np.nan_to_num(uncertainties_low, nan=0.0)
uncertainties_high = np.nan_to_num(uncertainties_high, nan=0.0)
return uncertainties_low, uncertainties_high
def _check_binning_consistency(hist_list: list[bh.Histogram]) -> None:
"""
Check that all the histograms in the provided list share the same definition of their bins.
Parameters
----------
hist_list : list[bh.Histogram]
The list of histograms to check.
Raise
-----
ValueError
If the histograms do not share the same dimensionality or if their bins are not equal.
"""
for h in hist_list:
if not len(h.axes) == len(hist_list[0].axes):
raise ValueError("Histograms must have same dimensionality.")
for i in range(len(h.axes)):
if not h.axes[i] == hist_list[0].axes[i]:
raise ValueError("The bins of the histograms must be equal.")
[docs]
def get_ratio_variances(h1: bh.Histogram, h2: bh.Histogram) -> np.ndarray:
"""
Calculate the variances of the ratio of two uncorrelated histograms (h1/h2).
Parameters
----------
h1 : bh.Histogram
The first histogram.
h2 : bh.Histogram
The second histogram.
Returns
-------
ratio_variances : np.ndarray
The variances of the ratio of the two histograms.
Raises
------
ValueError
If the bins of the histograms are not equal.
"""
_check_binning_consistency([h1, h2])
_check_counting_histogram(h1)
_check_counting_histogram(h2)
with np.errstate(divide="ignore", invalid="ignore"):
return np.where(
h2.values() != 0,
h1.variances() / h2.values() ** 2
+ h2.variances() * h1.values() ** 2 / h2.values() ** 4,
np.nan,
)
[docs]
def get_pull(
h1: bh.Histogram,
h2: bh.Histogram,
h1_uncertainty_type: str = "symmetrical",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Compute the pull between two histograms.
Parameters
----------
h1 : bh.Histogram
The first histogram.
h2 : bh.Histogram
The second histogram.
h1_uncertainty_type : str, optional
What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval. Default is "symmetrical".
Returns
-------
pull_values : np.ndarray
The pull values.
pull_uncertainties_low : np.ndarray
The lower uncertainties on the pull. Always ones.
pull_uncertainties_high : np.ndarray
The upper uncertainties on the pull. Always ones.
"""
_check_uncertainty_type(h1_uncertainty_type)
_check_binning_consistency([h1, h2])
_check_counting_histogram(h1)
_check_counting_histogram(h2)
if "asymmetrical" in h1_uncertainty_type:
uncertainties_low, uncertainties_high = get_asymmetrical_uncertainties(
h1, h1_uncertainty_type
)
h1_variances = np.where(
h1.values() >= h2.values(),
uncertainties_low**2,
uncertainties_high**2,
)
h1 = h1.copy()
h1[:] = np.c_[h1.values(), h1_variances]
pull_values = np.where(
h1.variances() + h2.variances() != 0,
(h1.values() - h2.values()) / np.sqrt(h1.variances() + h2.variances()),
np.nan,
)
pull_uncertainties_low = np.ones_like(pull_values)
pull_uncertainties_high = pull_uncertainties_low
return (
pull_values,
pull_uncertainties_low,
pull_uncertainties_high,
)
[docs]
def get_difference(
h1: bh.Histogram,
h2: bh.Histogram,
h1_uncertainty_type: str = "symmetrical",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Compute the difference between two histograms.
Parameters
----------
h1 : bh.Histogram
The first histogram.
h2 : bh.Histogram
The second histogram.
h1_uncertainty_type : str, optional
What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval. Default is "symmetrical".
Returns
-------
difference_values : np.ndarray
The difference values.
difference_uncertainties_low : np.ndarray
The lower uncertainties on the difference.
difference_uncertainties_high : np.ndarray
The upper uncertainties on the difference.
"""
_check_uncertainty_type(h1_uncertainty_type)
_check_binning_consistency([h1, h2])
_check_counting_histogram(h1)
_check_counting_histogram(h2)
difference_values = h1.values() - h2.values()
if "asymmetrical" in h1_uncertainty_type:
uncertainties_low, uncertainties_high = get_asymmetrical_uncertainties(
h1, h1_uncertainty_type
)
difference_uncertainties_low = np.sqrt(uncertainties_low**2 + h2.variances())
difference_uncertainties_high = np.sqrt(uncertainties_high**2 + h2.variances())
else:
difference_uncertainties_low = np.sqrt(h1.variances() + h2.variances())
difference_uncertainties_high = difference_uncertainties_low
return (
difference_values,
difference_uncertainties_low,
difference_uncertainties_high,
)
[docs]
def get_efficiency(h1: bh.Histogram, h2: bh.Histogram) -> tuple[np.ndarray, np.ndarray]:
"""
Calculate the ratio of two correlated histograms (h1/h2), in which the entries of h1 are a subsample of the entries of h2.
The variances are calculated according to the formula given in :ref:`documentation-statistics-label`.
The following conditions must be fulfilled for the calculation of the efficiency:
* The bins of the histograms must be equal.
* The histograms must be unweighted.
* The bin contents of both histograms must be positive or zero.
* The bin contents of h1 must be a subsample of the bin contents of h2.
Parameters
----------
h1 : bh.Histogram
The first histogram.
h2 : bh.Histogram
The second histogram.
Returns
-------
efficiency_values : np.ndarray
The efficiency values.
efficiency_uncertainties : np.ndarray
The uncertainties on the efficiency values.
Raises
------
ValueError
If the histograms are weighted.
ValueError
If the bin contents of the histograms are not positive or zero.
ValueError
If the bin contents of h1 are not a subsample of the bin contents of h2.
"""
_check_binning_consistency([h1, h2])
_check_counting_histogram(h1)
_check_counting_histogram(h2)
if not (_is_unweighted(h1) and _is_unweighted(h2)):
raise ValueError(
"The ratio of two correlated histograms (efficiency) can only be computed for unweighted histograms."
)
if not (np.all(h1.values() >= 0) and np.all(h2.values() >= 0)):
raise ValueError(
"The ratio of two correlated histograms (efficiency) can only be computed if the bin contents of both histograms are positive or zero."
)
if not np.all(h1.values() <= h2.values()):
raise ValueError(
"The ratio of two correlated histograms (efficiency) can only be computed if the bin contents of h1 are a subsample of the bin contents of h2."
)
efficiency_values = np.where(h2.values() != 0, h1.values() / h2.values(), np.nan)
k = h1.values()
n = h2.values()
efficiency_variances = (k + 1) * (k + 2) / (n + 2) / (n + 3) - (k + 1) ** 2 / (
n + 2
) ** 2
return efficiency_values, np.sqrt(efficiency_variances)
[docs]
def get_asymmetry(h1: bh.Histogram, h2: bh.Histogram) -> tuple[np.ndarray, np.ndarray]:
"""
Get the asymmetry between two histograms h1 and h2, defined as (h1 - h2) / (h1 + h2).
Only symmetrical uncertainties are supported.
Parameters
----------
h1 : bh.Histogram
The first histogram.
h2 : bh.Histogram
The second histogram.
Returns
-------
asymmetry_values : np.ndarray
The asymmetry values.
asymmetry_uncertainties : np.ndarray
The uncertainties on the asymmetry.
"""
_check_binning_consistency([h1, h2])
_check_counting_histogram(h1)
_check_counting_histogram(h2)
hist_sum = h1 + h2
hist_diff = h1 + (-1 * h2)
asymmetry_values = np.where(
hist_sum.values() != 0, hist_diff.values() / hist_sum.values(), np.nan
)
asymmetry_variances = get_ratio_variances(hist_diff, hist_sum)
return (
asymmetry_values,
np.sqrt(asymmetry_variances),
)
[docs]
def get_ratio(
h1: bh.Histogram,
h2: bh.Histogram,
h1_uncertainty_type: str = "symmetrical",
ratio_uncertainty_type: str = "uncorrelated",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Compute the ratio h1/h2 between two uncorrelated histograms h1 and h2.
Parameters
----------
h1 : bh.Histogram
The numerator histogram.
h2 : bh.Histogram
The denominator histogram.
h1_uncertainty_type : str, optional
What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval. Default is "symmetrical".
ratio_uncertainty_type : str, optional
How to treat the uncertainties of the histograms:
* "uncorrelated" for the comparison of two uncorrelated histograms,
* "split" for scaling down the uncertainties of h1 by bin contents of h2, i.e. assuming zero uncertainty coming from h2 in the ratio uncertainty.
Default is "uncorrelated".
Returns
-------
ratio_values : np.ndarray
The ratio values.
ratio_uncertainties_low : np.ndarray
The lower uncertainties on the ratio.
ratio_uncertainties_high : np.ndarray
The upper uncertainties on the ratio.
Raises
------
ValueError
If the ratio_uncertainty_type is not valid.
"""
_check_uncertainty_type(h1_uncertainty_type)
_check_binning_consistency([h1, h2])
_check_counting_histogram(h1)
_check_counting_histogram(h2)
ratio_values = np.where(h2.values() != 0, h1.values() / h2.values(), np.nan)
if "asymmetrical" in h1_uncertainty_type:
uncertainties_low, uncertainties_high = get_asymmetrical_uncertainties(
h1, h1_uncertainty_type
)
if ratio_uncertainty_type == "uncorrelated":
if "asymmetrical" in h1_uncertainty_type:
h1_high = h1.copy()
h1_high[:] = np.c_[h1_high.values(), uncertainties_high**2]
h1_low = h1.copy()
h1_low[:] = np.c_[h1_low.values(), uncertainties_low**2]
ratio_uncertainties_low = np.sqrt(get_ratio_variances(h1_low, h2))
ratio_uncertainties_high = np.sqrt(get_ratio_variances(h1_high, h2))
else:
ratio_uncertainties_low = np.sqrt(get_ratio_variances(h1, h2))
ratio_uncertainties_high = ratio_uncertainties_low
elif ratio_uncertainty_type == "split":
if "asymmetrical" in h1_uncertainty_type:
ratio_uncertainties_low = uncertainties_low / h2.values()
ratio_uncertainties_high = uncertainties_high / h2.values()
else:
h1_scaled_uncertainties = np.where(
h2.values() != 0,
np.sqrt(h1.variances()) / h2.values(),
np.nan,
)
ratio_uncertainties_low = h1_scaled_uncertainties
ratio_uncertainties_high = ratio_uncertainties_low
else:
raise ValueError("ratio_uncertainty_type not in ['uncorrelated', 'split'].")
return (
ratio_values,
ratio_uncertainties_low,
ratio_uncertainties_high,
)
[docs]
def get_comparison(
h1: bh.Histogram,
h2: bh.Histogram,
comparison: str,
h1_uncertainty_type: str = "symmetrical",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Compute the comparison between two histograms.
Parameters
----------
h1 : bh.Histogram
The first histogram for comparison.
h2 : bh.Histogram
The second histogram for comparison.
comparison : str
The type of comparison ("ratio", "split_ratio", "pull", "difference", "relative_difference", "efficiency", or "asymmetry").
When the `split_ratio` option is used, the uncertainties of h1 are scaled down by the bin contents of h2, i.e. assuming zero uncertainty coming from h2 in the ratio uncertainty.
h1_uncertainty_type : str, optional
What kind of bin uncertainty to use for h1: "symmetrical" for the Poisson standard deviation derived from the variance stored in the histogram object, "asymmetrical" for asymmetrical uncertainties based on a Poisson confidence interval.
Asymmetrical uncertainties are not supported for the asymmetry and efficiency comparisons.
Default is "symmetrical".
Returns
-------
values : np.ndarray
The comparison values.
lower_uncertainties : np.ndarray
The lower uncertainties on the comparison values.
upper_uncertainties : np.ndarray
The upper uncertainties on the comparison values.
Raises
------
ValueError
If the comparison is not valid.
ValueError
If the h1_uncertainty_type is "asymmetrical" and the comparison is "asymmetry" or "efficiency".
"""
_check_uncertainty_type(h1_uncertainty_type)
_check_binning_consistency([h1, h2])
_check_counting_histogram(h1)
_check_counting_histogram(h2)
with np.errstate(divide="ignore", invalid="ignore"):
if comparison == "ratio":
values, lower_uncertainties, upper_uncertainties = get_ratio(
h1, h2, h1_uncertainty_type, "uncorrelated"
)
elif comparison == "split_ratio":
values, lower_uncertainties, upper_uncertainties = get_ratio(
h1, h2, h1_uncertainty_type, "split"
)
elif comparison == "relative_difference":
values, lower_uncertainties, upper_uncertainties = get_ratio(
h1, h2, h1_uncertainty_type, "uncorrelated"
)
values -= 1 # relative difference is ratio-1
elif comparison == "pull":
values, lower_uncertainties, upper_uncertainties = get_pull(
h1, h2, h1_uncertainty_type
)
elif comparison == "difference":
values, lower_uncertainties, upper_uncertainties = get_difference(
h1, h2, h1_uncertainty_type
)
elif comparison == "asymmetry":
if "asymmetrical" in h1_uncertainty_type:
raise ValueError(
"Asymmetrical uncertainties are not supported for the asymmetry comparison."
)
values, uncertainties = get_asymmetry(h1, h2)
lower_uncertainties = uncertainties
upper_uncertainties = uncertainties
elif comparison == "efficiency":
if "asymmetrical" in h1_uncertainty_type:
raise ValueError(
"Asymmetrical uncertainties are not supported in an efficiency computation."
)
values, uncertainties = get_efficiency(h1, h2)
lower_uncertainties = uncertainties
upper_uncertainties = uncertainties
else:
raise ValueError(
f"{comparison} not available as a comparison ('ratio', 'split_ratio', 'pull', 'difference', 'relative_difference', 'asymmetry' or 'efficiency')."
)
return values, lower_uncertainties, upper_uncertainties