Source code for pyhealth.metrics.calibration

"""Metrics that meature model calibration.

Reference Papers:

    [1] Lin, Zhen, Shubhendu Trivedi, and Jimeng Sun.
    "Taking a Step Back with KCal: Multi-Class Kernel-Based Calibration
    for Deep Neural Networks."
    ICLR 2023.

    [2] Nixon, Jeremy, Michael W. Dusenberry, Linchuan Zhang, Ghassen Jerfel, and Dustin Tran.
    "Measuring Calibration in Deep Learning."
    In CVPR workshops, vol. 2, no. 7. 2019.

    [3] Patel, Kanil, William Beluch, Bin Yang, Michael Pfeiffer, and Dan Zhang.
    "Multi-class uncertainty calibration via mutual information maximization-based binning."
    ICLR 2021.

    [4] Guo, Chuan, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger.
    "On calibration of modern neural networks."
    ICML 2017.

    [5] Kull, Meelis, Miquel Perello Nieto, Markus Kängsepp, Telmo Silva Filho, Hao Song, and Peter Flach.
    "Beyond temperature scaling: Obtaining well-calibrated multi-class probabilities with dirichlet calibration."
    Advances in neural information processing systems 32 (2019).

    [6] Brier, Glenn W.
    "Verification of forecasts expressed in terms of probability."
    Monthly weather review 78, no. 1 (1950): 1-3.

"""
import bisect

import numpy as np
import pandas as pd


def _get_bins(bins):
    if isinstance(bins, int):
        bins = list(np.arange(bins+1) / bins)
    return bins

def assign_bin(sorted_ser: pd.Series, bins:int, adaptive:bool=False):
    ret = pd.DataFrame(sorted_ser)
    if adaptive:
        assert isinstance(bins, int)
        step = len(sorted_ser) // bins
        nvals = [step for _ in range(bins)]
        for _ in range(len(sorted_ser) % bins):
            nvals[-_-1] += 1
        ret['bin'] = [ith for ith, val in enumerate(nvals) for _ in range(val)]
        nvals = list(np.asarray(nvals).cumsum())
        bins = [ret.iloc[0]['conf']]
        for iloc in nvals:
            bins.append(ret.iloc[iloc-1]['conf'])
            if iloc != nvals[-1]:
                bins[-1] = 0.5 * bins[-1] + 0.5 *ret.iloc[iloc]['conf']
    else:
        bins = _get_bins(bins)
        bin_assign = pd.Series(0, index=sorted_ser.index)
        locs = [bisect.bisect(sorted_ser.values, b) for b in bins]
        locs[0], locs[-1] = 0, len(ret)
        for i, loc in enumerate(locs[:-1]):
            bin_assign.iloc[loc:locs[i+1]] = i
        ret['bin'] = bin_assign
    return ret['bin'], bins

def _ECE_loss(summ):
    w = summ['cnt'] / summ['cnt'].sum()
    loss = np.average((summ['conf'] - summ['acc']).abs(), weights=w)
    return loss

def _ECE_confidence(df, bins=20, adaptive=False):
    # df should have columns: conf, acc
    df = df.sort_values(['conf']).reset_index().drop('index', axis=1)
    df['bin'], _ = assign_bin(df['conf'], bins, adaptive=adaptive)
    summ = pd.DataFrame(df.groupby('bin')[['acc', 'conf']].mean())#.fillna(0.)
    summ['cnt'] = df.groupby('bin').size()
    summ = summ.reset_index()
    return summ, _ECE_loss(summ)

def _ECE_classwise(prob:np.ndarray, label_onehot:np.ndarray, bins=20, threshold=0., adaptive=False):
    summs = []
    class_losses = {}
    for k in range(prob.shape[1]):
        msk = prob[:, k] >= threshold
        if msk.sum() == 0:
            continue
        df = pd.DataFrame({"conf": prob[msk, k], 'acc': label_onehot[msk, k]})
        df = df.sort_values(['conf']).reset_index()
        df['bin'], _ = assign_bin(df['conf'], bins, adaptive=adaptive)
        summ = pd.DataFrame(df.groupby('bin')[['acc', 'conf']].mean())
        summ['cnt'] = df.groupby('bin').size()
        summ['k'] = k
        summs.append(summ.reset_index())
        class_losses[k] = _ECE_loss(summs[-1])
    class_losses = pd.Series(class_losses)
    class_losses['avg'], class_losses['sum'] = class_losses.mean(), class_losses.sum()
    summs = pd.concat(summs, ignore_index=True)
    return summs, class_losses

[docs]def ece_confidence_multiclass(prob:np.ndarray, label:np.ndarray, bins=20, adaptive=False):
    """Expected Calibration Error (ECE).

    We group samples into 'bins' basing on the top-class prediction.
    Then, we compute the absolute difference between the average top-class prediction and
    the frequency of top-class being correct (i.e. accuracy) for each bin.
    ECE is the average (weighed by number of points in each bin) of these absolute differences.
    It could be expressed by the following formula, with :math:`B_m` denoting the m-th bin:

    .. math::
        ECE = \\sum_{m=1}^M \\frac{|B_m|}{N} |acc(B_m) - conf(B_m)|

    Example:
        >>> pred = np.asarray([[0.2, 0.2, 0.6], [0.2, 0.31, 0.49], [0.1, 0.1, 0.8]])
        >>> label = np.asarray([2,1,2])
        >>> ECE_confidence_multiclass(pred, label, bins=2)
        0.36333333333333334

    Explanation of the example: The bins are [0, 0.5] and (0.5, 1].
    In the first bin, we have one sample with top-class prediction of 0.49, and its
    accuracy is 0. In the second bin, we have average confidence of 0.7 and average
    accuracy of 1. Thus, the ECE is :math:`\\frac{1}{3} \cdot 0.49 + \\frac{2}{3}\cdot 0.3=0.3633`.

    Args:
        prob (np.ndarray): (N, C)
        label (np.ndarray): (N,)
        bins (int, optional): Number of bins. Defaults to 20.
        adaptive (bool, optional): If False, bins are equal width ([0, 0.05, 0.1, ..., 1])
            If True, bin widths are adaptive such that each bin contains the same number
            of points. Defaults to False.
    """
    df = pd.DataFrame({'acc': label == np.argmax(prob, 1), 'conf': prob.max(1)})
    return _ECE_confidence(df, bins, adaptive)[1]

[docs]def ece_confidence_binary(prob:np.ndarray, label:np.ndarray, bins=20, adaptive=False):
    """Expected Calibration Error (ECE) for binary classification.

    Similar to :func:`ece_confidence_multiclass`, but on class 1 instead of the top-prediction.


    Args:
        prob (np.ndarray): (N, C)
        label (np.ndarray): (N,)
        bins (int, optional): Number of bins. Defaults to 20.
        adaptive (bool, optional): If False, bins are equal width ([0, 0.05, 0.1, ..., 1])
            If True, bin widths are adaptive such that each bin contains the same number
            of points. Defaults to False.
    """

    df = pd.DataFrame({'acc': label[:,0], 'conf': prob[:,0]})
    return _ECE_confidence(df, bins, adaptive)[1]

[docs]def ece_classwise(prob, label, bins=20, threshold=0., adaptive=False):
    """Classwise Expected Calibration Error (ECE).

    This is equivalent to applying :func:`ece_confidence_binary` to each class and take the average.

    Args:
        prob (np.ndarray): (N, C)
        label (np.ndarray): (N,)
        bins (int, optional): Number of bins. Defaults to 20.
        threshold (float): threshold to filter out samples.
            If the number of classes C is very large, many classes receive close to 0
            prediction. Any prediction below threshold is considered noise and ignored.
            In recent papers, this is typically set to a small number (such as 1/C).
        adaptive (bool, optional): If False, bins are equal width ([0, 0.05, 0.1, ..., 1])
            If True, bin widths are adaptive such that each bin contains the same number
            of points. Defaults to False.
    """

    K = prob.shape[1]
    if len(label.shape) == 1:
        # make it one-hot
        label, _ = np.zeros((len(label),K)), label
        label[np.arange(len(label)), _] = 1
    return _ECE_classwise(prob, label, bins, threshold, adaptive)[1]['avg']

[docs]def brier_top1(prob:np.ndarray, label:np.ndarray):
    """Brier score (i.e. mean squared error between prediction and 0-1 label) of the top prediction.
    """
    conf = prob.max(1)
    acc = (label == np.argmax(prob, 1)).astype(int)
    return np.mean(np.square(conf - acc))