Source code for skll.metrics

# License: BSD 3 clause
"""
Metrics that can be used to evaluate the performance of learners.

:author: Nitin Madnani (nmadnani@ets.org)
:author: Michael Heilman (mheilman@ets.org)
:author: Dan Blanchard (dblanchard@ets.org)
:organization: ETS
"""

import copy
import sys
from importlib import import_module
from inspect import signature
from pathlib import Path
from typing import Optional, Union

import numpy as np
from scipy.stats import kendalltau, pearsonr, spearmanr
from sklearn.metrics import (
    confusion_matrix,
    f1_score,
    fbeta_score,
    get_scorer,
    get_scorer_names,
    make_scorer,
)

from skll.types import PathOrStr



[docs]
def kappa(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    weights: Optional[Union[str, np.ndarray]] = None,
    allow_off_by_one: bool = False,
) -> float:
    """
    Calculate the kappa inter-rater agreement.

    The agreement is calculated between the gold standard and the predicted
    ratings. Potential values range from -1 (representing complete disagreement)
    to 1 (representing complete agreement).  A kappa value of 0 is expected if
    all agreement is due to chance.

    In the course of calculating kappa, all items in ``y_true`` and ``y_pred`` will
    first be converted to floats and then rounded to integers.

    It is assumed that y_true and y_pred contain the complete range of possible
    ratings.

    This function contains a combination of code from yorchopolis's kappa-stats
    and Ben Hamner's Metrics projects on Github.

    Parameters
    ----------
    y_true : numpy.ndarray
        The true/actual/gold labels for the data.
    y_pred : numpy.ndarray
        The predicted/observed labels for the data.
    weights : Optional[Union[str, numpy.ndarray]], default=None
        Specifies the weight matrix for the calculation.
        Possible values are: ``None`` (unweighted-kappa), ``"quadratic"``
        (quadratically weighted kappa), ``"linear"`` (linearly weighted kappa),
        and a two-dimensional numpy array (a custom matrix of weights). Each
        weight in this array corresponds to the :math:`w_{ij}` values in the
        Wikipedia description of how to calculate weighted Cohen's kappa.
    allow_off_by_one : bool, default=False
        If true, ratings that are off by one are counted as
        equal, and all other differences are reduced by
        one. For example, 1 and 2 will be considered to be
        equal, whereas 1 and 3 will have a difference of 1
        for when building the weights matrix.

    Returns
    -------
    float
        The weighted or unweighted kappa score.

    Raises
    ------
    AssertionError
        If ``y_true`` != ``y_pred``.
    ValueError
        If labels cannot be converted to int.
    ValueError
        If invalid weight scheme.

    """
    # Ensure that the lists are both the same length
    assert len(y_true) == len(y_pred)

    # This rather crazy looking typecast is intended to work as follows:
    # If an input is an int, the operations will have no effect.
    # If it is a float, it will be rounded and then converted to an int
    # because the ml_metrics package requires ints.
    # If it is a str like "1", then it will be converted to a (rounded) int.
    # If it is a str that can't be typecast, then the user is
    # given a hopefully useful error message.
    try:
        y_true = np.array([int(np.round(float(y))) for y in y_true])
        y_pred = np.array([int(np.round(float(y))) for y in y_pred])
    except ValueError:
        raise ValueError(
            "For kappa, the labels should be integers or strings"
            " that can be converted to ints (E.g., '4.0' or "
            "'3')."
        )

    # Figure out normalized expected values
    min_rating = min(min(y_true), min(y_pred))
    max_rating = max(max(y_true), max(y_pred))

    # shift the values so that the lowest value is 0
    # (to support scales that include negative values)
    y_true = y_true - min_rating
    y_pred = y_pred - min_rating

    # Build the observed/confusion matrix
    num_ratings = max_rating - min_rating + 1
    observed = confusion_matrix(y_true, y_pred, labels=list(range(num_ratings)))
    num_scored_items = float(len(y_true))

    # Build weight array if weren't passed one
    if isinstance(weights, str):
        wt_scheme = weights
        weights = None
    else:
        wt_scheme = ""

    if weights is None:
        kappa_weights = np.empty((num_ratings, num_ratings))
        for i in range(num_ratings):
            for j in range(num_ratings):
                diff = abs(i - j)
                if allow_off_by_one and diff:
                    diff -= 1
                if wt_scheme == "linear":
                    kappa_weights[i, j] = diff
                elif wt_scheme == "quadratic":
                    kappa_weights[i, j] = diff**2
                elif not wt_scheme:  # unweighted
                    kappa_weights[i, j] = bool(diff)
                else:
                    raise ValueError("Invalid weight scheme specified for " f"kappa: {wt_scheme}")
    else:
        kappa_weights = weights

    hist_true: np.ndarray = np.bincount(y_true, minlength=num_ratings)
    hist_true = hist_true[:num_ratings] / num_scored_items
    hist_pred: np.ndarray = np.bincount(y_pred, minlength=num_ratings)
    hist_pred = hist_pred[:num_ratings] / num_scored_items
    expected = np.outer(hist_true, hist_pred)

    # Normalize observed array
    observed = observed / num_scored_items

    # If all weights are zero, that means no disagreements matter.
    k = 1.0
    if np.count_nonzero(kappa_weights):
        observed_sum = np.sum(kappa_weights * observed)
        expected_sum = np.sum(kappa_weights * expected)
        k -= np.sum(observed_sum) / np.sum(expected_sum)

    return k




[docs]
def correlation(y_true: np.ndarray, y_pred: np.ndarray, corr_type: str = "pearson") -> float:
    """
    Calculate given correlation type between ``y_true`` and ``y_pred``.

    ``y_pred`` can be multi-dimensional. If ``y_pred`` is 1-dimensional, it
    may either contain probabilities, most-likely classification labels, or
    regressor predictions. In that case, we simply return the correlation
    between ``y_true`` and ``y_pred``. If ``y_pred`` is multi-dimensional,
    it contains probabilties for multiple classes in which case, we infer the
    most likely labels and then compute the correlation between those and
    ``y_true``.

    Parameters
    ----------
    y_true : numpy.ndarray
        The true/actual/gold labels for the data.
    y_pred : numpy.ndarray
        The predicted/observed labels for the data.
    corr_type : str, default="pearson"
        Which type of correlation to compute. Possible
        choices are "pearson", "spearman", and "kendall_tau".

    Returns
    -------
    float
        correlation value if well-defined, else 0.0

    """
    # get the correlation function to use based on the given type
    corr_func = pearsonr
    if corr_type == "spearman":
        corr_func = spearmanr
    elif corr_type == "kendall_tau":
        corr_func = kendalltau

    # convert to numpy array in case we are passed a list
    y_pred = np.array(y_pred)

    # multi-dimensional -> probability array -> get label
    if y_pred.ndim > 1:
        labels = np.argmax(y_pred, axis=1)
        ret_score = corr_func(y_true, labels)[0]
    # 1-dimensional -> probabilities/labels -> use as is
    else:
        ret_score = corr_func(y_true, y_pred)[0]
    return ret_score




[docs]
def f1_score_least_frequent(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Calculate F1 score of the least frequent label/class.

    Parameters
    ----------
    y_true : numpy.ndarray
        The true/actual/gold labels for the data.
    y_pred : numpy.ndarray
        The predicted/observed labels for the data.

    Returns
    -------
    float
        F1 score of the least frequent label.

    """
    least_frequent = np.bincount(y_true).argmin()
    return f1_score(y_true, y_pred, average=None)[least_frequent]




[docs]
def register_custom_metric(custom_metric_path: PathOrStr, custom_metric_name: str):
    """
    Import, load, and register the custom metric function from the given path.

    Parameters
    ----------
    custom_metric_path : :class:`skll.types.PathOrStr`
        The path to a custom metric.
    custom_metric_name : str
        The name of the custom metric function to load. This function must take
        only two array-like arguments: the true labels and the predictions,
        in that order.

    Raises
    ------
    ValueError
        If the custom metric path does not end in '.py'.
    NameError
        If the name of the custom metric file conflicts
        with an already existing attribute in ``skll.metrics``
        or if the custom metric name conflicts with a scikit-learn
        or SKLL metric.

    """
    if not custom_metric_path:
        raise ValueError(
            f"custom metric path was not set and " f"metric {custom_metric_name} was not found."
        )

    custom_metric_path = Path(custom_metric_path)
    if not custom_metric_path.exists():
        raise ValueError(f"custom metric path '{custom_metric_path}' " f"does not exist.")

    if custom_metric_path.suffix != ".py":
        raise ValueError(
            f"custom metric path must end in .py, you specified " f"{custom_metric_path}"
        )

    # get the name of the module containing the custom metric
    custom_metric_module_name = custom_metric_path.stem

    # once we know that the module name is okay, we need to make sure
    # that the metric function name is also okay
    if custom_metric_name in get_scorer_names() or custom_metric_name in _CUSTOM_METRICS:
        raise NameError(
            f"a metric called '{custom_metric_name}' already "
            f"exists; rename the metric function in "
            f"{custom_metric_module_name}.py and try again."
        )

    # dynamically import the module unless we have already done it
    if custom_metric_module_name not in sys.modules:
        sys.path.append(str(custom_metric_path.resolve().parent))
        metric_module = import_module(custom_metric_module_name)

        # this statement is only necessary so that if we end
        # up using multiprocessing parallelization backend,
        # things are serialized properly
        globals()[custom_metric_module_name] = metric_module

    # get the metric function from this imported module
    metric_func = getattr(sys.modules[custom_metric_module_name], custom_metric_name)
    # again, we need this for multiprocessing serialization
    metric_func.__module__ = f"skll.metrics.{custom_metric_module_name}"

    # extract any "special" keyword arguments from the metric function
    metric_func_parameters = signature(metric_func).parameters
    make_scorer_kwargs = {}
    for make_scorer_kwarg in ["greater_is_better", "response_method"]:
        if make_scorer_kwarg in metric_func_parameters:
            parameter = metric_func_parameters.get(make_scorer_kwarg)
            if parameter is not None:
                parameter_value = parameter.default
                make_scorer_kwargs.update({make_scorer_kwarg: parameter_value})

    # make the scorer function with the extracted keyword arguments
    _CUSTOM_METRICS[f"{custom_metric_name}"] = make_scorer(metric_func, **make_scorer_kwargs)

    return metric_func




[docs]
def use_score_func(func_name: str, y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Call the given scoring function.

    This takes care of handling keyword arguments that were pre-specified
    when creating the scorer. This applies any sign-flipping that was
    specified by ``make_scorer()`` when the scorer was created.

    Parameters
    ----------
    func_name : str
        The name of the objective function to use.
    y_true : numpy.ndarray
        The true/actual/gold labels for the data.
    y_pred : numpy.ndarray
        The predicted/observed labels for the data.

    Returns
    -------
    float
        The scored result from the given scorer.

    """
    try:
        scorer = get_scorer(func_name)
    except ValueError:
        scorer = _CUSTOM_METRICS[func_name]

    return scorer._sign * scorer._score_func(y_true, y_pred, **scorer._kwargs)



# a dictionary that maps pre-defined custom metric names to their scorer functions
# this is a private variable only meant for internal use
_PREDEFINED_CUSTOM_METRICS = {
    "f1_score_micro": make_scorer(f1_score, average="micro"),
    "f1_score_macro": make_scorer(f1_score, average="macro"),
    "f1_score_weighted": make_scorer(f1_score, average="weighted"),
    "f1_score_least_frequent": make_scorer(f1_score_least_frequent),
    "f05": make_scorer(fbeta_score, beta=0.5, average="binary"),
    "f05_score_micro": make_scorer(fbeta_score, beta=0.5, average="micro"),
    "f05_score_macro": make_scorer(fbeta_score, beta=0.5, average="macro"),
    "f05_score_weighted": make_scorer(fbeta_score, beta=0.5, average="weighted"),
    "pearson": make_scorer(correlation, corr_type="pearson"),
    "spearman": make_scorer(correlation, corr_type="spearman"),
    "kendall_tau": make_scorer(correlation, corr_type="kendall_tau"),
    "unweighted_kappa": make_scorer(kappa),
    "quadratic_weighted_kappa": make_scorer(kappa, weights="quadratic"),
    "linear_weighted_kappa": make_scorer(kappa, weights="linear"),
    "qwk_off_by_one": make_scorer(kappa, weights="quadratic", allow_off_by_one=True),
    "lwk_off_by_one": make_scorer(kappa, weights="linear", allow_off_by_one=True),
    "uwk_off_by_one": make_scorer(kappa, allow_off_by_one=True),
}

# now create a new dictionary that contains all of the above metrics but
# will also contain any user-defined custom metrics
_CUSTOM_METRICS = copy.deepcopy(_PREDEFINED_CUSTOM_METRICS)