Source code for skll.metrics

# License: BSD 3 clause
"""
This module contains a bunch of evaluation metrics that can be used to
evaluate the performance of learners.

:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:author: Dan Blanchard (dblanchard@ets.org)
:organization: ETS
"""

from __future__ import print_function, unicode_literals

import numpy as np
from scipy.stats import kendalltau, spearmanr, pearsonr
from six import string_types
from six.moves import xrange as range
from sklearn.metrics import confusion_matrix, f1_score, SCORERS


# Constants
_CORRELATION_METRICS = frozenset(['kendall_tau', 'spearman', 'pearson'])


[docs]def kappa(y_true, y_pred, weights=None, allow_off_by_one=False): """ Calculates the kappa inter-rater agreement between two the gold standard and the predicted ratings. Potential values range from -1 (representing complete disagreement) to 1 (representing complete agreement). A kappa value of 0 is expected if all agreement is due to chance. In the course of calculating kappa, all items in ``y_true`` and ``y_pred`` will first be converted to floats and then rounded to integers. It is assumed that y_true and y_pred contain the complete range of possible ratings. This function contains a combination of code from yorchopolis's kappa-stats and Ben Hamner's Metrics projects on Github. Parameters ---------- y_true : array-like of float The true/actual/gold labels for the data. y_pred : array-like of float The predicted/observed labels for the data. weights : str or np.array, optional Specifies the weight matrix for the calculation. Options are :: - None = unweighted-kappa - 'quadratic' = quadratic-weighted kappa - 'linear' = linear-weighted kappa - two-dimensional numpy array = a custom matrix of weights. Each weight corresponds to the :math:`w_{ij}` values in the wikipedia description of how to calculate weighted Cohen's kappa. Defaults to None. allow_off_by_one : bool, optional If true, ratings that are off by one are counted as equal, and all other differences are reduced by one. For example, 1 and 2 will be considered to be equal, whereas 1 and 3 will have a difference of 1 for when building the weights matrix. Defaults to False. Returns ------- k : float The kappa score, or weighted kappa score. Raises ------ AssertionError If ``y_true`` != ``y_pred``. ValueError If labels cannot be converted to int. ValueError If invalid weight scheme. """ # Ensure that the lists are both the same length assert(len(y_true) == len(y_pred)) # This rather crazy looking typecast is intended to work as follows: # If an input is an int, the operations will have no effect. # If it is a float, it will be rounded and then converted to an int # because the ml_metrics package requires ints. # If it is a str like "1", then it will be converted to a (rounded) int. # If it is a str that can't be typecast, then the user is # given a hopefully useful error message. # Note: numpy and python 3.3 use bankers' rounding. try: y_true = [int(np.round(float(y))) for y in y_true] y_pred = [int(np.round(float(y))) for y in y_pred] except ValueError: raise ValueError("For kappa, the labels should be integers or strings " "that can be converted to ints (E.g., '4.0' or '3').") # Figure out normalized expected values min_rating = min(min(y_true), min(y_pred)) max_rating = max(max(y_true), max(y_pred)) # shift the values so that the lowest value is 0 # (to support scales that include negative values) y_true = [y - min_rating for y in y_true] y_pred = [y - min_rating for y in y_pred] # Build the observed/confusion matrix num_ratings = max_rating - min_rating + 1 observed = confusion_matrix(y_true, y_pred, labels=list(range(num_ratings))) num_scored_items = float(len(y_true)) # Build weight array if weren't passed one if isinstance(weights, string_types): wt_scheme = weights weights = None else: wt_scheme = '' if weights is None: weights = np.empty((num_ratings, num_ratings)) for i in range(num_ratings): for j in range(num_ratings): diff = abs(i - j) if allow_off_by_one and diff: diff -= 1 if wt_scheme == 'linear': weights[i, j] = diff elif wt_scheme == 'quadratic': weights[i, j] = diff ** 2 elif not wt_scheme: # unweighted weights[i, j] = bool(diff) else: raise ValueError('Invalid weight scheme specified for ' 'kappa: {}'.format(wt_scheme)) hist_true = np.bincount(y_true, minlength=num_ratings) hist_true = hist_true[: num_ratings] / num_scored_items hist_pred = np.bincount(y_pred, minlength=num_ratings) hist_pred = hist_pred[: num_ratings] / num_scored_items expected = np.outer(hist_true, hist_pred) # Normalize observed array observed = observed / num_scored_items # If all weights are zero, that means no disagreements matter. k = 1.0 if np.count_nonzero(weights): k -= (sum(sum(weights * observed)) / sum(sum(weights * expected))) return k
[docs]def kendall_tau(y_true, y_pred): """ Calculate Kendall's tau between ``y_true`` and ``y_pred``. Parameters ---------- y_true : array-like of float The true/actual/gold labels for the data. y_pred : array-like of float The predicted/observed labels for the data. Returns ------- ret_score : float Kendall's tau if well-defined, else 0.0 """ ret_score = kendalltau(y_true, y_pred)[0] return ret_score if not np.isnan(ret_score) else 0.0
[docs]def spearman(y_true, y_pred): """ Calculate Spearman's rank correlation coefficient between ``y_true`` and ``y_pred``. Parameters ---------- y_true : array-like of float The true/actual/gold labels for the data. y_pred : array-like of float The predicted/observed labels for the data. Returns ------- ret_score : float Spearman's rank correlation coefficient if well-defined, else 0.0 """ ret_score = spearmanr(y_true, y_pred)[0] return ret_score if not np.isnan(ret_score) else 0.0
[docs]def pearson(y_true, y_pred): """ Calculate Pearson product-moment correlation coefficient between ``y_true`` and ``y_pred``. Parameters ---------- y_true : array-like of float The true/actual/gold labels for the data. y_pred : array-like of float The predicted/observed labels for the data. Returns ------- ret_score : float Pearson product-moment correlation coefficient if well-defined, else 0.0 """ ret_score = pearsonr(y_true, y_pred)[0] return ret_score if not np.isnan(ret_score) else 0.0
[docs]def f1_score_least_frequent(y_true, y_pred): """ Calculate the F1 score of the least frequent label/class in ``y_true`` for ``y_pred``. Parameters ---------- y_true : array-like of float The true/actual/gold labels for the data. y_pred : array-like of float The predicted/observed labels for the data. Returns ------- ret_score : float F1 score of the least frequent label. """ least_frequent = np.bincount(y_true).argmin() return f1_score(y_true, y_pred, average=None)[least_frequent]
[docs]def use_score_func(func_name, y_true, y_pred): """ Call the scoring function in ``sklearn.metrics.SCORERS`` with the given name. This takes care of handling keyword arguments that were pre-specified when creating the scorer. This applies any sign-flipping that was specified by ``make_scorer()`` when the scorer was created. Parameters ---------- func_name : str The name of the objective function to use from SCORERS. y_true : array-like of float The true/actual/gold labels for the data. y_pred : array-like of float The predicted/observed labels for the data. Returns ------- ret_score : float The scored result from the given scorer. """ scorer = SCORERS[func_name] return scorer._sign * scorer._score_func(y_true, y_pred, **scorer._kwargs)