Source code for skll.learner.voting

# License: BSD 3 clause
"""
A meta-learner class that wraps scikit-learn's `VotingClassifier` and `VotingRegressor`.

:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""
import copy
import logging
from importlib import import_module
from itertools import zip_longest
from multiprocessing import cpu_count
from typing import Any, Dict, List, Optional, Tuple, Union

import joblib
import numpy as np
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.utils import shuffle as sk_shuffle
from sklearn.utils.multiclass import type_of_target

from skll.data import FeatureSet
from skll.data.dict_vectorizer import DictVectorizer
from skll.learner import Learner
from skll.types import (
    EvaluateTaskResults,
    FoldMapping,
    LabelType,
    LearningCurveSizes,
    PathOrStr,
    VotingCrossValidateTaskResults,
)
from skll.utils.constants import MAX_CONCURRENT_PROCESSES

from .utils import (
    _load_learner_from_disk,
    _save_learner_to_disk,
    add_unseen_labels,
    compute_evaluation_metrics,
    get_acceptable_classification_metrics,
    get_acceptable_regression_metrics,
    get_predictions,
    setup_cv_fold_iterator,
    setup_cv_split_iterator,
    train_and_score,
    write_predictions,
)



[docs]
class VotingLearner(object):
    """
    Wrap ``VotingClassifier`` and ``VotingRegressor`` from scikit-learn.

    Note that this class does not inherit from the ``Learner`` class but rather
    uses different ``Learner`` instances underlyingly.

    Parameters
    ----------
    learner_names : List[str]
        List of the learner names that will participate in the voting process.
    voting : Optional[str], default="hard"
        One of "hard" or "soft". If "hard", the predicted class labels
        are used for majority rule voting. If "soft", the predicted class
        label is based on the argmax of the sums of the predicted
        probabilities from each of the underlying learnrs. This parameter
        is only relevant for classification.
    custom_learner_path : Optional[:class:`skll.types.PathOrStr`], default=None
        Path to a Python file containing the definitions of any custom
        learners. Any and all custom learners in ``estimator_names`` must
        be defined in this file. If the custom learner does not inherit
        from an already existing scikit-learn estimator, it must explicitly
        define an `_estimator_type` attribute indicating whether it's a
        "classifier" or a "regressor".
    feature_scaling : str, default="none"
        How to scale the features, if at all for each estimator. Options are
        -  "with_std": scale features using the standard deviation
        -  "with_mean": center features using the mean
        -  "both": do both scaling as well as centering
        -  "none": do neither scaling nor centering
    pos_label : Optional[:class:`skll.types.LabelType`], default=None
        A string denoting the label of the class to be
        treated as the positive class in a binary classification
        setting, for each estimator. If ``None``, the class represented
        by the label that appears second when sorted is chosen as the
        positive class. For example, if the two labels in data are "A"
        and "B" and ``pos_label`` is not specified, "B" will
        be chosen as the positive class.
    min_feature_count : int, default=1
        The minimum number of examples a feature must have a nonzero
        value in to be included, for each estimator.
    model_kwargs_list : Optional[List[Dict[str, Any]]], default=None
        A list of dictionaries of keyword arguments to pass to the
        initializer for each of the estimators. There's a one-to-one
        correspondence between the order of this list and the order
        of the ``learner_names`` list.
    sampler_list : Optional[List[str]], default=None
        The samplers to use for kernel approximation, if desired, for each
        estimator. Valid values are:
        -  "AdditiveChi2Sampler"
        -  "Nystroem"
        -  "RBFSampler"
        -  "SkewedChi2Sampler"
        There's a one-to-one correspondence between the order of this list
        and the order of the ``learner_names`` list.
    sampler_kwargs_list : Optional[List[Dict[str, Any]]], default=None
        A list of dictionaries of keyword arguments to pass to the
        initializer for the specified sampler, one per estimator.
        There's a one-to-one correspondence between the order of this
        list and the order of the ``learner_names`` list.
    logger : Optional[logging.Logger], default=None
        A logging object. If ``None`` is passed, get logger from ``__name__``.
    """

    def __init__(
        self,
        learner_names: List[str],
        voting: Optional[str] = "hard",
        custom_learner_path: Optional[PathOrStr] = None,
        feature_scaling: str = "none",
        pos_label: Optional[LabelType] = None,
        min_feature_count: int = 1,
        model_kwargs_list: Optional[List[Dict[str, Any]]] = None,
        sampler_list: Optional[List[str]] = None,
        sampler_kwargs_list: Optional[List[Dict[str, Any]]] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """Initialize a ``VotingLearner`` object with the specified settings."""
        # initialize various attributes
        self._model = None
        self.voting = voting
        self.label_dict: Dict[LabelType, int] = {}
        self.logger = logger if logger else logging.getLogger(__name__)
        self.model_kwargs_list = [] if model_kwargs_list is None else model_kwargs_list
        self.sampler_list = [] if sampler_list is None else sampler_list
        self.sampler_kwargs_list = [] if sampler_kwargs_list is None else sampler_kwargs_list

        # check that the arguments that are supposed to be lists are lists;
        # if they are `None`, set them to be empty lists
        for argument_name in ["model_kwargs_list", "sampler_list", "sampler_kwargs_list"]:
            argument_value = locals()[argument_name]
            if argument_value is None:
                setattr(self, argument_name, [])
            else:
                if not isinstance(argument_value, list):
                    raise ValueError(
                        f"{argument_name} should be a list, you " f"specified {argument_value}"
                    )
                else:
                    setattr(self, argument_name, argument_value)

        # check that the list arguments, if not empty, have the right length
        for attribute_name in ["model_kwargs_list", "sampler_list", "sampler_kwargs_list"]:
            attribute_value = getattr(self, attribute_name)
            try:
                assert len(attribute_value) == 0 or len(attribute_value) == len(learner_names)
            except AssertionError:
                raise ValueError(
                    f"'{attribute_name}' must have {len(learner_names)} "
                    "entries, same as the number of learners"
                ) from None

        # instantiate each of the given estimators
        self._learners = []
        learner_types = set()
        self._learner_names = learner_names
        for learner_name, model_kwargs, sampler, sampler_kwargs in zip_longest(
            self._learner_names, self.model_kwargs_list, self.sampler_list, self.sampler_kwargs_list
        ):
            learner = Learner(
                learner_name,
                custom_learner_path=custom_learner_path,
                feature_scaling=feature_scaling,
                min_feature_count=min_feature_count,
                model_kwargs=model_kwargs,
                pipeline=True,
                pos_label=pos_label,
                probability=self.voting == "soft",
                sampler=sampler,
                sampler_kwargs=sampler_kwargs,
                logger=logger,
            )
            learner_types.add(learner.model_type._estimator_type)
            self._learners.append(learner)

        # infer what type of metalearner we have - a classifier or
        # a regressor; it can only be one or the other
        try:
            assert len(learner_types) == 1 and (
                learner_types == {"classifier"} or learner_types == {"regressor"}
            )
        except AssertionError:
            raise ValueError("cannot mix classifiers and regressors for voting")
        else:
            self.learner_type = list(learner_types)[0]

        # unset the voting attribute for regressors for downstream simplicity
        if self.learner_type == "regressor":
            self.voting = None

    @property
    def learners(self) -> List[Learner]:
        """Return the underlying list of learners."""
        return self._learners

    @property
    def model(self):
        """Return underlying scikit-learn meta-estimator model."""
        return self._model

    @property
    def model_type(self):
        """Return meta-estimator model type (i.e., the class)."""
        return self._model_type

    def _setup_underlying_learners(self, examples: FeatureSet) -> None:
        """Complete pre-training set up for learners."""
        for learner in self.learners:
            learner._create_label_dict(examples)
            learner._train_setup(examples)

    def __getstate__(self) -> Dict[str, Any]:
        """
        Return attributes that should be pickled.

        We need this because we do not want to dump loggers.
        """
        attribute_dict = dict(self.__dict__)
        if "logger" in attribute_dict:
            del attribute_dict["logger"]
        return attribute_dict


[docs]
    def save(self, learner_path: PathOrStr) -> None:
        """
        Save the ``VotingLearner`` instance to a file.

        Parameters
        ----------
        learner_path : :class:`skll.types.PathOrStr`
            The path to save the ``VotingLearner`` instance to.
        """
        _save_learner_to_disk(self, learner_path)



[docs]
    @classmethod
    def from_file(
        cls, learner_path: PathOrStr, logger: Optional[logging.Logger] = None
    ) -> "VotingLearner":
        """
        Load a saved ``VotingLearner`` instance from a file.

        Parameters
        ----------
        learner_path : :class:`skll.types.PathOrStr`
            The path to a saved ``VotingLearner`` instance file.
        logger : Optional[logging.Logger], default=None
            A logging object. If ``None`` is passed, get logger from ``__name__``.

        Returns
        -------
        learner : skll.learner.voting.VotingLearner
            The ``VotingLearner`` instance loaded from the file.
        """
        # use the logger that's passed in or if nothing was passed in,
        # then create a new logger
        logger = logger if logger else logging.getLogger(__name__)

        # call the learner loding utility function
        obj = _load_learner_from_disk(cls, learner_path, logger)
        assert isinstance(obj, cls)
        return obj



[docs]
    def train(
        self,
        examples: FeatureSet,
        param_grid_list: Optional[List[Dict[str, Any]]] = None,
        grid_search_folds: Union[int, FoldMapping] = 5,
        grid_search: bool = True,
        grid_objective: Optional[str] = None,
        grid_jobs: Optional[int] = None,
        shuffle: bool = False,
    ) -> None:
        """
        Train the voting meta-estimator.

        First, we train each of the underlying estimators (represented by
        a skll ``Learner``), possibly with grid search. Then, we instantiate
        a ``VotingClassifier`` or ``VotingRegressor`` as appropriate with the
        scikit-learn ``Pipeline`` stored in the ``pipeline`` attribute
        of each trained ``Learner`` instance as the estimator. Finally,
        we call ``fit()`` on the ``VotingClassifier`` or ``VotingRegressor``
        instance. We follow this process because it allows us to use grid
        search to find good hyperparameter values for our underlying learners
        before passing them to the meta-estimator AND because it allows us to
        use SKLL featuresets and do all of the same pre-processing when
        doing inference.

        The trained meta-estimator is saved in the ``_model`` attribute.
        Nothing is returned.

        Parameters
        ----------
        examples : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance to use for training.
        param_grid_list : Optional[List[Dict[str, Any]]], default=None
            The list of parameter grids to search through for grid
            search, one for each underlying learner. The order of
            the dictionaries should correspond to the order in
            which the underlying estimators were specified when the
            ``VotingLearner`` was instantiated. If ``None``, the default
            parameter grids will be used for the underlying estimators.
        grid_search_folds : Union[int, :class:`skll.types.FoldMapping`], default=5
            The number of folds to use when doing the grid search
            for each of the underlying learners, or a mapping from
            example IDs to folds.
        grid_search : bool, default=True
            Should we use grid search when training each underlying learner?
        grid_objective : Optional[str], default=None
            The name of the objective function to use when
            doing the grid search for each underlying learner.
            Must be specified if ``grid_search`` is ``True``.
        grid_jobs : Optional[int], default=None
            The number of jobs to run in parallel when doing the
            grid search for each underlying learner. If ``None`` or 0,
            the number of grid search folds will be used.
        shuffle : bool, default=False
            Shuffle examples (e.g., for grid search CV.)
        """
        if param_grid_list is None:
            self._param_grids = []
        else:
            if not isinstance(param_grid_list, list):
                raise ValueError(
                    f"`param_grid_list` should be a list of dictionaries, "
                    f"you specified: {param_grid_list}"
                )
            else:
                self._param_grids = param_grid_list

        # train each of the underlying estimators with grid search, if required;
        # basically, we are just running grid search to find good hyperparameter
        # values that we can then pass to scikit-learn below
        for learner, param_grid in zip_longest(self.learners, self._param_grids):
            _ = learner.train(
                examples,
                grid_search=grid_search,
                grid_objective=grid_objective,
                param_grid=param_grid,
                grid_search_folds=grid_search_folds,
                grid_jobs=grid_jobs,
                shuffle=shuffle,
            )

        # once we have our instantiated learners, we use their `pipeline`
        # attribute as the input estimators to the specific voting learner type
        estimators = list(zip(self._learner_names, [learner.pipeline for learner in self.learners]))
        if self.learner_type == "classifier":
            self._model_type = VotingClassifier
            model_kwargs = {"voting": self.voting}
        else:
            self._model_type = VotingRegressor
            model_kwargs = {}
        meta_learner = self.model_type(estimators, **model_kwargs)

        # get the training features in the right dictionary format
        if isinstance(examples.vectorizer, DictVectorizer):
            X_train = examples.vectorizer.inverse_transform(examples.features)

        # since label dictionaries are identical for all underlying
        # learners, save it into a easier to access attribute
        self.label_dict = self.learners[0].label_dict

        # get the training labels in the right format too
        # NOTE: technically, we could also use a `LabelEncoder` here but
        # that may not account for passing `pos_label` above when
        # instantiating the learners so we stick with the label dict
        y_train: np.ndarray
        if examples.labels is not None:
            if self.learner_type == "classifier":
                y_train = np.array([self.label_dict[label] for label in examples.labels])
            else:
                # for regressors, the labels are just the labels
                y_train = examples.labels

        # now we need to fit the actual meta learner which will also fit
        # clones of the underlying pipelines;
        # NOTE: this will *not* yield the same results as the case where we take
        # the predictions from the trained SKLL learners above and do the
        # voting ourselves. This is because SKLL learners do a lot of things
        # differently (e.g., shuffling before grid search) that can cause
        # differences in results and/or floating point precision.
        self._model = meta_learner.fit(X_train, y_train)



[docs]
    def predict(
        self,
        examples: FeatureSet,
        prediction_prefix: Optional[str] = None,
        append: bool = False,
        class_labels: bool = True,
        individual_predictions: bool = False,
    ) -> Tuple[np.ndarray, Optional[Dict[str, np.ndarray]]]:
        """
        Generate predictions with meta-estimator.

        Compute the predictions from the meta-estimator and, optionally, the
        underlying estimators on given ``FeatureSet``. The predictions are
        also written to disk if ``prediction_prefix`` is not ``None``.

        For regressors, the returned and written-out predictions are identical.
        However, for classifiers:

        - if ``class_labels`` is ``True``, class labels are returned as well as
          written out.

        - if ``class_labels`` is ``False`` and the classifier is probabilistic
          (i.e., ``self.probability`` is ``True``), class probabilities are
          returned as well as written out.

        - if ``class_labels`` is ``False`` and the classifier is non-probabilistic
          (i.e., ``self..probability`` is ``False``), class indices are returned
          and class labels are written out. This option is generally only
          meant for SKLL-internal use.

        Parameters
        ----------
        examples : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance to predict labels for.
        prediction_prefix : Optional[str], default=None
            If saving the predictions, this is the prefix that will be used for
            the filename. It will be followed by ``"_predictions.tsv"``
        append : bool, default=False
            Should we append the current predictions to the file if it exists?
        class_labels : bool, default=True
            For classifier, should we convert class indices to their (str) labels
            for the returned array? Note that class labels are always written out
            to disk.
        individual_predictions : bool, default=False
            Return (and, optionally, write out) the predictions from each
            underlying learner.

        Returns
        -------
        Tuple[numpy.ndarray, Optional[Dict[str, numpy.ndarray]]]
            The first element is the array of predictions returned by the
            meta-estimator and the second is an optional dictionary with the
            name of each underlying learner as the key and the array of its
            predictions as the value. The second element is ``None`` if
            ``individual_predictions`` is set to ``False``.
        """
        example_ids = examples.ids

        # get the test set features in the right format
        if isinstance(examples.vectorizer, DictVectorizer):
            xtest = examples.vectorizer.inverse_transform(examples.features)

        # get all possible kinds of predictions from the meta-learner
        prediction_dict = get_predictions(self, xtest)

        # decide what predictions to write and what predictions to return
        # by default, these are just what is output by the model
        to_write = prediction_dict["raw"]
        to_return = prediction_dict["raw"]

        # for classifiers ...
        if self.learner_type == "classifier":
            # return and write class labels if they were explicitly asked for
            if class_labels:
                to_return = to_write = prediction_dict["labels"]
            else:
                # return and write probabilities
                if self.voting == "soft":
                    to_return = to_write = prediction_dict["probabilities"]
                # return class indices and write labels
                else:
                    to_return = prediction_dict["raw"]
                    to_write = prediction_dict["labels"]

        # for regressors, it's really simple
        else:
            to_write = to_return = prediction_dict["raw"]

        # write out the meta-estimator predictions if we are asked to
        if prediction_prefix is not None:
            write_predictions(
                example_ids,
                to_write,
                prediction_prefix,
                self.learner_type,
                self.learners[0].label_list,
                append=append,
            )

        # get and write each underlying learner's predictions if asked for
        if individual_predictions:
            # create a dictionary to hold the individual predictions
            individual_predictions_dict = {}

            # iterate over each underlying learner along with names
            for name, learner in zip(self.model.named_estimators_, self.learners):
                # the learner's `predict()` method should handle everything
                learner_prediction_prefix = (
                    f"{prediction_prefix}_{name}" if prediction_prefix is not None else None
                )
                learner_predictions = learner.predict(
                    examples,
                    prediction_prefix=learner_prediction_prefix,
                    append=append,
                    class_labels=class_labels,
                )

                # save this estimator's predictions in the dictionary
                individual_predictions_dict[name] = learner_predictions
        else:
            individual_predictions_dict = None

        # return the tuple of the meta-estimator predictions array
        # and the dictionary containing the individual predictions
        return (to_return, individual_predictions_dict)



[docs]
    def evaluate(
        self,
        examples: FeatureSet,
        prediction_prefix: Optional[str] = None,
        append: bool = False,
        grid_objective: Optional[str] = None,
        individual_predictions: bool = False,
        output_metrics: List[str] = [],
    ) -> EvaluateTaskResults:
        """
        Evaluate the meta-estimator on a given ``FeatureSet``.

        Parameters
        ----------
        examples : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance to evaluate the performance of the model on.
        prediction_prefix : Optional[str], default=None
            If saving the predictions, this is the
            prefix that will be used for the filename.
            It will be followed by ``"_predictions.tsv"``
        append : bool, default=False
            Should we append the current predictions to the file if
            it exists?
        grid_objective : Optional[str], default=None
            The objective function used when doing the grid search.
        individual_predictions : bool, default=False
            Optionally, write out the predictions from each underlying learner.
        output_metrics : List[str], default=[]
            List of additional metric names to compute in
            addition to grid objective.

        Returns
        -------
        :class:`skll.types.EvaluateTaskResults`
            The confusion matrix, the overall accuracy, the per-label
            PRFs, the model parameters, the grid search objective
            function score, and the additional evaluation metrics, if any.
        """
        # make the prediction on the test data; note that these
        # are either class indices or class probabilities
        yhat, _ = self.predict(
            examples,
            class_labels=False,
            prediction_prefix=prediction_prefix,
            append=append,
            individual_predictions=individual_predictions,
        )

        # for classifiers, convert class labels indices for consistency
        # but account for any unseen labels in the test set that may not
        # have occurred in the training data for the underlying learners
        # at all; then get acceptable metrics based on the type of labels we have
        if examples.labels is not None:
            if self.learner_type == "classifier":
                sorted_unique_labels = np.unique(examples.labels)
                test_label_list = sorted_unique_labels.tolist()
                train_and_test_label_dict = add_unseen_labels(self.label_dict, test_label_list)
                ytest = np.array([train_and_test_label_dict[label] for label in examples.labels])
                acceptable_metrics = get_acceptable_classification_metrics(sorted_unique_labels)
            # for regressors we do not need to do anything special to the labels
            else:
                train_and_test_label_dict = None
                ytest = examples.labels
                acceptable_metrics = get_acceptable_regression_metrics()

        # check that all of the output metrics are acceptable
        unacceptable_metrics = set(output_metrics).difference(acceptable_metrics)
        if unacceptable_metrics and examples.labels is not None:
            label_type = examples.labels.dtype.type
            raise ValueError(
                f"The following metrics are not valid "
                f"for this learner({self.model_type.__name__}) "
                f"with these labels of type {label_type.__name__}: "
                f"{list(unacceptable_metrics)}"
            )

        # get the values of the evaluation metrics
        (
            conf_matrix,
            accuracy,
            result_dict,
            objective_score,
            metric_scores,
        ) = compute_evaluation_metrics(
            output_metrics,
            ytest,
            yhat,
            self.learner_type,
            label_dict=train_and_test_label_dict,
            grid_objective=grid_objective,
            probability=self.voting == "soft",
            logger=self.logger,
        )

        # add in the model parameters, excluding the ones
        # for the underlying estimators, and return
        model_params = self.model.get_params(deep=False)
        res = (conf_matrix, accuracy, result_dict, model_params, objective_score, metric_scores)
        return res



[docs]
    def cross_validate(
        self,
        examples: FeatureSet,
        stratified: bool = True,
        cv_folds: Union[int, FoldMapping] = 10,
        cv_seed: int = 123456789,
        grid_search: bool = True,
        grid_search_folds: Union[int, FoldMapping] = 5,
        grid_jobs: Optional[int] = None,
        grid_objective: Optional[str] = None,
        output_metrics: List[str] = [],
        prediction_prefix: Optional[str] = None,
        param_grid_list: Optional[List[Dict[str, Any]]] = None,
        shuffle: bool = False,
        save_cv_folds: bool = True,
        save_cv_models: bool = False,
        individual_predictions: bool = False,
        use_custom_folds_for_grid_search: bool = True,
    ) -> VotingCrossValidateTaskResults:
        """
        Cross-validate the meta-estimator on the given examples.

        We follow essentially the same methodology as in
        ``Learner.cross_validate()`` - split the examples into
        training and testing folds, and then call ``self.train()``
        on the training folds and then ``self.evaluate()`` on the
        test fold. Note that this means that underlying estimators
        with different hyperparameters may be used for each fold, as is
        the case with ``Learner.cross_validate()``.

        Parameters
        ----------
        examples : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance to cross-validate learner performance on.
        stratified : bool, default=True
            Should we stratify the folds to ensure an even
            distribution of labels for each fold?
        cv_folds : Union[int, :class:`skll.types.FoldMapping`], default=10
            The number of folds to use for cross-validation, or
            a mapping from example IDs to folds.
        cv_seed: int, default=123456789
            The value for seeding the random number generator
            used to create the random folds. Note that this
            seed is *only* used if either ``grid_search`` or
            ``shuffle`` are set to ``True``.
        grid_search : bool, default=True
            Should we do grid search when training each fold?
            Note: This will make this take *much* longer.
        grid_search_folds : Union[int, :class:`skll.types.FoldMapping`], default=5
            The number of folds to use when doing the grid search, or a mapping
            from example IDs to folds.
        grid_jobs : Optional[int], default=None
            The number of jobs to run in parallel when doing the grid search.
            If ``None`` or 0, the number of grid search folds will be used.
        grid_objective : Optional[str], default=None
            The name of the objective function to use when doing the grid search.
            Must be specified if ``grid_search`` is ``True``.
        output_metrics : Optional[List[str]], default=[]
            List of additional metric names to compute in addition to the metric
            used for grid search.
        prediction_prefix : Optional[str], default=None
            If saving the predictions, this is the prefix that will be used for
            the filename. It will be followed by ``"_predictions.tsv"``
        param_grid_list : Optional[List[Dict[str, Any]]], default=None
            The list of parameters grid to search through for grid
            search, one for each underlying learner. The order of
            the dictionaries should correspond to the order If ``None``,
            the default parameter grids will be used for the underlying
            estimators.
        shuffle : bool, default=False
            Shuffle examples before splitting into folds for CV.
        save_cv_folds : bool, default=True
             Whether to save the cv fold ids or not?
        save_cv_models : bool, default=False
            Whether to save the cv models or not?
        individual_predictions : bool, default=False
            Write out the cross-validated predictions from each underlying
            learner as well.
        use_custom_folds_for_grid_search : bool, default=True
            If ``cv_folds`` is a custom dictionary, but ``grid_search_folds``
            is not, perhaps due to user oversight, should the same custom
            dictionary automatically be used for the inner grid-search
            cross-validation?

        Returns
        -------
        :class:`skll.types.CrossValidateTaskResults`
           A 3-tuple containing the following:

            List[:class:`skll.types.EvaluateTaskResults`]: the confusion matrix, overall accuracy,
            per-label PRFs, model parameters, objective function score, and
            evaluation metrics (if any) for each fold.

            Optional[:class:`skll.types.FoldMapping`]: dictionary containing the test-fold number
            for each id if ``save_cv_folds`` is ``True``, otherwise ``None``.

            Optional[List[:class:`skll.learner.voting.VotingLearner`]]: list of voting
            learners, one for each fold if ``save_cv_models`` is ``True``,
            otherwise ``None``.

        Raises
        ------
        ValueError
            If classification labels are not properly encoded as strings.
        ValueError
            If ``grid_search`` is ``True`` but ``grid_objective`` is ``None``.
        """
        # Seed the random number generator so that randomized algorithms are
        # replicable.
        random_state = np.random.RandomState(cv_seed)

        # We need to check whether the labels in the featureset are labels
        # or continuous values. If it's the latter, we need to raise an
        # an exception since the stratified splitting in sklearn does not
        # work with continuous labels. Note that although using random folds
        # _will_ work, we want to raise an error in general since it's better
        # to encode the labels as strings anyway for classification problems.
        if self.learner_type == "classifier" and type_of_target(examples.labels) not in [
            "binary",
            "multiclass",
        ]:
            raise ValueError(
                "Floating point labels must be encoded as strings for cross-validation."
            )

        # check that we have an objective since grid search is on by default
        # Note that `train()` would raise this error anyway later but it's
        # better to raise this early on so rather than after a whole bunch of
        # stuff has happened
        if grid_search and not grid_objective:
            raise ValueError(
                "Grid search is on by default. You must "
                "either specify a grid objective or turn off "
                "grid search."
            )

        # Shuffle so that the folds are random for the inner grid search CV.
        # If grid search is True but shuffle isn't, shuffle anyway.
        # You can't shuffle a scipy sparse matrix in place, so unfortunately
        # we make a copy of everything (and then get rid of the old version)
        if grid_search or shuffle:
            if grid_search and not shuffle:
                self.logger.warning(
                    "Training data will be shuffled to randomize "
                    "grid search folds.  Shuffling may yield "
                    "different results compared to scikit-learn."
                )
            ids, labels, features = sk_shuffle(
                examples.ids, examples.labels, examples.features, random_state=random_state
            )
            examples = FeatureSet(
                examples.name, ids, labels=labels, features=features, vectorizer=examples.vectorizer
            )

        # Call some setup code which will properly initialize the underlying
        # learners before they are eventually trained
        self._setup_underlying_learners(examples)

        # Set up the cross-validation iterator.
        kfold, cv_groups = setup_cv_fold_iterator(
            cv_folds, examples, self.learner_type, stratified=stratified, logger=self.logger
        )

        # When using custom CV folds (a dictionary), if we are planning to do
        # grid search, set the grid search folds to be the same as the custom
        # cv folds unless a flag is set that explicitly tells us not to.
        # Note that this should only happen when we are using the API; otherwise
        # the configparser should take care of this even before this method is called
        if isinstance(cv_folds, dict):
            if grid_search and use_custom_folds_for_grid_search and grid_search_folds != cv_folds:
                self.logger.warning(
                    "The specified custom folds will be used for " "the inner grid search."
                )
                grid_search_folds = cv_folds

        # handle each fold separately & accumulate the predictions and results
        results = []
        append_predictions = False
        saved_models: List["VotingLearner"] = []
        saved_skll_fold_ids: FoldMapping = {}
        if examples.features is not None and examples.labels is not None:
            for fold_num, (train_indices, test_indices) in enumerate(
                kfold.split(examples.features, examples.labels, cv_groups)
            ):
                # Train model
                self._model = None  # prevent feature vectorizer from being reset.
                train_set = FeatureSet(
                    examples.name,
                    examples.ids[train_indices],
                    labels=examples.labels[train_indices],
                    features=examples.features[train_indices],
                    vectorizer=examples.vectorizer,
                )

                self.train(
                    train_set,
                    param_grid_list=param_grid_list,
                    grid_search_folds=grid_search_folds,
                    grid_search=grid_search,
                    grid_objective=grid_objective,
                    grid_jobs=grid_jobs,
                    shuffle=grid_search,
                )

                if save_cv_models:
                    saved_models.append(copy.deepcopy(self))

                # evaluate the voting meta-estimator on the test fold
                test_tuple = FeatureSet(
                    examples.name,
                    examples.ids[test_indices],
                    labels=examples.labels[test_indices],
                    features=examples.features[test_indices],
                    vectorizer=examples.vectorizer,
                )

                # save the results
                results.append(
                    self.evaluate(
                        test_tuple,
                        prediction_prefix=prediction_prefix,
                        append=append_predictions,
                        grid_objective=grid_objective,
                        output_metrics=output_metrics,
                        individual_predictions=individual_predictions,
                    )
                )
                append_predictions = True

                # save the fold number for each test ID if we were asked to
                if save_cv_folds:
                    for index in test_indices:
                        saved_skll_fold_ids[examples.ids[index]] = str(fold_num)

        # return list of results/outputs for all folds
        models = saved_models if save_cv_models else None
        skll_fold_ids = saved_skll_fold_ids if save_cv_folds else None
        return (results, skll_fold_ids, models)



[docs]
    def learning_curve(
        self,
        examples: FeatureSet,
        metric: str,
        cv_folds: Union[int, FoldMapping] = 10,
        train_sizes: LearningCurveSizes = np.linspace(0.1, 1.0, 5),
        override_minimum: bool = False,
    ) -> Tuple[List[float], List[float], List[float], List[int]]:
        """
        Generate learning curves for the meta-estimator.

        Generate learning curves for the voting meta-estimator on the training
        examples via cross-validation. Adapted from the scikit-learn code for
        learning curve generation (cf.``sklearn.model_selection.learning_curve``).

        Parameters
        ----------
        examples : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance to generate the learning curve on.
        metric : str
            The name of the metric function to use
            when computing the train and test scores
            for the learning curve.
        cv_folds : Union[int, :class:`skll.types.FoldMapping`], default=10
            The number of folds to use for cross-validation, or
            a mapping from example IDs to folds.
        train_sizes : :class:`skll.types.LearningCurveSizes`, default= :func:`numpy.linspace` with start=0.1, stop=1.0, num=5
            Relative or absolute numbers of training examples
            that will be used to generate the learning curve.
            If the type is float, it is regarded as a fraction
            of the maximum size of the training set (that is
            determined by the selected validation method),
            i.e. it has to be within (0, 1]. Otherwise it
            is interpreted as absolute sizes of the training
            sets. Note that for classification the number of
            samples usually have to be big enough to contain
            at least one sample from each class.
        override_minimum : bool, default=False
            Learning curves can be unreliable for very small sizes
            esp. for > 2 labels. If this option is set to ``True``, the
            learning curve would be generated even if the number
            of example is less 500 along with a warning. If ``False``,
            the curve is not generated and an exception is raised instead.

        Returns
        -------
        train_scores : List[float]
            The scores for the training set.
        test_scores : List[float]
            The scores on the test set.
        fit_times : List[float]
            The average times taken to fit each model.
        num_examples : List[int]
            The numbers of training examples used to generate the curve.

        Raises
        ------
        ValueError
            If the number of examples is less than 500.
        """
        # check that the number of training examples is more than the minimum
        # needed for generating a reliable learning curve
        if len(examples) < 500:
            if not override_minimum:
                raise ValueError(
                    f"Number of training examples provided ({len(examples)}) "
                    "is less than the minimum needed (500) for the "
                    "learning curve to be reliable."
                )
            else:
                self.logger.warning(
                    "Learning curves can be unreliable for examples fewer than "
                    f"500. You provided {len(examples)}."
                )

        # raise a warning if we are using a probabilistic classifier
        # since that means we cannot use the predictions directly
        if self.voting == "soft":
            self.logger.warning(
                "For soft-voting classifiers, the most likely "
                "class will be computed via an argmax before "
                "computing the curve."
            )

        # Call some setup code which will properly initialize the underlying
        # learners before they are eventually trained
        self._setup_underlying_learners(examples)

        # set up the CV split iterator over the train/test featuresets
        # which also returns the maximum number of training examples
        (featureset_iter, n_max_training_samples) = setup_cv_split_iterator(cv_folds, examples)

        # Get the `_translate_train_sizes()` function from scikit-learn
        # since we need it to get the right list of sizes after cross-validation
        _module = import_module("sklearn.model_selection._validation")
        _translate_train_sizes = getattr(_module, "_translate_train_sizes")
        train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples)
        n_unique_ticks = train_sizes_abs.shape[0]

        # limit the number of parallel jobs for this to be no higher than
        # MAX_CONCURRENT_PROCESSES or the number of cores, whichever is lower
        n_jobs = min(cpu_count(), MAX_CONCURRENT_PROCESSES)

        # Run jobs in parallel that train the model on each subset
        # of the training data and compute train and test scores
        parallel = joblib.Parallel(n_jobs=n_jobs, pre_dispatch=n_jobs)
        out = parallel(
            joblib.delayed(train_and_score)(self, train_fs[:n_train_samples], test_fs, metric)
            for train_fs, test_fs in featureset_iter
            for n_train_samples in train_sizes_abs
        )

        # Reshape the outputs
        out = np.array(out)
        n_cv_folds = out.shape[0] // n_unique_ticks
        out = out.reshape(n_cv_folds, n_unique_ticks, 3)
        out = np.asarray(out).transpose((2, 1, 0))

        return list(out[0]), list(out[1]), list(out[2]), list(train_sizes_abs)