Source code for skll.data.writers

# License: BSD 3 clause
"""
Handles loading data from various types of data files.

:author: Dan Blanchard (dblanchard@ets.org)
:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:author: Jeremy Biggs (jbiggs@ets.org)
:organization: ETS
"""

import json
import logging
import sys
from csv import DictWriter
from decimal import Decimal
from pathlib import Path
from typing import IO, Any, Dict, List, Optional, Set, Tuple, Union

import numpy as np
import pandas as pd
from scipy.sparse import issparse
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

from skll.data import FeatureSet
from skll.types import FeatGenerator, FeatureDict, IdType, LabelType, PathOrStr



[docs]
class Writer(object):
    """
    Write out FeatureSets to files on disk.

    This is the base class used to create featureset writers for different
    file types.

    Parameters
    ----------
    path : :class:`skll.types.PathOrStr`
        A path to the feature file we would like to create. The suffix
        to this filename must be ``.arff``, ``.csv``, ``.jsonlines``,
        ``.libsvm``, ``.ndj``, or ``.tsv``. If ``subsets``
        is not ``None``, when calling the ``write()`` method, path is
        assumed to be a string containing the path to the directory to
        write the feature files with an additional file extension
        specifying the file type. For example ``/foo/.csv``.

    feature_set : :class:`skll.data.featureset.FeatureSet`
        The ``FeatureSet`` instance to dump to the file.

    quiet : bool, default=True
        Do not print "Writing..." status message to stderr.

    subsets : Optional[Dict[str, List[str]]], default=None
        A mapping from subset names to lists of feature names
        that are included in those sets. If given, a feature
        file will be written for every subset (with the name
        containing the subset name as suffix to ``path``).
        Note, since string- valued features are automatically
        converted into boolean features with names of the form
        ``FEATURE_NAME=STRING_VALUE``, when doing the
        filtering, the portion before the ``=`` is all that's
        used for matching. Therefore, you do not need to
        enumerate all of these boolean feature names in your
        mapping.

    logger : Optional[logging.Logger], default=None
        A logger instance to use to log messages instead of creating
        a new one by default.

    """

    def __init__(
        self,
        path: PathOrStr,
        feature_set: FeatureSet,
        quiet: bool = True,
        subsets: Optional[Dict[str, List[str]]] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """Initialize base Writer class."""
        super(Writer, self).__init__()

        self.quiet = quiet
        self.path = Path(path)
        self.feat_set = feature_set
        self.subsets = subsets
        self.logger = logger if logger else logging.getLogger(__name__)

        # Get prefix & extension for checking file types & writing subset files;
        # since we also need to handle paths like "foo/.csv" for subset-writing
        # we have to do a bit of introspection before figuring out the various
        # parts of the path
        parent, stem, suffix = self.path.parent, self.path.stem, self.path.suffix
        if stem.startswith(".") and not suffix:
            self.root = parent
            self.ext = stem.lower()
        else:
            self.root = parent / stem
            self.ext = suffix.lower()
        self._progress_msg = ""
        self._use_pandas = False


[docs]
    @classmethod
    def for_path(cls, path: PathOrStr, feature_set: FeatureSet, **kwargs) -> "Writer":
        """
        Retrieve object of ``Writer`` sub-class appropriate for given path.

        Parameters
        ----------
        path : :class:`skll.types.PathOrStr`
            A path to the feature file we would like to create. The
            suffix to this filename must be ``.arff``, ``.csv``,
            ``.jsonlines``, ``.libsvm``, ``.ndj``, or
            ``.tsv``. If ``subsets`` is not ``None``, when calling the
            ``write()`` method, path is assumed to be a string
            containing the path to the directory to write the feature
            files with an additional file extension specifying the
            file type. For example ``/foo/.csv``.

        feature_set : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance to dump to the output file.

        kwargs : Optional[Dict[str, Any]]
            The keyword arguments for ``for_path`` are the same as
            the initializer for the desired ``Writer`` subclass.

        Returns
        -------
        writer : :class:`skll.data.Writer`
            New instance of the Writer sub-class that is
            appropriate for the given path.

        """
        # Get lowercase extension for file extension checking
        # NOTE: the reason we are doing this complicated gymnastics
        # instead of just using `path.suffix` is because sometimes
        # `path` may look like `foo/.jsonlines` when we are writing
        # subsets so we need to handle that edge case
        path = Path(path)
        stem, suffix = path.stem, path.suffix
        if stem.startswith(".") and not suffix:
            ext = stem.lower()
        else:
            ext = suffix.lower()
        return EXT_TO_WRITER[ext](path, feature_set, **kwargs)



[docs]
    def write(self) -> None:
        """Write out this Writer's ``FeatureSet`` to a file in its format."""
        if isinstance(self.feat_set.vectorizer, FeatureHasher):
            raise ValueError(
                "Writer cannot write sets that use a " "FeatureHasher for vectorization."
            )

        # Write one feature file if we weren't given a dict of subsets
        if self.subsets is None:
            self._write_subset(self.path)

        # Otherwise write one feature file per subset
        else:
            for subset_name, filter_features in self.subsets.items():
                self.logger.debug(f"Subset ({subset_name}) features: " f"{filter_features}")
                sub_path = self.root / f"{subset_name}{self.ext}"
                self._write_subset(sub_path, set(filter_features))


    def _write_subset(
        self, sub_path: PathOrStr, filter_features: Optional[Set[str]] = None
    ) -> None:
        """
        Write out given ``FeatureSet`` instance to a file in this class's format.

        Parameters
        ----------
        sub_path : :class:`skll.types.PathOrStr`
            The path to the file we want to create for this subset
            of our data.

        filter_features : Optional[Set[str]], default=None
            Set of features to include in current feature file.

        """
        self.logger.debug(f"sub_path: {sub_path}")
        self.logger.debug(f"feature_set: {self.feat_set.name}")
        self.logger.debug(f"filter_features: {filter_features}")

        if not self.quiet:
            self._progress_msg = f"Writing {sub_path}..."
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        if not self._use_pandas:
            # Apply filtering
            filtered_set: Union[FeatGenerator, FeatureSet] = (
                self.feat_set.filtered_iter(features=filter_features)
                if filter_features is not None
                else self.feat_set
            )

            # Open file for writing and write each line
            with open(sub_path, "w", encoding="utf-8") as output_file:
                # Write out the header if this format requires it
                self._write_header(filtered_set, output_file, filter_features)
                # Write individual lines
                for ex_num, (id_, label_, feat_dict) in enumerate(filtered_set):
                    self._write_line(id_, label_, feat_dict, output_file)
                    if not self.quiet and ex_num % 100 == 0:
                        print(f"{self._progress_msg}{ex_num:>15}", end="\r", file=sys.stderr)
                        sys.stderr.flush()
        else:
            self._write_data(self.feat_set, sub_path, filter_features)

        if not self.quiet:
            print(f"{self._progress_msg}{'done':<15}", file=sys.stderr)
            sys.stderr.flush()

    def _write_header(self, feature_set, output_file, filter_features):
        """
        Write header to file.

        Called before lines are written to file, so that headers can be written
        for files that need them.

        Parameters
        ----------
        feature_set : Ignored
            Not used.

        output_file : Ignored
            Not used.

        filter_features : Ignored
           Not used.

        """
        pass

    def _write_line(self, id_, label_, feat_dict, output_file):
        """
        Write the current line in the file in this Writer's format.

        Parameters
        ----------
        id_ : Ignored
            Not used.

        label_ : Ignored
            Not used.

        feat_dict : Ignored
            Not used.

        output_file : Ignored
             Not used.

        Raises
        ------
        NotImplementedError

        """
        raise NotImplementedError

    def _write_data(self, feature_set, output_file, filter_features):
        """
        Write full data set in Writer's format using `pandas`, rather than row-by-row.

        Parameters
        ----------
        feature_set : Ignored
            Not used.

        output_file : Ignored
            Not used.

        filter_features : Ignored
            Not used.

        Raises
        ------
        NotImplementedError

        """
        raise NotImplementedError

    def _get_column_names_and_indexes(
        self, feature_set: FeatureSet, filter_features: Optional[Set[str]] = None
    ) -> Tuple[List[str], List[int]]:
        """
        Get names of columns and associated indices for (possibly filtered) features.

        Parameters
        ----------
        feature_set : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance being written to a file.

        filter_features : Optional[Set[str]], default=None
            If only writing a subset of the features in the
            FeatureSet to ``output_file``, these are the
            features to include in this file.

        Returns
        -------
        column_names : List[str]
            A list of the (possibly filtered) column names.

        column_indexes : List[int]
            A list of the (possibly filtered) column indexes.

        """
        # if we're not doing filtering,
        # then just take all the feature names
        self.logger.debug(feature_set)
        if isinstance(feature_set.vectorizer, DictVectorizer):
            if filter_features is None:
                filter_features = feature_set.vectorizer.feature_names_

            # create a list of tuples with (column names, column indexes)
            # so that we can correctly extract the appropriate columns
            columns = sorted(
                [
                    (col_name, col_idx)
                    for col_name, col_idx in feature_set.vectorizer.vocabulary_.items()
                    if (col_name in filter_features or col_name.split("=", 1)[0] in filter_features)
                ],
                key=lambda x: x[1],
            )

            # then, split the names and indexes into separate lists
            column_names, column_indexes = zip(*columns)
            return list(column_names), list(column_indexes)
        else:
            return [], []




[docs]
class CSVWriter(Writer):
    """
    Writer for writing out ``FeatureSet`` instances as CSV files.

    Parameters
    ----------
    path : :class:`skll.types.PathOrStr`
        A path to the feature file we would like to create.
        If ``subsets`` is not ``None``, this is assumed to be a string
        containing the path to the directory to write the feature
        files with an additional file extension specifying the file
        type. For example ``/foo/.csv``.

    feature_set : :class:`skll.data.featureset.FeatureSet`
        The ``FeatureSet`` instance to dump to the output file.

    quiet : bool, default=True
        Do not print "Writing..." status message to stderr.

    subsets : Optional[Dict[str, List[str]]], default=None
        A mapping from subset names to lists of feature names
        that are included in those sets. If given, a feature
        file will be written for every subset (with the name
        containing the subset name as suffix to ``path``).
        Note, since string- valued features are automatically
        converted into boolean features with names of the form
        ``FEATURE_NAME=STRING_VALUE``, when doing the
        filtering, the portion before the ``=`` is all that's
        used for matching. Therefore, you do not need to
        enumerate all of these boolean feature names in your
        mapping.

    logger : Optional[logging.Logger], default=None
        A logger instance to use to log messages instead of creating
        a new one by default.

    label_col : str, default="y"
        The column name containing the label.

    id_col : str, default="id"
        The column name containing the ID.

    pandas_kwargs : Optional[Dict[str], Any], default=None
        Arguments that will be passed directly to the `pandas` I/O reader.

    """

    def __init__(
        self,
        path: PathOrStr,
        feature_set: FeatureSet,
        quiet: bool = True,
        subsets: Optional[Dict[str, List[str]]] = None,
        logger: Optional[logging.Logger] = None,
        label_col: str = "y",
        id_col: str = "id",
        pandas_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """Initialize the CSVWriter class."""
        self.label_col = label_col
        self.id_col = id_col
        self._pandas_kwargs = {} if pandas_kwargs is None else pandas_kwargs
        self._sep = self._pandas_kwargs.pop("sep", ",")
        self._index = self._pandas_kwargs.pop("index", False)
        super(CSVWriter, self).__init__(
            path, feature_set, quiet=quiet, subsets=subsets, logger=logger
        )
        self._use_pandas = True

    def _build_dataframe_with_features(
        self, feature_set: FeatureSet, filter_features: Optional[Set[str]] = None
    ) -> pd.DataFrame:
        """
        Create and filter data frame from features in given feature set.

        Parameters
        ----------
        feature_set : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance being written to a file.

        filter_features : Optional[Set[str]], default=None
            If only writing a subset of the features in the
            FeatureSet to ``output_file``, these are the
            features to include in this file.

        Returns
        -------
        df_features : pandas.DataFrame
            The data frame constructed from the feature set. The frame may be
            empty are not features in the feature set.

        Raises
        ------
        ValueError
            If ID column is already used as feature.
            If label column is already used as feature.

        """
        # if there is no filtering, then just keep all the names
        (column_names, column_idxs) = self._get_column_names_and_indexes(
            feature_set, filter_features
        )

        # create the data frame from the feature set;
        # then, select only the columns that we want,
        # and give the columns their correct names
        if feature_set.features is not None:
            if issparse(feature_set.features):
                df_features = pd.DataFrame(feature_set.features.toarray())
            else:
                df_features = pd.DataFrame(feature_set.features)
            df_features = df_features.iloc[:, column_idxs].copy()
            df_features.columns = column_names
            return df_features
        else:
            return pd.DataFrame()

    def _build_dataframe(
        self,
        feature_set: FeatureSet,
        filter_features: Optional[Set[str]] = None,
        df_features: Optional[pd.DataFrame] = None,
    ) -> pd.DataFrame:
        """
        Create and filter data frame with features in given feature set.

        Add the IDs and labels, if applicable. If the data frame
        with features already exists, pass `df_features`. Then the IDs and labels will
        simply be added to the existing data frame containing the features.

        Parameters
        ----------
        feature_set : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance being written to a file.

        filter_features : Optional[Set[str]], default=None
            If only writing a subset of the features in the
            FeatureSet to ``output_file``, these are the
            features to include in this file.

        df_features : Optional[pandas.DataFrame], default=None
            If the data frame with features already exists,
            then we use it and add IDs and labels; otherwise,
            the feature data frame will be created from the feature set.

        Returns
        -------
        df_features : pandas.DataFrame
            The data frame constructed from the feature set.

        Raises
        ------
        ValueError
            If ID column is already used as feature.
            If label column is already used as feature.

        """
        # create the data frame with just the features
        # from the feature set, at this point
        if df_features is None:
            df_features = self._build_dataframe_with_features(feature_set, filter_features)

        # if the id column is already in the data frame,
        # then raise an error; otherwise, just add the ids
        if self.id_col in df_features:
            raise ValueError(f"ID column name {self.id_col} already used as feature name.")
        df_features[self.id_col] = feature_set.ids

        # if the the labels should exist but the column is already
        # in the data frame, then raise an error; otherwise, just add the labels
        if feature_set.has_labels:
            if self.label_col in df_features:
                raise ValueError(
                    f'Class column name "{self.label_col}" ' "already used as feature name."
                )
            df_features[self.label_col] = feature_set.labels

        return df_features

    def _write_data(
        self,
        feature_set: FeatureSet,
        output_file: PathOrStr,
        filter_features: Optional[Set[str]] = None,
    ) -> None:
        """
        Write the data in CSV format.

        Parameters
        ----------
        feature_set : :class:`skll.data.featureset.FeatureSet`
            The ``FeatureSet`` instance being written to a file.

        output_file : :class:`skll.types.PathOrStr`
            The path of the file being written to

        filter_features : Optional[Set[str]], default=None
            If only writing a subset of the features in the
            FeatureSet to ``output_file``, these are the
            features to include in this file.

        """
        df = self._build_dataframe(feature_set, filter_features=filter_features)
        df.to_csv(output_file, sep=self._sep, index=self._index, **self._pandas_kwargs)




[docs]
class TSVWriter(CSVWriter):
    """
    Writer for writing out FeatureSets as TSV files.

    Parameters
    ----------
    path : :class:`skll.types.PathOrStr`
        A path to the feature file we would like to create.
        If ``subsets`` is not ``None``, this is assumed to be a string
        containing the path to the directory to write the feature
        files with an additional file extension specifying the file
        type. For example ``/foo/.tsv``.

    feature_set : :class:`skll.data.featureset.FeatureSet`
        The ``FeatureSet`` instance to dump to the output file.

    quiet : bool, default=True
        Do not print "Writing..." status message to stderr.

    subsets : Optional[Dict[str, List[str]]], default=None
        A mapping from subset names to lists of feature names
        that are included in those sets. If given, a feature
        file will be written for every subset (with the name
        containing the subset name as suffix to ``path``).
        Note, since string- valued features are automatically
        converted into boolean features with names of the form
        ``FEATURE_NAME=STRING_VALUE``, when doing the
        filtering, the portion before the ``=`` is all that's
        used for matching. Therefore, you do not need to
        enumerate all of these boolean feature names in your
        mapping.

    logger : Optional[logging.Logger], default=None
        A logger instance to use to log messages instead of creating
        a new one by default.

    label_col: str, default="y"
        The column name containing the label.

    id_col: str, default="id"
        The column name containing the ID.

    pandas_kwargs : Optional[Dict[str, Any]], default=None
        Arguments that will be passed directly to the `pandas` I/O reader.

    """

    def __init__(
        self,
        path: PathOrStr,
        feature_set: FeatureSet,
        quiet: bool = True,
        subsets: Optional[Dict[str, List[str]]] = None,
        logger: Optional[logging.Logger] = None,
        label_col: str = "y",
        id_col: str = "id",
        pandas_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """Initialize the TSVWriter class."""
        super(TSVWriter, self).__init__(
            path,
            feature_set,
            quiet=quiet,
            subsets=subsets,
            logger=logger,
            label_col=label_col,
            id_col=id_col,
            pandas_kwargs=pandas_kwargs,
        )
        self._sep = str("\t")




[docs]
class ARFFWriter(Writer):
    """
    Writer for writing out FeatureSets as ARFF files.

    Parameters
    ----------
    path : :class:`skll.types.PathOrStr`
        A path to the feature file we would like to create.
        If ``subsets`` is not ``None``, this is assumed to be a string
        containing the path to the directory to write the feature
        files with an additional file extension specifying the file
        type. For example ``/foo/.arff``.

    feature_set : :class:`skll.data.featureset.FeatureSet`
        The ``FeatureSet`` instance to dump to the output file.

    quiet : bool, default=True
        Do not print "Writing..." status message to stderr.

    subsets : Optional[Dict[str, List[str]]], default=None
        A mapping from subset names to lists of feature names
        that are included in those sets. If given, a feature
        file will be written for every subset (with the name
        containing the subset name as suffix to ``path``).
        Note, since string- valued features are automatically
        converted into boolean features with names of the form
        ``FEATURE_NAME=STRING_VALUE``, when doing the
        filtering, the portion before the ``=`` is all that's
        used for matching. Therefore, you do not need to
        enumerate all of these boolean feature names in your
        mapping.

    logger : Optional[logging.Logger], default=None
        A logger instance to use to log messages instead of creating
        a new one by default.

    relation : str, default='skll_relation'
        The name of the relation in the ARFF file.

    regression : bool, default=False
        Is this an ARFF file to be used for regression?

    kwargs : Optional[Dict[str, Any]]
        The arguments to the ``Writer`` object being instantiated.

    """

    def __init__(
        self,
        path: PathOrStr,
        feature_set: FeatureSet,
        quiet: bool = True,
        subsets: Optional[Dict[str, List[str]]] = None,
        logger: Optional[logging.Logger] = None,
        relation="skll_relation",
        regression=False,
        dialect="excel-tab",
        label_col="y",
        id_col="id",
    ):
        """Initialize the ARFFWRiter class."""
        self.relation = relation
        self.regression = regression
        self.dialect = dialect
        self.label_col = label_col
        self.id_col = id_col
        super(ARFFWriter, self).__init__(
            path, feature_set, quiet=quiet, subsets=subsets, logger=logger
        )
        self._dict_writer: Optional[DictWriter[str]] = None

    def _write_header(
        self, feature_set: FeatureSet, output_file: IO[str], filter_features: Set[str]
    ) -> None:
        """
        Write headers to ARFF file.

        Called before lines are written to file, so that headers can be written
        for files that need them.

        Parameters
        ----------
        feature_set : Ignored
            Not used.

        output_file : IO[str]
            The file being written to.

        filter_features : Set[str]
            If only writing a subset of the features in the
            FeatureSet to ``output_file``, these are the
            features to include in this file.

        """
        fieldnames, _ = self._get_column_names_and_indexes(self.feat_set, filter_features)
        fieldnames.append(self.id_col)

        # Add relation to header
        print(f"@relation '{self.relation}'\n", file=output_file)

        # Loop through fields writing the header info for the ARFF file
        for field in fieldnames:
            field = field.replace("\\", "\\\\").replace("'", "\\'")
            print(f"@attribute '{field}' numeric", file=output_file)

        # Print class label header if necessary
        if self.regression:
            print(f"@attribute {self.label_col} numeric", file=output_file)
        else:
            if self.feat_set.has_labels:
                sorted_features = sorted(set(self.feat_set.labels))  # type: ignore
                labels_str = ",".join([str(feat) for feat in sorted_features])
                labels_str = "{" + labels_str + "}"
                print(f"@attribute {self.label_col} {labels_str}", file=output_file)
        if self.label_col:
            fieldnames.append(self.label_col)

        # Create CSV writer to handle missing values for lines in data section
        # and to ignore the instance values for non-numeric attributes
        self._dict_writer = DictWriter(
            output_file, fieldnames, restval=0, extrasaction="ignore", dialect="arff"
        )

        # Finish header and start data section
        print("\n@data", file=output_file)

    def _write_line(
        self, id_: IdType, label_: LabelType, feat_dict: FeatureDict, output_file: IO[str]
    ) -> None:
        """
        Write the current line in the file in this Writer's format.

        Parameters
        ----------
        id_ : :class:`skll.types.IdType`
            The ID for the current instance.

        label_ : :class:`skll.types.LabelType`
            The label for the current instance.

        feat_dict : :class:`skll.types.FeatureDict`
            The feature dictionary for the current instance.

        output_file : Ignored.
            Not used.

        Raises
        ------
        ValueError
            If class column name is already use as a feature

        ValueError
            If ID column name is already used as a feature.

        """
        # Add class column to feat_dict (unless this is unlabeled data)
        if self.label_col not in feat_dict:
            if self.feat_set.has_labels:
                feat_dict[self.label_col] = label_
        else:
            raise ValueError(
                f'Class column name "{self.label_col}" already ' "used as feature name."
            )
        # Add id column to feat_dict if id is provided
        if self.id_col not in feat_dict:
            feat_dict[self.id_col] = id_
        else:
            raise ValueError(f'ID column name "{self.id_col}" already used as' " feature name.")

        # Write out line
        if self._dict_writer:
            self._dict_writer.writerow(feat_dict)




[docs]
class NDJWriter(Writer):
    """
    Writer for writing out FeatureSets as .jsonlines/.ndj files.

    Parameters
    ----------
    path : :class:`skll.types.PathOrStr`
        A path to the feature file we would like to create.
        If ``subsets`` is not ``None``, this is assumed to be a string
        containing the path to the directory to write the feature
        files with an additional file extension specifying the file
        type. For example ``/foo/.ndj``.

    feature_set : :class:`skll.data.featureset.FeatureSet`
        The ``FeatureSet`` instance to dump to the output file.

    quiet : bool, default=True
        Do not print "Writing..." status message to stderr.

    subsets : Optional[Dict[str, List[str]]], default=None
        A mapping from subset names to lists of feature names
        that are included in those sets. If given, a feature
        file will be written for every subset (with the name
        containing the subset name as suffix to ``path``).
        Note, since string- valued features are automatically
        converted into boolean features with names of the form
        ``FEATURE_NAME=STRING_VALUE``, when doing the
        filtering, the portion before the ``=`` is all that's
        used for matching. Therefore, you do not need to
        enumerate all of these boolean feature names in your
        mapping.

    logger : Optional[logging.Logger], default=None
        A logger instance to use to log messages instead of creating
        a new one by default.

    """

    def __init__(
        self,
        path: PathOrStr,
        feature_set: FeatureSet,
        quiet: bool = True,
        subsets: Optional[Dict[str, List[str]]] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """Initialize the NDJWriter class."""
        super(NDJWriter, self).__init__(
            path, feature_set, quiet=quiet, subsets=subsets, logger=logger
        )

    def _write_line(
        self,
        id_: IdType,
        label_: Union[LabelType, np.int64, np.float64],
        feat_dict: FeatureDict,
        output_file: IO[str],
    ) -> None:
        """
        Write the current line in the file in NDJ format.

        Parameters
        ----------
        id_ : :class:`skll.types.IdType`
            The ID for the current instance.

        label_ : :class:`skll.types.LabelType`
            The label for the current instance.

        feat_dict : :class:`skll.types.FeatureDict`
            The feature dictionary for the current instance.

        output_file : IO[str]
            The file being written to.

        """
        example_dict: FeatureDict = {}
        # Don't try to add class column if this is label-less data
        # Try to convert the label to a scalar assuming it'a numpy
        # non-scalar type (e.g., int64) but if that doesn't work
        # then use it as is
        if self.feat_set.has_labels:
            if hasattr(label_, "item"):
                example_dict["y"] = label_.item()
            else:
                example_dict["y"] = label_
        # Try to convert the ID to a scalar assuming it'a numpy
        # non-scalar type (e.g., int64) but if that doesn't work
        # then use it as is
        if hasattr(id_, "item"):
            example_dict["id"] = id_.item()
        else:
            example_dict["id"] = id_
        example_dict["x"] = feat_dict
        print(json.dumps(example_dict, sort_keys=True), file=output_file)




[docs]
class LibSVMWriter(Writer):
    """
    Writer for writing out FeatureSets as LibSVM/SVMLight files.

    Parameters
    ----------
    path : :class:`skll.types.PathOrStr`
        A path to the feature file we would like to create.
        If ``subsets`` is not ``None``, this is assumed to be a string
        containing the path to the directory to write the feature
        files with an additional file extension specifying the file
        type. For example ``/foo/.libsvm``.

    feature_set : :class:`skll.data.featureset.FeatureSet`
        The ``FeatureSet`` instance to dump to the output file.

    quiet : bool, default=True
        Do not print "Writing..." status message to stderr.

    subsets : Optional[Dict[str, List[str]]], default=None
        A mapping from subset names to lists of feature names
        that are included in those sets. If given, a feature
        file will be written for every subset (with the name
        containing the subset name as suffix to ``path``).
        Note, since string- valued features are automatically
        converted into boolean features with names of the form
        ``FEATURE_NAME=STRING_VALUE``, when doing the
        filtering, the portion before the ``=`` is all that's
        used for matching. Therefore, you do not need to
        enumerate all of these boolean feature names in your
        mapping.

    logger : Optional[logging.Logger], default=None
        A logger instance to use to log messages instead of creating
        a new one by default.

    label_map : Optional[Dict[str, int]], default=None
        A mapping from label strings to integers.

    """

    LIBSVM_REPLACE_DICT = {
        ":": "\u2236",
        "#": "\uFF03",
        " ": "\u2002",
        "=": "\ua78a",
        "|": "\u2223",
    }

    def __init__(
        self,
        path: PathOrStr,
        feature_set: FeatureSet,
        quiet: bool = True,
        subsets: Optional[Dict[str, List[str]]] = None,
        logger: Optional[logging.Logger] = None,
        label_map: Optional[Dict[Any, Any]] = None,
    ):
        """Initialize the LibSVMWriter class."""
        self.label_map = label_map
        super(LibSVMWriter, self).__init__(
            path, feature_set, quiet=quiet, subsets=subsets, logger=logger
        )
        if self.label_map is None:
            fs_labels = feature_set.labels if feature_set.has_labels else np.array([])
            self.label_map = {
                label: num
                for num, label in enumerate(
                    sorted(
                        {
                            label
                            for label in fs_labels  # type: ignore
                            if not isinstance(label, (int, float))
                        }
                    )
                )
            }
            # Add fake item to vectorizer for None
            self.label_map[None] = "00000"

    @staticmethod
    def _sanitize(name: Union[IdType, LabelType]) -> Union[IdType, LabelType]:
        """
        Sanitize feature names for older feature formats.

        Replace special characters in names with close unicode
        equivalents to make things loadable in by LibSVM, LibLinear, or
        SVMLight.

        Parameters
        ----------
        name : Union[:class:`skll.types.IdType`, :class:`skll.types.LabelType`]
            Input name in which special characters are replaced with unicode
            equivalents.

        Returns
        -------
        Union[:class:`skll.types.IdType`, :class:`skll.types.LabelType`]
            The sanitized name with special characters replaced.

        """
        sanitized_name = name
        if isinstance(sanitized_name, str):
            for orig, replacement in LibSVMWriter.LIBSVM_REPLACE_DICT.items():
                sanitized_name = sanitized_name.replace(orig, replacement)
        return sanitized_name

    def _write_line(
        self, id_: IdType, label_: LabelType, feat_dict: FeatureDict, output_file: IO[str]
    ) -> None:
        """
        Write the current line in the file in this Writer's format.

        Parameters
        ----------
        id_ : :class:`skll.types.IdType`
            The ID for the current instance.

        label_ : :class:`skll.types.LabelType`
            The label for the current instance.

        feat_dict : :class:`skll.types.FeatureDict`
            The feature dictionary for the current instance.

        output_file : IO[str]
            The file being written to.

        """
        field_values = (
            sorted(
                [
                    (self.feat_set.vectorizer.vocabulary_[field] + 1, value)
                    for field, value in feat_dict.items()
                    if Decimal(value) != 0
                ]
            )
            if self.feat_set.vectorizer
            else []
        )

        # Print label
        if self.label_map:
            if label_ in self.label_map:
                print(self.label_map[label_], end=" ", file=output_file)
            else:
                print(label_, end=" ", file=output_file)

        # Print features
        print(
            " ".join((f"{field}:{value}" for field, value in field_values)),
            end=" ",
            file=output_file,
        )

        # Print comment with id and mappings
        print("#", end=" ", file=output_file)
        print(self._sanitize(id_), end="", file=output_file)
        print(" |", end=" ", file=output_file)

        if self.label_map:
            if label_ in self.label_map:
                print(
                    f"{self._sanitize(self.label_map[label_])}={self._sanitize(label_)}",
                    end=" | ",
                    file=output_file,
                )
            else:
                print(" |", end=" ", file=output_file)

        line = (
            " ".join(
                f"{self.feat_set.vectorizer.vocabulary_[field] + 1}={self._sanitize(field)}"
                for field, value in feat_dict.items()
                if Decimal(value) != 0
            )
            if self.feat_set.vectorizer
            else ""
        )
        print(line, file=output_file)



# Constants
EXT_TO_WRITER = {
    ".arff": ARFFWriter,
    ".csv": CSVWriter,
    ".jsonlines": NDJWriter,
    ".libsvm": LibSVMWriter,
    ".ndj": NDJWriter,
    ".tsv": TSVWriter,
}