first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,70 @@
"""
The :mod:`sklearn.preprocessing` module includes scaling, centering,
normalization, binarization methods.
"""
from ._function_transformer import FunctionTransformer
from ._data import Binarizer
from ._data import KernelCenterer
from ._data import MinMaxScaler
from ._data import MaxAbsScaler
from ._data import Normalizer
from ._data import RobustScaler
from ._data import StandardScaler
from ._data import QuantileTransformer
from ._data import add_dummy_feature
from ._data import binarize
from ._data import normalize
from ._data import scale
from ._data import robust_scale
from ._data import maxabs_scale
from ._data import minmax_scale
from ._data import quantile_transform
from ._data import power_transform
from ._data import PowerTransformer
from ._encoders import OneHotEncoder
from ._encoders import OrdinalEncoder
from ._label import label_binarize
from ._label import LabelBinarizer
from ._label import LabelEncoder
from ._label import MultiLabelBinarizer
from ._discretization import KBinsDiscretizer
from ._polynomial import PolynomialFeatures
from ._polynomial import SplineTransformer
__all__ = [
"Binarizer",
"FunctionTransformer",
"KBinsDiscretizer",
"KernelCenterer",
"LabelBinarizer",
"LabelEncoder",
"MultiLabelBinarizer",
"MinMaxScaler",
"MaxAbsScaler",
"QuantileTransformer",
"Normalizer",
"OneHotEncoder",
"OrdinalEncoder",
"PowerTransformer",
"RobustScaler",
"SplineTransformer",
"StandardScaler",
"add_dummy_feature",
"PolynomialFeatures",
"binarize",
"normalize",
"scale",
"robust_scale",
"maxabs_scale",
"minmax_scale",
"label_binarize",
"quantile_transform",
"power_transform",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,454 @@
# Author: Henry Lin <hlin117@gmail.com>
# Tom Dupré la Tour
# License: BSD
import numbers
import numpy as np
import warnings
from . import OneHotEncoder
from ..base import BaseEstimator, TransformerMixin
from ..utils.validation import check_array
from ..utils.validation import check_is_fitted
from ..utils.validation import check_random_state
from ..utils.validation import _check_feature_names_in
from ..utils.validation import check_scalar
from ..utils import _safe_indexing
class KBinsDiscretizer(TransformerMixin, BaseEstimator):
"""
Bin continuous data into intervals.
Read more in the :ref:`User Guide <preprocessing_discretization>`.
.. versionadded:: 0.20
Parameters
----------
n_bins : int or array-like of shape (n_features,), default=5
The number of bins to produce. Raises ValueError if ``n_bins < 2``.
encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
Method used to encode the transformed result.
- 'onehot': Encode the transformed result with one-hot encoding
and return a sparse matrix. Ignored features are always
stacked to the right.
- 'onehot-dense': Encode the transformed result with one-hot encoding
and return a dense array. Ignored features are always
stacked to the right.
- 'ordinal': Return the bin identifier encoded as an integer value.
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
Strategy used to define the widths of the bins.
- 'uniform': All bins in each feature have identical widths.
- 'quantile': All bins in each feature have the same number of points.
- 'kmeans': Values in each bin have the same nearest center of a 1D
k-means cluster.
dtype : {np.float32, np.float64}, default=None
The desired data-type for the output. If None, output dtype is
consistent with input dtype. Only np.float32 and np.float64 are
supported.
.. versionadded:: 0.24
subsample : int or None (default='warn')
Maximum number of samples, used to fit the model, for computational
efficiency. Used when `strategy="quantile"`.
`subsample=None` means that all the training samples are used when
computing the quantiles that determine the binning thresholds.
Since quantile computation relies on sorting each column of `X` and
that sorting has an `n log(n)` time complexity,
it is recommended to use subsampling on datasets with a
very large number of samples.
.. deprecated:: 1.1
In version 1.3 and onwards, `subsample=2e5` will be the default.
random_state : int, RandomState instance or None, default=None
Determines random number generation for subsampling.
Pass an int for reproducible results across multiple function calls.
See the `subsample` parameter for more details.
See :term:`Glossary <random_state>`.
.. versionadded:: 1.1
Attributes
----------
bin_edges_ : ndarray of ndarray of shape (n_features,)
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
Ignored features will have empty arrays.
n_bins_ : ndarray of shape (n_features,), dtype=np.int_
Number of bins per feature. Bins whose width are too small
(i.e., <= 1e-8) are removed with a warning.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
Binarizer : Class used to bin values as ``0`` or
``1`` based on a parameter ``threshold``.
Notes
-----
In bin edges for feature ``i``, the first and last values are used only for
``inverse_transform``. During transform, bin edges are extended to::
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
You can combine ``KBinsDiscretizer`` with
:class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
part of the features.
``KBinsDiscretizer`` might produce constant features (e.g., when
``encode = 'onehot'`` and certain bins do not contain any data).
These features can be removed with feature selection algorithms
(e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
Examples
--------
>>> from sklearn.preprocessing import KBinsDiscretizer
>>> X = [[-2, 1, -4, -1],
... [-1, 2, -3, -0.5],
... [ 0, 3, -2, 0.5],
... [ 1, 4, -1, 2]]
>>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
>>> est.fit(X)
KBinsDiscretizer(...)
>>> Xt = est.transform(X)
>>> Xt # doctest: +SKIP
array([[ 0., 0., 0., 0.],
[ 1., 1., 1., 0.],
[ 2., 2., 2., 1.],
[ 2., 2., 2., 2.]])
Sometimes it may be useful to convert the data back into the original
feature space. The ``inverse_transform`` function converts the binned
data into the original feature space. Each value will be equal to the mean
of the two bin edges.
>>> est.bin_edges_[0]
array([-2., -1., 0., 1.])
>>> est.inverse_transform(Xt)
array([[-1.5, 1.5, -3.5, -0.5],
[-0.5, 2.5, -2.5, -0.5],
[ 0.5, 3.5, -1.5, 0.5],
[ 0.5, 3.5, -1.5, 1.5]])
"""
def __init__(
self,
n_bins=5,
*,
encode="onehot",
strategy="quantile",
dtype=None,
subsample="warn",
random_state=None,
):
self.n_bins = n_bins
self.encode = encode
self.strategy = strategy
self.dtype = dtype
self.subsample = subsample
self.random_state = random_state
def fit(self, X, y=None):
"""
Fit the estimator.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data to be discretized.
y : None
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X, dtype="numeric")
supported_dtype = (np.float64, np.float32)
if self.dtype in supported_dtype:
output_dtype = self.dtype
elif self.dtype is None:
output_dtype = X.dtype
else:
raise ValueError(
"Valid options for 'dtype' are "
f"{supported_dtype + (None,)}. Got dtype={self.dtype} "
" instead."
)
n_samples, n_features = X.shape
if self.strategy == "quantile" and self.subsample is not None:
if self.subsample == "warn":
if n_samples > 2e5:
warnings.warn(
"In version 1.3 onwards, subsample=2e5 "
"will be used by default. Set subsample explicitly to "
"silence this warning in the mean time. Set "
"subsample=None to disable subsampling explicitly.",
FutureWarning,
)
else:
self.subsample = check_scalar(
self.subsample, "subsample", numbers.Integral, min_val=1
)
rng = check_random_state(self.random_state)
if n_samples > self.subsample:
subsample_idx = rng.choice(
n_samples, size=self.subsample, replace=False
)
X = _safe_indexing(X, subsample_idx)
elif self.strategy != "quantile" and isinstance(
self.subsample, numbers.Integral
):
raise ValueError(
f"Invalid parameter for `strategy`: {self.strategy}. "
'`subsample` must be used with `strategy="quantile"`.'
)
valid_encode = ("onehot", "onehot-dense", "ordinal")
if self.encode not in valid_encode:
raise ValueError(
"Valid options for 'encode' are {}. Got encode={!r} instead.".format(
valid_encode, self.encode
)
)
valid_strategy = ("uniform", "quantile", "kmeans")
if self.strategy not in valid_strategy:
raise ValueError(
"Valid options for 'strategy' are {}. "
"Got strategy={!r} instead.".format(valid_strategy, self.strategy)
)
n_features = X.shape[1]
n_bins = self._validate_n_bins(n_features)
bin_edges = np.zeros(n_features, dtype=object)
for jj in range(n_features):
column = X[:, jj]
col_min, col_max = column.min(), column.max()
if col_min == col_max:
warnings.warn(
"Feature %d is constant and will be replaced with 0." % jj
)
n_bins[jj] = 1
bin_edges[jj] = np.array([-np.inf, np.inf])
continue
if self.strategy == "uniform":
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
elif self.strategy == "quantile":
quantiles = np.linspace(0, 100, n_bins[jj] + 1)
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
elif self.strategy == "kmeans":
from ..cluster import KMeans # fixes import loops
# Deterministic initialization with uniform spacing
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
# 1D k-means procedure
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
centers = km.fit(column[:, None]).cluster_centers_[:, 0]
# Must sort, centers may be unsorted even with sorted init
centers.sort()
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
# Remove bins whose width are too small (i.e., <= 1e-8)
if self.strategy in ("quantile", "kmeans"):
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
bin_edges[jj] = bin_edges[jj][mask]
if len(bin_edges[jj]) - 1 != n_bins[jj]:
warnings.warn(
"Bins whose width are too small (i.e., <= "
"1e-8) in feature %d are removed. Consider "
"decreasing the number of bins." % jj
)
n_bins[jj] = len(bin_edges[jj]) - 1
self.bin_edges_ = bin_edges
self.n_bins_ = n_bins
if "onehot" in self.encode:
self._encoder = OneHotEncoder(
categories=[np.arange(i) for i in self.n_bins_],
sparse=self.encode == "onehot",
dtype=output_dtype,
)
# Fit the OneHotEncoder with toy datasets
# so that it's ready for use after the KBinsDiscretizer is fitted
self._encoder.fit(np.zeros((1, len(self.n_bins_))))
return self
def _validate_n_bins(self, n_features):
"""Returns n_bins_, the number of bins per feature."""
orig_bins = self.n_bins
if isinstance(orig_bins, numbers.Number):
if not isinstance(orig_bins, numbers.Integral):
raise ValueError(
"{} received an invalid n_bins type. "
"Received {}, expected int.".format(
KBinsDiscretizer.__name__, type(orig_bins).__name__
)
)
if orig_bins < 2:
raise ValueError(
"{} received an invalid number "
"of bins. Received {}, expected at least 2.".format(
KBinsDiscretizer.__name__, orig_bins
)
)
return np.full(n_features, orig_bins, dtype=int)
n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
violating_indices = np.where(bad_nbins_value)[0]
if violating_indices.shape[0] > 0:
indices = ", ".join(str(i) for i in violating_indices)
raise ValueError(
"{} received an invalid number "
"of bins at indices {}. Number of bins "
"must be at least 2, and must be an int.".format(
KBinsDiscretizer.__name__, indices
)
)
return n_bins
def transform(self, X):
"""
Discretize the data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data to be discretized.
Returns
-------
Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
Data in the binned space. Will be a sparse matrix if
`self.encode='onehot'` and ndarray otherwise.
"""
check_is_fitted(self)
# check input and attribute dtypes
dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)
bin_edges = self.bin_edges_
for jj in range(Xt.shape[1]):
Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
if self.encode == "ordinal":
return Xt
dtype_init = None
if "onehot" in self.encode:
dtype_init = self._encoder.dtype
self._encoder.dtype = Xt.dtype
try:
Xt_enc = self._encoder.transform(Xt)
finally:
# revert the initial dtype to avoid modifying self.
self._encoder.dtype = dtype_init
return Xt_enc
def inverse_transform(self, Xt):
"""
Transform discretized data back to original feature space.
Note that this function does not regenerate the original data
due to discretization rounding.
Parameters
----------
Xt : array-like of shape (n_samples, n_features)
Transformed data in the binned space.
Returns
-------
Xinv : ndarray, dtype={np.float32, np.float64}
Data in the original feature space.
"""
check_is_fitted(self)
if "onehot" in self.encode:
Xt = self._encoder.inverse_transform(Xt)
Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))
n_features = self.n_bins_.shape[0]
if Xinv.shape[1] != n_features:
raise ValueError(
"Incorrect number of features. Expecting {}, received {}.".format(
n_features, Xinv.shape[1]
)
)
for jj in range(n_features):
bin_edges = self.bin_edges_[jj]
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]
return Xinv
def get_feature_names_out(self, input_features=None):
"""Get output feature names.
Parameters
----------
input_features : array-like of str or None, default=None
Input features.
- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then the following input feature names are generated:
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
input_features = _check_feature_names_in(self, input_features)
if hasattr(self, "_encoder"):
return self._encoder.get_feature_names_out(input_features)
# ordinal encoding
return input_features

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,299 @@
import warnings
import numpy as np
from ..base import BaseEstimator, TransformerMixin
from ..utils.metaestimators import available_if
from ..utils.validation import (
_allclose_dense_sparse,
_check_feature_names_in,
check_array,
)
def _identity(X):
"""The identity function."""
return X
class FunctionTransformer(TransformerMixin, BaseEstimator):
"""Constructs a transformer from an arbitrary callable.
A FunctionTransformer forwards its X (and optionally y) arguments to a
user-defined function or function object and returns the result of this
function. This is useful for stateless transformations such as taking the
log of frequencies, doing custom scaling, etc.
Note: If a lambda is used as the function, then the resulting
transformer will not be pickleable.
.. versionadded:: 0.17
Read more in the :ref:`User Guide <function_transformer>`.
Parameters
----------
func : callable, default=None
The callable to use for the transformation. This will be passed
the same arguments as transform, with args and kwargs forwarded.
If func is None, then func will be the identity function.
inverse_func : callable, default=None
The callable to use for the inverse transformation. This will be
passed the same arguments as inverse transform, with args and
kwargs forwarded. If inverse_func is None, then inverse_func
will be the identity function.
validate : bool, default=False
Indicate that the input X array should be checked before calling
``func``. The possibilities are:
- If False, there is no input validation.
- If True, then X will be converted to a 2-dimensional NumPy array or
sparse matrix. If the conversion is not possible an exception is
raised.
.. versionchanged:: 0.22
The default of ``validate`` changed from True to False.
accept_sparse : bool, default=False
Indicate that func accepts a sparse matrix as input. If validate is
False, this has no effect. Otherwise, if accept_sparse is false,
sparse matrix inputs will cause an exception to be raised.
check_inverse : bool, default=True
Whether to check that or ``func`` followed by ``inverse_func`` leads to
the original inputs. It can be used for a sanity check, raising a
warning when the condition is not fulfilled.
.. versionadded:: 0.20
feature_names_out : callable, 'one-to-one' or None, default=None
Determines the list of feature names that will be returned by the
`get_feature_names_out` method. If it is 'one-to-one', then the output
feature names will be equal to the input feature names. If it is a
callable, then it must take two positional arguments: this
`FunctionTransformer` (`self`) and an array-like of input feature names
(`input_features`). It must return an array-like of output feature
names. The `get_feature_names_out` method is only defined if
`feature_names_out` is not None.
See ``get_feature_names_out`` for more details.
.. versionadded:: 1.1
kw_args : dict, default=None
Dictionary of additional keyword arguments to pass to func.
.. versionadded:: 0.18
inv_kw_args : dict, default=None
Dictionary of additional keyword arguments to pass to inverse_func.
.. versionadded:: 0.18
Attributes
----------
n_features_in_ : int
Number of features seen during :term:`fit`. Defined only when
`validate=True`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `validate=True`
and `X` has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
MaxAbsScaler : Scale each feature by its maximum absolute value.
StandardScaler : Standardize features by removing the mean and
scaling to unit variance.
LabelBinarizer : Binarize labels in a one-vs-all fashion.
MultiLabelBinarizer : Transform between iterable of iterables
and a multilabel format.
Examples
--------
>>> import numpy as np
>>> from sklearn.preprocessing import FunctionTransformer
>>> transformer = FunctionTransformer(np.log1p)
>>> X = np.array([[0, 1], [2, 3]])
>>> transformer.transform(X)
array([[0. , 0.6931...],
[1.0986..., 1.3862...]])
"""
def __init__(
self,
func=None,
inverse_func=None,
*,
validate=False,
accept_sparse=False,
check_inverse=True,
feature_names_out=None,
kw_args=None,
inv_kw_args=None,
):
self.func = func
self.inverse_func = inverse_func
self.validate = validate
self.accept_sparse = accept_sparse
self.check_inverse = check_inverse
self.feature_names_out = feature_names_out
self.kw_args = kw_args
self.inv_kw_args = inv_kw_args
def _check_input(self, X, *, reset):
if self.validate:
return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset)
return X
def _check_inverse_transform(self, X):
"""Check that func and inverse_func are the inverse."""
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
if not np.issubdtype(X.dtype, np.number):
raise ValueError(
"'check_inverse' is only supported when all the elements in `X` is"
" numerical."
)
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
warnings.warn(
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'.",
UserWarning,
)
def fit(self, X, y=None):
"""Fit transformer by checking X.
If ``validate`` is ``True``, ``X`` will be checked.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input array.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
FunctionTransformer class instance.
"""
X = self._check_input(X, reset=True)
if self.check_inverse and not (self.func is None or self.inverse_func is None):
self._check_inverse_transform(X)
return self
def transform(self, X):
"""Transform X using the forward function.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input array.
Returns
-------
X_out : array-like, shape (n_samples, n_features)
Transformed input.
"""
X = self._check_input(X, reset=False)
return self._transform(X, func=self.func, kw_args=self.kw_args)
def inverse_transform(self, X):
"""Transform X using the inverse function.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input array.
Returns
-------
X_out : array-like, shape (n_samples, n_features)
Transformed input.
"""
if self.validate:
X = check_array(X, accept_sparse=self.accept_sparse)
return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
@available_if(lambda self: self.feature_names_out is not None)
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
This method is only defined if `feature_names_out` is not None.
Parameters
----------
input_features : array-like of str or None, default=None
Input feature names.
- If `input_features` is None, then `feature_names_in_` is
used as the input feature names. If `feature_names_in_` is not
defined, then names are generated:
`[x0, x1, ..., x(n_features_in_ - 1)]`.
- If `input_features` is array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
- If `feature_names_out` is 'one-to-one', the input feature names
are returned (see `input_features` above). This requires
`feature_names_in_` and/or `n_features_in_` to be defined, which
is done automatically if `validate=True`. Alternatively, you can
set them in `func`.
- If `feature_names_out` is a callable, then it is called with two
arguments, `self` and `input_features`, and its return value is
returned by this method.
"""
if hasattr(self, "n_features_in_") or input_features is not None:
input_features = _check_feature_names_in(self, input_features)
if self.feature_names_out == "one-to-one":
if input_features is None:
raise ValueError(
"When 'feature_names_out' is 'one-to-one', either "
"'input_features' must be passed, or 'feature_names_in_' "
"and/or 'n_features_in_' must be defined. If you set "
"'validate' to 'True', then they will be defined "
"automatically when 'fit' is called. Alternatively, you "
"can set them in 'func'."
)
names_out = input_features
elif callable(self.feature_names_out):
names_out = self.feature_names_out(self, input_features)
else:
raise ValueError(
f"feature_names_out={self.feature_names_out!r} is invalid. "
'It must either be "one-to-one" or a callable with two '
"arguments: the function transformer and an array-like of "
"input feature names. The callable must return an array-like "
"of output feature names."
)
return np.asarray(names_out, dtype=object)
def _transform(self, X, func=None, kw_args=None):
if func is None:
func = _identity
return func(X, **(kw_args if kw_args else {}))
def __sklearn_is_fitted__(self):
"""Return True since FunctionTransfomer is stateless."""
return True
def _more_tags(self):
return {"no_validation": not self.validate, "stateless": True}

View File

@@ -0,0 +1,929 @@
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Mathieu Blondel <mathieu@mblondel.org>
# Olivier Grisel <olivier.grisel@ensta.org>
# Andreas Mueller <amueller@ais.uni-bonn.de>
# Joel Nothman <joel.nothman@gmail.com>
# Hamzeh Alsalhi <ha258@cornell.edu>
# License: BSD 3 clause
from collections import defaultdict
import itertools
import array
import warnings
import numpy as np
import scipy.sparse as sp
from ..base import BaseEstimator, TransformerMixin
from ..utils.sparsefuncs import min_max_axis
from ..utils import column_or_1d
from ..utils.validation import _num_samples, check_array, check_is_fitted
from ..utils.multiclass import unique_labels
from ..utils.multiclass import type_of_target
from ..utils._encode import _encode, _unique
__all__ = [
"label_binarize",
"LabelBinarizer",
"LabelEncoder",
"MultiLabelBinarizer",
]
class LabelEncoder(TransformerMixin, BaseEstimator):
"""Encode target labels with value between 0 and n_classes-1.
This transformer should be used to encode target values, *i.e.* `y`, and
not the input `X`.
Read more in the :ref:`User Guide <preprocessing_targets>`.
.. versionadded:: 0.12
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Holds the label for each class.
See Also
--------
OrdinalEncoder : Encode categorical features using an ordinal encoding
scheme.
OneHotEncoder : Encode categorical features as a one-hot numeric array.
Examples
--------
`LabelEncoder` can be used to normalize labels.
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2]...)
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])
It can also be used to transform non-numerical labels (as long as they are
hashable and comparable) to numerical labels.
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']
"""
def fit(self, y):
"""Fit label encoder.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
self : returns an instance of self.
Fitted label encoder.
"""
y = column_or_1d(y, warn=True)
self.classes_ = _unique(y)
return self
def fit_transform(self, y):
"""Fit label encoder and return encoded labels.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
y : array-like of shape (n_samples,)
Encoded labels.
"""
y = column_or_1d(y, warn=True)
self.classes_, y = _unique(y, return_inverse=True)
return y
def transform(self, y):
"""Transform labels to normalized encoding.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
y : array-like of shape (n_samples,)
Labels as normalized encodings.
"""
check_is_fitted(self)
y = column_or_1d(y, warn=True)
# transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
return _encode(y, uniques=self.classes_)
def inverse_transform(self, y):
"""Transform labels back to original encoding.
Parameters
----------
y : ndarray of shape (n_samples,)
Target values.
Returns
-------
y : ndarray of shape (n_samples,)
Original encoding.
"""
check_is_fitted(self)
y = column_or_1d(y, warn=True)
# inverse transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
diff = np.setdiff1d(y, np.arange(len(self.classes_)))
if len(diff):
raise ValueError("y contains previously unseen labels: %s" % str(diff))
y = np.asarray(y)
return self.classes_[y]
def _more_tags(self):
return {"X_types": ["1dlabels"]}
class LabelBinarizer(TransformerMixin, BaseEstimator):
"""Binarize labels in a one-vs-all fashion.
Several regression and binary classification algorithms are
available in scikit-learn. A simple way to extend these algorithms
to the multi-class classification case is to use the so-called
one-vs-all scheme.
At learning time, this simply consists in learning one regressor
or binary classifier per class. In doing so, one needs to convert
multi-class labels to binary labels (belong or does not belong
to the class). LabelBinarizer makes this process easy with the
transform method.
At prediction time, one assigns the class for which the corresponding
model gave the greatest confidence. LabelBinarizer makes this easy
with the inverse_transform method.
Read more in the :ref:`User Guide <preprocessing_targets>`.
Parameters
----------
neg_label : int, default=0
Value with which negative labels must be encoded.
pos_label : int, default=1
Value with which positive labels must be encoded.
sparse_output : bool, default=False
True if the returned array from transform is desired to be in sparse
CSR format.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Holds the label for each class.
y_type_ : str
Represents the type of the target data as evaluated by
utils.multiclass.type_of_target. Possible type are 'continuous',
'continuous-multioutput', 'binary', 'multiclass',
'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
sparse_input_ : bool
True if the input data to transform is given as a sparse matrix, False
otherwise.
See Also
--------
label_binarize : Function to perform the transform operation of
LabelBinarizer with fixed classes.
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
scheme.
Examples
--------
>>> from sklearn import preprocessing
>>> lb = preprocessing.LabelBinarizer()
>>> lb.fit([1, 2, 6, 4, 2])
LabelBinarizer()
>>> lb.classes_
array([1, 2, 4, 6])
>>> lb.transform([1, 6])
array([[1, 0, 0, 0],
[0, 0, 0, 1]])
Binary targets transform to a column vector
>>> lb = preprocessing.LabelBinarizer()
>>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
array([[1],
[0],
[0],
[1]])
Passing a 2D matrix for multilabel classification
>>> import numpy as np
>>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
LabelBinarizer()
>>> lb.classes_
array([0, 1, 2])
>>> lb.transform([0, 1, 2, 1])
array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0]])
"""
def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
self.neg_label = neg_label
self.pos_label = pos_label
self.sparse_output = sparse_output
def fit(self, y):
"""Fit label binarizer.
Parameters
----------
y : ndarray of shape (n_samples,) or (n_samples, n_classes)
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification.
Returns
-------
self : object
Returns the instance itself.
"""
if self.neg_label >= self.pos_label:
raise ValueError(
f"neg_label={self.neg_label} must be strictly less than "
f"pos_label={self.pos_label}."
)
if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
raise ValueError(
"Sparse binarization is only supported with non "
"zero pos_label and zero neg_label, got "
f"pos_label={self.pos_label} and neg_label={self.neg_label}"
)
self.y_type_ = type_of_target(y, input_name="y")
if "multioutput" in self.y_type_:
raise ValueError(
"Multioutput target data is not supported with label binarization"
)
if _num_samples(y) == 0:
raise ValueError("y has 0 samples: %r" % y)
self.sparse_input_ = sp.issparse(y)
self.classes_ = unique_labels(y)
return self
def fit_transform(self, y):
"""Fit label binarizer/transform multi-class labels to binary labels.
The output of transform is sometimes referred to as
the 1-of-K coding scheme.
Parameters
----------
y : {ndarray, sparse matrix} of shape (n_samples,) or \
(n_samples, n_classes)
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification. Sparse matrix can be
CSR, CSC, COO, DOK, or LIL.
Returns
-------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Shape will be (n_samples, 1) for binary problems. Sparse matrix
will be of CSR format.
"""
return self.fit(y).transform(y)
def transform(self, y):
"""Transform multi-class labels to binary labels.
The output of transform is sometimes referred to by some authors as
the 1-of-K coding scheme.
Parameters
----------
y : {array, sparse matrix} of shape (n_samples,) or \
(n_samples, n_classes)
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification. Sparse matrix can be
CSR, CSC, COO, DOK, or LIL.
Returns
-------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Shape will be (n_samples, 1) for binary problems. Sparse matrix
will be of CSR format.
"""
check_is_fitted(self)
y_is_multilabel = type_of_target(y).startswith("multilabel")
if y_is_multilabel and not self.y_type_.startswith("multilabel"):
raise ValueError("The object was not fitted with multilabel input.")
return label_binarize(
y,
classes=self.classes_,
pos_label=self.pos_label,
neg_label=self.neg_label,
sparse_output=self.sparse_output,
)
def inverse_transform(self, Y, threshold=None):
"""Transform binary labels back to multi-class labels.
Parameters
----------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Target values. All sparse matrices are converted to CSR before
inverse transformation.
threshold : float, default=None
Threshold used in the binary and multi-label cases.
Use 0 when ``Y`` contains the output of decision_function
(classifier).
Use 0.5 when ``Y`` contains the output of predict_proba.
If None, the threshold is assumed to be half way between
neg_label and pos_label.
Returns
-------
y : {ndarray, sparse matrix} of shape (n_samples,)
Target values. Sparse matrix will be of CSR format.
Notes
-----
In the case when the binary labels are fractional
(probabilistic), inverse_transform chooses the class with the
greatest value. Typically, this allows to use the output of a
linear model's decision_function method directly as the input
of inverse_transform.
"""
check_is_fitted(self)
if threshold is None:
threshold = (self.pos_label + self.neg_label) / 2.0
if self.y_type_ == "multiclass":
y_inv = _inverse_binarize_multiclass(Y, self.classes_)
else:
y_inv = _inverse_binarize_thresholding(
Y, self.y_type_, self.classes_, threshold
)
if self.sparse_input_:
y_inv = sp.csr_matrix(y_inv)
elif sp.issparse(y_inv):
y_inv = y_inv.toarray()
return y_inv
def _more_tags(self):
return {"X_types": ["1dlabels"]}
def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
"""Binarize labels in a one-vs-all fashion.
Several regression and binary classification algorithms are
available in scikit-learn. A simple way to extend these algorithms
to the multi-class classification case is to use the so-called
one-vs-all scheme.
This function makes it possible to compute this transformation for a
fixed set of class labels known ahead of time.
Parameters
----------
y : array-like
Sequence of integer labels or multilabel data to encode.
classes : array-like of shape (n_classes,)
Uniquely holds the label for each class.
neg_label : int, default=0
Value with which negative labels must be encoded.
pos_label : int, default=1
Value with which positive labels must be encoded.
sparse_output : bool, default=False,
Set to true if output binary array is desired in CSR sparse format.
Returns
-------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Shape will be (n_samples, 1) for binary problems. Sparse matrix will
be of CSR format.
Examples
--------
>>> from sklearn.preprocessing import label_binarize
>>> label_binarize([1, 6], classes=[1, 2, 4, 6])
array([[1, 0, 0, 0],
[0, 0, 0, 1]])
The class ordering is preserved:
>>> label_binarize([1, 6], classes=[1, 6, 4, 2])
array([[1, 0, 0, 0],
[0, 1, 0, 0]])
Binary targets transform to a column vector
>>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
array([[1],
[0],
[0],
[1]])
See Also
--------
LabelBinarizer : Class used to wrap the functionality of label_binarize and
allow for fitting to classes independently of the transform operation.
"""
if not isinstance(y, list):
# XXX Workaround that will be removed when list of list format is
# dropped
y = check_array(
y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
)
else:
if _num_samples(y) == 0:
raise ValueError("y has 0 samples: %r" % y)
if neg_label >= pos_label:
raise ValueError(
"neg_label={0} must be strictly less than pos_label={1}.".format(
neg_label, pos_label
)
)
if sparse_output and (pos_label == 0 or neg_label != 0):
raise ValueError(
"Sparse binarization is only supported with non "
"zero pos_label and zero neg_label, got "
"pos_label={0} and neg_label={1}"
"".format(pos_label, neg_label)
)
# To account for pos_label == 0 in the dense case
pos_switch = pos_label == 0
if pos_switch:
pos_label = -neg_label
y_type = type_of_target(y)
if "multioutput" in y_type:
raise ValueError(
"Multioutput target data is not supported with label binarization"
)
if y_type == "unknown":
raise ValueError("The type of target data is not known")
n_samples = y.shape[0] if sp.issparse(y) else len(y)
n_classes = len(classes)
classes = np.asarray(classes)
if y_type == "binary":
if n_classes == 1:
if sparse_output:
return sp.csr_matrix((n_samples, 1), dtype=int)
else:
Y = np.zeros((len(y), 1), dtype=int)
Y += neg_label
return Y
elif len(classes) >= 3:
y_type = "multiclass"
sorted_class = np.sort(classes)
if y_type == "multilabel-indicator":
y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
if classes.size != y_n_classes:
raise ValueError(
"classes {0} mismatch with the labels {1} found in the data".format(
classes, unique_labels(y)
)
)
if y_type in ("binary", "multiclass"):
y = column_or_1d(y)
# pick out the known labels from y
y_in_classes = np.in1d(y, classes)
y_seen = y[y_in_classes]
indices = np.searchsorted(sorted_class, y_seen)
indptr = np.hstack((0, np.cumsum(y_in_classes)))
data = np.empty_like(indices)
data.fill(pos_label)
Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
elif y_type == "multilabel-indicator":
Y = sp.csr_matrix(y)
if pos_label != 1:
data = np.empty_like(Y.data)
data.fill(pos_label)
Y.data = data
else:
raise ValueError(
"%s target data is not supported with label binarization" % y_type
)
if not sparse_output:
Y = Y.toarray()
Y = Y.astype(int, copy=False)
if neg_label != 0:
Y[Y == 0] = neg_label
if pos_switch:
Y[Y == pos_label] = 0
else:
Y.data = Y.data.astype(int, copy=False)
# preserve label ordering
if np.any(classes != sorted_class):
indices = np.searchsorted(sorted_class, classes)
Y = Y[:, indices]
if y_type == "binary":
if sparse_output:
Y = Y.getcol(-1)
else:
Y = Y[:, -1].reshape((-1, 1))
return Y
def _inverse_binarize_multiclass(y, classes):
"""Inverse label binarization transformation for multiclass.
Multiclass uses the maximal score instead of a threshold.
"""
classes = np.asarray(classes)
if sp.issparse(y):
# Find the argmax for each row in y where y is a CSR matrix
y = y.tocsr()
n_samples, n_outputs = y.shape
outputs = np.arange(n_outputs)
row_max = min_max_axis(y, 1)[1]
row_nnz = np.diff(y.indptr)
y_data_repeated_max = np.repeat(row_max, row_nnz)
# picks out all indices obtaining the maximum per row
y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
# For corner case where last row has a max of 0
if row_max[-1] == 0:
y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
# Gets the index of the first argmax in each row from y_i_all_argmax
index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
# first argmax of each row
y_ind_ext = np.append(y.indices, [0])
y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
# Handle rows of all 0
y_i_argmax[np.where(row_nnz == 0)[0]] = 0
# Handles rows with max of 0 that contain negative numbers
samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
for i in samples:
ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
return classes[y_i_argmax]
else:
return classes.take(y.argmax(axis=1), mode="clip")
def _inverse_binarize_thresholding(y, output_type, classes, threshold):
"""Inverse label binarization transformation using thresholding."""
if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
if output_type != "binary" and y.shape[1] != len(classes):
raise ValueError(
"The number of class is not equal to the number of dimension of y."
)
classes = np.asarray(classes)
# Perform thresholding
if sp.issparse(y):
if threshold > 0:
if y.format not in ("csr", "csc"):
y = y.tocsr()
y.data = np.array(y.data > threshold, dtype=int)
y.eliminate_zeros()
else:
y = np.array(y.toarray() > threshold, dtype=int)
else:
y = np.array(y > threshold, dtype=int)
# Inverse transform data
if output_type == "binary":
if sp.issparse(y):
y = y.toarray()
if y.ndim == 2 and y.shape[1] == 2:
return classes[y[:, 1]]
else:
if len(classes) == 1:
return np.repeat(classes[0], len(y))
else:
return classes[y.ravel()]
elif output_type == "multilabel-indicator":
return y
else:
raise ValueError("{0} format is not supported".format(output_type))
class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
"""Transform between iterable of iterables and a multilabel format.
Although a list of sets or tuples is a very intuitive format for multilabel
data, it is unwieldy to process. This transformer converts between this
intuitive format and the supported multilabel format: a (samples x classes)
binary matrix indicating the presence of a class label.
Parameters
----------
classes : array-like of shape (n_classes,), default=None
Indicates an ordering for the class labels.
All entries should be unique (cannot contain duplicate classes).
sparse_output : bool, default=False
Set to True if output binary array is desired in CSR sparse format.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
A copy of the `classes` parameter when provided.
Otherwise it corresponds to the sorted set of classes found
when fitting.
See Also
--------
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
scheme.
Examples
--------
>>> from sklearn.preprocessing import MultiLabelBinarizer
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit_transform([(1, 2), (3,)])
array([[1, 1, 0],
[0, 0, 1]])
>>> mlb.classes_
array([1, 2, 3])
>>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
array([[0, 1, 1],
[1, 0, 0]])
>>> list(mlb.classes_)
['comedy', 'sci-fi', 'thriller']
A common mistake is to pass in a list, which leads to the following issue:
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
MultiLabelBinarizer()
>>> mlb.classes_
array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
'y'], dtype=object)
To correct this, the list of labels should be passed in as:
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
MultiLabelBinarizer()
>>> mlb.classes_
array(['comedy', 'sci-fi', 'thriller'], dtype=object)
"""
def __init__(self, *, classes=None, sparse_output=False):
self.classes = classes
self.sparse_output = sparse_output
def fit(self, y):
"""Fit the label sets binarizer, storing :term:`classes_`.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
Returns
-------
self : object
Fitted estimator.
"""
self._cached_dict = None
if self.classes is None:
classes = sorted(set(itertools.chain.from_iterable(y)))
elif len(set(self.classes)) < len(self.classes):
raise ValueError(
"The classes argument contains duplicate "
"classes. Remove these duplicates before passing "
"them to MultiLabelBinarizer."
)
else:
classes = self.classes
dtype = int if all(isinstance(c, int) for c in classes) else object
self.classes_ = np.empty(len(classes), dtype=dtype)
self.classes_[:] = classes
return self
def fit_transform(self, y):
"""Fit the label sets binarizer and transform the given label sets.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
Returns
-------
y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
format.
"""
self._cached_dict = None
if self.classes is not None:
return self.fit(y).transform(y)
# Automatically increment on new class
class_mapping = defaultdict(int)
class_mapping.default_factory = class_mapping.__len__
yt = self._transform(y, class_mapping)
# sort classes and reorder columns
tmp = sorted(class_mapping, key=class_mapping.get)
# (make safe for tuples)
dtype = int if all(isinstance(c, int) for c in tmp) else object
class_mapping = np.empty(len(tmp), dtype=dtype)
class_mapping[:] = tmp
self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
# ensure yt.indices keeps its current dtype
yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False)
if not self.sparse_output:
yt = yt.toarray()
return yt
def transform(self, y):
"""Transform the given label sets.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
Returns
-------
y_indicator : array or CSR matrix, shape (n_samples, n_classes)
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
`y[i]`, and 0 otherwise.
"""
check_is_fitted(self)
class_to_index = self._build_cache()
yt = self._transform(y, class_to_index)
if not self.sparse_output:
yt = yt.toarray()
return yt
def _build_cache(self):
if self._cached_dict is None:
self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
return self._cached_dict
def _transform(self, y, class_mapping):
"""Transforms the label sets with a given mapping.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
class_mapping : Mapping
Maps from label to column index in label indicator matrix.
Returns
-------
y_indicator : sparse matrix of shape (n_samples, n_classes)
Label indicator matrix. Will be of CSR format.
"""
indices = array.array("i")
indptr = array.array("i", [0])
unknown = set()
for labels in y:
index = set()
for label in labels:
try:
index.add(class_mapping[label])
except KeyError:
unknown.add(label)
indices.extend(index)
indptr.append(len(indices))
if unknown:
warnings.warn(
"unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
)
data = np.ones(len(indices), dtype=int)
return sp.csr_matrix(
(data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
)
def inverse_transform(self, yt):
"""Transform the given indicator matrix into label sets.
Parameters
----------
yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
A matrix containing only 1s ands 0s.
Returns
-------
y : list of tuples
The set of labels for each sample such that `y[i]` consists of
`classes_[j]` for each `yt[i, j] == 1`.
"""
check_is_fitted(self)
if yt.shape[1] != len(self.classes_):
raise ValueError(
"Expected indicator for {0} classes, but got {1}".format(
len(self.classes_), yt.shape[1]
)
)
if sp.issparse(yt):
yt = yt.tocsr()
if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
raise ValueError("Expected only 0s and 1s in label indicator.")
return [
tuple(self.classes_.take(yt.indices[start:end]))
for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
]
else:
unexpected = np.setdiff1d(yt, [0, 1])
if len(unexpected) > 0:
raise ValueError(
"Expected only 0s and 1s in label indicator. Also got {0}".format(
unexpected
)
)
return [tuple(self.classes_.compress(indicators)) for indicators in yt]
def _more_tags(self):
return {"X_types": ["2dlabels"]}

View File

@@ -0,0 +1,22 @@
import os
def configuration(parent_package="", top_path=None):
import numpy
from numpy.distutils.misc_util import Configuration
config = Configuration("preprocessing", parent_package, top_path)
libraries = []
if os.name == "posix":
libraries.append("m")
config.add_extension(
"_csr_polynomial_expansion",
sources=["_csr_polynomial_expansion.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
)
config.add_subpackage("tests")
return config

View File

@@ -0,0 +1,183 @@
import warnings
import pytest
import numpy as np
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.preprocessing import maxabs_scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import scale
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import robust_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
iris = load_iris()
def _get_valid_samples_by_column(X, col):
"""Get non NaN samples in column of X"""
return X[:, [col]][~np.isnan(X[:, col])]
@pytest.mark.parametrize(
"est, func, support_sparse, strictly_positive, omit_kwargs",
[
(MaxAbsScaler(), maxabs_scale, True, False, []),
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
(StandardScaler(), scale, False, False, []),
(StandardScaler(with_mean=False), scale, True, False, []),
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
(PowerTransformer("box-cox"), power_transform, False, True, []),
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
(RobustScaler(), robust_scale, False, False, []),
(RobustScaler(with_centering=False), robust_scale, True, False, []),
],
)
def test_missing_value_handling(
est, func, support_sparse, strictly_positive, omit_kwargs
):
# check that the preprocessing method let pass nan
rng = np.random.RandomState(42)
X = iris.data.copy()
n_missing = 50
X[
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
] = np.nan
if strictly_positive:
X += np.nanmin(X) + 0.1
X_train, X_test = train_test_split(X, random_state=1)
# sanity check
assert not np.all(np.isnan(X_train), axis=0).any()
assert np.any(np.isnan(X_train), axis=0).all()
assert np.any(np.isnan(X_test), axis=0).all()
X_test[:, 0] = np.nan # make sure this boundary case is tested
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt = est.fit(X_train).transform(X_test)
# ensure no warnings are raised
# missing values should still be missing, and only them
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
# check that the function leads to the same results as the class
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_class = est.transform(X_train)
kwargs = est.get_params()
# remove the parameters which should be omitted because they
# are not defined in the counterpart function of the preprocessing class
for kwarg in omit_kwargs:
_ = kwargs.pop(kwarg)
Xt_func = func(X_train, **kwargs)
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
# check that the inverse transform keep NaN
Xt_inv = est.inverse_transform(Xt)
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
# FIXME: we can introduce equal_nan=True in recent version of numpy.
# For the moment which just check that non-NaN values are almost equal.
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
for i in range(X.shape[1]):
# train only on non-NaN
est.fit(_get_valid_samples_by_column(X_train, i))
# check transforming with NaN works even when training without NaN
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_col = est.transform(X_test[:, [i]])
assert_allclose(Xt_col, Xt[:, [i]])
# check non-NaN is handled as before - the 1st column is all nan
if not np.isnan(X_test[:, i]).all():
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
if support_sparse:
est_dense = clone(est)
est_sparse = clone(est)
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
for sparse_constructor in (
sparse.csr_matrix,
sparse.csc_matrix,
sparse.bsr_matrix,
sparse.coo_matrix,
sparse.dia_matrix,
sparse.dok_matrix,
sparse.lil_matrix,
):
# check that the dense and sparse inputs lead to the same results
# precompute the matrix to avoid catching side warnings
X_train_sp = sparse_constructor(X_train)
X_test_sp = sparse_constructor(X_test)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
assert_allclose(Xt_sp.A, Xt_dense)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
@pytest.mark.parametrize(
"est, func",
[
(MaxAbsScaler(), maxabs_scale),
(MinMaxScaler(), minmax_scale),
(StandardScaler(), scale),
(StandardScaler(with_mean=False), scale),
(PowerTransformer("yeo-johnson"), power_transform),
(
PowerTransformer("box-cox"),
power_transform,
),
(QuantileTransformer(n_quantiles=3), quantile_transform),
(RobustScaler(), robust_scale),
(RobustScaler(with_centering=False), robust_scale),
],
)
def test_missing_value_pandas_na_support(est, func):
# Test pandas IntegerArray with pd.NA
pd = pytest.importorskip("pandas")
X = np.array(
[
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
[1, 2, 3, 4, 5, 6, 7, 8],
]
).T
# Creates dataframe with IntegerArrays with pd.NA
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
X_df["c"] = X_df["c"].astype("int")
X_trans = est.fit_transform(X)
X_df_trans = est.fit_transform(X_df)
assert_allclose(X_trans, X_df_trans)

View File

@@ -0,0 +1,472 @@
import pytest
import numpy as np
import scipy.sparse as sp
import warnings
from sklearn import clone
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils._testing import (
assert_array_almost_equal,
assert_array_equal,
assert_allclose_dense_sparse,
)
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
@pytest.mark.parametrize(
"strategy, expected",
[
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
],
)
def test_fit_transform(strategy, expected):
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
est.fit(X)
assert_array_equal(expected, est.transform(X))
def test_valid_n_bins():
KBinsDiscretizer(n_bins=2).fit_transform(X)
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
def test_invalid_n_bins():
est = KBinsDiscretizer(n_bins=1)
err_msg = (
"KBinsDiscretizer received an invalid number of bins. Received 1, expected at"
" least 2."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
est = KBinsDiscretizer(n_bins=1.1)
err_msg = (
"KBinsDiscretizer received an invalid n_bins type. Received float, expected"
" int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
def test_invalid_n_bins_array():
# Bad shape
n_bins = np.full((2, 4), 2.0)
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Incorrect number of features
n_bins = [1, 2, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Bad bin values
n_bins = [1, 2, 2, 1]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 3. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Float bin values
n_bins = [2.1, 2, 2.1, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 2. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
@pytest.mark.parametrize(
"strategy, expected",
[
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
],
)
def test_fit_transform_n_bins_array(strategy, expected):
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
).fit(X)
assert_array_equal(expected, est.transform(X))
# test the shape of bin_edges_
n_features = np.array(X).shape[1]
assert est.bin_edges_.shape == (n_features,)
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
assert bin_edges.shape == (n_bins + 1,)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_same_min_max(strategy):
warnings.simplefilter("always")
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
warning_message = "Feature 0 is constant and will be replaced with 0."
with pytest.warns(UserWarning, match=warning_message):
est.fit(X)
assert est.n_bins_[0] == 1
# replace the feature with zeros
Xt = est.transform(X)
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_transform_1d_behavior():
X = np.arange(4)
est = KBinsDiscretizer(n_bins=2)
with pytest.raises(ValueError):
est.fit(X)
est = KBinsDiscretizer(n_bins=2)
est.fit(X.reshape(-1, 1))
with pytest.raises(ValueError):
est.transform(X)
@pytest.mark.parametrize("i", range(1, 9))
def test_numeric_stability(i):
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
# Test up to discretizing nano units
X = X_init / 10**i
Xt = KBinsDiscretizer(n_bins=2, encode="ordinal").fit_transform(X)
assert_array_equal(Xt_expected, Xt)
def test_invalid_encode_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="invalid-encode")
err_msg = (
r"Valid options for 'encode' are "
r"\('onehot', 'onehot-dense', 'ordinal'\). "
r"Got encode='invalid-encode' instead."
)
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
def test_encode_options():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="ordinal").fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot-dense").fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(
OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False
).fit_transform(Xt_1),
Xt_2,
)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot").fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(
OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True)
.fit_transform(Xt_1)
.toarray(),
Xt_3.toarray(),
)
def test_invalid_strategy_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy="invalid-strategy")
err_msg = (
r"Valid options for 'strategy' are "
r"\('uniform', 'quantile', 'kmeans'\). "
r"Got strategy='invalid-strategy' instead."
)
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
@pytest.mark.parametrize(
"strategy, expected_2bins, expected_3bins, expected_5bins",
[
("uniform", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
("kmeans", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
("quantile", [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4]),
],
)
def test_nonuniform_strategies(
strategy, expected_2bins, expected_3bins, expected_5bins
):
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
# with 2 bins
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(expected_2bins, Xt.ravel())
# with 3 bins
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(expected_3bins, Xt.ravel())
# with 5 bins
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(expected_5bins, Xt.ravel())
@pytest.mark.parametrize(
"strategy, expected_inv",
[
(
"uniform",
[
[-1.5, 2.0, -3.5, -0.5],
[-0.5, 3.0, -2.5, -0.5],
[0.5, 4.0, -1.5, 0.5],
[0.5, 4.0, -1.5, 1.5],
],
),
(
"kmeans",
[
[-1.375, 2.125, -3.375, -0.5625],
[-1.375, 2.125, -3.375, -0.5625],
[-0.125, 3.375, -2.125, 0.5625],
[0.75, 4.25, -1.25, 1.625],
],
),
(
"quantile",
[
[-1.5, 2.0, -3.5, -0.75],
[-0.5, 3.0, -2.5, 0.0],
[0.5, 4.0, -1.5, 1.25],
[0.5, 4.0, -1.5, 1.25],
],
),
],
)
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_inverse_transform(strategy, encode, expected_inv):
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
Xt = kbd.fit_transform(X)
Xinv = kbd.inverse_transform(Xt)
assert_array_almost_equal(expected_inv, Xinv)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_transform_outside_fit_range(strategy):
X = np.array([0, 1, 2, 3])[:, None]
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
kbd.fit(X)
X2 = np.array([-2, 5])[:, None]
X2t = kbd.transform(X2)
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
assert_array_equal(X2t.min(axis=0), [0])
def test_overwrite():
X = np.array([0, 1, 2, 3])[:, None]
X_before = X.copy()
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(X, X_before)
Xt_before = Xt.copy()
Xinv = est.inverse_transform(Xt)
assert_array_equal(Xt, Xt_before)
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
@pytest.mark.parametrize(
"strategy, expected_bin_edges", [("quantile", [0, 1, 3]), ("kmeans", [0, 1.5, 3])]
)
def test_redundant_bins(strategy, expected_bin_edges):
X = [[0], [0], [0], [0], [3], [3]]
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
def test_percentile_numeric_stability():
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
Xt = np.array([0, 0, 4]).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
assert_array_almost_equal(kbd.transform(X), Xt)
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("out_dtype", [None, np.float16, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_consistent_dtype(in_dtype, out_dtype, encode):
X_input = np.array(X, dtype=in_dtype)
kbd = KBinsDiscretizer(n_bins=3, encode=encode, dtype=out_dtype)
# a error is raised if a wrong dtype is define for the model
if out_dtype not in [None, np.float32, np.float64]:
with pytest.raises(ValueError, match="Valid options for 'dtype' are"):
kbd.fit(X_input)
else:
kbd.fit(X_input)
# test output dtype
if out_dtype is not None:
expected_dtype = out_dtype
elif out_dtype is None and X_input.dtype == np.float16:
# wrong numeric input dtype are cast in np.float64
expected_dtype = np.float64
else:
expected_dtype = X_input.dtype
Xt = kbd.transform(X_input)
assert Xt.dtype == expected_dtype
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_32_equal_64(input_dtype, encode):
# TODO this check is redundant with common checks and can be removed
# once #16290 is merged
X_input = np.array(X, dtype=input_dtype)
# 32 bit output
kbd_32 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float32)
kbd_32.fit(X_input)
Xt_32 = kbd_32.transform(X_input)
# 64 bit output
kbd_64 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float64)
kbd_64.fit(X_input)
Xt_64 = kbd_64.transform(X_input)
assert_allclose_dense_sparse(Xt_32, Xt_64)
# FIXME: remove the `filterwarnings` in 1.3
@pytest.mark.filterwarnings("ignore:In version 1.3 onwards, subsample=2e5")
@pytest.mark.parametrize("subsample", [None, "warn"])
def test_kbinsdiscretizer_subsample_default(subsample):
# Since the size of X is small (< 2e5), subsampling will not take place.
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
kbd_default.fit(X)
kbd_with_subsampling = clone(kbd_default)
kbd_with_subsampling.set_params(subsample=subsample)
kbd_with_subsampling.fit(X)
for bin_kbd_default, bin_kbd_with_subsampling in zip(
kbd_default.bin_edges_[0], kbd_with_subsampling.bin_edges_[0]
):
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
def test_kbinsdiscretizer_subsample_invalid_strategy():
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform", subsample=3)
err_msg = '`subsample` must be used with `strategy="quantile"`.'
with pytest.raises(ValueError, match=err_msg):
kbd.fit(X)
def test_kbinsdiscretizer_subsample_invalid_type():
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd = KBinsDiscretizer(
n_bins=10, encode="ordinal", strategy="quantile", subsample="full"
)
msg = "subsample must be an instance of int, not str."
with pytest.raises(TypeError, match=msg):
kbd.fit(X)
# TODO: Remove in 1.3
def test_kbinsdiscretizer_subsample_warn():
X = np.random.rand(200001, 1).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=100, encode="ordinal", strategy="quantile")
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
with pytest.warns(FutureWarning, match=msg):
kbd.fit(X)
@pytest.mark.parametrize("subsample", [0, int(2e5)])
def test_kbinsdiscretizer_subsample_values(subsample):
X = np.random.rand(220000, 1).reshape(-1, 1)
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
kbd_with_subsampling = clone(kbd_default)
kbd_with_subsampling.set_params(subsample=subsample)
if subsample == 0:
with pytest.raises(ValueError, match="subsample == 0, must be >= 1."):
kbd_with_subsampling.fit(X)
else:
# TODO: Remove in 1.3
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
with pytest.warns(FutureWarning, match=msg):
kbd_default.fit(X)
kbd_with_subsampling.fit(X)
assert not np.all(
kbd_default.bin_edges_[0] == kbd_with_subsampling.bin_edges_[0]
)
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
@pytest.mark.parametrize(
"encode, expected_names",
[
(
"onehot",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
(
"onehot-dense",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
],
)
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
"""Check get_feature_names_out for different settings.
Non-regression test for #22731
"""
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
kbd = KBinsDiscretizer(n_bins=4, encode=encode).fit(X)
Xt = kbd.transform(X)
input_features = [f"feat{i}" for i in range(3)]
output_names = kbd.get_feature_names_out(input_features)
assert Xt.shape[1] == output_names.shape[0]
assert_array_equal(output_names, expected_names)

View File

@@ -0,0 +1,392 @@
import warnings
import pytest
import numpy as np
from scipy import sparse
from sklearn.utils import _safe_indexing
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils._testing import (
assert_array_equal,
assert_allclose_dense_sparse,
_convert_container,
)
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
def _func(X, *args, **kwargs):
args_store.append(X)
args_store.extend(args)
kwargs_store.update(kwargs)
return func(X)
return _func
def test_delegate_to_func():
# (args|kwargs)_store will hold the positional and keyword arguments
# passed to the function inside the FunctionTransformer.
args_store = []
kwargs_store = {}
X = np.arange(10).reshape((5, 2))
assert_array_equal(
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
X,
"transform should have returned X unchanged",
)
# The function should only have received X.
assert args_store == [
X
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
assert (
not kwargs_store
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
# reset the argument stores.
args_store[:] = []
kwargs_store.clear()
transformed = FunctionTransformer(
_make_func(args_store, kwargs_store),
).transform(X)
assert_array_equal(
transformed, X, err_msg="transform should have returned X unchanged"
)
# The function should have received X
assert args_store == [
X
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
assert (
not kwargs_store
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
def test_np_log():
X = np.arange(10).reshape((5, 2))
# Test that the numpy.log example still works.
assert_array_equal(
FunctionTransformer(np.log1p).transform(X),
np.log1p(X),
)
def test_kw_arg():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=3))
def test_kw_arg_update():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args["decimals"] = 1
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_kw_arg_reset():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args = dict(decimals=1)
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_inverse_transform():
X = np.array([1, 4, 9, 16]).reshape((2, 2))
# Test that inverse_transform works correctly
F = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
inv_kw_args=dict(decimals=3),
)
assert_array_equal(
F.inverse_transform(F.transform(X)),
np.around(np.sqrt(X), decimals=3),
)
def test_check_inverse():
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)]
for X in X_list:
if sparse.issparse(X):
accept_sparse = True
else:
accept_sparse = False
trans = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True,
)
warning_message = (
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'."
)
with pytest.warns(UserWarning, match=warning_message):
trans.fit(X)
trans = FunctionTransformer(
func=np.expm1,
inverse_func=np.log1p,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True,
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
Xt = trans.fit_transform(X)
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
# check that we don't check inverse when one of the func or inverse is not
# provided.
trans = FunctionTransformer(
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X_dense)
trans = FunctionTransformer(
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X_dense)
def test_function_transformer_frame():
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(np.random.randn(100, 10))
transformer = FunctionTransformer()
X_df_trans = transformer.fit_transform(X_df)
assert hasattr(X_df_trans, "loc")
@pytest.mark.parametrize("X_type", ["array", "series"])
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
inverse_mapping = {value: key for key, value in mapping.items()}
dtype = "object"
data = ["one", "two", "three", "one", "one", 5, 6]
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
def func(X):
return np.array(
[mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object
)
def inverse_func(X):
return _convert_container(
[inverse_mapping[x] for x in X],
X_type,
columns_name=["value"],
dtype=dtype,
)
transformer = FunctionTransformer(
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
)
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
with pytest.raises(ValueError, match=msg):
transformer.fit(data)
@pytest.mark.parametrize(
"X, feature_names_out, input_features, expected",
[
(
# NumPy inputs, default behavior: generate names
np.random.rand(100, 3),
"one-to-one",
None,
("x0", "x1", "x2"),
),
(
# Pandas input, default behavior: use input feature names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
None,
("a", "b"),
),
(
# NumPy input, feature_names_out=callable
np.random.rand(100, 3),
lambda transformer, input_features: ("a", "b"),
None,
("a", "b"),
),
(
# Pandas input, feature_names_out=callable
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: ("c", "d", "e"),
None,
("c", "d", "e"),
),
(
# NumPy input, feature_names_out=callable default input_features
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("a",),
None,
("x0", "x1", "x2", "a"),
),
(
# Pandas input, feature_names_out=callable default input_features
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
None,
("a", "b", "c"),
),
(
# NumPy input, input_features=list of names
np.random.rand(100, 3),
"one-to-one",
("a", "b", "c"),
("a", "b", "c"),
),
(
# Pandas input, input_features=list of names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
("a", "b"), # must match feature_names_in_
("a", "b"),
),
(
# NumPy input, feature_names_out=callable, input_features=list
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("d",),
("a", "b", "c"),
("a", "b", "c", "d"),
),
(
# Pandas input, feature_names_out=callable, input_features=list
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
("a", "b"), # must match feature_names_in_
("a", "b", "c"),
),
],
)
def test_function_transformer_get_feature_names_out(
X, feature_names_out, input_features, expected
):
if isinstance(X, dict):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X)
transformer = FunctionTransformer(
feature_names_out=feature_names_out, validate=True
)
transformer.fit_transform(X)
names = transformer.get_feature_names_out(input_features)
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected)
def test_function_transformer_get_feature_names_out_without_validation():
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = "When 'feature_names_out' is 'one-to-one', either"
with pytest.raises(ValueError, match=msg):
transformer.get_feature_names_out()
names = transformer.get_feature_names_out(("a", "b"))
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b"))
@pytest.mark.parametrize("feature_names_out", ["x0", ["x0"], ("x0",)])
def test_function_transformer_feature_names_out_string(feature_names_out):
transformer = FunctionTransformer(feature_names_out=feature_names_out)
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = """must either be "one-to-one" or a callable"""
with pytest.raises(ValueError, match=msg):
transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_is_None():
transformer = FunctionTransformer()
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
with pytest.raises(AttributeError, match=msg):
transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_uses_estimator():
def add_n_random_features(X, n):
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
def feature_names_out(transformer, input_features):
n = transformer.kw_args["n"]
return list(input_features) + [f"rnd{i}" for i in range(n)]
transformer = FunctionTransformer(
func=add_n_random_features,
feature_names_out=feature_names_out,
kw_args=dict(n=3),
validate=True,
)
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
transformer.fit_transform(df)
names = transformer.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
def test_function_transformer_validate_inverse():
"""Test that function transformer does not reset estimator in
`inverse_transform`."""
def add_constant_feature(X):
X_one = np.ones((X.shape[0], 1))
return np.concatenate((X, X_one), axis=1)
def inverse_add_constant(X):
return X[:, :-1]
X = np.array([[1, 2], [3, 4], [3, 4]])
trans = FunctionTransformer(
func=add_constant_feature,
inverse_func=inverse_add_constant,
validate=True,
)
X_trans = trans.fit_transform(X)
assert trans.n_features_in_ == X.shape[1]
trans.inverse_transform(X_trans)
assert trans.n_features_in_ == X.shape[1]

View File

@@ -0,0 +1,645 @@
import numpy as np
import pytest
from scipy.sparse import issparse
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
from sklearn.utils.multiclass import type_of_target
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.utils import _to_object_array
from sklearn.preprocessing._label import LabelBinarizer
from sklearn.preprocessing._label import MultiLabelBinarizer
from sklearn.preprocessing._label import LabelEncoder
from sklearn.preprocessing._label import label_binarize
from sklearn.preprocessing._label import _inverse_binarize_thresholding
from sklearn.preprocessing._label import _inverse_binarize_multiclass
from sklearn import datasets
iris = datasets.load_iris()
def toarray(a):
if hasattr(a, "toarray"):
a = a.toarray()
return a
def test_label_binarizer():
# one-class case defaults to negative label
# For dense case:
inp = ["pos", "pos", "pos", "pos"]
lb = LabelBinarizer(sparse_output=False)
expected = np.array([[0, 0, 0, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
# For sparse case:
lb = LabelBinarizer(sparse_output=True)
got = lb.fit_transform(inp)
assert issparse(got)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got.toarray())
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
lb = LabelBinarizer(sparse_output=False)
# two-class case
inp = ["neg", "pos", "pos", "neg"]
expected = np.array([[0, 1, 1, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["neg", "pos"])
assert_array_equal(expected, got)
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
assert_array_equal(lb.inverse_transform(to_invert), inp)
# multi-class case
inp = ["spam", "ham", "eggs", "ham", "0"]
expected = np.array(
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
)
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
got = lb.fit_transform(["b", "d", "e"])
assert_array_equal(expected, got)
expected = np.array(
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
)
got = lb.transform(["a", "b", "c", "d", "e", "f"])
assert_array_equal(expected, got)
def test_label_binarizer_set_label_encoding():
lb = LabelBinarizer(neg_label=-2, pos_label=0)
# two-class case with pos_label=0
inp = np.array([0, 1, 1, 0])
expected = np.array([[-2, 0, 0, -2]]).T
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
lb = LabelBinarizer(neg_label=-2, pos_label=2)
# multi-class case
inp = np.array([3, 2, 1, 2, 0])
expected = np.array(
[
[-2, -2, -2, +2],
[-2, -2, +2, -2],
[-2, +2, -2, -2],
[-2, -2, +2, -2],
[+2, -2, -2, -2],
]
)
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
@ignore_warnings
def test_label_binarizer_errors():
# Check that invalid arguments yield ValueError
one_class = np.array([0, 0, 0, 0])
lb = LabelBinarizer().fit(one_class)
multi_label = [(2, 3), (0,), (0, 2)]
err_msg = "You appear to be using a legacy multi-label data representation."
with pytest.raises(ValueError, match=err_msg):
lb.transform(multi_label)
lb = LabelBinarizer()
err_msg = "This LabelBinarizer instance is not fitted yet"
with pytest.raises(ValueError, match=err_msg):
lb.transform([])
with pytest.raises(ValueError, match=err_msg):
lb.inverse_transform([])
input_labels = [0, 1, 0, 1]
err_msg = "neg_label=2 must be strictly less than pos_label=1."
lb = LabelBinarizer(neg_label=2, pos_label=1)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = "neg_label=2 must be strictly less than pos_label=2."
lb = LabelBinarizer(neg_label=2, pos_label=2)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = (
"Sparse binarization is only supported with non zero pos_label and zero "
"neg_label, got pos_label=2 and neg_label=1"
)
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
# Fail on y_type
err_msg = "foo format is not supported"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2],
threshold=0,
)
# Sequence of seq type should raise ValueError
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
err_msg = "You appear to be using a legacy multi-label data representation"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit_transform(y_seq_of_seqs)
# Fail on the number of classes
err_msg = "The number of class is not equal to the number of dimension of y."
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2, 3],
threshold=0,
)
# Fail on the dimension of 'binary'
err_msg = "output_type='binary', but y.shape"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=np.array([[1, 2, 3], [2, 1, 3]]),
output_type="binary",
classes=[1, 2, 3],
threshold=0,
)
# Fail on multioutput data
err_msg = "Multioutput target data is not supported with label binarization"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
with pytest.raises(ValueError, match=err_msg):
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
@pytest.mark.parametrize(
"values, classes, unknown",
[
(
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array([1, 2, 3], dtype="int64"),
np.array([4], dtype="int64"),
),
(
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
np.array(["d"], dtype=object),
),
(
np.array(["b", "a", "c", "a", "c"]),
np.array(["a", "b", "c"]),
np.array(["d"]),
),
],
ids=["int64", "object", "str"],
)
def test_label_encoder(values, classes, unknown):
# Test LabelEncoder's transform, fit_transform and
# inverse_transform methods
le = LabelEncoder()
le.fit(values)
assert_array_equal(le.classes_, classes)
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
le = LabelEncoder()
ret = le.fit_transform(values)
assert_array_equal(ret, [1, 0, 2, 0, 2])
with pytest.raises(ValueError, match="unseen labels"):
le.transform(unknown)
def test_label_encoder_negative_ints():
le = LabelEncoder()
le.fit([1, 1, 4, 5, -1, 0])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
assert_array_equal(
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
)
with pytest.raises(ValueError):
le.transform([0, 6])
@pytest.mark.parametrize("dtype", ["str", "object"])
def test_label_encoder_str_bad_shape(dtype):
le = LabelEncoder()
le.fit(np.array(["apple", "orange"], dtype=dtype))
msg = "should be a 1d array"
with pytest.raises(ValueError, match=msg):
le.transform("apple")
def test_label_encoder_errors():
# Check that invalid arguments yield ValueError
le = LabelEncoder()
with pytest.raises(ValueError):
le.transform([])
with pytest.raises(ValueError):
le.inverse_transform([])
# Fail on unseen labels
le = LabelEncoder()
le.fit([1, 2, 3, -1, 1])
msg = "contains previously unseen labels"
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2])
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2, -3, -4])
# Fail on inverse_transform("")
msg = r"should be a 1d array.+shape \(\)"
with pytest.raises(ValueError, match=msg):
le.inverse_transform("")
@pytest.mark.parametrize(
"values",
[
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["b", "a", "c", "a", "c"]),
],
ids=["int64", "object", "str"],
)
def test_label_encoder_empty_array(values):
le = LabelEncoder()
le.fit(values)
# test empty transform
transformed = le.transform([])
assert_array_equal(np.array([]), transformed)
# test empty inverse transform
inverse_transformed = le.inverse_transform([])
assert_array_equal(np.array([]), inverse_transformed)
def test_sparse_output_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for sparse_output in [True, False]:
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit_transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit(inp()).transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
with pytest.raises(ValueError):
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
def test_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer()
got = mlb.fit_transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer()
got = mlb.fit(inp()).transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
def test_multilabel_binarizer_empty_sample():
mlb = MultiLabelBinarizer()
y = [[1, 2], [1], []]
Y = np.array([[1, 1], [1, 0], [0, 0]])
assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_unknown_class():
mlb = MultiLabelBinarizer()
y = [[1, 2]]
Y = np.array([[1, 0], [0, 1]])
warning_message = "unknown class.* will be ignored"
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
Y = np.array([[1, 0, 0], [0, 1, 0]])
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
def test_multilabel_binarizer_given_classes():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# fit().transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# ensure works with extra class
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
assert_array_equal(
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
)
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
# ensure fit is no-op as iterable is not consumed
inp = iter(inp)
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
# ensure a ValueError is thrown if given duplicate classes
err_msg = (
"The classes argument contains duplicate classes. Remove "
"these duplicates before passing them to MultiLabelBinarizer."
)
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
with pytest.raises(ValueError, match=err_msg):
mlb.fit(inp)
def test_multilabel_binarizer_multiple_calls():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
# first call
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
# second call change class
mlb.classes = [1, 2, 3]
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def test_multilabel_binarizer_same_length_sequence():
# Ensure sequences of the same length are not interpreted as a 2-d array
inp = [[1], [0], [2]]
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_non_integer_labels():
tuple_classes = _to_object_array([(1,), (2,), (3,)])
inputs = [
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
for inp, classes in inputs:
# fit_transform()
mlb = MultiLabelBinarizer()
inp = np.array(inp, dtype=object)
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
mlb = MultiLabelBinarizer()
with pytest.raises(TypeError):
mlb.fit_transform([({}), ({}, {"a": "b"})])
def test_multilabel_binarizer_non_unique():
inp = [(1, 1, 1, 0)]
indicator_mat = np.array([[1, 1]])
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_multilabel_binarizer_inverse_validation():
inp = [(1, 1, 1, 0)]
mlb = MultiLabelBinarizer()
mlb.fit_transform(inp)
# Not binary
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 3]]))
# The following binary cases are fine, however
mlb.inverse_transform(np.array([[0, 0]]))
mlb.inverse_transform(np.array([[1, 1]]))
mlb.inverse_transform(np.array([[1, 0]]))
# Wrong shape
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1]]))
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 1, 1]]))
def test_label_binarize_with_class_order():
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
assert_array_equal(out, expected)
# Modified class order
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
assert_array_equal(out, expected)
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
assert_array_equal(out, expected)
def check_binarized_results(y, classes, pos_label, neg_label, expected):
for sparse_output in [True, False]:
if (pos_label == 0 or neg_label != 0) and sparse_output:
with pytest.raises(ValueError):
label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
continue
# check label_binarize
binarized = label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
# check inverse
y_type = type_of_target(y)
if y_type == "multiclass":
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
else:
inversed = _inverse_binarize_thresholding(
binarized,
output_type=y_type,
classes=classes,
threshold=((neg_label + pos_label) / 2.0),
)
assert_array_equal(toarray(inversed), toarray(y))
# Check label binarizer
lb = LabelBinarizer(
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
)
binarized = lb.fit_transform(y)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
inverse_output = lb.inverse_transform(binarized)
assert_array_equal(toarray(inverse_output), toarray(y))
assert issparse(inverse_output) == issparse(y)
def test_label_binarize_binary():
y = [0, 1, 0]
classes = [0, 1]
pos_label = 2
neg_label = -1
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
# Binary case where sparse_output = True will not result in a ValueError
y = [0, 1, 0]
classes = [0, 1]
pos_label = 3
neg_label = 0
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
def test_label_binarize_multiclass():
y = [0, 1, 2]
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = 2 * np.eye(3)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
def test_label_binarize_multilabel():
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = pos_label * y_ind
y_sparse = [
sparse_matrix(y_ind)
for sparse_matrix in [
coo_matrix,
csc_matrix,
csr_matrix,
dok_matrix,
lil_matrix,
]
]
for y in [y_ind] + y_sparse:
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
def test_invalid_input_label_binarize():
with pytest.raises(ValueError):
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
with pytest.raises(ValueError, match="continuous target data is not "):
label_binarize([1.2, 2.7], classes=[0, 1])
with pytest.raises(ValueError, match="mismatch with the labels"):
label_binarize([[1, 3]], classes=[1, 2, 3])
def test_inverse_binarize_multiclass():
got = _inverse_binarize_multiclass(
csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
)
assert_array_equal(got, np.array([1, 1, 0]))

View File

@@ -0,0 +1,930 @@
import numpy as np
import pytest
from scipy import sparse
from scipy.sparse import random as sparse_random
from sklearn.utils._testing import assert_array_almost_equal
from numpy.testing import assert_allclose, assert_array_equal
from scipy.interpolate import BSpline
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
KBinsDiscretizer,
PolynomialFeatures,
SplineTransformer,
)
@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
def test_polynomial_and_spline_array_order(est):
"""Test that output array has the given order."""
X = np.arange(10).reshape(5, 2)
def is_c_contiguous(a):
return np.isfortran(a.T)
assert is_c_contiguous(est().fit_transform(X))
assert is_c_contiguous(est(order="C").fit_transform(X))
assert np.isfortran(est(order="F").fit_transform(X))
@pytest.mark.parametrize(
"params, err_msg",
[
({"degree": -1}, "degree must be a non-negative integer"),
({"degree": 2.5}, "degree must be a non-negative integer"),
({"degree": "string"}, "degree must be a non-negative integer"),
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
({"n_knots": 2.5}, "n_knots must be a positive integer >= 2."),
({"n_knots": "string"}, "n_knots must be a positive integer >= 2."),
({"knots": 1}, "Expected 2D array, got scalar array instead:"),
({"knots": [1, 2]}, "Expected 2D array, got 1D array instead:"),
(
{"knots": [[1]]},
r"Number of knots, knots.shape\[0\], must be >= 2.",
),
(
{"knots": [[1, 5], [2, 6]]},
r"knots.shape\[1\] == n_features is violated.",
),
(
{"knots": [[1], [1], [2]]},
"knots must be sorted without duplicates.",
),
({"knots": [[2], [1]]}, "knots must be sorted without duplicates."),
(
{"extrapolation": None},
"extrapolation must be one of 'error', 'constant', 'linear', "
"'continue' or 'periodic'.",
),
(
{"extrapolation": 1},
"extrapolation must be one of 'error', 'constant', 'linear', "
"'continue' or 'periodic'.",
),
(
{"extrapolation": "string"},
"extrapolation must be one of 'error', 'constant', 'linear', "
"'continue' or 'periodic'.",
),
({"include_bias": None}, "include_bias must be bool."),
({"include_bias": 1}, "include_bias must be bool."),
({"include_bias": "string"}, "include_bias must be bool."),
(
{"extrapolation": "periodic", "n_knots": 3, "degree": 3},
"Periodic splines require degree < n_knots. Got n_knots=3 and degree=3.",
),
(
{"extrapolation": "periodic", "knots": [[0], [1]], "degree": 2},
"Periodic splines require degree < n_knots. Got n_knots=2 and degree=2.",
),
],
)
def test_spline_transformer_input_validation(params, err_msg):
"""Test that we raise errors for invalid input in SplineTransformer."""
X = [[1], [2]]
with pytest.raises(ValueError, match=err_msg):
SplineTransformer(**params).fit(X)
def test_spline_transformer_manual_knot_input():
"""
Test that array-like knot positions in SplineTransformer are accepted.
"""
X = np.arange(20).reshape(10, 2)
knots = [[0.5, 1], [1.5, 2], [5, 10]]
st1 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
knots = np.asarray(knots)
st2 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
for i in range(X.shape[1]):
assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t)
@pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
def test_spline_transformer_integer_knots(extrapolation):
"""Test that SplineTransformer accepts integer value knot positions."""
X = np.arange(20).reshape(10, 2)
knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
_ = SplineTransformer(
degree=3, knots=knots, extrapolation=extrapolation
).fit_transform(X)
# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
def test_spline_transformer_feature_names(get_names):
"""Test that SplineTransformer generates correct features name."""
X = np.arange(20).reshape(10, 2)
splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
feature_names = getattr(splt, get_names)()
assert_array_equal(
feature_names,
[
"x0_sp_0",
"x0_sp_1",
"x0_sp_2",
"x0_sp_3",
"x0_sp_4",
"x1_sp_0",
"x1_sp_1",
"x1_sp_2",
"x1_sp_3",
"x1_sp_4",
],
)
splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
feature_names = getattr(splt, get_names)(["a", "b"])
assert_array_equal(
feature_names,
[
"a_sp_0",
"a_sp_1",
"a_sp_2",
"a_sp_3",
"b_sp_0",
"b_sp_1",
"b_sp_2",
"b_sp_3",
],
)
@pytest.mark.parametrize("degree", range(1, 5))
@pytest.mark.parametrize("n_knots", range(3, 5))
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
@pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation):
"""Test that B-splines are indeed a decomposition of unity.
Splines basis functions must sum up to 1 per row, if we stay in between
boundaries.
"""
X = np.linspace(0, 1, 100)[:, None]
# make the boundaries 0 and 1 part of X_train, for sure.
X_train = np.r_[[[0]], X[::2, :], [[1]]]
X_test = X[1::2, :]
if extrapolation == "periodic":
n_knots = n_knots + degree # periodic splines require degree < n_knots
splt = SplineTransformer(
n_knots=n_knots,
degree=degree,
knots=knots,
include_bias=True,
extrapolation=extrapolation,
)
splt.fit(X_train)
for X in [X_train, X_test]:
assert_allclose(np.sum(splt.transform(X), axis=1), 1)
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
def test_spline_transformer_linear_regression(bias, intercept):
"""Test that B-splines fit a sinusodial curve pretty well."""
X = np.linspace(0, 10, 100)[:, None]
y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose
pipe = Pipeline(
steps=[
(
"spline",
SplineTransformer(
n_knots=15,
degree=3,
include_bias=bias,
extrapolation="constant",
),
),
("ols", LinearRegression(fit_intercept=intercept)),
]
)
pipe.fit(X, y)
assert_allclose(pipe.predict(X), y, rtol=1e-3)
@pytest.mark.parametrize(
["knots", "n_knots", "sample_weight", "expected_knots"],
[
("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])),
(
"uniform",
3,
np.array([0, 0, 1, 1, 0, 3, 1]),
np.array([[2, 2], [4, 8], [6, 14]]),
),
("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])),
("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])),
(
"quantile",
3,
np.array([0, 0, 1, 1, 0, 3, 1]),
np.array([[2, 2], [5, 8], [6, 14]]),
),
],
)
def test_spline_transformer_get_base_knot_positions(
knots, n_knots, sample_weight, expected_knots
):
# Check the behaviour to find the positions of the knots with and without
# `sample_weight`
X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]])
base_knots = SplineTransformer._get_base_knot_positions(
X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight
)
assert_allclose(base_knots, expected_knots)
@pytest.mark.parametrize(
"knots, n_knots, degree",
[
("uniform", 5, 3),
("uniform", 12, 8),
(
[[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]],
None,
3,
),
],
)
def test_spline_transformer_periodicity_of_extrapolation(knots, n_knots, degree):
"""Test that the SplineTransformer is periodic for multiple features."""
X_1 = np.linspace((-1, 0), (1, 5), 10)
X_2 = np.linspace((1, 5), (3, 10), 10)
splt = SplineTransformer(
knots=knots, n_knots=n_knots, degree=degree, extrapolation="periodic"
)
splt.fit(X_1)
assert_allclose(splt.transform(X_1), splt.transform(X_2))
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
def test_spline_transformer_periodic_linear_regression(bias, intercept):
"""Test that B-splines fit a periodic curve pretty well."""
# "+ 3" to avoid the value 0 in assert_allclose
def f(x):
return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
X = np.linspace(0, 1, 101)[:, None]
pipe = Pipeline(
steps=[
(
"spline",
SplineTransformer(
n_knots=20,
degree=3,
include_bias=bias,
extrapolation="periodic",
),
),
("ols", LinearRegression(fit_intercept=intercept)),
]
)
pipe.fit(X, f(X[:, 0]))
# Generate larger array to check periodic extrapolation
X_ = np.linspace(-1, 2, 301)[:, None]
predictions = pipe.predict(X_)
assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
def test_spline_transformer_periodic_spline_backport():
"""Test that the backport of extrapolate="periodic" works correctly"""
X = np.linspace(-2, 3.5, 10)[:, None]
degree = 2
# Use periodic extrapolation backport in SplineTransformer
transformer = SplineTransformer(
degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]
)
Xt = transformer.fit_transform(X)
# Use periodic extrapolation in BSpline
coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
Xspl = spl(X[:, 0])
assert_allclose(Xt, Xspl)
def test_spline_transformer_periodic_splines_periodicity():
"""
Test if shifted knots result in the same transformation up to permutation.
"""
X = np.linspace(0, 10, 101)[:, None]
transformer_1 = SplineTransformer(
degree=3,
extrapolation="periodic",
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
)
transformer_2 = SplineTransformer(
degree=3,
extrapolation="periodic",
knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]],
)
Xt_1 = transformer_1.fit_transform(X)
Xt_2 = transformer_2.fit_transform(X)
assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
@pytest.mark.parametrize("degree", [3, 5])
def test_spline_transformer_periodic_splines_smoothness(degree):
"""Test that spline transformation is smooth at first / last knot."""
X = np.linspace(-2, 10, 10_000)[:, None]
transformer = SplineTransformer(
degree=degree,
extrapolation="periodic",
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
)
Xt = transformer.fit_transform(X)
delta = (X.max() - X.min()) / len(X)
tol = 10 * delta
dXt = Xt
# We expect splines of degree `degree` to be (`degree`-1) times
# continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
# derivative should be continuous. This is the case if the (d+1)-th
# numerical derivative is reasonably small (smaller than `tol` in absolute
# value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
# and compare them to `tol`.
#
# Note that the 0-th derivative is the function itself, such that we are
# also checking its continuity.
for d in range(1, degree + 1):
# Check continuity of the (d-1)-th derivative
diff = np.diff(dXt, axis=0)
assert np.abs(diff).max() < tol
# Compute d-th numeric derivative
dXt = diff / delta
# As degree `degree` splines are not `degree` times continuously
# differentiable at the knots, the `degree + 1`-th numeric derivative
# should have spikes at the knots.
diff = np.diff(dXt, axis=0)
assert np.abs(diff).max() > 1
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
def test_spline_transformer_extrapolation(bias, intercept, degree):
"""Test that B-spline extrapolation works correctly."""
# we use a straight line for that
X = np.linspace(-1, 1, 100)[:, None]
y = X.squeeze()
# 'constant'
pipe = Pipeline(
[
[
"spline",
SplineTransformer(
n_knots=4,
degree=degree,
include_bias=bias,
extrapolation="constant",
),
],
["ols", LinearRegression(fit_intercept=intercept)],
]
)
pipe.fit(X, y)
assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
# 'linear'
pipe = Pipeline(
[
[
"spline",
SplineTransformer(
n_knots=4,
degree=degree,
include_bias=bias,
extrapolation="linear",
),
],
["ols", LinearRegression(fit_intercept=intercept)],
]
)
pipe.fit(X, y)
assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
# 'error'
splt = SplineTransformer(
n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
)
splt.fit(X)
with pytest.raises(ValueError):
splt.transform([[-10]])
with pytest.raises(ValueError):
splt.transform([[5]])
def test_spline_transformer_kbindiscretizer():
"""Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
rng = np.random.RandomState(97531)
X = rng.randn(200).reshape(200, 1)
n_bins = 5
n_knots = n_bins + 1
splt = SplineTransformer(
n_knots=n_knots, degree=0, knots="quantile", include_bias=True
)
splines = splt.fit_transform(X)
kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile")
kbins = kbd.fit_transform(X)
# Though they should be exactly equal, we test approximately with high
# accuracy.
assert_allclose(splines, kbins, rtol=1e-13)
@pytest.mark.parametrize("n_knots", [5, 10])
@pytest.mark.parametrize("include_bias", [True, False])
@pytest.mark.parametrize("degree", [3, 5])
def test_spline_transformer_n_features_out(n_knots, include_bias, degree):
"""Test that transform results in n_features_out_ features."""
splt = SplineTransformer(n_knots=n_knots, degree=degree, include_bias=include_bias)
X = np.linspace(0, 1, 10)[:, None]
splt.fit(X)
assert splt.transform(X).shape[1] == splt.n_features_out_
@pytest.mark.parametrize(
"params, err_msg",
[
({"degree": -1}, "degree must be a non-negative integer"),
({"degree": 2.5}, "degree must be a non-negative int or tuple"),
({"degree": "12"}, r"degree=\(min_degree, max_degree\) must"),
({"degree": "string"}, "degree must be a non-negative int or tuple"),
({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"),
({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"),
({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"),
],
)
def test_polynomial_features_input_validation(params, err_msg):
"""Test that we raise errors for invalid input in PolynomialFeatures."""
X = [[1], [2]]
with pytest.raises(ValueError, match=err_msg):
PolynomialFeatures(**params).fit(X)
@pytest.fixture()
def single_feature_degree3():
X = np.arange(6)[:, np.newaxis]
P = np.hstack([np.ones_like(X), X, X**2, X**3])
return X, P
@pytest.mark.parametrize(
"degree, include_bias, interaction_only, indices",
[
(3, True, False, slice(None, None)),
(3, False, False, slice(1, None)),
(3, True, True, [0, 1]),
(3, False, True, [1]),
((2, 3), True, False, [0, 2, 3]),
((2, 3), False, False, [2, 3]),
((2, 3), True, True, [0]),
((2, 3), False, True, []),
],
)
@pytest.mark.parametrize(
"sparse_X",
[False, sparse.csr_matrix, sparse.csc_matrix],
)
def test_polynomial_features_one_feature(
single_feature_degree3,
degree,
include_bias,
interaction_only,
indices,
sparse_X,
):
"""Test PolynomialFeatures on single feature up to degree 3."""
X, P = single_feature_degree3
if sparse_X:
X = sparse_X(X)
tf = PolynomialFeatures(
degree=degree, include_bias=include_bias, interaction_only=interaction_only
).fit(X)
out = tf.transform(X)
if sparse_X:
out = out.toarray()
assert_allclose(out, P[:, indices])
if tf.n_output_features_ > 0:
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
@pytest.fixture()
def two_features_degree3():
X = np.arange(6).reshape((3, 2))
x1 = X[:, :1]
x2 = X[:, 1:]
P = np.hstack(
[
x1**0 * x2**0, # 0
x1**1 * x2**0, # 1
x1**0 * x2**1, # 2
x1**2 * x2**0, # 3
x1**1 * x2**1, # 4
x1**0 * x2**2, # 5
x1**3 * x2**0, # 6
x1**2 * x2**1, # 7
x1**1 * x2**2, # 8
x1**0 * x2**3, # 9
]
)
return X, P
@pytest.mark.parametrize(
"degree, include_bias, interaction_only, indices",
[
(2, True, False, slice(0, 6)),
(2, False, False, slice(1, 6)),
(2, True, True, [0, 1, 2, 4]),
(2, False, True, [1, 2, 4]),
((2, 2), True, False, [0, 3, 4, 5]),
((2, 2), False, False, [3, 4, 5]),
((2, 2), True, True, [0, 4]),
((2, 2), False, True, [4]),
(3, True, False, slice(None, None)),
(3, False, False, slice(1, None)),
(3, True, True, [0, 1, 2, 4]),
(3, False, True, [1, 2, 4]),
((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]),
((2, 3), False, False, slice(3, None)),
((2, 3), True, True, [0, 4]),
((2, 3), False, True, [4]),
((3, 3), True, False, [0, 6, 7, 8, 9]),
((3, 3), False, False, [6, 7, 8, 9]),
((3, 3), True, True, [0]),
((3, 3), False, True, []), # would need 3 input features
],
)
@pytest.mark.parametrize(
"sparse_X",
[False, sparse.csr_matrix, sparse.csc_matrix],
)
def test_polynomial_features_two_features(
two_features_degree3,
degree,
include_bias,
interaction_only,
indices,
sparse_X,
):
"""Test PolynomialFeatures on 2 features up to degree 3."""
X, P = two_features_degree3
if sparse_X:
X = sparse_X(X)
tf = PolynomialFeatures(
degree=degree, include_bias=include_bias, interaction_only=interaction_only
).fit(X)
out = tf.transform(X)
if sparse_X:
out = out.toarray()
assert_allclose(out, P[:, indices])
if tf.n_output_features_ > 0:
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
def test_polynomial_feature_names(get_names):
X = np.arange(30).reshape(10, 3)
poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
feature_names = poly.get_feature_names()
assert_array_equal(
["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"],
feature_names,
)
assert len(feature_names) == poly.transform(X).shape[1]
poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
feature_names = getattr(poly, get_names)(["a", "b", "c"])
assert_array_equal(
[
"a",
"b",
"c",
"a^2",
"a b",
"a c",
"b^2",
"b c",
"c^2",
"a^3",
"a^2 b",
"a^2 c",
"a b^2",
"a b c",
"a c^2",
"b^3",
"b^2 c",
"b c^2",
"c^3",
],
feature_names,
)
assert len(feature_names) == poly.transform(X).shape[1]
poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X)
feature_names = getattr(poly, get_names)(["a", "b", "c"])
assert_array_equal(
[
"a^2",
"a b",
"a c",
"b^2",
"b c",
"c^2",
"a^3",
"a^2 b",
"a^2 c",
"a b^2",
"a b c",
"a c^2",
"b^3",
"b^2 c",
"b c^2",
"c^3",
],
feature_names,
)
assert len(feature_names) == poly.transform(X).shape[1]
poly = PolynomialFeatures(
degree=(3, 3), include_bias=True, interaction_only=True
).fit(X)
feature_names = getattr(poly, get_names)(["a", "b", "c"])
assert_array_equal(["1", "a b c"], feature_names)
assert len(feature_names) == poly.transform(X).shape[1]
# test some unicode
poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
feature_names = poly.get_feature_names(["\u0001F40D", "\u262E", "\u05D0"])
assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], feature_names)
@pytest.mark.parametrize(
["deg", "include_bias", "interaction_only", "dtype"],
[
(1, True, False, int),
(2, True, False, int),
(2, True, False, np.float32),
(2, True, False, np.float64),
(3, False, False, np.float64),
(3, False, True, np.float64),
(4, False, False, np.float64),
(4, False, True, np.float64),
],
)
def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype):
rng = np.random.RandomState(0)
X = rng.randint(0, 2, (100, 2))
X_csc = sparse.csc_matrix(X)
est = PolynomialFeatures(
deg, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csc = est.fit_transform(X_csc.astype(dtype))
Xt_dense = est.fit_transform(X.astype(dtype))
assert isinstance(Xt_csc, sparse.csc_matrix)
assert Xt_csc.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csc.A, Xt_dense)
@pytest.mark.parametrize(
["deg", "include_bias", "interaction_only", "dtype"],
[
(1, True, False, int),
(2, True, False, int),
(2, True, False, np.float32),
(2, True, False, np.float64),
(3, False, False, np.float64),
(3, False, True, np.float64),
],
)
def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
rng = np.random.RandomState(0)
X = rng.randint(0, 2, (100, 2))
X_csr = sparse.csr_matrix(X)
est = PolynomialFeatures(
deg, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csr = est.fit_transform(X_csr.astype(dtype))
Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
@pytest.mark.parametrize("n_features", [1, 4, 5])
@pytest.mark.parametrize(
"min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)]
)
@pytest.mark.parametrize("interaction_only", [True, False])
@pytest.mark.parametrize("include_bias", [True, False])
def test_num_combinations(
n_features,
min_degree,
max_degree,
interaction_only,
include_bias,
):
"""
Test that n_output_features_ is calculated correctly.
"""
x = sparse.csr_matrix(([1], ([0], [n_features - 1])))
est = PolynomialFeatures(
degree=max_degree,
interaction_only=interaction_only,
include_bias=include_bias,
)
est.fit(x)
num_combos = est.n_output_features_
combos = PolynomialFeatures._combinations(
n_features=n_features,
min_degree=0,
max_degree=max_degree,
interaction_only=interaction_only,
include_bias=include_bias,
)
assert num_combos == sum([1 for _ in combos])
@pytest.mark.parametrize(
["deg", "include_bias", "interaction_only", "dtype"],
[
(2, True, False, np.float32),
(2, True, False, np.float64),
(3, False, False, np.float64),
(3, False, True, np.float64),
],
)
def test_polynomial_features_csr_X_floats(deg, include_bias, interaction_only, dtype):
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
X = X_csr.toarray()
est = PolynomialFeatures(
deg, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csr = est.fit_transform(X_csr.astype(dtype))
Xt_dense = est.fit_transform(X.astype(dtype))
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
@pytest.mark.parametrize(
["zero_row_index", "deg", "interaction_only"],
[
(0, 2, True),
(1, 2, True),
(2, 2, True),
(0, 3, True),
(1, 3, True),
(2, 3, True),
(0, 2, False),
(1, 2, False),
(2, 2, False),
(0, 3, False),
(1, 3, False),
(2, 3, False),
],
)
def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, interaction_only):
X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr()
X_csr[zero_row_index, :] = 0.0
X = X_csr.toarray()
est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only)
Xt_csr = est.fit_transform(X_csr)
Xt_dense = est.fit_transform(X)
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
# This degree should always be one more than the highest degree supported by
# _csr_expansion.
@pytest.mark.parametrize(
["include_bias", "interaction_only"],
[(True, True), (True, False), (False, True), (False, False)],
)
def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only):
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
X = X_csr.toarray()
est = PolynomialFeatures(
4, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csr = est.fit_transform(X_csr)
Xt_dense = est.fit_transform(X)
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
@pytest.mark.parametrize(
["deg", "dim", "interaction_only"],
[
(2, 1, True),
(2, 2, True),
(3, 1, True),
(3, 2, True),
(3, 3, True),
(2, 1, False),
(2, 2, False),
(3, 1, False),
(3, 2, False),
(3, 3, False),
],
)
def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only):
X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr()
X = X_csr.toarray()
est = PolynomialFeatures(deg, interaction_only=interaction_only)
Xt_csr = est.fit_transform(X_csr)
Xt_dense = est.fit_transform(X)
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
def test_polynomial_features_deprecated_n_input_features():
# check that we raise a deprecation warning when accessing
# `n_input_features_`. FIXME: remove in 1.2
depr_msg = (
"The attribute `n_input_features_` was deprecated in version "
"1.0 and will be removed in 1.2."
)
X = np.arange(10).reshape(5, 2)
with pytest.warns(FutureWarning, match=depr_msg):
PolynomialFeatures().fit(X).n_input_features_
# TODO: Remove in 1.2 when get_feature_names is removed
@pytest.mark.parametrize("Transformer", [SplineTransformer, PolynomialFeatures])
def test_get_feature_names_deprecated(Transformer):
X = np.arange(30).reshape(10, 3)
poly = Transformer().fit(X)
msg = "get_feature_names is deprecated in 1.0"
with pytest.warns(FutureWarning, match=msg):
poly.get_feature_names()
def test_polynomial_features_behaviour_on_zero_degree():
"""Check that PolynomialFeatures raises error when degree=0 and include_bias=False,
and output a single constant column when include_bias=True
"""
X = np.ones((10, 2))
poly = PolynomialFeatures(degree=0, include_bias=False)
err_msg = (
"Setting degree to zero and include_bias to False would result in"
" an empty output array."
)
with pytest.raises(ValueError, match=err_msg):
poly.fit_transform(X)
poly = PolynomialFeatures(degree=(0, 0), include_bias=False)
err_msg = (
"Setting both min_deree and max_degree to zero and include_bias to"
" False would result in an empty output array."
)
with pytest.raises(ValueError, match=err_msg):
poly.fit_transform(X)
for _X in [X, sparse.csr_matrix(X), sparse.csc_matrix(X)]:
poly = PolynomialFeatures(degree=0, include_bias=True)
output = poly.fit_transform(_X)
# convert to dense array if needed
if sparse.issparse(output):
output = output.toarray()
assert_array_equal(output, np.ones((X.shape[0], 1)))