first commit
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
"""
|
||||
The :mod:`sklearn.preprocessing` module includes scaling, centering,
|
||||
normalization, binarization methods.
|
||||
"""
|
||||
|
||||
from ._function_transformer import FunctionTransformer
|
||||
|
||||
from ._data import Binarizer
|
||||
from ._data import KernelCenterer
|
||||
from ._data import MinMaxScaler
|
||||
from ._data import MaxAbsScaler
|
||||
from ._data import Normalizer
|
||||
from ._data import RobustScaler
|
||||
from ._data import StandardScaler
|
||||
from ._data import QuantileTransformer
|
||||
from ._data import add_dummy_feature
|
||||
from ._data import binarize
|
||||
from ._data import normalize
|
||||
from ._data import scale
|
||||
from ._data import robust_scale
|
||||
from ._data import maxabs_scale
|
||||
from ._data import minmax_scale
|
||||
from ._data import quantile_transform
|
||||
from ._data import power_transform
|
||||
from ._data import PowerTransformer
|
||||
|
||||
from ._encoders import OneHotEncoder
|
||||
from ._encoders import OrdinalEncoder
|
||||
|
||||
from ._label import label_binarize
|
||||
from ._label import LabelBinarizer
|
||||
from ._label import LabelEncoder
|
||||
from ._label import MultiLabelBinarizer
|
||||
|
||||
from ._discretization import KBinsDiscretizer
|
||||
|
||||
from ._polynomial import PolynomialFeatures
|
||||
from ._polynomial import SplineTransformer
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Binarizer",
|
||||
"FunctionTransformer",
|
||||
"KBinsDiscretizer",
|
||||
"KernelCenterer",
|
||||
"LabelBinarizer",
|
||||
"LabelEncoder",
|
||||
"MultiLabelBinarizer",
|
||||
"MinMaxScaler",
|
||||
"MaxAbsScaler",
|
||||
"QuantileTransformer",
|
||||
"Normalizer",
|
||||
"OneHotEncoder",
|
||||
"OrdinalEncoder",
|
||||
"PowerTransformer",
|
||||
"RobustScaler",
|
||||
"SplineTransformer",
|
||||
"StandardScaler",
|
||||
"add_dummy_feature",
|
||||
"PolynomialFeatures",
|
||||
"binarize",
|
||||
"normalize",
|
||||
"scale",
|
||||
"robust_scale",
|
||||
"maxabs_scale",
|
||||
"minmax_scale",
|
||||
"label_binarize",
|
||||
"quantile_transform",
|
||||
"power_transform",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,454 @@
|
||||
# Author: Henry Lin <hlin117@gmail.com>
|
||||
# Tom Dupré la Tour
|
||||
|
||||
# License: BSD
|
||||
|
||||
|
||||
import numbers
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from . import OneHotEncoder
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..utils.validation import check_array
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import check_random_state
|
||||
from ..utils.validation import _check_feature_names_in
|
||||
from ..utils.validation import check_scalar
|
||||
from ..utils import _safe_indexing
|
||||
|
||||
|
||||
class KBinsDiscretizer(TransformerMixin, BaseEstimator):
|
||||
"""
|
||||
Bin continuous data into intervals.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_discretization>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_bins : int or array-like of shape (n_features,), default=5
|
||||
The number of bins to produce. Raises ValueError if ``n_bins < 2``.
|
||||
|
||||
encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
|
||||
Method used to encode the transformed result.
|
||||
|
||||
- 'onehot': Encode the transformed result with one-hot encoding
|
||||
and return a sparse matrix. Ignored features are always
|
||||
stacked to the right.
|
||||
- 'onehot-dense': Encode the transformed result with one-hot encoding
|
||||
and return a dense array. Ignored features are always
|
||||
stacked to the right.
|
||||
- 'ordinal': Return the bin identifier encoded as an integer value.
|
||||
|
||||
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
|
||||
Strategy used to define the widths of the bins.
|
||||
|
||||
- 'uniform': All bins in each feature have identical widths.
|
||||
- 'quantile': All bins in each feature have the same number of points.
|
||||
- 'kmeans': Values in each bin have the same nearest center of a 1D
|
||||
k-means cluster.
|
||||
|
||||
dtype : {np.float32, np.float64}, default=None
|
||||
The desired data-type for the output. If None, output dtype is
|
||||
consistent with input dtype. Only np.float32 and np.float64 are
|
||||
supported.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
subsample : int or None (default='warn')
|
||||
Maximum number of samples, used to fit the model, for computational
|
||||
efficiency. Used when `strategy="quantile"`.
|
||||
`subsample=None` means that all the training samples are used when
|
||||
computing the quantiles that determine the binning thresholds.
|
||||
Since quantile computation relies on sorting each column of `X` and
|
||||
that sorting has an `n log(n)` time complexity,
|
||||
it is recommended to use subsampling on datasets with a
|
||||
very large number of samples.
|
||||
|
||||
.. deprecated:: 1.1
|
||||
In version 1.3 and onwards, `subsample=2e5` will be the default.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for subsampling.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See the `subsample` parameter for more details.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
Attributes
|
||||
----------
|
||||
bin_edges_ : ndarray of ndarray of shape (n_features,)
|
||||
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
|
||||
Ignored features will have empty arrays.
|
||||
|
||||
n_bins_ : ndarray of shape (n_features,), dtype=np.int_
|
||||
Number of bins per feature. Bins whose width are too small
|
||||
(i.e., <= 1e-8) are removed with a warning.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
Binarizer : Class used to bin values as ``0`` or
|
||||
``1`` based on a parameter ``threshold``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In bin edges for feature ``i``, the first and last values are used only for
|
||||
``inverse_transform``. During transform, bin edges are extended to::
|
||||
|
||||
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
|
||||
|
||||
You can combine ``KBinsDiscretizer`` with
|
||||
:class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
|
||||
part of the features.
|
||||
|
||||
``KBinsDiscretizer`` might produce constant features (e.g., when
|
||||
``encode = 'onehot'`` and certain bins do not contain any data).
|
||||
These features can be removed with feature selection algorithms
|
||||
(e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.preprocessing import KBinsDiscretizer
|
||||
>>> X = [[-2, 1, -4, -1],
|
||||
... [-1, 2, -3, -0.5],
|
||||
... [ 0, 3, -2, 0.5],
|
||||
... [ 1, 4, -1, 2]]
|
||||
>>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
|
||||
>>> est.fit(X)
|
||||
KBinsDiscretizer(...)
|
||||
>>> Xt = est.transform(X)
|
||||
>>> Xt # doctest: +SKIP
|
||||
array([[ 0., 0., 0., 0.],
|
||||
[ 1., 1., 1., 0.],
|
||||
[ 2., 2., 2., 1.],
|
||||
[ 2., 2., 2., 2.]])
|
||||
|
||||
Sometimes it may be useful to convert the data back into the original
|
||||
feature space. The ``inverse_transform`` function converts the binned
|
||||
data into the original feature space. Each value will be equal to the mean
|
||||
of the two bin edges.
|
||||
|
||||
>>> est.bin_edges_[0]
|
||||
array([-2., -1., 0., 1.])
|
||||
>>> est.inverse_transform(Xt)
|
||||
array([[-1.5, 1.5, -3.5, -0.5],
|
||||
[-0.5, 2.5, -2.5, -0.5],
|
||||
[ 0.5, 3.5, -1.5, 0.5],
|
||||
[ 0.5, 3.5, -1.5, 1.5]])
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_bins=5,
|
||||
*,
|
||||
encode="onehot",
|
||||
strategy="quantile",
|
||||
dtype=None,
|
||||
subsample="warn",
|
||||
random_state=None,
|
||||
):
|
||||
self.n_bins = n_bins
|
||||
self.encode = encode
|
||||
self.strategy = strategy
|
||||
self.dtype = dtype
|
||||
self.subsample = subsample
|
||||
self.random_state = random_state
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Fit the estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data to be discretized.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`~sklearn.pipeline.Pipeline`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = self._validate_data(X, dtype="numeric")
|
||||
|
||||
supported_dtype = (np.float64, np.float32)
|
||||
if self.dtype in supported_dtype:
|
||||
output_dtype = self.dtype
|
||||
elif self.dtype is None:
|
||||
output_dtype = X.dtype
|
||||
else:
|
||||
raise ValueError(
|
||||
"Valid options for 'dtype' are "
|
||||
f"{supported_dtype + (None,)}. Got dtype={self.dtype} "
|
||||
" instead."
|
||||
)
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
if self.strategy == "quantile" and self.subsample is not None:
|
||||
if self.subsample == "warn":
|
||||
if n_samples > 2e5:
|
||||
warnings.warn(
|
||||
"In version 1.3 onwards, subsample=2e5 "
|
||||
"will be used by default. Set subsample explicitly to "
|
||||
"silence this warning in the mean time. Set "
|
||||
"subsample=None to disable subsampling explicitly.",
|
||||
FutureWarning,
|
||||
)
|
||||
else:
|
||||
self.subsample = check_scalar(
|
||||
self.subsample, "subsample", numbers.Integral, min_val=1
|
||||
)
|
||||
rng = check_random_state(self.random_state)
|
||||
if n_samples > self.subsample:
|
||||
subsample_idx = rng.choice(
|
||||
n_samples, size=self.subsample, replace=False
|
||||
)
|
||||
X = _safe_indexing(X, subsample_idx)
|
||||
elif self.strategy != "quantile" and isinstance(
|
||||
self.subsample, numbers.Integral
|
||||
):
|
||||
raise ValueError(
|
||||
f"Invalid parameter for `strategy`: {self.strategy}. "
|
||||
'`subsample` must be used with `strategy="quantile"`.'
|
||||
)
|
||||
|
||||
valid_encode = ("onehot", "onehot-dense", "ordinal")
|
||||
if self.encode not in valid_encode:
|
||||
raise ValueError(
|
||||
"Valid options for 'encode' are {}. Got encode={!r} instead.".format(
|
||||
valid_encode, self.encode
|
||||
)
|
||||
)
|
||||
valid_strategy = ("uniform", "quantile", "kmeans")
|
||||
if self.strategy not in valid_strategy:
|
||||
raise ValueError(
|
||||
"Valid options for 'strategy' are {}. "
|
||||
"Got strategy={!r} instead.".format(valid_strategy, self.strategy)
|
||||
)
|
||||
|
||||
n_features = X.shape[1]
|
||||
n_bins = self._validate_n_bins(n_features)
|
||||
|
||||
bin_edges = np.zeros(n_features, dtype=object)
|
||||
for jj in range(n_features):
|
||||
column = X[:, jj]
|
||||
col_min, col_max = column.min(), column.max()
|
||||
|
||||
if col_min == col_max:
|
||||
warnings.warn(
|
||||
"Feature %d is constant and will be replaced with 0." % jj
|
||||
)
|
||||
n_bins[jj] = 1
|
||||
bin_edges[jj] = np.array([-np.inf, np.inf])
|
||||
continue
|
||||
|
||||
if self.strategy == "uniform":
|
||||
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
|
||||
|
||||
elif self.strategy == "quantile":
|
||||
quantiles = np.linspace(0, 100, n_bins[jj] + 1)
|
||||
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
|
||||
|
||||
elif self.strategy == "kmeans":
|
||||
from ..cluster import KMeans # fixes import loops
|
||||
|
||||
# Deterministic initialization with uniform spacing
|
||||
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
|
||||
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
|
||||
|
||||
# 1D k-means procedure
|
||||
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
|
||||
centers = km.fit(column[:, None]).cluster_centers_[:, 0]
|
||||
# Must sort, centers may be unsorted even with sorted init
|
||||
centers.sort()
|
||||
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
|
||||
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
|
||||
|
||||
# Remove bins whose width are too small (i.e., <= 1e-8)
|
||||
if self.strategy in ("quantile", "kmeans"):
|
||||
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
|
||||
bin_edges[jj] = bin_edges[jj][mask]
|
||||
if len(bin_edges[jj]) - 1 != n_bins[jj]:
|
||||
warnings.warn(
|
||||
"Bins whose width are too small (i.e., <= "
|
||||
"1e-8) in feature %d are removed. Consider "
|
||||
"decreasing the number of bins." % jj
|
||||
)
|
||||
n_bins[jj] = len(bin_edges[jj]) - 1
|
||||
|
||||
self.bin_edges_ = bin_edges
|
||||
self.n_bins_ = n_bins
|
||||
|
||||
if "onehot" in self.encode:
|
||||
self._encoder = OneHotEncoder(
|
||||
categories=[np.arange(i) for i in self.n_bins_],
|
||||
sparse=self.encode == "onehot",
|
||||
dtype=output_dtype,
|
||||
)
|
||||
# Fit the OneHotEncoder with toy datasets
|
||||
# so that it's ready for use after the KBinsDiscretizer is fitted
|
||||
self._encoder.fit(np.zeros((1, len(self.n_bins_))))
|
||||
|
||||
return self
|
||||
|
||||
def _validate_n_bins(self, n_features):
|
||||
"""Returns n_bins_, the number of bins per feature."""
|
||||
orig_bins = self.n_bins
|
||||
if isinstance(orig_bins, numbers.Number):
|
||||
if not isinstance(orig_bins, numbers.Integral):
|
||||
raise ValueError(
|
||||
"{} received an invalid n_bins type. "
|
||||
"Received {}, expected int.".format(
|
||||
KBinsDiscretizer.__name__, type(orig_bins).__name__
|
||||
)
|
||||
)
|
||||
if orig_bins < 2:
|
||||
raise ValueError(
|
||||
"{} received an invalid number "
|
||||
"of bins. Received {}, expected at least 2.".format(
|
||||
KBinsDiscretizer.__name__, orig_bins
|
||||
)
|
||||
)
|
||||
return np.full(n_features, orig_bins, dtype=int)
|
||||
|
||||
n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
|
||||
|
||||
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
|
||||
raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
|
||||
|
||||
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
|
||||
|
||||
violating_indices = np.where(bad_nbins_value)[0]
|
||||
if violating_indices.shape[0] > 0:
|
||||
indices = ", ".join(str(i) for i in violating_indices)
|
||||
raise ValueError(
|
||||
"{} received an invalid number "
|
||||
"of bins at indices {}. Number of bins "
|
||||
"must be at least 2, and must be an int.".format(
|
||||
KBinsDiscretizer.__name__, indices
|
||||
)
|
||||
)
|
||||
return n_bins
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Discretize the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data to be discretized.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
|
||||
Data in the binned space. Will be a sparse matrix if
|
||||
`self.encode='onehot'` and ndarray otherwise.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
# check input and attribute dtypes
|
||||
dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
|
||||
Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)
|
||||
|
||||
bin_edges = self.bin_edges_
|
||||
for jj in range(Xt.shape[1]):
|
||||
Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
|
||||
|
||||
if self.encode == "ordinal":
|
||||
return Xt
|
||||
|
||||
dtype_init = None
|
||||
if "onehot" in self.encode:
|
||||
dtype_init = self._encoder.dtype
|
||||
self._encoder.dtype = Xt.dtype
|
||||
try:
|
||||
Xt_enc = self._encoder.transform(Xt)
|
||||
finally:
|
||||
# revert the initial dtype to avoid modifying self.
|
||||
self._encoder.dtype = dtype_init
|
||||
return Xt_enc
|
||||
|
||||
def inverse_transform(self, Xt):
|
||||
"""
|
||||
Transform discretized data back to original feature space.
|
||||
|
||||
Note that this function does not regenerate the original data
|
||||
due to discretization rounding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Xt : array-like of shape (n_samples, n_features)
|
||||
Transformed data in the binned space.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xinv : ndarray, dtype={np.float32, np.float64}
|
||||
Data in the original feature space.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if "onehot" in self.encode:
|
||||
Xt = self._encoder.inverse_transform(Xt)
|
||||
|
||||
Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))
|
||||
n_features = self.n_bins_.shape[0]
|
||||
if Xinv.shape[1] != n_features:
|
||||
raise ValueError(
|
||||
"Incorrect number of features. Expecting {}, received {}.".format(
|
||||
n_features, Xinv.shape[1]
|
||||
)
|
||||
)
|
||||
|
||||
for jj in range(n_features):
|
||||
bin_edges = self.bin_edges_[jj]
|
||||
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
|
||||
Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]
|
||||
|
||||
return Xinv
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input features.
|
||||
|
||||
- If `input_features` is `None`, then `feature_names_in_` is
|
||||
used as feature names in. If `feature_names_in_` is not defined,
|
||||
then the following input feature names are generated:
|
||||
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||||
- If `input_features` is an array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
input_features = _check_feature_names_in(self, input_features)
|
||||
if hasattr(self, "_encoder"):
|
||||
return self._encoder.get_feature_names_out(input_features)
|
||||
|
||||
# ordinal encoding
|
||||
return input_features
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,299 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..utils.metaestimators import available_if
|
||||
from ..utils.validation import (
|
||||
_allclose_dense_sparse,
|
||||
_check_feature_names_in,
|
||||
check_array,
|
||||
)
|
||||
|
||||
|
||||
def _identity(X):
|
||||
"""The identity function."""
|
||||
return X
|
||||
|
||||
|
||||
class FunctionTransformer(TransformerMixin, BaseEstimator):
|
||||
"""Constructs a transformer from an arbitrary callable.
|
||||
|
||||
A FunctionTransformer forwards its X (and optionally y) arguments to a
|
||||
user-defined function or function object and returns the result of this
|
||||
function. This is useful for stateless transformations such as taking the
|
||||
log of frequencies, doing custom scaling, etc.
|
||||
|
||||
Note: If a lambda is used as the function, then the resulting
|
||||
transformer will not be pickleable.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
|
||||
Read more in the :ref:`User Guide <function_transformer>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable, default=None
|
||||
The callable to use for the transformation. This will be passed
|
||||
the same arguments as transform, with args and kwargs forwarded.
|
||||
If func is None, then func will be the identity function.
|
||||
|
||||
inverse_func : callable, default=None
|
||||
The callable to use for the inverse transformation. This will be
|
||||
passed the same arguments as inverse transform, with args and
|
||||
kwargs forwarded. If inverse_func is None, then inverse_func
|
||||
will be the identity function.
|
||||
|
||||
validate : bool, default=False
|
||||
Indicate that the input X array should be checked before calling
|
||||
``func``. The possibilities are:
|
||||
|
||||
- If False, there is no input validation.
|
||||
- If True, then X will be converted to a 2-dimensional NumPy array or
|
||||
sparse matrix. If the conversion is not possible an exception is
|
||||
raised.
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
The default of ``validate`` changed from True to False.
|
||||
|
||||
accept_sparse : bool, default=False
|
||||
Indicate that func accepts a sparse matrix as input. If validate is
|
||||
False, this has no effect. Otherwise, if accept_sparse is false,
|
||||
sparse matrix inputs will cause an exception to be raised.
|
||||
|
||||
check_inverse : bool, default=True
|
||||
Whether to check that or ``func`` followed by ``inverse_func`` leads to
|
||||
the original inputs. It can be used for a sanity check, raising a
|
||||
warning when the condition is not fulfilled.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
feature_names_out : callable, 'one-to-one' or None, default=None
|
||||
Determines the list of feature names that will be returned by the
|
||||
`get_feature_names_out` method. If it is 'one-to-one', then the output
|
||||
feature names will be equal to the input feature names. If it is a
|
||||
callable, then it must take two positional arguments: this
|
||||
`FunctionTransformer` (`self`) and an array-like of input feature names
|
||||
(`input_features`). It must return an array-like of output feature
|
||||
names. The `get_feature_names_out` method is only defined if
|
||||
`feature_names_out` is not None.
|
||||
|
||||
See ``get_feature_names_out`` for more details.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
kw_args : dict, default=None
|
||||
Dictionary of additional keyword arguments to pass to func.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
inv_kw_args : dict, default=None
|
||||
Dictionary of additional keyword arguments to pass to inverse_func.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`. Defined only when
|
||||
`validate=True`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `validate=True`
|
||||
and `X` has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
MaxAbsScaler : Scale each feature by its maximum absolute value.
|
||||
StandardScaler : Standardize features by removing the mean and
|
||||
scaling to unit variance.
|
||||
LabelBinarizer : Binarize labels in a one-vs-all fashion.
|
||||
MultiLabelBinarizer : Transform between iterable of iterables
|
||||
and a multilabel format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.preprocessing import FunctionTransformer
|
||||
>>> transformer = FunctionTransformer(np.log1p)
|
||||
>>> X = np.array([[0, 1], [2, 3]])
|
||||
>>> transformer.transform(X)
|
||||
array([[0. , 0.6931...],
|
||||
[1.0986..., 1.3862...]])
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
func=None,
|
||||
inverse_func=None,
|
||||
*,
|
||||
validate=False,
|
||||
accept_sparse=False,
|
||||
check_inverse=True,
|
||||
feature_names_out=None,
|
||||
kw_args=None,
|
||||
inv_kw_args=None,
|
||||
):
|
||||
self.func = func
|
||||
self.inverse_func = inverse_func
|
||||
self.validate = validate
|
||||
self.accept_sparse = accept_sparse
|
||||
self.check_inverse = check_inverse
|
||||
self.feature_names_out = feature_names_out
|
||||
self.kw_args = kw_args
|
||||
self.inv_kw_args = inv_kw_args
|
||||
|
||||
def _check_input(self, X, *, reset):
|
||||
if self.validate:
|
||||
return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset)
|
||||
return X
|
||||
|
||||
def _check_inverse_transform(self, X):
|
||||
"""Check that func and inverse_func are the inverse."""
|
||||
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
|
||||
X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
|
||||
|
||||
if not np.issubdtype(X.dtype, np.number):
|
||||
raise ValueError(
|
||||
"'check_inverse' is only supported when all the elements in `X` is"
|
||||
" numerical."
|
||||
)
|
||||
|
||||
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
|
||||
warnings.warn(
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'.",
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit transformer by checking X.
|
||||
|
||||
If ``validate`` is ``True``, ``X`` will be checked.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
FunctionTransformer class instance.
|
||||
"""
|
||||
X = self._check_input(X, reset=True)
|
||||
if self.check_inverse and not (self.func is None or self.inverse_func is None):
|
||||
self._check_inverse_transform(X)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X using the forward function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : array-like, shape (n_samples, n_features)
|
||||
Transformed input.
|
||||
"""
|
||||
X = self._check_input(X, reset=False)
|
||||
return self._transform(X, func=self.func, kw_args=self.kw_args)
|
||||
|
||||
def inverse_transform(self, X):
|
||||
"""Transform X using the inverse function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : array-like, shape (n_samples, n_features)
|
||||
Transformed input.
|
||||
"""
|
||||
if self.validate:
|
||||
X = check_array(X, accept_sparse=self.accept_sparse)
|
||||
return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
|
||||
|
||||
@available_if(lambda self: self.feature_names_out is not None)
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
This method is only defined if `feature_names_out` is not None.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input feature names.
|
||||
|
||||
- If `input_features` is None, then `feature_names_in_` is
|
||||
used as the input feature names. If `feature_names_in_` is not
|
||||
defined, then names are generated:
|
||||
`[x0, x1, ..., x(n_features_in_ - 1)]`.
|
||||
- If `input_features` is array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
|
||||
- If `feature_names_out` is 'one-to-one', the input feature names
|
||||
are returned (see `input_features` above). This requires
|
||||
`feature_names_in_` and/or `n_features_in_` to be defined, which
|
||||
is done automatically if `validate=True`. Alternatively, you can
|
||||
set them in `func`.
|
||||
- If `feature_names_out` is a callable, then it is called with two
|
||||
arguments, `self` and `input_features`, and its return value is
|
||||
returned by this method.
|
||||
"""
|
||||
if hasattr(self, "n_features_in_") or input_features is not None:
|
||||
input_features = _check_feature_names_in(self, input_features)
|
||||
if self.feature_names_out == "one-to-one":
|
||||
if input_features is None:
|
||||
raise ValueError(
|
||||
"When 'feature_names_out' is 'one-to-one', either "
|
||||
"'input_features' must be passed, or 'feature_names_in_' "
|
||||
"and/or 'n_features_in_' must be defined. If you set "
|
||||
"'validate' to 'True', then they will be defined "
|
||||
"automatically when 'fit' is called. Alternatively, you "
|
||||
"can set them in 'func'."
|
||||
)
|
||||
names_out = input_features
|
||||
elif callable(self.feature_names_out):
|
||||
names_out = self.feature_names_out(self, input_features)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"feature_names_out={self.feature_names_out!r} is invalid. "
|
||||
'It must either be "one-to-one" or a callable with two '
|
||||
"arguments: the function transformer and an array-like of "
|
||||
"input feature names. The callable must return an array-like "
|
||||
"of output feature names."
|
||||
)
|
||||
return np.asarray(names_out, dtype=object)
|
||||
|
||||
def _transform(self, X, func=None, kw_args=None):
|
||||
if func is None:
|
||||
func = _identity
|
||||
|
||||
return func(X, **(kw_args if kw_args else {}))
|
||||
|
||||
def __sklearn_is_fitted__(self):
|
||||
"""Return True since FunctionTransfomer is stateless."""
|
||||
return True
|
||||
|
||||
def _more_tags(self):
|
||||
return {"no_validation": not self.validate, "stateless": True}
|
||||
@@ -0,0 +1,929 @@
|
||||
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Mathieu Blondel <mathieu@mblondel.org>
|
||||
# Olivier Grisel <olivier.grisel@ensta.org>
|
||||
# Andreas Mueller <amueller@ais.uni-bonn.de>
|
||||
# Joel Nothman <joel.nothman@gmail.com>
|
||||
# Hamzeh Alsalhi <ha258@cornell.edu>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import array
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
|
||||
from ..utils.sparsefuncs import min_max_axis
|
||||
from ..utils import column_or_1d
|
||||
from ..utils.validation import _num_samples, check_array, check_is_fitted
|
||||
from ..utils.multiclass import unique_labels
|
||||
from ..utils.multiclass import type_of_target
|
||||
from ..utils._encode import _encode, _unique
|
||||
|
||||
|
||||
__all__ = [
|
||||
"label_binarize",
|
||||
"LabelBinarizer",
|
||||
"LabelEncoder",
|
||||
"MultiLabelBinarizer",
|
||||
]
|
||||
|
||||
|
||||
class LabelEncoder(TransformerMixin, BaseEstimator):
|
||||
"""Encode target labels with value between 0 and n_classes-1.
|
||||
|
||||
This transformer should be used to encode target values, *i.e.* `y`, and
|
||||
not the input `X`.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_targets>`.
|
||||
|
||||
.. versionadded:: 0.12
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
Holds the label for each class.
|
||||
|
||||
See Also
|
||||
--------
|
||||
OrdinalEncoder : Encode categorical features using an ordinal encoding
|
||||
scheme.
|
||||
OneHotEncoder : Encode categorical features as a one-hot numeric array.
|
||||
|
||||
Examples
|
||||
--------
|
||||
`LabelEncoder` can be used to normalize labels.
|
||||
|
||||
>>> from sklearn import preprocessing
|
||||
>>> le = preprocessing.LabelEncoder()
|
||||
>>> le.fit([1, 2, 2, 6])
|
||||
LabelEncoder()
|
||||
>>> le.classes_
|
||||
array([1, 2, 6])
|
||||
>>> le.transform([1, 1, 2, 6])
|
||||
array([0, 0, 1, 2]...)
|
||||
>>> le.inverse_transform([0, 0, 1, 2])
|
||||
array([1, 1, 2, 6])
|
||||
|
||||
It can also be used to transform non-numerical labels (as long as they are
|
||||
hashable and comparable) to numerical labels.
|
||||
|
||||
>>> le = preprocessing.LabelEncoder()
|
||||
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
|
||||
LabelEncoder()
|
||||
>>> list(le.classes_)
|
||||
['amsterdam', 'paris', 'tokyo']
|
||||
>>> le.transform(["tokyo", "tokyo", "paris"])
|
||||
array([2, 2, 1]...)
|
||||
>>> list(le.inverse_transform([2, 2, 1]))
|
||||
['tokyo', 'tokyo', 'paris']
|
||||
"""
|
||||
|
||||
def fit(self, y):
|
||||
"""Fit label encoder.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : returns an instance of self.
|
||||
Fitted label encoder.
|
||||
"""
|
||||
y = column_or_1d(y, warn=True)
|
||||
self.classes_ = _unique(y)
|
||||
return self
|
||||
|
||||
def fit_transform(self, y):
|
||||
"""Fit label encoder and return encoded labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : array-like of shape (n_samples,)
|
||||
Encoded labels.
|
||||
"""
|
||||
y = column_or_1d(y, warn=True)
|
||||
self.classes_, y = _unique(y, return_inverse=True)
|
||||
return y
|
||||
|
||||
def transform(self, y):
|
||||
"""Transform labels to normalized encoding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : array-like of shape (n_samples,)
|
||||
Labels as normalized encodings.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
y = column_or_1d(y, warn=True)
|
||||
# transform of empty array is empty array
|
||||
if _num_samples(y) == 0:
|
||||
return np.array([])
|
||||
|
||||
return _encode(y, uniques=self.classes_)
|
||||
|
||||
def inverse_transform(self, y):
|
||||
"""Transform labels back to original encoding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples,)
|
||||
Original encoding.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
y = column_or_1d(y, warn=True)
|
||||
# inverse transform of empty array is empty array
|
||||
if _num_samples(y) == 0:
|
||||
return np.array([])
|
||||
|
||||
diff = np.setdiff1d(y, np.arange(len(self.classes_)))
|
||||
if len(diff):
|
||||
raise ValueError("y contains previously unseen labels: %s" % str(diff))
|
||||
y = np.asarray(y)
|
||||
return self.classes_[y]
|
||||
|
||||
def _more_tags(self):
|
||||
return {"X_types": ["1dlabels"]}
|
||||
|
||||
|
||||
class LabelBinarizer(TransformerMixin, BaseEstimator):
|
||||
"""Binarize labels in a one-vs-all fashion.
|
||||
|
||||
Several regression and binary classification algorithms are
|
||||
available in scikit-learn. A simple way to extend these algorithms
|
||||
to the multi-class classification case is to use the so-called
|
||||
one-vs-all scheme.
|
||||
|
||||
At learning time, this simply consists in learning one regressor
|
||||
or binary classifier per class. In doing so, one needs to convert
|
||||
multi-class labels to binary labels (belong or does not belong
|
||||
to the class). LabelBinarizer makes this process easy with the
|
||||
transform method.
|
||||
|
||||
At prediction time, one assigns the class for which the corresponding
|
||||
model gave the greatest confidence. LabelBinarizer makes this easy
|
||||
with the inverse_transform method.
|
||||
|
||||
Read more in the :ref:`User Guide <preprocessing_targets>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
neg_label : int, default=0
|
||||
Value with which negative labels must be encoded.
|
||||
|
||||
pos_label : int, default=1
|
||||
Value with which positive labels must be encoded.
|
||||
|
||||
sparse_output : bool, default=False
|
||||
True if the returned array from transform is desired to be in sparse
|
||||
CSR format.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
Holds the label for each class.
|
||||
|
||||
y_type_ : str
|
||||
Represents the type of the target data as evaluated by
|
||||
utils.multiclass.type_of_target. Possible type are 'continuous',
|
||||
'continuous-multioutput', 'binary', 'multiclass',
|
||||
'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
|
||||
|
||||
sparse_input_ : bool
|
||||
True if the input data to transform is given as a sparse matrix, False
|
||||
otherwise.
|
||||
|
||||
See Also
|
||||
--------
|
||||
label_binarize : Function to perform the transform operation of
|
||||
LabelBinarizer with fixed classes.
|
||||
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
|
||||
scheme.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import preprocessing
|
||||
>>> lb = preprocessing.LabelBinarizer()
|
||||
>>> lb.fit([1, 2, 6, 4, 2])
|
||||
LabelBinarizer()
|
||||
>>> lb.classes_
|
||||
array([1, 2, 4, 6])
|
||||
>>> lb.transform([1, 6])
|
||||
array([[1, 0, 0, 0],
|
||||
[0, 0, 0, 1]])
|
||||
|
||||
Binary targets transform to a column vector
|
||||
|
||||
>>> lb = preprocessing.LabelBinarizer()
|
||||
>>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
|
||||
array([[1],
|
||||
[0],
|
||||
[0],
|
||||
[1]])
|
||||
|
||||
Passing a 2D matrix for multilabel classification
|
||||
|
||||
>>> import numpy as np
|
||||
>>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
|
||||
LabelBinarizer()
|
||||
>>> lb.classes_
|
||||
array([0, 1, 2])
|
||||
>>> lb.transform([0, 1, 2, 1])
|
||||
array([[1, 0, 0],
|
||||
[0, 1, 0],
|
||||
[0, 0, 1],
|
||||
[0, 1, 0]])
|
||||
"""
|
||||
|
||||
def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
|
||||
|
||||
self.neg_label = neg_label
|
||||
self.pos_label = pos_label
|
||||
self.sparse_output = sparse_output
|
||||
|
||||
def fit(self, y):
|
||||
"""Fit label binarizer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray of shape (n_samples,) or (n_samples, n_classes)
|
||||
Target values. The 2-d matrix should only contain 0 and 1,
|
||||
represents multilabel classification.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
|
||||
if self.neg_label >= self.pos_label:
|
||||
raise ValueError(
|
||||
f"neg_label={self.neg_label} must be strictly less than "
|
||||
f"pos_label={self.pos_label}."
|
||||
)
|
||||
|
||||
if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
|
||||
raise ValueError(
|
||||
"Sparse binarization is only supported with non "
|
||||
"zero pos_label and zero neg_label, got "
|
||||
f"pos_label={self.pos_label} and neg_label={self.neg_label}"
|
||||
)
|
||||
|
||||
self.y_type_ = type_of_target(y, input_name="y")
|
||||
|
||||
if "multioutput" in self.y_type_:
|
||||
raise ValueError(
|
||||
"Multioutput target data is not supported with label binarization"
|
||||
)
|
||||
if _num_samples(y) == 0:
|
||||
raise ValueError("y has 0 samples: %r" % y)
|
||||
|
||||
self.sparse_input_ = sp.issparse(y)
|
||||
self.classes_ = unique_labels(y)
|
||||
return self
|
||||
|
||||
def fit_transform(self, y):
|
||||
"""Fit label binarizer/transform multi-class labels to binary labels.
|
||||
|
||||
The output of transform is sometimes referred to as
|
||||
the 1-of-K coding scheme.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : {ndarray, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_classes)
|
||||
Target values. The 2-d matrix should only contain 0 and 1,
|
||||
represents multilabel classification. Sparse matrix can be
|
||||
CSR, CSC, COO, DOK, or LIL.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Shape will be (n_samples, 1) for binary problems. Sparse matrix
|
||||
will be of CSR format.
|
||||
"""
|
||||
return self.fit(y).transform(y)
|
||||
|
||||
def transform(self, y):
|
||||
"""Transform multi-class labels to binary labels.
|
||||
|
||||
The output of transform is sometimes referred to by some authors as
|
||||
the 1-of-K coding scheme.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : {array, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_classes)
|
||||
Target values. The 2-d matrix should only contain 0 and 1,
|
||||
represents multilabel classification. Sparse matrix can be
|
||||
CSR, CSC, COO, DOK, or LIL.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Shape will be (n_samples, 1) for binary problems. Sparse matrix
|
||||
will be of CSR format.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
y_is_multilabel = type_of_target(y).startswith("multilabel")
|
||||
if y_is_multilabel and not self.y_type_.startswith("multilabel"):
|
||||
raise ValueError("The object was not fitted with multilabel input.")
|
||||
|
||||
return label_binarize(
|
||||
y,
|
||||
classes=self.classes_,
|
||||
pos_label=self.pos_label,
|
||||
neg_label=self.neg_label,
|
||||
sparse_output=self.sparse_output,
|
||||
)
|
||||
|
||||
def inverse_transform(self, Y, threshold=None):
|
||||
"""Transform binary labels back to multi-class labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Target values. All sparse matrices are converted to CSR before
|
||||
inverse transformation.
|
||||
|
||||
threshold : float, default=None
|
||||
Threshold used in the binary and multi-label cases.
|
||||
|
||||
Use 0 when ``Y`` contains the output of decision_function
|
||||
(classifier).
|
||||
Use 0.5 when ``Y`` contains the output of predict_proba.
|
||||
|
||||
If None, the threshold is assumed to be half way between
|
||||
neg_label and pos_label.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : {ndarray, sparse matrix} of shape (n_samples,)
|
||||
Target values. Sparse matrix will be of CSR format.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In the case when the binary labels are fractional
|
||||
(probabilistic), inverse_transform chooses the class with the
|
||||
greatest value. Typically, this allows to use the output of a
|
||||
linear model's decision_function method directly as the input
|
||||
of inverse_transform.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if threshold is None:
|
||||
threshold = (self.pos_label + self.neg_label) / 2.0
|
||||
|
||||
if self.y_type_ == "multiclass":
|
||||
y_inv = _inverse_binarize_multiclass(Y, self.classes_)
|
||||
else:
|
||||
y_inv = _inverse_binarize_thresholding(
|
||||
Y, self.y_type_, self.classes_, threshold
|
||||
)
|
||||
|
||||
if self.sparse_input_:
|
||||
y_inv = sp.csr_matrix(y_inv)
|
||||
elif sp.issparse(y_inv):
|
||||
y_inv = y_inv.toarray()
|
||||
|
||||
return y_inv
|
||||
|
||||
def _more_tags(self):
|
||||
return {"X_types": ["1dlabels"]}
|
||||
|
||||
|
||||
def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
|
||||
"""Binarize labels in a one-vs-all fashion.
|
||||
|
||||
Several regression and binary classification algorithms are
|
||||
available in scikit-learn. A simple way to extend these algorithms
|
||||
to the multi-class classification case is to use the so-called
|
||||
one-vs-all scheme.
|
||||
|
||||
This function makes it possible to compute this transformation for a
|
||||
fixed set of class labels known ahead of time.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like
|
||||
Sequence of integer labels or multilabel data to encode.
|
||||
|
||||
classes : array-like of shape (n_classes,)
|
||||
Uniquely holds the label for each class.
|
||||
|
||||
neg_label : int, default=0
|
||||
Value with which negative labels must be encoded.
|
||||
|
||||
pos_label : int, default=1
|
||||
Value with which positive labels must be encoded.
|
||||
|
||||
sparse_output : bool, default=False,
|
||||
Set to true if output binary array is desired in CSR sparse format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
Shape will be (n_samples, 1) for binary problems. Sparse matrix will
|
||||
be of CSR format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.preprocessing import label_binarize
|
||||
>>> label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
array([[1, 0, 0, 0],
|
||||
[0, 0, 0, 1]])
|
||||
|
||||
The class ordering is preserved:
|
||||
|
||||
>>> label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
array([[1, 0, 0, 0],
|
||||
[0, 1, 0, 0]])
|
||||
|
||||
Binary targets transform to a column vector
|
||||
|
||||
>>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
|
||||
array([[1],
|
||||
[0],
|
||||
[0],
|
||||
[1]])
|
||||
|
||||
See Also
|
||||
--------
|
||||
LabelBinarizer : Class used to wrap the functionality of label_binarize and
|
||||
allow for fitting to classes independently of the transform operation.
|
||||
"""
|
||||
if not isinstance(y, list):
|
||||
# XXX Workaround that will be removed when list of list format is
|
||||
# dropped
|
||||
y = check_array(
|
||||
y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
|
||||
)
|
||||
else:
|
||||
if _num_samples(y) == 0:
|
||||
raise ValueError("y has 0 samples: %r" % y)
|
||||
if neg_label >= pos_label:
|
||||
raise ValueError(
|
||||
"neg_label={0} must be strictly less than pos_label={1}.".format(
|
||||
neg_label, pos_label
|
||||
)
|
||||
)
|
||||
|
||||
if sparse_output and (pos_label == 0 or neg_label != 0):
|
||||
raise ValueError(
|
||||
"Sparse binarization is only supported with non "
|
||||
"zero pos_label and zero neg_label, got "
|
||||
"pos_label={0} and neg_label={1}"
|
||||
"".format(pos_label, neg_label)
|
||||
)
|
||||
|
||||
# To account for pos_label == 0 in the dense case
|
||||
pos_switch = pos_label == 0
|
||||
if pos_switch:
|
||||
pos_label = -neg_label
|
||||
|
||||
y_type = type_of_target(y)
|
||||
if "multioutput" in y_type:
|
||||
raise ValueError(
|
||||
"Multioutput target data is not supported with label binarization"
|
||||
)
|
||||
if y_type == "unknown":
|
||||
raise ValueError("The type of target data is not known")
|
||||
|
||||
n_samples = y.shape[0] if sp.issparse(y) else len(y)
|
||||
n_classes = len(classes)
|
||||
classes = np.asarray(classes)
|
||||
|
||||
if y_type == "binary":
|
||||
if n_classes == 1:
|
||||
if sparse_output:
|
||||
return sp.csr_matrix((n_samples, 1), dtype=int)
|
||||
else:
|
||||
Y = np.zeros((len(y), 1), dtype=int)
|
||||
Y += neg_label
|
||||
return Y
|
||||
elif len(classes) >= 3:
|
||||
y_type = "multiclass"
|
||||
|
||||
sorted_class = np.sort(classes)
|
||||
if y_type == "multilabel-indicator":
|
||||
y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
|
||||
if classes.size != y_n_classes:
|
||||
raise ValueError(
|
||||
"classes {0} mismatch with the labels {1} found in the data".format(
|
||||
classes, unique_labels(y)
|
||||
)
|
||||
)
|
||||
|
||||
if y_type in ("binary", "multiclass"):
|
||||
y = column_or_1d(y)
|
||||
|
||||
# pick out the known labels from y
|
||||
y_in_classes = np.in1d(y, classes)
|
||||
y_seen = y[y_in_classes]
|
||||
indices = np.searchsorted(sorted_class, y_seen)
|
||||
indptr = np.hstack((0, np.cumsum(y_in_classes)))
|
||||
|
||||
data = np.empty_like(indices)
|
||||
data.fill(pos_label)
|
||||
Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
|
||||
elif y_type == "multilabel-indicator":
|
||||
Y = sp.csr_matrix(y)
|
||||
if pos_label != 1:
|
||||
data = np.empty_like(Y.data)
|
||||
data.fill(pos_label)
|
||||
Y.data = data
|
||||
else:
|
||||
raise ValueError(
|
||||
"%s target data is not supported with label binarization" % y_type
|
||||
)
|
||||
|
||||
if not sparse_output:
|
||||
Y = Y.toarray()
|
||||
Y = Y.astype(int, copy=False)
|
||||
|
||||
if neg_label != 0:
|
||||
Y[Y == 0] = neg_label
|
||||
|
||||
if pos_switch:
|
||||
Y[Y == pos_label] = 0
|
||||
else:
|
||||
Y.data = Y.data.astype(int, copy=False)
|
||||
|
||||
# preserve label ordering
|
||||
if np.any(classes != sorted_class):
|
||||
indices = np.searchsorted(sorted_class, classes)
|
||||
Y = Y[:, indices]
|
||||
|
||||
if y_type == "binary":
|
||||
if sparse_output:
|
||||
Y = Y.getcol(-1)
|
||||
else:
|
||||
Y = Y[:, -1].reshape((-1, 1))
|
||||
|
||||
return Y
|
||||
|
||||
|
||||
def _inverse_binarize_multiclass(y, classes):
|
||||
"""Inverse label binarization transformation for multiclass.
|
||||
|
||||
Multiclass uses the maximal score instead of a threshold.
|
||||
"""
|
||||
classes = np.asarray(classes)
|
||||
|
||||
if sp.issparse(y):
|
||||
# Find the argmax for each row in y where y is a CSR matrix
|
||||
|
||||
y = y.tocsr()
|
||||
n_samples, n_outputs = y.shape
|
||||
outputs = np.arange(n_outputs)
|
||||
row_max = min_max_axis(y, 1)[1]
|
||||
row_nnz = np.diff(y.indptr)
|
||||
|
||||
y_data_repeated_max = np.repeat(row_max, row_nnz)
|
||||
# picks out all indices obtaining the maximum per row
|
||||
y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
|
||||
|
||||
# For corner case where last row has a max of 0
|
||||
if row_max[-1] == 0:
|
||||
y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
|
||||
|
||||
# Gets the index of the first argmax in each row from y_i_all_argmax
|
||||
index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
|
||||
# first argmax of each row
|
||||
y_ind_ext = np.append(y.indices, [0])
|
||||
y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
|
||||
# Handle rows of all 0
|
||||
y_i_argmax[np.where(row_nnz == 0)[0]] = 0
|
||||
|
||||
# Handles rows with max of 0 that contain negative numbers
|
||||
samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
|
||||
for i in samples:
|
||||
ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
|
||||
y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
|
||||
|
||||
return classes[y_i_argmax]
|
||||
else:
|
||||
return classes.take(y.argmax(axis=1), mode="clip")
|
||||
|
||||
|
||||
def _inverse_binarize_thresholding(y, output_type, classes, threshold):
|
||||
"""Inverse label binarization transformation using thresholding."""
|
||||
|
||||
if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
|
||||
raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
|
||||
|
||||
if output_type != "binary" and y.shape[1] != len(classes):
|
||||
raise ValueError(
|
||||
"The number of class is not equal to the number of dimension of y."
|
||||
)
|
||||
|
||||
classes = np.asarray(classes)
|
||||
|
||||
# Perform thresholding
|
||||
if sp.issparse(y):
|
||||
if threshold > 0:
|
||||
if y.format not in ("csr", "csc"):
|
||||
y = y.tocsr()
|
||||
y.data = np.array(y.data > threshold, dtype=int)
|
||||
y.eliminate_zeros()
|
||||
else:
|
||||
y = np.array(y.toarray() > threshold, dtype=int)
|
||||
else:
|
||||
y = np.array(y > threshold, dtype=int)
|
||||
|
||||
# Inverse transform data
|
||||
if output_type == "binary":
|
||||
if sp.issparse(y):
|
||||
y = y.toarray()
|
||||
if y.ndim == 2 and y.shape[1] == 2:
|
||||
return classes[y[:, 1]]
|
||||
else:
|
||||
if len(classes) == 1:
|
||||
return np.repeat(classes[0], len(y))
|
||||
else:
|
||||
return classes[y.ravel()]
|
||||
|
||||
elif output_type == "multilabel-indicator":
|
||||
return y
|
||||
|
||||
else:
|
||||
raise ValueError("{0} format is not supported".format(output_type))
|
||||
|
||||
|
||||
class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
|
||||
"""Transform between iterable of iterables and a multilabel format.
|
||||
|
||||
Although a list of sets or tuples is a very intuitive format for multilabel
|
||||
data, it is unwieldy to process. This transformer converts between this
|
||||
intuitive format and the supported multilabel format: a (samples x classes)
|
||||
binary matrix indicating the presence of a class label.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
classes : array-like of shape (n_classes,), default=None
|
||||
Indicates an ordering for the class labels.
|
||||
All entries should be unique (cannot contain duplicate classes).
|
||||
|
||||
sparse_output : bool, default=False
|
||||
Set to True if output binary array is desired in CSR sparse format.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
A copy of the `classes` parameter when provided.
|
||||
Otherwise it corresponds to the sorted set of classes found
|
||||
when fitting.
|
||||
|
||||
See Also
|
||||
--------
|
||||
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
|
||||
scheme.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.preprocessing import MultiLabelBinarizer
|
||||
>>> mlb = MultiLabelBinarizer()
|
||||
>>> mlb.fit_transform([(1, 2), (3,)])
|
||||
array([[1, 1, 0],
|
||||
[0, 0, 1]])
|
||||
>>> mlb.classes_
|
||||
array([1, 2, 3])
|
||||
|
||||
>>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
|
||||
array([[0, 1, 1],
|
||||
[1, 0, 0]])
|
||||
>>> list(mlb.classes_)
|
||||
['comedy', 'sci-fi', 'thriller']
|
||||
|
||||
A common mistake is to pass in a list, which leads to the following issue:
|
||||
|
||||
>>> mlb = MultiLabelBinarizer()
|
||||
>>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
|
||||
MultiLabelBinarizer()
|
||||
>>> mlb.classes_
|
||||
array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
|
||||
'y'], dtype=object)
|
||||
|
||||
To correct this, the list of labels should be passed in as:
|
||||
|
||||
>>> mlb = MultiLabelBinarizer()
|
||||
>>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
|
||||
MultiLabelBinarizer()
|
||||
>>> mlb.classes_
|
||||
array(['comedy', 'sci-fi', 'thriller'], dtype=object)
|
||||
"""
|
||||
|
||||
def __init__(self, *, classes=None, sparse_output=False):
|
||||
self.classes = classes
|
||||
self.sparse_output = sparse_output
|
||||
|
||||
def fit(self, y):
|
||||
"""Fit the label sets binarizer, storing :term:`classes_`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self._cached_dict = None
|
||||
if self.classes is None:
|
||||
classes = sorted(set(itertools.chain.from_iterable(y)))
|
||||
elif len(set(self.classes)) < len(self.classes):
|
||||
raise ValueError(
|
||||
"The classes argument contains duplicate "
|
||||
"classes. Remove these duplicates before passing "
|
||||
"them to MultiLabelBinarizer."
|
||||
)
|
||||
else:
|
||||
classes = self.classes
|
||||
dtype = int if all(isinstance(c, int) for c in classes) else object
|
||||
self.classes_ = np.empty(len(classes), dtype=dtype)
|
||||
self.classes_[:] = classes
|
||||
return self
|
||||
|
||||
def fit_transform(self, y):
|
||||
"""Fit the label sets binarizer and transform the given label sets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
|
||||
is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
|
||||
format.
|
||||
"""
|
||||
self._cached_dict = None
|
||||
|
||||
if self.classes is not None:
|
||||
return self.fit(y).transform(y)
|
||||
|
||||
# Automatically increment on new class
|
||||
class_mapping = defaultdict(int)
|
||||
class_mapping.default_factory = class_mapping.__len__
|
||||
yt = self._transform(y, class_mapping)
|
||||
|
||||
# sort classes and reorder columns
|
||||
tmp = sorted(class_mapping, key=class_mapping.get)
|
||||
|
||||
# (make safe for tuples)
|
||||
dtype = int if all(isinstance(c, int) for c in tmp) else object
|
||||
class_mapping = np.empty(len(tmp), dtype=dtype)
|
||||
class_mapping[:] = tmp
|
||||
self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
|
||||
# ensure yt.indices keeps its current dtype
|
||||
yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False)
|
||||
|
||||
if not self.sparse_output:
|
||||
yt = yt.toarray()
|
||||
|
||||
return yt
|
||||
|
||||
def transform(self, y):
|
||||
"""Transform the given label sets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_indicator : array or CSR matrix, shape (n_samples, n_classes)
|
||||
A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
|
||||
`y[i]`, and 0 otherwise.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
class_to_index = self._build_cache()
|
||||
yt = self._transform(y, class_to_index)
|
||||
|
||||
if not self.sparse_output:
|
||||
yt = yt.toarray()
|
||||
|
||||
return yt
|
||||
|
||||
def _build_cache(self):
|
||||
if self._cached_dict is None:
|
||||
self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
|
||||
|
||||
return self._cached_dict
|
||||
|
||||
def _transform(self, y, class_mapping):
|
||||
"""Transforms the label sets with a given mapping.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : iterable of iterables
|
||||
A set of labels (any orderable and hashable object) for each
|
||||
sample. If the `classes` parameter is set, `y` will not be
|
||||
iterated.
|
||||
|
||||
class_mapping : Mapping
|
||||
Maps from label to column index in label indicator matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_indicator : sparse matrix of shape (n_samples, n_classes)
|
||||
Label indicator matrix. Will be of CSR format.
|
||||
"""
|
||||
indices = array.array("i")
|
||||
indptr = array.array("i", [0])
|
||||
unknown = set()
|
||||
for labels in y:
|
||||
index = set()
|
||||
for label in labels:
|
||||
try:
|
||||
index.add(class_mapping[label])
|
||||
except KeyError:
|
||||
unknown.add(label)
|
||||
indices.extend(index)
|
||||
indptr.append(len(indices))
|
||||
if unknown:
|
||||
warnings.warn(
|
||||
"unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
|
||||
)
|
||||
data = np.ones(len(indices), dtype=int)
|
||||
|
||||
return sp.csr_matrix(
|
||||
(data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
|
||||
)
|
||||
|
||||
def inverse_transform(self, yt):
|
||||
"""Transform the given indicator matrix into label sets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
||||
A matrix containing only 1s ands 0s.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : list of tuples
|
||||
The set of labels for each sample such that `y[i]` consists of
|
||||
`classes_[j]` for each `yt[i, j] == 1`.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if yt.shape[1] != len(self.classes_):
|
||||
raise ValueError(
|
||||
"Expected indicator for {0} classes, but got {1}".format(
|
||||
len(self.classes_), yt.shape[1]
|
||||
)
|
||||
)
|
||||
|
||||
if sp.issparse(yt):
|
||||
yt = yt.tocsr()
|
||||
if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
|
||||
raise ValueError("Expected only 0s and 1s in label indicator.")
|
||||
return [
|
||||
tuple(self.classes_.take(yt.indices[start:end]))
|
||||
for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
|
||||
]
|
||||
else:
|
||||
unexpected = np.setdiff1d(yt, [0, 1])
|
||||
if len(unexpected) > 0:
|
||||
raise ValueError(
|
||||
"Expected only 0s and 1s in label indicator. Also got {0}".format(
|
||||
unexpected
|
||||
)
|
||||
)
|
||||
return [tuple(self.classes_.compress(indicators)) for indicators in yt]
|
||||
|
||||
def _more_tags(self):
|
||||
return {"X_types": ["2dlabels"]}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
|
||||
|
||||
def configuration(parent_package="", top_path=None):
|
||||
import numpy
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
config = Configuration("preprocessing", parent_package, top_path)
|
||||
libraries = []
|
||||
if os.name == "posix":
|
||||
libraries.append("m")
|
||||
|
||||
config.add_extension(
|
||||
"_csr_polynomial_expansion",
|
||||
sources=["_csr_polynomial_expansion.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_subpackage("tests")
|
||||
|
||||
return config
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,183 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from sklearn.base import clone
|
||||
|
||||
from sklearn.preprocessing import maxabs_scale
|
||||
from sklearn.preprocessing import minmax_scale
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.preprocessing import power_transform
|
||||
from sklearn.preprocessing import quantile_transform
|
||||
from sklearn.preprocessing import robust_scale
|
||||
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import PowerTransformer
|
||||
from sklearn.preprocessing import QuantileTransformer
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
|
||||
def _get_valid_samples_by_column(X, col):
|
||||
"""Get non NaN samples in column of X"""
|
||||
return X[:, [col]][~np.isnan(X[:, col])]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func, support_sparse, strictly_positive, omit_kwargs",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale, True, False, []),
|
||||
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
|
||||
(StandardScaler(), scale, False, False, []),
|
||||
(StandardScaler(with_mean=False), scale, True, False, []),
|
||||
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
|
||||
(PowerTransformer("box-cox"), power_transform, False, True, []),
|
||||
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
|
||||
(RobustScaler(), robust_scale, False, False, []),
|
||||
(RobustScaler(with_centering=False), robust_scale, True, False, []),
|
||||
],
|
||||
)
|
||||
def test_missing_value_handling(
|
||||
est, func, support_sparse, strictly_positive, omit_kwargs
|
||||
):
|
||||
# check that the preprocessing method let pass nan
|
||||
rng = np.random.RandomState(42)
|
||||
X = iris.data.copy()
|
||||
n_missing = 50
|
||||
X[
|
||||
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
|
||||
] = np.nan
|
||||
if strictly_positive:
|
||||
X += np.nanmin(X) + 0.1
|
||||
X_train, X_test = train_test_split(X, random_state=1)
|
||||
# sanity check
|
||||
assert not np.all(np.isnan(X_train), axis=0).any()
|
||||
assert np.any(np.isnan(X_train), axis=0).all()
|
||||
assert np.any(np.isnan(X_test), axis=0).all()
|
||||
X_test[:, 0] = np.nan # make sure this boundary case is tested
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt = est.fit(X_train).transform(X_test)
|
||||
# ensure no warnings are raised
|
||||
# missing values should still be missing, and only them
|
||||
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
|
||||
|
||||
# check that the function leads to the same results as the class
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_class = est.transform(X_train)
|
||||
kwargs = est.get_params()
|
||||
# remove the parameters which should be omitted because they
|
||||
# are not defined in the counterpart function of the preprocessing class
|
||||
for kwarg in omit_kwargs:
|
||||
_ = kwargs.pop(kwarg)
|
||||
Xt_func = func(X_train, **kwargs)
|
||||
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
|
||||
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
|
||||
|
||||
# check that the inverse transform keep NaN
|
||||
Xt_inv = est.inverse_transform(Xt)
|
||||
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
|
||||
# FIXME: we can introduce equal_nan=True in recent version of numpy.
|
||||
# For the moment which just check that non-NaN values are almost equal.
|
||||
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
# train only on non-NaN
|
||||
est.fit(_get_valid_samples_by_column(X_train, i))
|
||||
# check transforming with NaN works even when training without NaN
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_col = est.transform(X_test[:, [i]])
|
||||
assert_allclose(Xt_col, Xt[:, [i]])
|
||||
# check non-NaN is handled as before - the 1st column is all nan
|
||||
if not np.isnan(X_test[:, i]).all():
|
||||
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
|
||||
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
|
||||
|
||||
if support_sparse:
|
||||
est_dense = clone(est)
|
||||
est_sparse = clone(est)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_dense = est_dense.fit(X_train).transform(X_test)
|
||||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
|
||||
|
||||
for sparse_constructor in (
|
||||
sparse.csr_matrix,
|
||||
sparse.csc_matrix,
|
||||
sparse.bsr_matrix,
|
||||
sparse.coo_matrix,
|
||||
sparse.dia_matrix,
|
||||
sparse.dok_matrix,
|
||||
sparse.lil_matrix,
|
||||
):
|
||||
# check that the dense and sparse inputs lead to the same results
|
||||
# precompute the matrix to avoid catching side warnings
|
||||
X_train_sp = sparse_constructor(X_train)
|
||||
X_test_sp = sparse_constructor(X_test)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
|
||||
|
||||
assert_allclose(Xt_sp.A, Xt_dense)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
|
||||
|
||||
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale),
|
||||
(MinMaxScaler(), minmax_scale),
|
||||
(StandardScaler(), scale),
|
||||
(StandardScaler(with_mean=False), scale),
|
||||
(PowerTransformer("yeo-johnson"), power_transform),
|
||||
(
|
||||
PowerTransformer("box-cox"),
|
||||
power_transform,
|
||||
),
|
||||
(QuantileTransformer(n_quantiles=3), quantile_transform),
|
||||
(RobustScaler(), robust_scale),
|
||||
(RobustScaler(with_centering=False), robust_scale),
|
||||
],
|
||||
)
|
||||
def test_missing_value_pandas_na_support(est, func):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
|
||||
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
|
||||
[1, 2, 3, 4, 5, 6, 7, 8],
|
||||
]
|
||||
).T
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
|
||||
X_df["c"] = X_df["c"].astype("int")
|
||||
|
||||
X_trans = est.fit_transform(X)
|
||||
X_df_trans = est.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans, X_df_trans)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,472 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import warnings
|
||||
|
||||
from sklearn import clone
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.utils._testing import (
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
assert_allclose_dense_sparse,
|
||||
)
|
||||
|
||||
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected",
|
||||
[
|
||||
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
|
||||
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
|
||||
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
|
||||
],
|
||||
)
|
||||
def test_fit_transform(strategy, expected):
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
|
||||
est.fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
|
||||
def test_valid_n_bins():
|
||||
KBinsDiscretizer(n_bins=2).fit_transform(X)
|
||||
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
|
||||
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
|
||||
|
||||
|
||||
def test_invalid_n_bins():
|
||||
est = KBinsDiscretizer(n_bins=1)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins. Received 1, expected at"
|
||||
" least 2."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=1.1)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid n_bins type. Received float, expected"
|
||||
" int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
def test_invalid_n_bins_array():
|
||||
# Bad shape
|
||||
n_bins = np.full((2, 4), 2.0)
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Incorrect number of features
|
||||
n_bins = [1, 2, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Bad bin values
|
||||
n_bins = [1, 2, 2, 1]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 3. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Float bin values
|
||||
n_bins = [2.1, 2, 2.1, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 2. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected",
|
||||
[
|
||||
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
|
||||
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
|
||||
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
|
||||
],
|
||||
)
|
||||
def test_fit_transform_n_bins_array(strategy, expected):
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
|
||||
).fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
# test the shape of bin_edges_
|
||||
n_features = np.array(X).shape[1]
|
||||
assert est.bin_edges_.shape == (n_features,)
|
||||
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
|
||||
assert bin_edges.shape == (n_bins + 1,)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_same_min_max(strategy):
|
||||
warnings.simplefilter("always")
|
||||
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
|
||||
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
|
||||
warning_message = "Feature 0 is constant and will be replaced with 0."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
est.fit(X)
|
||||
assert est.n_bins_[0] == 1
|
||||
# replace the feature with zeros
|
||||
Xt = est.transform(X)
|
||||
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_transform_1d_behavior():
|
||||
X = np.arange(4)
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
with pytest.raises(ValueError):
|
||||
est.fit(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
est.fit(X.reshape(-1, 1))
|
||||
with pytest.raises(ValueError):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("i", range(1, 9))
|
||||
def test_numeric_stability(i):
|
||||
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
|
||||
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
|
||||
|
||||
# Test up to discretizing nano units
|
||||
X = X_init / 10**i
|
||||
Xt = KBinsDiscretizer(n_bins=2, encode="ordinal").fit_transform(X)
|
||||
assert_array_equal(Xt_expected, Xt)
|
||||
|
||||
|
||||
def test_invalid_encode_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="invalid-encode")
|
||||
err_msg = (
|
||||
r"Valid options for 'encode' are "
|
||||
r"\('onehot', 'onehot-dense', 'ordinal'\). "
|
||||
r"Got encode='invalid-encode' instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
def test_encode_options():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="ordinal").fit(X)
|
||||
Xt_1 = est.transform(X)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot-dense").fit(X)
|
||||
Xt_2 = est.transform(X)
|
||||
assert not sp.issparse(Xt_2)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False
|
||||
).fit_transform(Xt_1),
|
||||
Xt_2,
|
||||
)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot").fit(X)
|
||||
Xt_3 = est.transform(X)
|
||||
assert sp.issparse(Xt_3)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True)
|
||||
.fit_transform(Xt_1)
|
||||
.toarray(),
|
||||
Xt_3.toarray(),
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_strategy_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy="invalid-strategy")
|
||||
err_msg = (
|
||||
r"Valid options for 'strategy' are "
|
||||
r"\('uniform', 'quantile', 'kmeans'\). "
|
||||
r"Got strategy='invalid-strategy' instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_2bins, expected_3bins, expected_5bins",
|
||||
[
|
||||
("uniform", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
|
||||
("kmeans", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
|
||||
("quantile", [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4]),
|
||||
],
|
||||
)
|
||||
def test_nonuniform_strategies(
|
||||
strategy, expected_2bins, expected_3bins, expected_5bins
|
||||
):
|
||||
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
|
||||
|
||||
# with 2 bins
|
||||
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_2bins, Xt.ravel())
|
||||
|
||||
# with 3 bins
|
||||
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_3bins, Xt.ravel())
|
||||
|
||||
# with 5 bins
|
||||
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_5bins, Xt.ravel())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_inv",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.5],
|
||||
[-0.5, 3.0, -2.5, -0.5],
|
||||
[0.5, 4.0, -1.5, 0.5],
|
||||
[0.5, 4.0, -1.5, 1.5],
|
||||
],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
[
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-0.125, 3.375, -2.125, 0.5625],
|
||||
[0.75, 4.25, -1.25, 1.625],
|
||||
],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.75],
|
||||
[-0.5, 3.0, -2.5, 0.0],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_inverse_transform(strategy, encode, expected_inv):
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
|
||||
Xt = kbd.fit_transform(X)
|
||||
Xinv = kbd.inverse_transform(Xt)
|
||||
assert_array_almost_equal(expected_inv, Xinv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_transform_outside_fit_range(strategy):
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
|
||||
kbd.fit(X)
|
||||
|
||||
X2 = np.array([-2, 5])[:, None]
|
||||
X2t = kbd.transform(X2)
|
||||
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
|
||||
assert_array_equal(X2t.min(axis=0), [0])
|
||||
|
||||
|
||||
def test_overwrite():
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
X_before = X.copy()
|
||||
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(X, X_before)
|
||||
|
||||
Xt_before = Xt.copy()
|
||||
Xinv = est.inverse_transform(Xt)
|
||||
assert_array_equal(Xt, Xt_before)
|
||||
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_bin_edges", [("quantile", [0, 1, 3]), ("kmeans", [0, 1.5, 3])]
|
||||
)
|
||||
def test_redundant_bins(strategy, expected_bin_edges):
|
||||
X = [[0], [0], [0], [0], [3], [3]]
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
|
||||
|
||||
|
||||
def test_percentile_numeric_stability():
|
||||
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
|
||||
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
|
||||
Xt = np.array([0, 0, 4]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
|
||||
assert_array_almost_equal(kbd.transform(X), Xt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("out_dtype", [None, np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_consistent_dtype(in_dtype, out_dtype, encode):
|
||||
X_input = np.array(X, dtype=in_dtype)
|
||||
kbd = KBinsDiscretizer(n_bins=3, encode=encode, dtype=out_dtype)
|
||||
|
||||
# a error is raised if a wrong dtype is define for the model
|
||||
if out_dtype not in [None, np.float32, np.float64]:
|
||||
with pytest.raises(ValueError, match="Valid options for 'dtype' are"):
|
||||
kbd.fit(X_input)
|
||||
else:
|
||||
kbd.fit(X_input)
|
||||
|
||||
# test output dtype
|
||||
if out_dtype is not None:
|
||||
expected_dtype = out_dtype
|
||||
elif out_dtype is None and X_input.dtype == np.float16:
|
||||
# wrong numeric input dtype are cast in np.float64
|
||||
expected_dtype = np.float64
|
||||
else:
|
||||
expected_dtype = X_input.dtype
|
||||
Xt = kbd.transform(X_input)
|
||||
assert Xt.dtype == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_32_equal_64(input_dtype, encode):
|
||||
# TODO this check is redundant with common checks and can be removed
|
||||
# once #16290 is merged
|
||||
X_input = np.array(X, dtype=input_dtype)
|
||||
|
||||
# 32 bit output
|
||||
kbd_32 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float32)
|
||||
kbd_32.fit(X_input)
|
||||
Xt_32 = kbd_32.transform(X_input)
|
||||
|
||||
# 64 bit output
|
||||
kbd_64 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float64)
|
||||
kbd_64.fit(X_input)
|
||||
Xt_64 = kbd_64.transform(X_input)
|
||||
|
||||
assert_allclose_dense_sparse(Xt_32, Xt_64)
|
||||
|
||||
|
||||
# FIXME: remove the `filterwarnings` in 1.3
|
||||
@pytest.mark.filterwarnings("ignore:In version 1.3 onwards, subsample=2e5")
|
||||
@pytest.mark.parametrize("subsample", [None, "warn"])
|
||||
def test_kbinsdiscretizer_subsample_default(subsample):
|
||||
# Since the size of X is small (< 2e5), subsampling will not take place.
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
|
||||
kbd_default.fit(X)
|
||||
|
||||
kbd_with_subsampling = clone(kbd_default)
|
||||
kbd_with_subsampling.set_params(subsample=subsample)
|
||||
kbd_with_subsampling.fit(X)
|
||||
|
||||
for bin_kbd_default, bin_kbd_with_subsampling in zip(
|
||||
kbd_default.bin_edges_[0], kbd_with_subsampling.bin_edges_[0]
|
||||
):
|
||||
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
|
||||
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
|
||||
|
||||
|
||||
def test_kbinsdiscretizer_subsample_invalid_strategy():
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform", subsample=3)
|
||||
|
||||
err_msg = '`subsample` must be used with `strategy="quantile"`.'
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
kbd.fit(X)
|
||||
|
||||
|
||||
def test_kbinsdiscretizer_subsample_invalid_type():
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=10, encode="ordinal", strategy="quantile", subsample="full"
|
||||
)
|
||||
|
||||
msg = "subsample must be an instance of int, not str."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
kbd.fit(X)
|
||||
|
||||
|
||||
# TODO: Remove in 1.3
|
||||
def test_kbinsdiscretizer_subsample_warn():
|
||||
X = np.random.rand(200001, 1).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=100, encode="ordinal", strategy="quantile")
|
||||
|
||||
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
kbd.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("subsample", [0, int(2e5)])
|
||||
def test_kbinsdiscretizer_subsample_values(subsample):
|
||||
X = np.random.rand(220000, 1).reshape(-1, 1)
|
||||
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
|
||||
|
||||
kbd_with_subsampling = clone(kbd_default)
|
||||
kbd_with_subsampling.set_params(subsample=subsample)
|
||||
|
||||
if subsample == 0:
|
||||
with pytest.raises(ValueError, match="subsample == 0, must be >= 1."):
|
||||
kbd_with_subsampling.fit(X)
|
||||
else:
|
||||
# TODO: Remove in 1.3
|
||||
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
kbd_default.fit(X)
|
||||
|
||||
kbd_with_subsampling.fit(X)
|
||||
assert not np.all(
|
||||
kbd_default.bin_edges_[0] == kbd_with_subsampling.bin_edges_[0]
|
||||
)
|
||||
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encode, expected_names",
|
||||
[
|
||||
(
|
||||
"onehot",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
(
|
||||
"onehot-dense",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
|
||||
],
|
||||
)
|
||||
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
|
||||
"""Check get_feature_names_out for different settings.
|
||||
Non-regression test for #22731
|
||||
"""
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
|
||||
kbd = KBinsDiscretizer(n_bins=4, encode=encode).fit(X)
|
||||
Xt = kbd.transform(X)
|
||||
|
||||
input_features = [f"feat{i}" for i in range(3)]
|
||||
output_names = kbd.get_feature_names_out(input_features)
|
||||
assert Xt.shape[1] == output_names.shape[0]
|
||||
|
||||
assert_array_equal(output_names, expected_names)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,392 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from sklearn.utils import _safe_indexing
|
||||
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
from sklearn.utils._testing import (
|
||||
assert_array_equal,
|
||||
assert_allclose_dense_sparse,
|
||||
_convert_container,
|
||||
)
|
||||
|
||||
|
||||
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
|
||||
def _func(X, *args, **kwargs):
|
||||
args_store.append(X)
|
||||
args_store.extend(args)
|
||||
kwargs_store.update(kwargs)
|
||||
return func(X)
|
||||
|
||||
return _func
|
||||
|
||||
|
||||
def test_delegate_to_func():
|
||||
# (args|kwargs)_store will hold the positional and keyword arguments
|
||||
# passed to the function inside the FunctionTransformer.
|
||||
args_store = []
|
||||
kwargs_store = {}
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
assert_array_equal(
|
||||
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
|
||||
X,
|
||||
"transform should have returned X unchanged",
|
||||
)
|
||||
|
||||
# The function should only have received X.
|
||||
assert args_store == [
|
||||
X
|
||||
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
|
||||
assert (
|
||||
not kwargs_store
|
||||
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
|
||||
# reset the argument stores.
|
||||
args_store[:] = []
|
||||
kwargs_store.clear()
|
||||
transformed = FunctionTransformer(
|
||||
_make_func(args_store, kwargs_store),
|
||||
).transform(X)
|
||||
|
||||
assert_array_equal(
|
||||
transformed, X, err_msg="transform should have returned X unchanged"
|
||||
)
|
||||
|
||||
# The function should have received X
|
||||
assert args_store == [
|
||||
X
|
||||
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
|
||||
assert (
|
||||
not kwargs_store
|
||||
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
|
||||
|
||||
def test_np_log():
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
|
||||
# Test that the numpy.log example still works.
|
||||
assert_array_equal(
|
||||
FunctionTransformer(np.log1p).transform(X),
|
||||
np.log1p(X),
|
||||
)
|
||||
|
||||
|
||||
def test_kw_arg():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=3))
|
||||
|
||||
|
||||
def test_kw_arg_update():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args["decimals"] = 1
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_kw_arg_reset():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args = dict(decimals=1)
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_inverse_transform():
|
||||
X = np.array([1, 4, 9, 16]).reshape((2, 2))
|
||||
|
||||
# Test that inverse_transform works correctly
|
||||
F = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
inv_kw_args=dict(decimals=3),
|
||||
)
|
||||
assert_array_equal(
|
||||
F.inverse_transform(F.transform(X)),
|
||||
np.around(np.sqrt(X), decimals=3),
|
||||
)
|
||||
|
||||
|
||||
def test_check_inverse():
|
||||
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
|
||||
X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)]
|
||||
|
||||
for X in X_list:
|
||||
if sparse.issparse(X):
|
||||
accept_sparse = True
|
||||
else:
|
||||
accept_sparse = False
|
||||
trans = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
warning_message = (
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
trans.fit(X)
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1,
|
||||
inverse_func=np.log1p,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
Xt = trans.fit_transform(X)
|
||||
|
||||
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
|
||||
|
||||
# check that we don't check inverse when one of the func or inverse is not
|
||||
# provided.
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X_dense)
|
||||
trans = FunctionTransformer(
|
||||
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X_dense)
|
||||
|
||||
|
||||
def test_function_transformer_frame():
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(np.random.randn(100, 10))
|
||||
transformer = FunctionTransformer()
|
||||
X_df_trans = transformer.fit_transform(X_df)
|
||||
assert hasattr(X_df_trans, "loc")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X_type", ["array", "series"])
|
||||
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
|
||||
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
|
||||
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
|
||||
inverse_mapping = {value: key for key, value in mapping.items()}
|
||||
dtype = "object"
|
||||
|
||||
data = ["one", "two", "three", "one", "one", 5, 6]
|
||||
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
|
||||
|
||||
def func(X):
|
||||
return np.array(
|
||||
[mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object
|
||||
)
|
||||
|
||||
def inverse_func(X):
|
||||
return _convert_container(
|
||||
[inverse_mapping[x] for x in X],
|
||||
X_type,
|
||||
columns_name=["value"],
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
|
||||
)
|
||||
|
||||
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.fit(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, feature_names_out, input_features, expected",
|
||||
[
|
||||
(
|
||||
# NumPy inputs, default behavior: generate names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
None,
|
||||
("x0", "x1", "x2"),
|
||||
),
|
||||
(
|
||||
# Pandas input, default behavior: use input feature names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: ("a", "b"),
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: ("c", "d", "e"),
|
||||
None,
|
||||
("c", "d", "e"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable – default input_features
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("a",),
|
||||
None,
|
||||
("x0", "x1", "x2", "a"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable – default input_features
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
None,
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# NumPy input, input_features=list of names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# Pandas input, input_features=list of names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable, input_features=list
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("d",),
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c", "d"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable, input_features=list
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b", "c"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_function_transformer_get_feature_names_out(
|
||||
X, feature_names_out, input_features, expected
|
||||
):
|
||||
if isinstance(X, dict):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame(X)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
feature_names_out=feature_names_out, validate=True
|
||||
)
|
||||
transformer.fit_transform(X)
|
||||
names = transformer.get_feature_names_out(input_features)
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, expected)
|
||||
|
||||
|
||||
def test_function_transformer_get_feature_names_out_without_validation():
|
||||
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = "When 'feature_names_out' is 'one-to-one', either"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
names = transformer.get_feature_names_out(("a", "b"))
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("feature_names_out", ["x0", ["x0"], ("x0",)])
|
||||
def test_function_transformer_feature_names_out_string(feature_names_out):
|
||||
transformer = FunctionTransformer(feature_names_out=feature_names_out)
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = """must either be "one-to-one" or a callable"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_is_None():
|
||||
transformer = FunctionTransformer()
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_uses_estimator():
|
||||
def add_n_random_features(X, n):
|
||||
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
|
||||
|
||||
def feature_names_out(transformer, input_features):
|
||||
n = transformer.kw_args["n"]
|
||||
return list(input_features) + [f"rnd{i}" for i in range(n)]
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=add_n_random_features,
|
||||
feature_names_out=feature_names_out,
|
||||
kw_args=dict(n=3),
|
||||
validate=True,
|
||||
)
|
||||
pd = pytest.importorskip("pandas")
|
||||
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
|
||||
transformer.fit_transform(df)
|
||||
names = transformer.get_feature_names_out()
|
||||
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
|
||||
|
||||
|
||||
def test_function_transformer_validate_inverse():
|
||||
"""Test that function transformer does not reset estimator in
|
||||
`inverse_transform`."""
|
||||
|
||||
def add_constant_feature(X):
|
||||
X_one = np.ones((X.shape[0], 1))
|
||||
return np.concatenate((X, X_one), axis=1)
|
||||
|
||||
def inverse_add_constant(X):
|
||||
return X[:, :-1]
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [3, 4]])
|
||||
trans = FunctionTransformer(
|
||||
func=add_constant_feature,
|
||||
inverse_func=inverse_add_constant,
|
||||
validate=True,
|
||||
)
|
||||
X_trans = trans.fit_transform(X)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
|
||||
trans.inverse_transform(X_trans)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
@@ -0,0 +1,645 @@
|
||||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils import _to_object_array
|
||||
|
||||
from sklearn.preprocessing._label import LabelBinarizer
|
||||
from sklearn.preprocessing._label import MultiLabelBinarizer
|
||||
from sklearn.preprocessing._label import LabelEncoder
|
||||
from sklearn.preprocessing._label import label_binarize
|
||||
|
||||
from sklearn.preprocessing._label import _inverse_binarize_thresholding
|
||||
from sklearn.preprocessing._label import _inverse_binarize_multiclass
|
||||
|
||||
from sklearn import datasets
|
||||
|
||||
iris = datasets.load_iris()
|
||||
|
||||
|
||||
def toarray(a):
|
||||
if hasattr(a, "toarray"):
|
||||
a = a.toarray()
|
||||
return a
|
||||
|
||||
|
||||
def test_label_binarizer():
|
||||
# one-class case defaults to negative label
|
||||
# For dense case:
|
||||
inp = ["pos", "pos", "pos", "pos"]
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
expected = np.array([[0, 0, 0, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
# For sparse case:
|
||||
lb = LabelBinarizer(sparse_output=True)
|
||||
got = lb.fit_transform(inp)
|
||||
assert issparse(got)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got.toarray())
|
||||
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
|
||||
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
# two-class case
|
||||
inp = ["neg", "pos", "pos", "neg"]
|
||||
expected = np.array([[0, 1, 1, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["neg", "pos"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
|
||||
assert_array_equal(lb.inverse_transform(to_invert), inp)
|
||||
|
||||
# multi-class case
|
||||
inp = ["spam", "ham", "eggs", "ham", "0"]
|
||||
expected = np.array(
|
||||
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
def test_label_binarizer_unseen_labels():
|
||||
lb = LabelBinarizer()
|
||||
|
||||
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
|
||||
got = lb.fit_transform(["b", "d", "e"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
expected = np.array(
|
||||
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
|
||||
)
|
||||
got = lb.transform(["a", "b", "c", "d", "e", "f"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
|
||||
def test_label_binarizer_set_label_encoding():
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=0)
|
||||
|
||||
# two-class case with pos_label=0
|
||||
inp = np.array([0, 1, 1, 0])
|
||||
expected = np.array([[-2, 0, 0, -2]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=2)
|
||||
|
||||
# multi-class case
|
||||
inp = np.array([3, 2, 1, 2, 0])
|
||||
expected = np.array(
|
||||
[
|
||||
[-2, -2, -2, +2],
|
||||
[-2, -2, +2, -2],
|
||||
[-2, +2, -2, -2],
|
||||
[-2, -2, +2, -2],
|
||||
[+2, -2, -2, -2],
|
||||
]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
@ignore_warnings
|
||||
def test_label_binarizer_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
one_class = np.array([0, 0, 0, 0])
|
||||
lb = LabelBinarizer().fit(one_class)
|
||||
|
||||
multi_label = [(2, 3), (0,), (0, 2)]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform(multi_label)
|
||||
|
||||
lb = LabelBinarizer()
|
||||
err_msg = "This LabelBinarizer instance is not fitted yet"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform([])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.inverse_transform([])
|
||||
|
||||
input_labels = [0, 1, 0, 1]
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=1."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=1)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=2."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=2)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = (
|
||||
"Sparse binarization is only supported with non zero pos_label and zero "
|
||||
"neg_label, got pos_label=2 and neg_label=1"
|
||||
)
|
||||
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
|
||||
# Fail on y_type
|
||||
err_msg = "foo format is not supported"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Sequence of seq type should raise ValueError
|
||||
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit_transform(y_seq_of_seqs)
|
||||
|
||||
# Fail on the number of classes
|
||||
err_msg = "The number of class is not equal to the number of dimension of y."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on the dimension of 'binary'
|
||||
err_msg = "output_type='binary', but y.shape"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=np.array([[1, 2, 3], [2, 1, 3]]),
|
||||
output_type="binary",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on multioutput data
|
||||
err_msg = "Multioutput target data is not supported with label binarization"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, classes, unknown",
|
||||
[
|
||||
(
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array([1, 2, 3], dtype="int64"),
|
||||
np.array([4], dtype="int64"),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["d"]),
|
||||
),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder(values, classes, unknown):
|
||||
# Test LabelEncoder's transform, fit_transform and
|
||||
# inverse_transform methods
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
assert_array_equal(le.classes_, classes)
|
||||
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
|
||||
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
|
||||
le = LabelEncoder()
|
||||
ret = le.fit_transform(values)
|
||||
assert_array_equal(ret, [1, 0, 2, 0, 2])
|
||||
|
||||
with pytest.raises(ValueError, match="unseen labels"):
|
||||
le.transform(unknown)
|
||||
|
||||
|
||||
def test_label_encoder_negative_ints():
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 1, 4, 5, -1, 0])
|
||||
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
|
||||
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
|
||||
assert_array_equal(
|
||||
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([0, 6])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["str", "object"])
|
||||
def test_label_encoder_str_bad_shape(dtype):
|
||||
le = LabelEncoder()
|
||||
le.fit(np.array(["apple", "orange"], dtype=dtype))
|
||||
msg = "should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.transform("apple")
|
||||
|
||||
|
||||
def test_label_encoder_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
le = LabelEncoder()
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
le.inverse_transform([])
|
||||
|
||||
# Fail on unseen labels
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 2, 3, -1, 1])
|
||||
msg = "contains previously unseen labels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2, -3, -4])
|
||||
|
||||
# Fail on inverse_transform("")
|
||||
msg = r"should be a 1d array.+shape \(\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform("")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder_empty_array(values):
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
# test empty transform
|
||||
transformed = le.transform([])
|
||||
assert_array_equal(np.array([]), transformed)
|
||||
# test empty inverse transform
|
||||
inverse_transformed = le.inverse_transform([])
|
||||
assert_array_equal(np.array([]), inverse_transformed)
|
||||
|
||||
|
||||
def test_sparse_output_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
inverse = inputs[0]()
|
||||
for sparse_output in [True, False]:
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit_transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
|
||||
|
||||
|
||||
def test_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
inverse = inputs[0]()
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit_transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
def test_multilabel_binarizer_empty_sample():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2], [1], []]
|
||||
Y = np.array([[1, 1], [1, 0], [0, 0]])
|
||||
assert_array_equal(mlb.fit_transform(y), Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_unknown_class():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2]]
|
||||
Y = np.array([[1, 0], [0, 1]])
|
||||
warning_message = "unknown class.* will be ignored"
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
|
||||
Y = np.array([[1, 0, 0], [0, 1, 0]])
|
||||
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_given_classes():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# ensure works with extra class
|
||||
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
|
||||
assert_array_equal(
|
||||
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
|
||||
)
|
||||
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
|
||||
|
||||
# ensure fit is no-op as iterable is not consumed
|
||||
inp = iter(inp)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
|
||||
# ensure a ValueError is thrown if given duplicate classes
|
||||
err_msg = (
|
||||
"The classes argument contains duplicate classes. Remove "
|
||||
"these duplicates before passing them to MultiLabelBinarizer."
|
||||
)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mlb.fit(inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_multiple_calls():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
|
||||
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
# first call
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
# second call change class
|
||||
mlb.classes = [1, 2, 3]
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_same_length_sequence():
|
||||
# Ensure sequences of the same length are not interpreted as a 2-d array
|
||||
inp = [[1], [0], [2]]
|
||||
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_integer_labels():
|
||||
tuple_classes = _to_object_array([(1,), (2,), (3,)])
|
||||
inputs = [
|
||||
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
|
||||
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
|
||||
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
for inp, classes in inputs:
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
inp = np.array(inp, dtype=object)
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
with pytest.raises(TypeError):
|
||||
mlb.fit_transform([({}), ({}, {"a": "b"})])
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_unique():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
indicator_mat = np.array([[1, 1]])
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_inverse_validation():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit_transform(inp)
|
||||
# Not binary
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 3]]))
|
||||
# The following binary cases are fine, however
|
||||
mlb.inverse_transform(np.array([[0, 0]]))
|
||||
mlb.inverse_transform(np.array([[1, 1]]))
|
||||
mlb.inverse_transform(np.array([[1, 0]]))
|
||||
|
||||
# Wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1]]))
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 1, 1]]))
|
||||
|
||||
|
||||
def test_label_binarize_with_class_order():
|
||||
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
# Modified class order
|
||||
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
|
||||
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
|
||||
def check_binarized_results(y, classes, pos_label, neg_label, expected):
|
||||
for sparse_output in [True, False]:
|
||||
if (pos_label == 0 or neg_label != 0) and sparse_output:
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
continue
|
||||
|
||||
# check label_binarize
|
||||
binarized = label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
|
||||
# check inverse
|
||||
y_type = type_of_target(y)
|
||||
if y_type == "multiclass":
|
||||
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
|
||||
|
||||
else:
|
||||
inversed = _inverse_binarize_thresholding(
|
||||
binarized,
|
||||
output_type=y_type,
|
||||
classes=classes,
|
||||
threshold=((neg_label + pos_label) / 2.0),
|
||||
)
|
||||
|
||||
assert_array_equal(toarray(inversed), toarray(y))
|
||||
|
||||
# Check label binarizer
|
||||
lb = LabelBinarizer(
|
||||
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
|
||||
)
|
||||
binarized = lb.fit_transform(y)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
inverse_output = lb.inverse_transform(binarized)
|
||||
assert_array_equal(toarray(inverse_output), toarray(y))
|
||||
assert issparse(inverse_output) == issparse(y)
|
||||
|
||||
|
||||
def test_label_binarize_binary():
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 2
|
||||
neg_label = -1
|
||||
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
# Binary case where sparse_output = True will not result in a ValueError
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 3
|
||||
neg_label = 0
|
||||
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
|
||||
def test_label_binarize_multiclass():
|
||||
y = [0, 1, 2]
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = 2 * np.eye(3)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
def test_label_binarize_multilabel():
|
||||
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = pos_label * y_ind
|
||||
y_sparse = [
|
||||
sparse_matrix(y_ind)
|
||||
for sparse_matrix in [
|
||||
coo_matrix,
|
||||
csc_matrix,
|
||||
csr_matrix,
|
||||
dok_matrix,
|
||||
lil_matrix,
|
||||
]
|
||||
]
|
||||
|
||||
for y in [y_ind] + y_sparse:
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_input_label_binarize():
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
|
||||
with pytest.raises(ValueError, match="continuous target data is not "):
|
||||
label_binarize([1.2, 2.7], classes=[0, 1])
|
||||
with pytest.raises(ValueError, match="mismatch with the labels"):
|
||||
label_binarize([[1, 3]], classes=[1, 2, 3])
|
||||
|
||||
|
||||
def test_inverse_binarize_multiclass():
|
||||
got = _inverse_binarize_multiclass(
|
||||
csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
|
||||
)
|
||||
assert_array_equal(got, np.array([1, 1, 0]))
|
||||
@@ -0,0 +1,930 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
from scipy.sparse import random as sparse_random
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
from scipy.interpolate import BSpline
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import (
|
||||
KBinsDiscretizer,
|
||||
PolynomialFeatures,
|
||||
SplineTransformer,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
|
||||
def test_polynomial_and_spline_array_order(est):
|
||||
"""Test that output array has the given order."""
|
||||
X = np.arange(10).reshape(5, 2)
|
||||
|
||||
def is_c_contiguous(a):
|
||||
return np.isfortran(a.T)
|
||||
|
||||
assert is_c_contiguous(est().fit_transform(X))
|
||||
assert is_c_contiguous(est(order="C").fit_transform(X))
|
||||
assert np.isfortran(est(order="F").fit_transform(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
({"degree": -1}, "degree must be a non-negative integer"),
|
||||
({"degree": 2.5}, "degree must be a non-negative integer"),
|
||||
({"degree": "string"}, "degree must be a non-negative integer"),
|
||||
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
|
||||
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
|
||||
({"n_knots": 2.5}, "n_knots must be a positive integer >= 2."),
|
||||
({"n_knots": "string"}, "n_knots must be a positive integer >= 2."),
|
||||
({"knots": 1}, "Expected 2D array, got scalar array instead:"),
|
||||
({"knots": [1, 2]}, "Expected 2D array, got 1D array instead:"),
|
||||
(
|
||||
{"knots": [[1]]},
|
||||
r"Number of knots, knots.shape\[0\], must be >= 2.",
|
||||
),
|
||||
(
|
||||
{"knots": [[1, 5], [2, 6]]},
|
||||
r"knots.shape\[1\] == n_features is violated.",
|
||||
),
|
||||
(
|
||||
{"knots": [[1], [1], [2]]},
|
||||
"knots must be sorted without duplicates.",
|
||||
),
|
||||
({"knots": [[2], [1]]}, "knots must be sorted without duplicates."),
|
||||
(
|
||||
{"extrapolation": None},
|
||||
"extrapolation must be one of 'error', 'constant', 'linear', "
|
||||
"'continue' or 'periodic'.",
|
||||
),
|
||||
(
|
||||
{"extrapolation": 1},
|
||||
"extrapolation must be one of 'error', 'constant', 'linear', "
|
||||
"'continue' or 'periodic'.",
|
||||
),
|
||||
(
|
||||
{"extrapolation": "string"},
|
||||
"extrapolation must be one of 'error', 'constant', 'linear', "
|
||||
"'continue' or 'periodic'.",
|
||||
),
|
||||
({"include_bias": None}, "include_bias must be bool."),
|
||||
({"include_bias": 1}, "include_bias must be bool."),
|
||||
({"include_bias": "string"}, "include_bias must be bool."),
|
||||
(
|
||||
{"extrapolation": "periodic", "n_knots": 3, "degree": 3},
|
||||
"Periodic splines require degree < n_knots. Got n_knots=3 and degree=3.",
|
||||
),
|
||||
(
|
||||
{"extrapolation": "periodic", "knots": [[0], [1]], "degree": 2},
|
||||
"Periodic splines require degree < n_knots. Got n_knots=2 and degree=2.",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_spline_transformer_input_validation(params, err_msg):
|
||||
"""Test that we raise errors for invalid input in SplineTransformer."""
|
||||
X = [[1], [2]]
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
SplineTransformer(**params).fit(X)
|
||||
|
||||
|
||||
def test_spline_transformer_manual_knot_input():
|
||||
"""
|
||||
Test that array-like knot positions in SplineTransformer are accepted.
|
||||
"""
|
||||
X = np.arange(20).reshape(10, 2)
|
||||
knots = [[0.5, 1], [1.5, 2], [5, 10]]
|
||||
st1 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
|
||||
knots = np.asarray(knots)
|
||||
st2 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
|
||||
for i in range(X.shape[1]):
|
||||
assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
|
||||
def test_spline_transformer_integer_knots(extrapolation):
|
||||
"""Test that SplineTransformer accepts integer value knot positions."""
|
||||
X = np.arange(20).reshape(10, 2)
|
||||
knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
|
||||
_ = SplineTransformer(
|
||||
degree=3, knots=knots, extrapolation=extrapolation
|
||||
).fit_transform(X)
|
||||
|
||||
|
||||
# TODO: Remove in 1.2 when get_feature_names is removed.
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
|
||||
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
|
||||
def test_spline_transformer_feature_names(get_names):
|
||||
"""Test that SplineTransformer generates correct features name."""
|
||||
X = np.arange(20).reshape(10, 2)
|
||||
splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
|
||||
feature_names = getattr(splt, get_names)()
|
||||
assert_array_equal(
|
||||
feature_names,
|
||||
[
|
||||
"x0_sp_0",
|
||||
"x0_sp_1",
|
||||
"x0_sp_2",
|
||||
"x0_sp_3",
|
||||
"x0_sp_4",
|
||||
"x1_sp_0",
|
||||
"x1_sp_1",
|
||||
"x1_sp_2",
|
||||
"x1_sp_3",
|
||||
"x1_sp_4",
|
||||
],
|
||||
)
|
||||
|
||||
splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
|
||||
feature_names = getattr(splt, get_names)(["a", "b"])
|
||||
assert_array_equal(
|
||||
feature_names,
|
||||
[
|
||||
"a_sp_0",
|
||||
"a_sp_1",
|
||||
"a_sp_2",
|
||||
"a_sp_3",
|
||||
"b_sp_0",
|
||||
"b_sp_1",
|
||||
"b_sp_2",
|
||||
"b_sp_3",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("degree", range(1, 5))
|
||||
@pytest.mark.parametrize("n_knots", range(3, 5))
|
||||
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
|
||||
@pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
|
||||
def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation):
|
||||
"""Test that B-splines are indeed a decomposition of unity.
|
||||
|
||||
Splines basis functions must sum up to 1 per row, if we stay in between
|
||||
boundaries.
|
||||
"""
|
||||
X = np.linspace(0, 1, 100)[:, None]
|
||||
# make the boundaries 0 and 1 part of X_train, for sure.
|
||||
X_train = np.r_[[[0]], X[::2, :], [[1]]]
|
||||
X_test = X[1::2, :]
|
||||
|
||||
if extrapolation == "periodic":
|
||||
n_knots = n_knots + degree # periodic splines require degree < n_knots
|
||||
|
||||
splt = SplineTransformer(
|
||||
n_knots=n_knots,
|
||||
degree=degree,
|
||||
knots=knots,
|
||||
include_bias=True,
|
||||
extrapolation=extrapolation,
|
||||
)
|
||||
splt.fit(X_train)
|
||||
for X in [X_train, X_test]:
|
||||
assert_allclose(np.sum(splt.transform(X), axis=1), 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||||
def test_spline_transformer_linear_regression(bias, intercept):
|
||||
"""Test that B-splines fit a sinusodial curve pretty well."""
|
||||
X = np.linspace(0, 10, 100)[:, None]
|
||||
y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose
|
||||
pipe = Pipeline(
|
||||
steps=[
|
||||
(
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=15,
|
||||
degree=3,
|
||||
include_bias=bias,
|
||||
extrapolation="constant",
|
||||
),
|
||||
),
|
||||
("ols", LinearRegression(fit_intercept=intercept)),
|
||||
]
|
||||
)
|
||||
pipe.fit(X, y)
|
||||
assert_allclose(pipe.predict(X), y, rtol=1e-3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["knots", "n_knots", "sample_weight", "expected_knots"],
|
||||
[
|
||||
("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])),
|
||||
(
|
||||
"uniform",
|
||||
3,
|
||||
np.array([0, 0, 1, 1, 0, 3, 1]),
|
||||
np.array([[2, 2], [4, 8], [6, 14]]),
|
||||
),
|
||||
("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])),
|
||||
("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])),
|
||||
(
|
||||
"quantile",
|
||||
3,
|
||||
np.array([0, 0, 1, 1, 0, 3, 1]),
|
||||
np.array([[2, 2], [5, 8], [6, 14]]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_spline_transformer_get_base_knot_positions(
|
||||
knots, n_knots, sample_weight, expected_knots
|
||||
):
|
||||
# Check the behaviour to find the positions of the knots with and without
|
||||
# `sample_weight`
|
||||
X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]])
|
||||
base_knots = SplineTransformer._get_base_knot_positions(
|
||||
X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight
|
||||
)
|
||||
assert_allclose(base_knots, expected_knots)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"knots, n_knots, degree",
|
||||
[
|
||||
("uniform", 5, 3),
|
||||
("uniform", 12, 8),
|
||||
(
|
||||
[[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]],
|
||||
None,
|
||||
3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_spline_transformer_periodicity_of_extrapolation(knots, n_knots, degree):
|
||||
"""Test that the SplineTransformer is periodic for multiple features."""
|
||||
X_1 = np.linspace((-1, 0), (1, 5), 10)
|
||||
X_2 = np.linspace((1, 5), (3, 10), 10)
|
||||
|
||||
splt = SplineTransformer(
|
||||
knots=knots, n_knots=n_knots, degree=degree, extrapolation="periodic"
|
||||
)
|
||||
splt.fit(X_1)
|
||||
|
||||
assert_allclose(splt.transform(X_1), splt.transform(X_2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||||
def test_spline_transformer_periodic_linear_regression(bias, intercept):
|
||||
"""Test that B-splines fit a periodic curve pretty well."""
|
||||
# "+ 3" to avoid the value 0 in assert_allclose
|
||||
def f(x):
|
||||
return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
|
||||
|
||||
X = np.linspace(0, 1, 101)[:, None]
|
||||
pipe = Pipeline(
|
||||
steps=[
|
||||
(
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=20,
|
||||
degree=3,
|
||||
include_bias=bias,
|
||||
extrapolation="periodic",
|
||||
),
|
||||
),
|
||||
("ols", LinearRegression(fit_intercept=intercept)),
|
||||
]
|
||||
)
|
||||
pipe.fit(X, f(X[:, 0]))
|
||||
|
||||
# Generate larger array to check periodic extrapolation
|
||||
X_ = np.linspace(-1, 2, 301)[:, None]
|
||||
predictions = pipe.predict(X_)
|
||||
assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
|
||||
assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
|
||||
|
||||
|
||||
def test_spline_transformer_periodic_spline_backport():
|
||||
"""Test that the backport of extrapolate="periodic" works correctly"""
|
||||
X = np.linspace(-2, 3.5, 10)[:, None]
|
||||
degree = 2
|
||||
|
||||
# Use periodic extrapolation backport in SplineTransformer
|
||||
transformer = SplineTransformer(
|
||||
degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]
|
||||
)
|
||||
Xt = transformer.fit_transform(X)
|
||||
|
||||
# Use periodic extrapolation in BSpline
|
||||
coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
|
||||
spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
|
||||
Xspl = spl(X[:, 0])
|
||||
assert_allclose(Xt, Xspl)
|
||||
|
||||
|
||||
def test_spline_transformer_periodic_splines_periodicity():
|
||||
"""
|
||||
Test if shifted knots result in the same transformation up to permutation.
|
||||
"""
|
||||
X = np.linspace(0, 10, 101)[:, None]
|
||||
|
||||
transformer_1 = SplineTransformer(
|
||||
degree=3,
|
||||
extrapolation="periodic",
|
||||
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
|
||||
)
|
||||
|
||||
transformer_2 = SplineTransformer(
|
||||
degree=3,
|
||||
extrapolation="periodic",
|
||||
knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]],
|
||||
)
|
||||
|
||||
Xt_1 = transformer_1.fit_transform(X)
|
||||
Xt_2 = transformer_2.fit_transform(X)
|
||||
|
||||
assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("degree", [3, 5])
|
||||
def test_spline_transformer_periodic_splines_smoothness(degree):
|
||||
"""Test that spline transformation is smooth at first / last knot."""
|
||||
X = np.linspace(-2, 10, 10_000)[:, None]
|
||||
|
||||
transformer = SplineTransformer(
|
||||
degree=degree,
|
||||
extrapolation="periodic",
|
||||
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
|
||||
)
|
||||
Xt = transformer.fit_transform(X)
|
||||
|
||||
delta = (X.max() - X.min()) / len(X)
|
||||
tol = 10 * delta
|
||||
|
||||
dXt = Xt
|
||||
# We expect splines of degree `degree` to be (`degree`-1) times
|
||||
# continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
|
||||
# derivative should be continuous. This is the case if the (d+1)-th
|
||||
# numerical derivative is reasonably small (smaller than `tol` in absolute
|
||||
# value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
|
||||
# and compare them to `tol`.
|
||||
#
|
||||
# Note that the 0-th derivative is the function itself, such that we are
|
||||
# also checking its continuity.
|
||||
for d in range(1, degree + 1):
|
||||
# Check continuity of the (d-1)-th derivative
|
||||
diff = np.diff(dXt, axis=0)
|
||||
assert np.abs(diff).max() < tol
|
||||
# Compute d-th numeric derivative
|
||||
dXt = diff / delta
|
||||
|
||||
# As degree `degree` splines are not `degree` times continuously
|
||||
# differentiable at the knots, the `degree + 1`-th numeric derivative
|
||||
# should have spikes at the knots.
|
||||
diff = np.diff(dXt, axis=0)
|
||||
assert np.abs(diff).max() > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||||
@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
|
||||
def test_spline_transformer_extrapolation(bias, intercept, degree):
|
||||
"""Test that B-spline extrapolation works correctly."""
|
||||
# we use a straight line for that
|
||||
X = np.linspace(-1, 1, 100)[:, None]
|
||||
y = X.squeeze()
|
||||
|
||||
# 'constant'
|
||||
pipe = Pipeline(
|
||||
[
|
||||
[
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=4,
|
||||
degree=degree,
|
||||
include_bias=bias,
|
||||
extrapolation="constant",
|
||||
),
|
||||
],
|
||||
["ols", LinearRegression(fit_intercept=intercept)],
|
||||
]
|
||||
)
|
||||
pipe.fit(X, y)
|
||||
assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
|
||||
|
||||
# 'linear'
|
||||
pipe = Pipeline(
|
||||
[
|
||||
[
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=4,
|
||||
degree=degree,
|
||||
include_bias=bias,
|
||||
extrapolation="linear",
|
||||
),
|
||||
],
|
||||
["ols", LinearRegression(fit_intercept=intercept)],
|
||||
]
|
||||
)
|
||||
pipe.fit(X, y)
|
||||
assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
|
||||
|
||||
# 'error'
|
||||
splt = SplineTransformer(
|
||||
n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
|
||||
)
|
||||
splt.fit(X)
|
||||
with pytest.raises(ValueError):
|
||||
splt.transform([[-10]])
|
||||
with pytest.raises(ValueError):
|
||||
splt.transform([[5]])
|
||||
|
||||
|
||||
def test_spline_transformer_kbindiscretizer():
|
||||
"""Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
|
||||
rng = np.random.RandomState(97531)
|
||||
X = rng.randn(200).reshape(200, 1)
|
||||
n_bins = 5
|
||||
n_knots = n_bins + 1
|
||||
|
||||
splt = SplineTransformer(
|
||||
n_knots=n_knots, degree=0, knots="quantile", include_bias=True
|
||||
)
|
||||
splines = splt.fit_transform(X)
|
||||
|
||||
kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile")
|
||||
kbins = kbd.fit_transform(X)
|
||||
|
||||
# Though they should be exactly equal, we test approximately with high
|
||||
# accuracy.
|
||||
assert_allclose(splines, kbins, rtol=1e-13)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_knots", [5, 10])
|
||||
@pytest.mark.parametrize("include_bias", [True, False])
|
||||
@pytest.mark.parametrize("degree", [3, 5])
|
||||
def test_spline_transformer_n_features_out(n_knots, include_bias, degree):
|
||||
"""Test that transform results in n_features_out_ features."""
|
||||
splt = SplineTransformer(n_knots=n_knots, degree=degree, include_bias=include_bias)
|
||||
X = np.linspace(0, 1, 10)[:, None]
|
||||
splt.fit(X)
|
||||
|
||||
assert splt.transform(X).shape[1] == splt.n_features_out_
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
({"degree": -1}, "degree must be a non-negative integer"),
|
||||
({"degree": 2.5}, "degree must be a non-negative int or tuple"),
|
||||
({"degree": "12"}, r"degree=\(min_degree, max_degree\) must"),
|
||||
({"degree": "string"}, "degree must be a non-negative int or tuple"),
|
||||
({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"),
|
||||
({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"),
|
||||
({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_input_validation(params, err_msg):
|
||||
"""Test that we raise errors for invalid input in PolynomialFeatures."""
|
||||
X = [[1], [2]]
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
PolynomialFeatures(**params).fit(X)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def single_feature_degree3():
|
||||
X = np.arange(6)[:, np.newaxis]
|
||||
P = np.hstack([np.ones_like(X), X, X**2, X**3])
|
||||
return X, P
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"degree, include_bias, interaction_only, indices",
|
||||
[
|
||||
(3, True, False, slice(None, None)),
|
||||
(3, False, False, slice(1, None)),
|
||||
(3, True, True, [0, 1]),
|
||||
(3, False, True, [1]),
|
||||
((2, 3), True, False, [0, 2, 3]),
|
||||
((2, 3), False, False, [2, 3]),
|
||||
((2, 3), True, True, [0]),
|
||||
((2, 3), False, True, []),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_X",
|
||||
[False, sparse.csr_matrix, sparse.csc_matrix],
|
||||
)
|
||||
def test_polynomial_features_one_feature(
|
||||
single_feature_degree3,
|
||||
degree,
|
||||
include_bias,
|
||||
interaction_only,
|
||||
indices,
|
||||
sparse_X,
|
||||
):
|
||||
"""Test PolynomialFeatures on single feature up to degree 3."""
|
||||
X, P = single_feature_degree3
|
||||
if sparse_X:
|
||||
X = sparse_X(X)
|
||||
tf = PolynomialFeatures(
|
||||
degree=degree, include_bias=include_bias, interaction_only=interaction_only
|
||||
).fit(X)
|
||||
out = tf.transform(X)
|
||||
if sparse_X:
|
||||
out = out.toarray()
|
||||
assert_allclose(out, P[:, indices])
|
||||
if tf.n_output_features_ > 0:
|
||||
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def two_features_degree3():
|
||||
X = np.arange(6).reshape((3, 2))
|
||||
x1 = X[:, :1]
|
||||
x2 = X[:, 1:]
|
||||
P = np.hstack(
|
||||
[
|
||||
x1**0 * x2**0, # 0
|
||||
x1**1 * x2**0, # 1
|
||||
x1**0 * x2**1, # 2
|
||||
x1**2 * x2**0, # 3
|
||||
x1**1 * x2**1, # 4
|
||||
x1**0 * x2**2, # 5
|
||||
x1**3 * x2**0, # 6
|
||||
x1**2 * x2**1, # 7
|
||||
x1**1 * x2**2, # 8
|
||||
x1**0 * x2**3, # 9
|
||||
]
|
||||
)
|
||||
return X, P
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"degree, include_bias, interaction_only, indices",
|
||||
[
|
||||
(2, True, False, slice(0, 6)),
|
||||
(2, False, False, slice(1, 6)),
|
||||
(2, True, True, [0, 1, 2, 4]),
|
||||
(2, False, True, [1, 2, 4]),
|
||||
((2, 2), True, False, [0, 3, 4, 5]),
|
||||
((2, 2), False, False, [3, 4, 5]),
|
||||
((2, 2), True, True, [0, 4]),
|
||||
((2, 2), False, True, [4]),
|
||||
(3, True, False, slice(None, None)),
|
||||
(3, False, False, slice(1, None)),
|
||||
(3, True, True, [0, 1, 2, 4]),
|
||||
(3, False, True, [1, 2, 4]),
|
||||
((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]),
|
||||
((2, 3), False, False, slice(3, None)),
|
||||
((2, 3), True, True, [0, 4]),
|
||||
((2, 3), False, True, [4]),
|
||||
((3, 3), True, False, [0, 6, 7, 8, 9]),
|
||||
((3, 3), False, False, [6, 7, 8, 9]),
|
||||
((3, 3), True, True, [0]),
|
||||
((3, 3), False, True, []), # would need 3 input features
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_X",
|
||||
[False, sparse.csr_matrix, sparse.csc_matrix],
|
||||
)
|
||||
def test_polynomial_features_two_features(
|
||||
two_features_degree3,
|
||||
degree,
|
||||
include_bias,
|
||||
interaction_only,
|
||||
indices,
|
||||
sparse_X,
|
||||
):
|
||||
"""Test PolynomialFeatures on 2 features up to degree 3."""
|
||||
X, P = two_features_degree3
|
||||
if sparse_X:
|
||||
X = sparse_X(X)
|
||||
tf = PolynomialFeatures(
|
||||
degree=degree, include_bias=include_bias, interaction_only=interaction_only
|
||||
).fit(X)
|
||||
out = tf.transform(X)
|
||||
if sparse_X:
|
||||
out = out.toarray()
|
||||
assert_allclose(out, P[:, indices])
|
||||
if tf.n_output_features_ > 0:
|
||||
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
|
||||
|
||||
|
||||
# TODO: Remove in 1.2 when get_feature_names is removed.
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
|
||||
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
|
||||
def test_polynomial_feature_names(get_names):
|
||||
X = np.arange(30).reshape(10, 3)
|
||||
poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
|
||||
feature_names = poly.get_feature_names()
|
||||
assert_array_equal(
|
||||
["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"],
|
||||
feature_names,
|
||||
)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
|
||||
feature_names = getattr(poly, get_names)(["a", "b", "c"])
|
||||
assert_array_equal(
|
||||
[
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"a^2",
|
||||
"a b",
|
||||
"a c",
|
||||
"b^2",
|
||||
"b c",
|
||||
"c^2",
|
||||
"a^3",
|
||||
"a^2 b",
|
||||
"a^2 c",
|
||||
"a b^2",
|
||||
"a b c",
|
||||
"a c^2",
|
||||
"b^3",
|
||||
"b^2 c",
|
||||
"b c^2",
|
||||
"c^3",
|
||||
],
|
||||
feature_names,
|
||||
)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X)
|
||||
feature_names = getattr(poly, get_names)(["a", "b", "c"])
|
||||
assert_array_equal(
|
||||
[
|
||||
"a^2",
|
||||
"a b",
|
||||
"a c",
|
||||
"b^2",
|
||||
"b c",
|
||||
"c^2",
|
||||
"a^3",
|
||||
"a^2 b",
|
||||
"a^2 c",
|
||||
"a b^2",
|
||||
"a b c",
|
||||
"a c^2",
|
||||
"b^3",
|
||||
"b^2 c",
|
||||
"b c^2",
|
||||
"c^3",
|
||||
],
|
||||
feature_names,
|
||||
)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
poly = PolynomialFeatures(
|
||||
degree=(3, 3), include_bias=True, interaction_only=True
|
||||
).fit(X)
|
||||
feature_names = getattr(poly, get_names)(["a", "b", "c"])
|
||||
assert_array_equal(["1", "a b c"], feature_names)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
# test some unicode
|
||||
poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
|
||||
feature_names = poly.get_feature_names(["\u0001F40D", "\u262E", "\u05D0"])
|
||||
assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], feature_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "include_bias", "interaction_only", "dtype"],
|
||||
[
|
||||
(1, True, False, int),
|
||||
(2, True, False, int),
|
||||
(2, True, False, np.float32),
|
||||
(2, True, False, np.float64),
|
||||
(3, False, False, np.float64),
|
||||
(3, False, True, np.float64),
|
||||
(4, False, False, np.float64),
|
||||
(4, False, True, np.float64),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randint(0, 2, (100, 2))
|
||||
X_csc = sparse.csc_matrix(X)
|
||||
|
||||
est = PolynomialFeatures(
|
||||
deg, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csc = est.fit_transform(X_csc.astype(dtype))
|
||||
Xt_dense = est.fit_transform(X.astype(dtype))
|
||||
|
||||
assert isinstance(Xt_csc, sparse.csc_matrix)
|
||||
assert Xt_csc.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csc.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "include_bias", "interaction_only", "dtype"],
|
||||
[
|
||||
(1, True, False, int),
|
||||
(2, True, False, int),
|
||||
(2, True, False, np.float32),
|
||||
(2, True, False, np.float64),
|
||||
(3, False, False, np.float64),
|
||||
(3, False, True, np.float64),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randint(0, 2, (100, 2))
|
||||
X_csr = sparse.csr_matrix(X)
|
||||
|
||||
est = PolynomialFeatures(
|
||||
deg, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csr = est.fit_transform(X_csr.astype(dtype))
|
||||
Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_features", [1, 4, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)]
|
||||
)
|
||||
@pytest.mark.parametrize("interaction_only", [True, False])
|
||||
@pytest.mark.parametrize("include_bias", [True, False])
|
||||
def test_num_combinations(
|
||||
n_features,
|
||||
min_degree,
|
||||
max_degree,
|
||||
interaction_only,
|
||||
include_bias,
|
||||
):
|
||||
"""
|
||||
Test that n_output_features_ is calculated correctly.
|
||||
"""
|
||||
x = sparse.csr_matrix(([1], ([0], [n_features - 1])))
|
||||
est = PolynomialFeatures(
|
||||
degree=max_degree,
|
||||
interaction_only=interaction_only,
|
||||
include_bias=include_bias,
|
||||
)
|
||||
est.fit(x)
|
||||
num_combos = est.n_output_features_
|
||||
|
||||
combos = PolynomialFeatures._combinations(
|
||||
n_features=n_features,
|
||||
min_degree=0,
|
||||
max_degree=max_degree,
|
||||
interaction_only=interaction_only,
|
||||
include_bias=include_bias,
|
||||
)
|
||||
assert num_combos == sum([1 for _ in combos])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "include_bias", "interaction_only", "dtype"],
|
||||
[
|
||||
(2, True, False, np.float32),
|
||||
(2, True, False, np.float64),
|
||||
(3, False, False, np.float64),
|
||||
(3, False, True, np.float64),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X_floats(deg, include_bias, interaction_only, dtype):
|
||||
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(
|
||||
deg, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csr = est.fit_transform(X_csr.astype(dtype))
|
||||
Xt_dense = est.fit_transform(X.astype(dtype))
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["zero_row_index", "deg", "interaction_only"],
|
||||
[
|
||||
(0, 2, True),
|
||||
(1, 2, True),
|
||||
(2, 2, True),
|
||||
(0, 3, True),
|
||||
(1, 3, True),
|
||||
(2, 3, True),
|
||||
(0, 2, False),
|
||||
(1, 2, False),
|
||||
(2, 2, False),
|
||||
(0, 3, False),
|
||||
(1, 3, False),
|
||||
(2, 3, False),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, interaction_only):
|
||||
X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr()
|
||||
X_csr[zero_row_index, :] = 0.0
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only)
|
||||
Xt_csr = est.fit_transform(X_csr)
|
||||
Xt_dense = est.fit_transform(X)
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
# This degree should always be one more than the highest degree supported by
|
||||
# _csr_expansion.
|
||||
@pytest.mark.parametrize(
|
||||
["include_bias", "interaction_only"],
|
||||
[(True, True), (True, False), (False, True), (False, False)],
|
||||
)
|
||||
def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only):
|
||||
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(
|
||||
4, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csr = est.fit_transform(X_csr)
|
||||
Xt_dense = est.fit_transform(X)
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "dim", "interaction_only"],
|
||||
[
|
||||
(2, 1, True),
|
||||
(2, 2, True),
|
||||
(3, 1, True),
|
||||
(3, 2, True),
|
||||
(3, 3, True),
|
||||
(2, 1, False),
|
||||
(2, 2, False),
|
||||
(3, 1, False),
|
||||
(3, 2, False),
|
||||
(3, 3, False),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only):
|
||||
X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr()
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(deg, interaction_only=interaction_only)
|
||||
Xt_csr = est.fit_transform(X_csr)
|
||||
Xt_dense = est.fit_transform(X)
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
def test_polynomial_features_deprecated_n_input_features():
|
||||
# check that we raise a deprecation warning when accessing
|
||||
# `n_input_features_`. FIXME: remove in 1.2
|
||||
depr_msg = (
|
||||
"The attribute `n_input_features_` was deprecated in version "
|
||||
"1.0 and will be removed in 1.2."
|
||||
)
|
||||
X = np.arange(10).reshape(5, 2)
|
||||
|
||||
with pytest.warns(FutureWarning, match=depr_msg):
|
||||
PolynomialFeatures().fit(X).n_input_features_
|
||||
|
||||
|
||||
# TODO: Remove in 1.2 when get_feature_names is removed
|
||||
@pytest.mark.parametrize("Transformer", [SplineTransformer, PolynomialFeatures])
|
||||
def test_get_feature_names_deprecated(Transformer):
|
||||
X = np.arange(30).reshape(10, 3)
|
||||
poly = Transformer().fit(X)
|
||||
msg = "get_feature_names is deprecated in 1.0"
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
poly.get_feature_names()
|
||||
|
||||
|
||||
def test_polynomial_features_behaviour_on_zero_degree():
|
||||
"""Check that PolynomialFeatures raises error when degree=0 and include_bias=False,
|
||||
and output a single constant column when include_bias=True
|
||||
"""
|
||||
X = np.ones((10, 2))
|
||||
poly = PolynomialFeatures(degree=0, include_bias=False)
|
||||
err_msg = (
|
||||
"Setting degree to zero and include_bias to False would result in"
|
||||
" an empty output array."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
poly.fit_transform(X)
|
||||
|
||||
poly = PolynomialFeatures(degree=(0, 0), include_bias=False)
|
||||
err_msg = (
|
||||
"Setting both min_deree and max_degree to zero and include_bias to"
|
||||
" False would result in an empty output array."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
poly.fit_transform(X)
|
||||
|
||||
for _X in [X, sparse.csr_matrix(X), sparse.csc_matrix(X)]:
|
||||
poly = PolynomialFeatures(degree=0, include_bias=True)
|
||||
output = poly.fit_transform(_X)
|
||||
# convert to dense array if needed
|
||||
if sparse.issparse(output):
|
||||
output = output.toarray()
|
||||
assert_array_equal(output, np.ones((X.shape[0], 1)))
|
||||
Reference in New Issue
Block a user