first commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,183 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from sklearn.base import clone
|
||||
|
||||
from sklearn.preprocessing import maxabs_scale
|
||||
from sklearn.preprocessing import minmax_scale
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.preprocessing import power_transform
|
||||
from sklearn.preprocessing import quantile_transform
|
||||
from sklearn.preprocessing import robust_scale
|
||||
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import PowerTransformer
|
||||
from sklearn.preprocessing import QuantileTransformer
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
|
||||
def _get_valid_samples_by_column(X, col):
|
||||
"""Get non NaN samples in column of X"""
|
||||
return X[:, [col]][~np.isnan(X[:, col])]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func, support_sparse, strictly_positive, omit_kwargs",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale, True, False, []),
|
||||
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
|
||||
(StandardScaler(), scale, False, False, []),
|
||||
(StandardScaler(with_mean=False), scale, True, False, []),
|
||||
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
|
||||
(PowerTransformer("box-cox"), power_transform, False, True, []),
|
||||
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
|
||||
(RobustScaler(), robust_scale, False, False, []),
|
||||
(RobustScaler(with_centering=False), robust_scale, True, False, []),
|
||||
],
|
||||
)
|
||||
def test_missing_value_handling(
|
||||
est, func, support_sparse, strictly_positive, omit_kwargs
|
||||
):
|
||||
# check that the preprocessing method let pass nan
|
||||
rng = np.random.RandomState(42)
|
||||
X = iris.data.copy()
|
||||
n_missing = 50
|
||||
X[
|
||||
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
|
||||
] = np.nan
|
||||
if strictly_positive:
|
||||
X += np.nanmin(X) + 0.1
|
||||
X_train, X_test = train_test_split(X, random_state=1)
|
||||
# sanity check
|
||||
assert not np.all(np.isnan(X_train), axis=0).any()
|
||||
assert np.any(np.isnan(X_train), axis=0).all()
|
||||
assert np.any(np.isnan(X_test), axis=0).all()
|
||||
X_test[:, 0] = np.nan # make sure this boundary case is tested
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt = est.fit(X_train).transform(X_test)
|
||||
# ensure no warnings are raised
|
||||
# missing values should still be missing, and only them
|
||||
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
|
||||
|
||||
# check that the function leads to the same results as the class
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_class = est.transform(X_train)
|
||||
kwargs = est.get_params()
|
||||
# remove the parameters which should be omitted because they
|
||||
# are not defined in the counterpart function of the preprocessing class
|
||||
for kwarg in omit_kwargs:
|
||||
_ = kwargs.pop(kwarg)
|
||||
Xt_func = func(X_train, **kwargs)
|
||||
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
|
||||
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
|
||||
|
||||
# check that the inverse transform keep NaN
|
||||
Xt_inv = est.inverse_transform(Xt)
|
||||
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
|
||||
# FIXME: we can introduce equal_nan=True in recent version of numpy.
|
||||
# For the moment which just check that non-NaN values are almost equal.
|
||||
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
# train only on non-NaN
|
||||
est.fit(_get_valid_samples_by_column(X_train, i))
|
||||
# check transforming with NaN works even when training without NaN
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_col = est.transform(X_test[:, [i]])
|
||||
assert_allclose(Xt_col, Xt[:, [i]])
|
||||
# check non-NaN is handled as before - the 1st column is all nan
|
||||
if not np.isnan(X_test[:, i]).all():
|
||||
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
|
||||
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
|
||||
|
||||
if support_sparse:
|
||||
est_dense = clone(est)
|
||||
est_sparse = clone(est)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_dense = est_dense.fit(X_train).transform(X_test)
|
||||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
|
||||
|
||||
for sparse_constructor in (
|
||||
sparse.csr_matrix,
|
||||
sparse.csc_matrix,
|
||||
sparse.bsr_matrix,
|
||||
sparse.coo_matrix,
|
||||
sparse.dia_matrix,
|
||||
sparse.dok_matrix,
|
||||
sparse.lil_matrix,
|
||||
):
|
||||
# check that the dense and sparse inputs lead to the same results
|
||||
# precompute the matrix to avoid catching side warnings
|
||||
X_train_sp = sparse_constructor(X_train)
|
||||
X_test_sp = sparse_constructor(X_test)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
|
||||
|
||||
assert_allclose(Xt_sp.A, Xt_dense)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", PendingDeprecationWarning)
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
|
||||
|
||||
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est, func",
|
||||
[
|
||||
(MaxAbsScaler(), maxabs_scale),
|
||||
(MinMaxScaler(), minmax_scale),
|
||||
(StandardScaler(), scale),
|
||||
(StandardScaler(with_mean=False), scale),
|
||||
(PowerTransformer("yeo-johnson"), power_transform),
|
||||
(
|
||||
PowerTransformer("box-cox"),
|
||||
power_transform,
|
||||
),
|
||||
(QuantileTransformer(n_quantiles=3), quantile_transform),
|
||||
(RobustScaler(), robust_scale),
|
||||
(RobustScaler(with_centering=False), robust_scale),
|
||||
],
|
||||
)
|
||||
def test_missing_value_pandas_na_support(est, func):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
|
||||
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
|
||||
[1, 2, 3, 4, 5, 6, 7, 8],
|
||||
]
|
||||
).T
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
|
||||
X_df["c"] = X_df["c"].astype("int")
|
||||
|
||||
X_trans = est.fit_transform(X)
|
||||
X_df_trans = est.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans, X_df_trans)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,472 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import warnings
|
||||
|
||||
from sklearn import clone
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.utils._testing import (
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
assert_allclose_dense_sparse,
|
||||
)
|
||||
|
||||
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected",
|
||||
[
|
||||
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
|
||||
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
|
||||
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
|
||||
],
|
||||
)
|
||||
def test_fit_transform(strategy, expected):
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
|
||||
est.fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
|
||||
def test_valid_n_bins():
|
||||
KBinsDiscretizer(n_bins=2).fit_transform(X)
|
||||
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
|
||||
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
|
||||
|
||||
|
||||
def test_invalid_n_bins():
|
||||
est = KBinsDiscretizer(n_bins=1)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins. Received 1, expected at"
|
||||
" least 2."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=1.1)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid n_bins type. Received float, expected"
|
||||
" int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
def test_invalid_n_bins_array():
|
||||
# Bad shape
|
||||
n_bins = np.full((2, 4), 2.0)
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Incorrect number of features
|
||||
n_bins = [1, 2, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Bad bin values
|
||||
n_bins = [1, 2, 2, 1]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 3. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
# Float bin values
|
||||
n_bins = [2.1, 2, 2.1, 2]
|
||||
est = KBinsDiscretizer(n_bins=n_bins)
|
||||
err_msg = (
|
||||
"KBinsDiscretizer received an invalid number of bins "
|
||||
"at indices 0, 2. Number of bins must be at least 2, "
|
||||
"and must be an int."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit_transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected",
|
||||
[
|
||||
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
|
||||
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
|
||||
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
|
||||
],
|
||||
)
|
||||
def test_fit_transform_n_bins_array(strategy, expected):
|
||||
est = KBinsDiscretizer(
|
||||
n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
|
||||
).fit(X)
|
||||
assert_array_equal(expected, est.transform(X))
|
||||
|
||||
# test the shape of bin_edges_
|
||||
n_features = np.array(X).shape[1]
|
||||
assert est.bin_edges_.shape == (n_features,)
|
||||
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
|
||||
assert bin_edges.shape == (n_bins + 1,)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_same_min_max(strategy):
|
||||
warnings.simplefilter("always")
|
||||
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
|
||||
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
|
||||
warning_message = "Feature 0 is constant and will be replaced with 0."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
est.fit(X)
|
||||
assert est.n_bins_[0] == 1
|
||||
# replace the feature with zeros
|
||||
Xt = est.transform(X)
|
||||
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_transform_1d_behavior():
|
||||
X = np.arange(4)
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
with pytest.raises(ValueError):
|
||||
est.fit(X)
|
||||
|
||||
est = KBinsDiscretizer(n_bins=2)
|
||||
est.fit(X.reshape(-1, 1))
|
||||
with pytest.raises(ValueError):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("i", range(1, 9))
|
||||
def test_numeric_stability(i):
|
||||
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
|
||||
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
|
||||
|
||||
# Test up to discretizing nano units
|
||||
X = X_init / 10**i
|
||||
Xt = KBinsDiscretizer(n_bins=2, encode="ordinal").fit_transform(X)
|
||||
assert_array_equal(Xt_expected, Xt)
|
||||
|
||||
|
||||
def test_invalid_encode_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="invalid-encode")
|
||||
err_msg = (
|
||||
r"Valid options for 'encode' are "
|
||||
r"\('onehot', 'onehot-dense', 'ordinal'\). "
|
||||
r"Got encode='invalid-encode' instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
def test_encode_options():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="ordinal").fit(X)
|
||||
Xt_1 = est.transform(X)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot-dense").fit(X)
|
||||
Xt_2 = est.transform(X)
|
||||
assert not sp.issparse(Xt_2)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(
|
||||
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False
|
||||
).fit_transform(Xt_1),
|
||||
Xt_2,
|
||||
)
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot").fit(X)
|
||||
Xt_3 = est.transform(X)
|
||||
assert sp.issparse(Xt_3)
|
||||
assert_array_equal(
|
||||
OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True)
|
||||
.fit_transform(Xt_1)
|
||||
.toarray(),
|
||||
Xt_3.toarray(),
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_strategy_option():
|
||||
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy="invalid-strategy")
|
||||
err_msg = (
|
||||
r"Valid options for 'strategy' are "
|
||||
r"\('uniform', 'quantile', 'kmeans'\). "
|
||||
r"Got strategy='invalid-strategy' instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_2bins, expected_3bins, expected_5bins",
|
||||
[
|
||||
("uniform", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
|
||||
("kmeans", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
|
||||
("quantile", [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4]),
|
||||
],
|
||||
)
|
||||
def test_nonuniform_strategies(
|
||||
strategy, expected_2bins, expected_3bins, expected_5bins
|
||||
):
|
||||
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
|
||||
|
||||
# with 2 bins
|
||||
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_2bins, Xt.ravel())
|
||||
|
||||
# with 3 bins
|
||||
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_3bins, Xt.ravel())
|
||||
|
||||
# with 5 bins
|
||||
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(expected_5bins, Xt.ravel())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_inv",
|
||||
[
|
||||
(
|
||||
"uniform",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.5],
|
||||
[-0.5, 3.0, -2.5, -0.5],
|
||||
[0.5, 4.0, -1.5, 0.5],
|
||||
[0.5, 4.0, -1.5, 1.5],
|
||||
],
|
||||
),
|
||||
(
|
||||
"kmeans",
|
||||
[
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-1.375, 2.125, -3.375, -0.5625],
|
||||
[-0.125, 3.375, -2.125, 0.5625],
|
||||
[0.75, 4.25, -1.25, 1.625],
|
||||
],
|
||||
),
|
||||
(
|
||||
"quantile",
|
||||
[
|
||||
[-1.5, 2.0, -3.5, -0.75],
|
||||
[-0.5, 3.0, -2.5, 0.0],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
[0.5, 4.0, -1.5, 1.25],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_inverse_transform(strategy, encode, expected_inv):
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
|
||||
Xt = kbd.fit_transform(X)
|
||||
Xinv = kbd.inverse_transform(Xt)
|
||||
assert_array_almost_equal(expected_inv, Xinv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
|
||||
def test_transform_outside_fit_range(strategy):
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
|
||||
kbd.fit(X)
|
||||
|
||||
X2 = np.array([-2, 5])[:, None]
|
||||
X2t = kbd.transform(X2)
|
||||
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
|
||||
assert_array_equal(X2t.min(axis=0), [0])
|
||||
|
||||
|
||||
def test_overwrite():
|
||||
X = np.array([0, 1, 2, 3])[:, None]
|
||||
X_before = X.copy()
|
||||
|
||||
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
|
||||
Xt = est.fit_transform(X)
|
||||
assert_array_equal(X, X_before)
|
||||
|
||||
Xt_before = Xt.copy()
|
||||
Xinv = est.inverse_transform(Xt)
|
||||
assert_array_equal(Xt, Xt_before)
|
||||
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy, expected_bin_edges", [("quantile", [0, 1, 3]), ("kmeans", [0, 1.5, 3])]
|
||||
)
|
||||
def test_redundant_bins(strategy, expected_bin_edges):
|
||||
X = [[0], [0], [0], [0], [3], [3]]
|
||||
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
|
||||
|
||||
|
||||
def test_percentile_numeric_stability():
|
||||
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
|
||||
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
|
||||
Xt = np.array([0, 0, 4]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
|
||||
warning_message = "Consider decreasing the number of bins."
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
kbd.fit(X)
|
||||
|
||||
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
|
||||
assert_array_almost_equal(kbd.transform(X), Xt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("out_dtype", [None, np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_consistent_dtype(in_dtype, out_dtype, encode):
|
||||
X_input = np.array(X, dtype=in_dtype)
|
||||
kbd = KBinsDiscretizer(n_bins=3, encode=encode, dtype=out_dtype)
|
||||
|
||||
# a error is raised if a wrong dtype is define for the model
|
||||
if out_dtype not in [None, np.float32, np.float64]:
|
||||
with pytest.raises(ValueError, match="Valid options for 'dtype' are"):
|
||||
kbd.fit(X_input)
|
||||
else:
|
||||
kbd.fit(X_input)
|
||||
|
||||
# test output dtype
|
||||
if out_dtype is not None:
|
||||
expected_dtype = out_dtype
|
||||
elif out_dtype is None and X_input.dtype == np.float16:
|
||||
# wrong numeric input dtype are cast in np.float64
|
||||
expected_dtype = np.float64
|
||||
else:
|
||||
expected_dtype = X_input.dtype
|
||||
Xt = kbd.transform(X_input)
|
||||
assert Xt.dtype == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
|
||||
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
|
||||
def test_32_equal_64(input_dtype, encode):
|
||||
# TODO this check is redundant with common checks and can be removed
|
||||
# once #16290 is merged
|
||||
X_input = np.array(X, dtype=input_dtype)
|
||||
|
||||
# 32 bit output
|
||||
kbd_32 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float32)
|
||||
kbd_32.fit(X_input)
|
||||
Xt_32 = kbd_32.transform(X_input)
|
||||
|
||||
# 64 bit output
|
||||
kbd_64 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float64)
|
||||
kbd_64.fit(X_input)
|
||||
Xt_64 = kbd_64.transform(X_input)
|
||||
|
||||
assert_allclose_dense_sparse(Xt_32, Xt_64)
|
||||
|
||||
|
||||
# FIXME: remove the `filterwarnings` in 1.3
|
||||
@pytest.mark.filterwarnings("ignore:In version 1.3 onwards, subsample=2e5")
|
||||
@pytest.mark.parametrize("subsample", [None, "warn"])
|
||||
def test_kbinsdiscretizer_subsample_default(subsample):
|
||||
# Since the size of X is small (< 2e5), subsampling will not take place.
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
|
||||
kbd_default.fit(X)
|
||||
|
||||
kbd_with_subsampling = clone(kbd_default)
|
||||
kbd_with_subsampling.set_params(subsample=subsample)
|
||||
kbd_with_subsampling.fit(X)
|
||||
|
||||
for bin_kbd_default, bin_kbd_with_subsampling in zip(
|
||||
kbd_default.bin_edges_[0], kbd_with_subsampling.bin_edges_[0]
|
||||
):
|
||||
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
|
||||
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
|
||||
|
||||
|
||||
def test_kbinsdiscretizer_subsample_invalid_strategy():
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform", subsample=3)
|
||||
|
||||
err_msg = '`subsample` must be used with `strategy="quantile"`.'
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
kbd.fit(X)
|
||||
|
||||
|
||||
def test_kbinsdiscretizer_subsample_invalid_type():
|
||||
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(
|
||||
n_bins=10, encode="ordinal", strategy="quantile", subsample="full"
|
||||
)
|
||||
|
||||
msg = "subsample must be an instance of int, not str."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
kbd.fit(X)
|
||||
|
||||
|
||||
# TODO: Remove in 1.3
|
||||
def test_kbinsdiscretizer_subsample_warn():
|
||||
X = np.random.rand(200001, 1).reshape(-1, 1)
|
||||
kbd = KBinsDiscretizer(n_bins=100, encode="ordinal", strategy="quantile")
|
||||
|
||||
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
kbd.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("subsample", [0, int(2e5)])
|
||||
def test_kbinsdiscretizer_subsample_values(subsample):
|
||||
X = np.random.rand(220000, 1).reshape(-1, 1)
|
||||
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
|
||||
|
||||
kbd_with_subsampling = clone(kbd_default)
|
||||
kbd_with_subsampling.set_params(subsample=subsample)
|
||||
|
||||
if subsample == 0:
|
||||
with pytest.raises(ValueError, match="subsample == 0, must be >= 1."):
|
||||
kbd_with_subsampling.fit(X)
|
||||
else:
|
||||
# TODO: Remove in 1.3
|
||||
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
kbd_default.fit(X)
|
||||
|
||||
kbd_with_subsampling.fit(X)
|
||||
assert not np.all(
|
||||
kbd_default.bin_edges_[0] == kbd_with_subsampling.bin_edges_[0]
|
||||
)
|
||||
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encode, expected_names",
|
||||
[
|
||||
(
|
||||
"onehot",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
(
|
||||
"onehot-dense",
|
||||
[
|
||||
f"feat{col_id}_{float(bin_id)}"
|
||||
for col_id in range(3)
|
||||
for bin_id in range(4)
|
||||
],
|
||||
),
|
||||
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
|
||||
],
|
||||
)
|
||||
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
|
||||
"""Check get_feature_names_out for different settings.
|
||||
Non-regression test for #22731
|
||||
"""
|
||||
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
|
||||
|
||||
kbd = KBinsDiscretizer(n_bins=4, encode=encode).fit(X)
|
||||
Xt = kbd.transform(X)
|
||||
|
||||
input_features = [f"feat{i}" for i in range(3)]
|
||||
output_names = kbd.get_feature_names_out(input_features)
|
||||
assert Xt.shape[1] == output_names.shape[0]
|
||||
|
||||
assert_array_equal(output_names, expected_names)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,392 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from sklearn.utils import _safe_indexing
|
||||
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
from sklearn.utils._testing import (
|
||||
assert_array_equal,
|
||||
assert_allclose_dense_sparse,
|
||||
_convert_container,
|
||||
)
|
||||
|
||||
|
||||
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
|
||||
def _func(X, *args, **kwargs):
|
||||
args_store.append(X)
|
||||
args_store.extend(args)
|
||||
kwargs_store.update(kwargs)
|
||||
return func(X)
|
||||
|
||||
return _func
|
||||
|
||||
|
||||
def test_delegate_to_func():
|
||||
# (args|kwargs)_store will hold the positional and keyword arguments
|
||||
# passed to the function inside the FunctionTransformer.
|
||||
args_store = []
|
||||
kwargs_store = {}
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
assert_array_equal(
|
||||
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
|
||||
X,
|
||||
"transform should have returned X unchanged",
|
||||
)
|
||||
|
||||
# The function should only have received X.
|
||||
assert args_store == [
|
||||
X
|
||||
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
|
||||
assert (
|
||||
not kwargs_store
|
||||
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
|
||||
# reset the argument stores.
|
||||
args_store[:] = []
|
||||
kwargs_store.clear()
|
||||
transformed = FunctionTransformer(
|
||||
_make_func(args_store, kwargs_store),
|
||||
).transform(X)
|
||||
|
||||
assert_array_equal(
|
||||
transformed, X, err_msg="transform should have returned X unchanged"
|
||||
)
|
||||
|
||||
# The function should have received X
|
||||
assert args_store == [
|
||||
X
|
||||
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
|
||||
|
||||
assert (
|
||||
not kwargs_store
|
||||
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
|
||||
|
||||
|
||||
def test_np_log():
|
||||
X = np.arange(10).reshape((5, 2))
|
||||
|
||||
# Test that the numpy.log example still works.
|
||||
assert_array_equal(
|
||||
FunctionTransformer(np.log1p).transform(X),
|
||||
np.log1p(X),
|
||||
)
|
||||
|
||||
|
||||
def test_kw_arg():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=3))
|
||||
|
||||
|
||||
def test_kw_arg_update():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args["decimals"] = 1
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_kw_arg_reset():
|
||||
X = np.linspace(0, 1, num=10).reshape((5, 2))
|
||||
|
||||
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
|
||||
|
||||
F.kw_args = dict(decimals=1)
|
||||
|
||||
# Test that rounding is correct
|
||||
assert_array_equal(F.transform(X), np.around(X, decimals=1))
|
||||
|
||||
|
||||
def test_inverse_transform():
|
||||
X = np.array([1, 4, 9, 16]).reshape((2, 2))
|
||||
|
||||
# Test that inverse_transform works correctly
|
||||
F = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
inv_kw_args=dict(decimals=3),
|
||||
)
|
||||
assert_array_equal(
|
||||
F.inverse_transform(F.transform(X)),
|
||||
np.around(np.sqrt(X), decimals=3),
|
||||
)
|
||||
|
||||
|
||||
def test_check_inverse():
|
||||
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
|
||||
|
||||
X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)]
|
||||
|
||||
for X in X_list:
|
||||
if sparse.issparse(X):
|
||||
accept_sparse = True
|
||||
else:
|
||||
accept_sparse = False
|
||||
trans = FunctionTransformer(
|
||||
func=np.sqrt,
|
||||
inverse_func=np.around,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
warning_message = (
|
||||
"The provided functions are not strictly"
|
||||
" inverse of each other. If you are sure you"
|
||||
" want to proceed regardless, set"
|
||||
" 'check_inverse=False'."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
trans.fit(X)
|
||||
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1,
|
||||
inverse_func=np.log1p,
|
||||
accept_sparse=accept_sparse,
|
||||
check_inverse=True,
|
||||
validate=True,
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
Xt = trans.fit_transform(X)
|
||||
|
||||
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
|
||||
|
||||
# check that we don't check inverse when one of the func or inverse is not
|
||||
# provided.
|
||||
trans = FunctionTransformer(
|
||||
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X_dense)
|
||||
trans = FunctionTransformer(
|
||||
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
trans.fit(X_dense)
|
||||
|
||||
|
||||
def test_function_transformer_frame():
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(np.random.randn(100, 10))
|
||||
transformer = FunctionTransformer()
|
||||
X_df_trans = transformer.fit_transform(X_df)
|
||||
assert hasattr(X_df_trans, "loc")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X_type", ["array", "series"])
|
||||
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
|
||||
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
|
||||
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
|
||||
inverse_mapping = {value: key for key, value in mapping.items()}
|
||||
dtype = "object"
|
||||
|
||||
data = ["one", "two", "three", "one", "one", 5, 6]
|
||||
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
|
||||
|
||||
def func(X):
|
||||
return np.array(
|
||||
[mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object
|
||||
)
|
||||
|
||||
def inverse_func(X):
|
||||
return _convert_container(
|
||||
[inverse_mapping[x] for x in X],
|
||||
X_type,
|
||||
columns_name=["value"],
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
|
||||
)
|
||||
|
||||
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.fit(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, feature_names_out, input_features, expected",
|
||||
[
|
||||
(
|
||||
# NumPy inputs, default behavior: generate names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
None,
|
||||
("x0", "x1", "x2"),
|
||||
),
|
||||
(
|
||||
# Pandas input, default behavior: use input feature names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: ("a", "b"),
|
||||
None,
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: ("c", "d", "e"),
|
||||
None,
|
||||
("c", "d", "e"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable – default input_features
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("a",),
|
||||
None,
|
||||
("x0", "x1", "x2", "a"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable – default input_features
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
None,
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# NumPy input, input_features=list of names
|
||||
np.random.rand(100, 3),
|
||||
"one-to-one",
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c"),
|
||||
),
|
||||
(
|
||||
# Pandas input, input_features=list of names
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
"one-to-one",
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b"),
|
||||
),
|
||||
(
|
||||
# NumPy input, feature_names_out=callable, input_features=list
|
||||
np.random.rand(100, 3),
|
||||
lambda transformer, input_features: tuple(input_features) + ("d",),
|
||||
("a", "b", "c"),
|
||||
("a", "b", "c", "d"),
|
||||
),
|
||||
(
|
||||
# Pandas input, feature_names_out=callable, input_features=list
|
||||
{"a": np.random.rand(100), "b": np.random.rand(100)},
|
||||
lambda transformer, input_features: tuple(input_features) + ("c",),
|
||||
("a", "b"), # must match feature_names_in_
|
||||
("a", "b", "c"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_function_transformer_get_feature_names_out(
|
||||
X, feature_names_out, input_features, expected
|
||||
):
|
||||
if isinstance(X, dict):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame(X)
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
feature_names_out=feature_names_out, validate=True
|
||||
)
|
||||
transformer.fit_transform(X)
|
||||
names = transformer.get_feature_names_out(input_features)
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, expected)
|
||||
|
||||
|
||||
def test_function_transformer_get_feature_names_out_without_validation():
|
||||
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = "When 'feature_names_out' is 'one-to-one', either"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
names = transformer.get_feature_names_out(("a", "b"))
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("feature_names_out", ["x0", ["x0"], ("x0",)])
|
||||
def test_function_transformer_feature_names_out_string(feature_names_out):
|
||||
transformer = FunctionTransformer(feature_names_out=feature_names_out)
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = """must either be "one-to-one" or a callable"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_is_None():
|
||||
transformer = FunctionTransformer()
|
||||
X = np.random.rand(100, 2)
|
||||
transformer.fit_transform(X)
|
||||
|
||||
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
transformer.get_feature_names_out()
|
||||
|
||||
|
||||
def test_function_transformer_feature_names_out_uses_estimator():
|
||||
def add_n_random_features(X, n):
|
||||
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
|
||||
|
||||
def feature_names_out(transformer, input_features):
|
||||
n = transformer.kw_args["n"]
|
||||
return list(input_features) + [f"rnd{i}" for i in range(n)]
|
||||
|
||||
transformer = FunctionTransformer(
|
||||
func=add_n_random_features,
|
||||
feature_names_out=feature_names_out,
|
||||
kw_args=dict(n=3),
|
||||
validate=True,
|
||||
)
|
||||
pd = pytest.importorskip("pandas")
|
||||
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
|
||||
transformer.fit_transform(df)
|
||||
names = transformer.get_feature_names_out()
|
||||
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
|
||||
|
||||
|
||||
def test_function_transformer_validate_inverse():
|
||||
"""Test that function transformer does not reset estimator in
|
||||
`inverse_transform`."""
|
||||
|
||||
def add_constant_feature(X):
|
||||
X_one = np.ones((X.shape[0], 1))
|
||||
return np.concatenate((X, X_one), axis=1)
|
||||
|
||||
def inverse_add_constant(X):
|
||||
return X[:, :-1]
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [3, 4]])
|
||||
trans = FunctionTransformer(
|
||||
func=add_constant_feature,
|
||||
inverse_func=inverse_add_constant,
|
||||
validate=True,
|
||||
)
|
||||
X_trans = trans.fit_transform(X)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
|
||||
trans.inverse_transform(X_trans)
|
||||
assert trans.n_features_in_ == X.shape[1]
|
||||
@@ -0,0 +1,645 @@
|
||||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils import _to_object_array
|
||||
|
||||
from sklearn.preprocessing._label import LabelBinarizer
|
||||
from sklearn.preprocessing._label import MultiLabelBinarizer
|
||||
from sklearn.preprocessing._label import LabelEncoder
|
||||
from sklearn.preprocessing._label import label_binarize
|
||||
|
||||
from sklearn.preprocessing._label import _inverse_binarize_thresholding
|
||||
from sklearn.preprocessing._label import _inverse_binarize_multiclass
|
||||
|
||||
from sklearn import datasets
|
||||
|
||||
iris = datasets.load_iris()
|
||||
|
||||
|
||||
def toarray(a):
|
||||
if hasattr(a, "toarray"):
|
||||
a = a.toarray()
|
||||
return a
|
||||
|
||||
|
||||
def test_label_binarizer():
|
||||
# one-class case defaults to negative label
|
||||
# For dense case:
|
||||
inp = ["pos", "pos", "pos", "pos"]
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
expected = np.array([[0, 0, 0, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
# For sparse case:
|
||||
lb = LabelBinarizer(sparse_output=True)
|
||||
got = lb.fit_transform(inp)
|
||||
assert issparse(got)
|
||||
assert_array_equal(lb.classes_, ["pos"])
|
||||
assert_array_equal(expected, got.toarray())
|
||||
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
|
||||
|
||||
lb = LabelBinarizer(sparse_output=False)
|
||||
# two-class case
|
||||
inp = ["neg", "pos", "pos", "neg"]
|
||||
expected = np.array([[0, 1, 1, 0]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["neg", "pos"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
|
||||
assert_array_equal(lb.inverse_transform(to_invert), inp)
|
||||
|
||||
# multi-class case
|
||||
inp = ["spam", "ham", "eggs", "ham", "0"]
|
||||
expected = np.array(
|
||||
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
def test_label_binarizer_unseen_labels():
|
||||
lb = LabelBinarizer()
|
||||
|
||||
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
|
||||
got = lb.fit_transform(["b", "d", "e"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
expected = np.array(
|
||||
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
|
||||
)
|
||||
got = lb.transform(["a", "b", "c", "d", "e", "f"])
|
||||
assert_array_equal(expected, got)
|
||||
|
||||
|
||||
def test_label_binarizer_set_label_encoding():
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=0)
|
||||
|
||||
# two-class case with pos_label=0
|
||||
inp = np.array([0, 1, 1, 0])
|
||||
expected = np.array([[-2, 0, 0, -2]]).T
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
lb = LabelBinarizer(neg_label=-2, pos_label=2)
|
||||
|
||||
# multi-class case
|
||||
inp = np.array([3, 2, 1, 2, 0])
|
||||
expected = np.array(
|
||||
[
|
||||
[-2, -2, -2, +2],
|
||||
[-2, -2, +2, -2],
|
||||
[-2, +2, -2, -2],
|
||||
[-2, -2, +2, -2],
|
||||
[+2, -2, -2, -2],
|
||||
]
|
||||
)
|
||||
got = lb.fit_transform(inp)
|
||||
assert_array_equal(expected, got)
|
||||
assert_array_equal(lb.inverse_transform(got), inp)
|
||||
|
||||
|
||||
@ignore_warnings
|
||||
def test_label_binarizer_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
one_class = np.array([0, 0, 0, 0])
|
||||
lb = LabelBinarizer().fit(one_class)
|
||||
|
||||
multi_label = [(2, 3), (0,), (0, 2)]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform(multi_label)
|
||||
|
||||
lb = LabelBinarizer()
|
||||
err_msg = "This LabelBinarizer instance is not fitted yet"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.transform([])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.inverse_transform([])
|
||||
|
||||
input_labels = [0, 1, 0, 1]
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=1."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=1)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = "neg_label=2 must be strictly less than pos_label=2."
|
||||
lb = LabelBinarizer(neg_label=2, pos_label=2)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
err_msg = (
|
||||
"Sparse binarization is only supported with non zero pos_label and zero "
|
||||
"neg_label, got pos_label=2 and neg_label=1"
|
||||
)
|
||||
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
lb.fit(input_labels)
|
||||
|
||||
# Fail on y_type
|
||||
err_msg = "foo format is not supported"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Sequence of seq type should raise ValueError
|
||||
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
|
||||
err_msg = "You appear to be using a legacy multi-label data representation"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit_transform(y_seq_of_seqs)
|
||||
|
||||
# Fail on the number of classes
|
||||
err_msg = "The number of class is not equal to the number of dimension of y."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=csr_matrix([[1, 2], [2, 1]]),
|
||||
output_type="foo",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on the dimension of 'binary'
|
||||
err_msg = "output_type='binary', but y.shape"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_inverse_binarize_thresholding(
|
||||
y=np.array([[1, 2, 3], [2, 1, 3]]),
|
||||
output_type="binary",
|
||||
classes=[1, 2, 3],
|
||||
threshold=0,
|
||||
)
|
||||
|
||||
# Fail on multioutput data
|
||||
err_msg = "Multioutput target data is not supported with label binarization"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, classes, unknown",
|
||||
[
|
||||
(
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array([1, 2, 3], dtype="int64"),
|
||||
np.array([4], dtype="int64"),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["d"]),
|
||||
),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder(values, classes, unknown):
|
||||
# Test LabelEncoder's transform, fit_transform and
|
||||
# inverse_transform methods
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
assert_array_equal(le.classes_, classes)
|
||||
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
|
||||
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
|
||||
le = LabelEncoder()
|
||||
ret = le.fit_transform(values)
|
||||
assert_array_equal(ret, [1, 0, 2, 0, 2])
|
||||
|
||||
with pytest.raises(ValueError, match="unseen labels"):
|
||||
le.transform(unknown)
|
||||
|
||||
|
||||
def test_label_encoder_negative_ints():
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 1, 4, 5, -1, 0])
|
||||
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
|
||||
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
|
||||
assert_array_equal(
|
||||
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([0, 6])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["str", "object"])
|
||||
def test_label_encoder_str_bad_shape(dtype):
|
||||
le = LabelEncoder()
|
||||
le.fit(np.array(["apple", "orange"], dtype=dtype))
|
||||
msg = "should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.transform("apple")
|
||||
|
||||
|
||||
def test_label_encoder_errors():
|
||||
# Check that invalid arguments yield ValueError
|
||||
le = LabelEncoder()
|
||||
with pytest.raises(ValueError):
|
||||
le.transform([])
|
||||
with pytest.raises(ValueError):
|
||||
le.inverse_transform([])
|
||||
|
||||
# Fail on unseen labels
|
||||
le = LabelEncoder()
|
||||
le.fit([1, 2, 3, -1, 1])
|
||||
msg = "contains previously unseen labels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform([-2, -3, -4])
|
||||
|
||||
# Fail on inverse_transform("")
|
||||
msg = r"should be a 1d array.+shape \(\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
le.inverse_transform("")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
np.array([2, 1, 3, 1, 3], dtype="int64"),
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["b", "a", "c", "a", "c"]),
|
||||
],
|
||||
ids=["int64", "object", "str"],
|
||||
)
|
||||
def test_label_encoder_empty_array(values):
|
||||
le = LabelEncoder()
|
||||
le.fit(values)
|
||||
# test empty transform
|
||||
transformed = le.transform([])
|
||||
assert_array_equal(np.array([]), transformed)
|
||||
# test empty inverse transform
|
||||
inverse_transformed = le.inverse_transform([])
|
||||
assert_array_equal(np.array([]), inverse_transformed)
|
||||
|
||||
|
||||
def test_sparse_output_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
inverse = inputs[0]()
|
||||
for sparse_output in [True, False]:
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit_transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert issparse(got) == sparse_output
|
||||
if sparse_output:
|
||||
# verify CSR assumption that indices and indptr have same dtype
|
||||
assert got.indices.dtype == got.indptr.dtype
|
||||
got = got.toarray()
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
|
||||
|
||||
|
||||
def test_multilabel_binarizer():
|
||||
# test input as iterable of iterables
|
||||
inputs = [
|
||||
lambda: [(2, 3), (1,), (1, 2)],
|
||||
lambda: ({2, 3}, {1}, {1, 2}),
|
||||
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
inverse = inputs[0]()
|
||||
for inp in inputs:
|
||||
# With fit_transform
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit_transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
# With fit
|
||||
mlb = MultiLabelBinarizer()
|
||||
got = mlb.fit(inp()).transform(inp())
|
||||
assert_array_equal(indicator_mat, got)
|
||||
assert_array_equal([1, 2, 3], mlb.classes_)
|
||||
assert mlb.inverse_transform(got) == inverse
|
||||
|
||||
|
||||
def test_multilabel_binarizer_empty_sample():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2], [1], []]
|
||||
Y = np.array([[1, 1], [1, 0], [0, 0]])
|
||||
assert_array_equal(mlb.fit_transform(y), Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_unknown_class():
|
||||
mlb = MultiLabelBinarizer()
|
||||
y = [[1, 2]]
|
||||
Y = np.array([[1, 0], [0, 1]])
|
||||
warning_message = "unknown class.* will be ignored"
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
|
||||
Y = np.array([[1, 0, 0], [0, 1, 0]])
|
||||
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
|
||||
with pytest.warns(UserWarning, match=warning_message):
|
||||
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
|
||||
assert_array_equal(matrix, Y)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_given_classes():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, [1, 3, 2])
|
||||
|
||||
# ensure works with extra class
|
||||
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
|
||||
assert_array_equal(
|
||||
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
|
||||
)
|
||||
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
|
||||
|
||||
# ensure fit is no-op as iterable is not consumed
|
||||
inp = iter(inp)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
|
||||
# ensure a ValueError is thrown if given duplicate classes
|
||||
err_msg = (
|
||||
"The classes argument contains duplicate classes. Remove "
|
||||
"these duplicates before passing them to MultiLabelBinarizer."
|
||||
)
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mlb.fit(inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_multiple_calls():
|
||||
inp = [(2, 3), (1,), (1, 2)]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
|
||||
|
||||
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
|
||||
# first call
|
||||
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
# second call change class
|
||||
mlb.classes = [1, 2, 3]
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_same_length_sequence():
|
||||
# Ensure sequences of the same length are not interpreted as a 2-d array
|
||||
inp = [[1], [0], [2]]
|
||||
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_integer_labels():
|
||||
tuple_classes = _to_object_array([(1,), (2,), (3,)])
|
||||
inputs = [
|
||||
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
|
||||
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
|
||||
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
|
||||
]
|
||||
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
|
||||
for inp, classes in inputs:
|
||||
# fit_transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
inp = np.array(inp, dtype=object)
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
# fit().transform()
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
|
||||
assert_array_equal(mlb.classes_, classes)
|
||||
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
|
||||
assert_array_equal(indicator_mat_inv, inp)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
with pytest.raises(TypeError):
|
||||
mlb.fit_transform([({}), ({}, {"a": "b"})])
|
||||
|
||||
|
||||
def test_multilabel_binarizer_non_unique():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
indicator_mat = np.array([[1, 1]])
|
||||
mlb = MultiLabelBinarizer()
|
||||
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
|
||||
|
||||
|
||||
def test_multilabel_binarizer_inverse_validation():
|
||||
inp = [(1, 1, 1, 0)]
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit_transform(inp)
|
||||
# Not binary
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 3]]))
|
||||
# The following binary cases are fine, however
|
||||
mlb.inverse_transform(np.array([[0, 0]]))
|
||||
mlb.inverse_transform(np.array([[1, 1]]))
|
||||
mlb.inverse_transform(np.array([[1, 0]]))
|
||||
|
||||
# Wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1]]))
|
||||
with pytest.raises(ValueError):
|
||||
mlb.inverse_transform(np.array([[1, 1, 1]]))
|
||||
|
||||
|
||||
def test_label_binarize_with_class_order():
|
||||
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
# Modified class order
|
||||
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
|
||||
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
|
||||
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
|
||||
assert_array_equal(out, expected)
|
||||
|
||||
|
||||
def check_binarized_results(y, classes, pos_label, neg_label, expected):
|
||||
for sparse_output in [True, False]:
|
||||
if (pos_label == 0 or neg_label != 0) and sparse_output:
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
continue
|
||||
|
||||
# check label_binarize
|
||||
binarized = label_binarize(
|
||||
y,
|
||||
classes=classes,
|
||||
neg_label=neg_label,
|
||||
pos_label=pos_label,
|
||||
sparse_output=sparse_output,
|
||||
)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
|
||||
# check inverse
|
||||
y_type = type_of_target(y)
|
||||
if y_type == "multiclass":
|
||||
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
|
||||
|
||||
else:
|
||||
inversed = _inverse_binarize_thresholding(
|
||||
binarized,
|
||||
output_type=y_type,
|
||||
classes=classes,
|
||||
threshold=((neg_label + pos_label) / 2.0),
|
||||
)
|
||||
|
||||
assert_array_equal(toarray(inversed), toarray(y))
|
||||
|
||||
# Check label binarizer
|
||||
lb = LabelBinarizer(
|
||||
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
|
||||
)
|
||||
binarized = lb.fit_transform(y)
|
||||
assert_array_equal(toarray(binarized), expected)
|
||||
assert issparse(binarized) == sparse_output
|
||||
inverse_output = lb.inverse_transform(binarized)
|
||||
assert_array_equal(toarray(inverse_output), toarray(y))
|
||||
assert issparse(inverse_output) == issparse(y)
|
||||
|
||||
|
||||
def test_label_binarize_binary():
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 2
|
||||
neg_label = -1
|
||||
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
# Binary case where sparse_output = True will not result in a ValueError
|
||||
y = [0, 1, 0]
|
||||
classes = [0, 1]
|
||||
pos_label = 3
|
||||
neg_label = 0
|
||||
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
|
||||
def test_label_binarize_multiclass():
|
||||
y = [0, 1, 2]
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = 2 * np.eye(3)
|
||||
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
def test_label_binarize_multilabel():
|
||||
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
|
||||
classes = [0, 1, 2]
|
||||
pos_label = 2
|
||||
neg_label = 0
|
||||
expected = pos_label * y_ind
|
||||
y_sparse = [
|
||||
sparse_matrix(y_ind)
|
||||
for sparse_matrix in [
|
||||
coo_matrix,
|
||||
csc_matrix,
|
||||
csr_matrix,
|
||||
dok_matrix,
|
||||
lil_matrix,
|
||||
]
|
||||
]
|
||||
|
||||
for y in [y_ind] + y_sparse:
|
||||
check_binarized_results(y, classes, pos_label, neg_label, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize(
|
||||
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_input_label_binarize():
|
||||
with pytest.raises(ValueError):
|
||||
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
|
||||
with pytest.raises(ValueError, match="continuous target data is not "):
|
||||
label_binarize([1.2, 2.7], classes=[0, 1])
|
||||
with pytest.raises(ValueError, match="mismatch with the labels"):
|
||||
label_binarize([[1, 3]], classes=[1, 2, 3])
|
||||
|
||||
|
||||
def test_inverse_binarize_multiclass():
|
||||
got = _inverse_binarize_multiclass(
|
||||
csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
|
||||
)
|
||||
assert_array_equal(got, np.array([1, 1, 0]))
|
||||
@@ -0,0 +1,930 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
from scipy.sparse import random as sparse_random
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
from scipy.interpolate import BSpline
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import (
|
||||
KBinsDiscretizer,
|
||||
PolynomialFeatures,
|
||||
SplineTransformer,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
|
||||
def test_polynomial_and_spline_array_order(est):
|
||||
"""Test that output array has the given order."""
|
||||
X = np.arange(10).reshape(5, 2)
|
||||
|
||||
def is_c_contiguous(a):
|
||||
return np.isfortran(a.T)
|
||||
|
||||
assert is_c_contiguous(est().fit_transform(X))
|
||||
assert is_c_contiguous(est(order="C").fit_transform(X))
|
||||
assert np.isfortran(est(order="F").fit_transform(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
({"degree": -1}, "degree must be a non-negative integer"),
|
||||
({"degree": 2.5}, "degree must be a non-negative integer"),
|
||||
({"degree": "string"}, "degree must be a non-negative integer"),
|
||||
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
|
||||
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
|
||||
({"n_knots": 2.5}, "n_knots must be a positive integer >= 2."),
|
||||
({"n_knots": "string"}, "n_knots must be a positive integer >= 2."),
|
||||
({"knots": 1}, "Expected 2D array, got scalar array instead:"),
|
||||
({"knots": [1, 2]}, "Expected 2D array, got 1D array instead:"),
|
||||
(
|
||||
{"knots": [[1]]},
|
||||
r"Number of knots, knots.shape\[0\], must be >= 2.",
|
||||
),
|
||||
(
|
||||
{"knots": [[1, 5], [2, 6]]},
|
||||
r"knots.shape\[1\] == n_features is violated.",
|
||||
),
|
||||
(
|
||||
{"knots": [[1], [1], [2]]},
|
||||
"knots must be sorted without duplicates.",
|
||||
),
|
||||
({"knots": [[2], [1]]}, "knots must be sorted without duplicates."),
|
||||
(
|
||||
{"extrapolation": None},
|
||||
"extrapolation must be one of 'error', 'constant', 'linear', "
|
||||
"'continue' or 'periodic'.",
|
||||
),
|
||||
(
|
||||
{"extrapolation": 1},
|
||||
"extrapolation must be one of 'error', 'constant', 'linear', "
|
||||
"'continue' or 'periodic'.",
|
||||
),
|
||||
(
|
||||
{"extrapolation": "string"},
|
||||
"extrapolation must be one of 'error', 'constant', 'linear', "
|
||||
"'continue' or 'periodic'.",
|
||||
),
|
||||
({"include_bias": None}, "include_bias must be bool."),
|
||||
({"include_bias": 1}, "include_bias must be bool."),
|
||||
({"include_bias": "string"}, "include_bias must be bool."),
|
||||
(
|
||||
{"extrapolation": "periodic", "n_knots": 3, "degree": 3},
|
||||
"Periodic splines require degree < n_knots. Got n_knots=3 and degree=3.",
|
||||
),
|
||||
(
|
||||
{"extrapolation": "periodic", "knots": [[0], [1]], "degree": 2},
|
||||
"Periodic splines require degree < n_knots. Got n_knots=2 and degree=2.",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_spline_transformer_input_validation(params, err_msg):
|
||||
"""Test that we raise errors for invalid input in SplineTransformer."""
|
||||
X = [[1], [2]]
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
SplineTransformer(**params).fit(X)
|
||||
|
||||
|
||||
def test_spline_transformer_manual_knot_input():
|
||||
"""
|
||||
Test that array-like knot positions in SplineTransformer are accepted.
|
||||
"""
|
||||
X = np.arange(20).reshape(10, 2)
|
||||
knots = [[0.5, 1], [1.5, 2], [5, 10]]
|
||||
st1 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
|
||||
knots = np.asarray(knots)
|
||||
st2 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
|
||||
for i in range(X.shape[1]):
|
||||
assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
|
||||
def test_spline_transformer_integer_knots(extrapolation):
|
||||
"""Test that SplineTransformer accepts integer value knot positions."""
|
||||
X = np.arange(20).reshape(10, 2)
|
||||
knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
|
||||
_ = SplineTransformer(
|
||||
degree=3, knots=knots, extrapolation=extrapolation
|
||||
).fit_transform(X)
|
||||
|
||||
|
||||
# TODO: Remove in 1.2 when get_feature_names is removed.
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
|
||||
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
|
||||
def test_spline_transformer_feature_names(get_names):
|
||||
"""Test that SplineTransformer generates correct features name."""
|
||||
X = np.arange(20).reshape(10, 2)
|
||||
splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
|
||||
feature_names = getattr(splt, get_names)()
|
||||
assert_array_equal(
|
||||
feature_names,
|
||||
[
|
||||
"x0_sp_0",
|
||||
"x0_sp_1",
|
||||
"x0_sp_2",
|
||||
"x0_sp_3",
|
||||
"x0_sp_4",
|
||||
"x1_sp_0",
|
||||
"x1_sp_1",
|
||||
"x1_sp_2",
|
||||
"x1_sp_3",
|
||||
"x1_sp_4",
|
||||
],
|
||||
)
|
||||
|
||||
splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
|
||||
feature_names = getattr(splt, get_names)(["a", "b"])
|
||||
assert_array_equal(
|
||||
feature_names,
|
||||
[
|
||||
"a_sp_0",
|
||||
"a_sp_1",
|
||||
"a_sp_2",
|
||||
"a_sp_3",
|
||||
"b_sp_0",
|
||||
"b_sp_1",
|
||||
"b_sp_2",
|
||||
"b_sp_3",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("degree", range(1, 5))
|
||||
@pytest.mark.parametrize("n_knots", range(3, 5))
|
||||
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
|
||||
@pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
|
||||
def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation):
|
||||
"""Test that B-splines are indeed a decomposition of unity.
|
||||
|
||||
Splines basis functions must sum up to 1 per row, if we stay in between
|
||||
boundaries.
|
||||
"""
|
||||
X = np.linspace(0, 1, 100)[:, None]
|
||||
# make the boundaries 0 and 1 part of X_train, for sure.
|
||||
X_train = np.r_[[[0]], X[::2, :], [[1]]]
|
||||
X_test = X[1::2, :]
|
||||
|
||||
if extrapolation == "periodic":
|
||||
n_knots = n_knots + degree # periodic splines require degree < n_knots
|
||||
|
||||
splt = SplineTransformer(
|
||||
n_knots=n_knots,
|
||||
degree=degree,
|
||||
knots=knots,
|
||||
include_bias=True,
|
||||
extrapolation=extrapolation,
|
||||
)
|
||||
splt.fit(X_train)
|
||||
for X in [X_train, X_test]:
|
||||
assert_allclose(np.sum(splt.transform(X), axis=1), 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||||
def test_spline_transformer_linear_regression(bias, intercept):
|
||||
"""Test that B-splines fit a sinusodial curve pretty well."""
|
||||
X = np.linspace(0, 10, 100)[:, None]
|
||||
y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose
|
||||
pipe = Pipeline(
|
||||
steps=[
|
||||
(
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=15,
|
||||
degree=3,
|
||||
include_bias=bias,
|
||||
extrapolation="constant",
|
||||
),
|
||||
),
|
||||
("ols", LinearRegression(fit_intercept=intercept)),
|
||||
]
|
||||
)
|
||||
pipe.fit(X, y)
|
||||
assert_allclose(pipe.predict(X), y, rtol=1e-3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["knots", "n_knots", "sample_weight", "expected_knots"],
|
||||
[
|
||||
("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])),
|
||||
(
|
||||
"uniform",
|
||||
3,
|
||||
np.array([0, 0, 1, 1, 0, 3, 1]),
|
||||
np.array([[2, 2], [4, 8], [6, 14]]),
|
||||
),
|
||||
("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])),
|
||||
("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])),
|
||||
(
|
||||
"quantile",
|
||||
3,
|
||||
np.array([0, 0, 1, 1, 0, 3, 1]),
|
||||
np.array([[2, 2], [5, 8], [6, 14]]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_spline_transformer_get_base_knot_positions(
|
||||
knots, n_knots, sample_weight, expected_knots
|
||||
):
|
||||
# Check the behaviour to find the positions of the knots with and without
|
||||
# `sample_weight`
|
||||
X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]])
|
||||
base_knots = SplineTransformer._get_base_knot_positions(
|
||||
X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight
|
||||
)
|
||||
assert_allclose(base_knots, expected_knots)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"knots, n_knots, degree",
|
||||
[
|
||||
("uniform", 5, 3),
|
||||
("uniform", 12, 8),
|
||||
(
|
||||
[[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]],
|
||||
None,
|
||||
3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_spline_transformer_periodicity_of_extrapolation(knots, n_knots, degree):
|
||||
"""Test that the SplineTransformer is periodic for multiple features."""
|
||||
X_1 = np.linspace((-1, 0), (1, 5), 10)
|
||||
X_2 = np.linspace((1, 5), (3, 10), 10)
|
||||
|
||||
splt = SplineTransformer(
|
||||
knots=knots, n_knots=n_knots, degree=degree, extrapolation="periodic"
|
||||
)
|
||||
splt.fit(X_1)
|
||||
|
||||
assert_allclose(splt.transform(X_1), splt.transform(X_2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||||
def test_spline_transformer_periodic_linear_regression(bias, intercept):
|
||||
"""Test that B-splines fit a periodic curve pretty well."""
|
||||
# "+ 3" to avoid the value 0 in assert_allclose
|
||||
def f(x):
|
||||
return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
|
||||
|
||||
X = np.linspace(0, 1, 101)[:, None]
|
||||
pipe = Pipeline(
|
||||
steps=[
|
||||
(
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=20,
|
||||
degree=3,
|
||||
include_bias=bias,
|
||||
extrapolation="periodic",
|
||||
),
|
||||
),
|
||||
("ols", LinearRegression(fit_intercept=intercept)),
|
||||
]
|
||||
)
|
||||
pipe.fit(X, f(X[:, 0]))
|
||||
|
||||
# Generate larger array to check periodic extrapolation
|
||||
X_ = np.linspace(-1, 2, 301)[:, None]
|
||||
predictions = pipe.predict(X_)
|
||||
assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
|
||||
assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
|
||||
|
||||
|
||||
def test_spline_transformer_periodic_spline_backport():
|
||||
"""Test that the backport of extrapolate="periodic" works correctly"""
|
||||
X = np.linspace(-2, 3.5, 10)[:, None]
|
||||
degree = 2
|
||||
|
||||
# Use periodic extrapolation backport in SplineTransformer
|
||||
transformer = SplineTransformer(
|
||||
degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]
|
||||
)
|
||||
Xt = transformer.fit_transform(X)
|
||||
|
||||
# Use periodic extrapolation in BSpline
|
||||
coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
|
||||
spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
|
||||
Xspl = spl(X[:, 0])
|
||||
assert_allclose(Xt, Xspl)
|
||||
|
||||
|
||||
def test_spline_transformer_periodic_splines_periodicity():
|
||||
"""
|
||||
Test if shifted knots result in the same transformation up to permutation.
|
||||
"""
|
||||
X = np.linspace(0, 10, 101)[:, None]
|
||||
|
||||
transformer_1 = SplineTransformer(
|
||||
degree=3,
|
||||
extrapolation="periodic",
|
||||
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
|
||||
)
|
||||
|
||||
transformer_2 = SplineTransformer(
|
||||
degree=3,
|
||||
extrapolation="periodic",
|
||||
knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]],
|
||||
)
|
||||
|
||||
Xt_1 = transformer_1.fit_transform(X)
|
||||
Xt_2 = transformer_2.fit_transform(X)
|
||||
|
||||
assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("degree", [3, 5])
|
||||
def test_spline_transformer_periodic_splines_smoothness(degree):
|
||||
"""Test that spline transformation is smooth at first / last knot."""
|
||||
X = np.linspace(-2, 10, 10_000)[:, None]
|
||||
|
||||
transformer = SplineTransformer(
|
||||
degree=degree,
|
||||
extrapolation="periodic",
|
||||
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
|
||||
)
|
||||
Xt = transformer.fit_transform(X)
|
||||
|
||||
delta = (X.max() - X.min()) / len(X)
|
||||
tol = 10 * delta
|
||||
|
||||
dXt = Xt
|
||||
# We expect splines of degree `degree` to be (`degree`-1) times
|
||||
# continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
|
||||
# derivative should be continuous. This is the case if the (d+1)-th
|
||||
# numerical derivative is reasonably small (smaller than `tol` in absolute
|
||||
# value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
|
||||
# and compare them to `tol`.
|
||||
#
|
||||
# Note that the 0-th derivative is the function itself, such that we are
|
||||
# also checking its continuity.
|
||||
for d in range(1, degree + 1):
|
||||
# Check continuity of the (d-1)-th derivative
|
||||
diff = np.diff(dXt, axis=0)
|
||||
assert np.abs(diff).max() < tol
|
||||
# Compute d-th numeric derivative
|
||||
dXt = diff / delta
|
||||
|
||||
# As degree `degree` splines are not `degree` times continuously
|
||||
# differentiable at the knots, the `degree + 1`-th numeric derivative
|
||||
# should have spikes at the knots.
|
||||
diff = np.diff(dXt, axis=0)
|
||||
assert np.abs(diff).max() > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||||
@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
|
||||
def test_spline_transformer_extrapolation(bias, intercept, degree):
|
||||
"""Test that B-spline extrapolation works correctly."""
|
||||
# we use a straight line for that
|
||||
X = np.linspace(-1, 1, 100)[:, None]
|
||||
y = X.squeeze()
|
||||
|
||||
# 'constant'
|
||||
pipe = Pipeline(
|
||||
[
|
||||
[
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=4,
|
||||
degree=degree,
|
||||
include_bias=bias,
|
||||
extrapolation="constant",
|
||||
),
|
||||
],
|
||||
["ols", LinearRegression(fit_intercept=intercept)],
|
||||
]
|
||||
)
|
||||
pipe.fit(X, y)
|
||||
assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
|
||||
|
||||
# 'linear'
|
||||
pipe = Pipeline(
|
||||
[
|
||||
[
|
||||
"spline",
|
||||
SplineTransformer(
|
||||
n_knots=4,
|
||||
degree=degree,
|
||||
include_bias=bias,
|
||||
extrapolation="linear",
|
||||
),
|
||||
],
|
||||
["ols", LinearRegression(fit_intercept=intercept)],
|
||||
]
|
||||
)
|
||||
pipe.fit(X, y)
|
||||
assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
|
||||
|
||||
# 'error'
|
||||
splt = SplineTransformer(
|
||||
n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
|
||||
)
|
||||
splt.fit(X)
|
||||
with pytest.raises(ValueError):
|
||||
splt.transform([[-10]])
|
||||
with pytest.raises(ValueError):
|
||||
splt.transform([[5]])
|
||||
|
||||
|
||||
def test_spline_transformer_kbindiscretizer():
|
||||
"""Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
|
||||
rng = np.random.RandomState(97531)
|
||||
X = rng.randn(200).reshape(200, 1)
|
||||
n_bins = 5
|
||||
n_knots = n_bins + 1
|
||||
|
||||
splt = SplineTransformer(
|
||||
n_knots=n_knots, degree=0, knots="quantile", include_bias=True
|
||||
)
|
||||
splines = splt.fit_transform(X)
|
||||
|
||||
kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile")
|
||||
kbins = kbd.fit_transform(X)
|
||||
|
||||
# Though they should be exactly equal, we test approximately with high
|
||||
# accuracy.
|
||||
assert_allclose(splines, kbins, rtol=1e-13)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_knots", [5, 10])
|
||||
@pytest.mark.parametrize("include_bias", [True, False])
|
||||
@pytest.mark.parametrize("degree", [3, 5])
|
||||
def test_spline_transformer_n_features_out(n_knots, include_bias, degree):
|
||||
"""Test that transform results in n_features_out_ features."""
|
||||
splt = SplineTransformer(n_knots=n_knots, degree=degree, include_bias=include_bias)
|
||||
X = np.linspace(0, 1, 10)[:, None]
|
||||
splt.fit(X)
|
||||
|
||||
assert splt.transform(X).shape[1] == splt.n_features_out_
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
({"degree": -1}, "degree must be a non-negative integer"),
|
||||
({"degree": 2.5}, "degree must be a non-negative int or tuple"),
|
||||
({"degree": "12"}, r"degree=\(min_degree, max_degree\) must"),
|
||||
({"degree": "string"}, "degree must be a non-negative int or tuple"),
|
||||
({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"),
|
||||
({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"),
|
||||
({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_input_validation(params, err_msg):
|
||||
"""Test that we raise errors for invalid input in PolynomialFeatures."""
|
||||
X = [[1], [2]]
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
PolynomialFeatures(**params).fit(X)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def single_feature_degree3():
|
||||
X = np.arange(6)[:, np.newaxis]
|
||||
P = np.hstack([np.ones_like(X), X, X**2, X**3])
|
||||
return X, P
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"degree, include_bias, interaction_only, indices",
|
||||
[
|
||||
(3, True, False, slice(None, None)),
|
||||
(3, False, False, slice(1, None)),
|
||||
(3, True, True, [0, 1]),
|
||||
(3, False, True, [1]),
|
||||
((2, 3), True, False, [0, 2, 3]),
|
||||
((2, 3), False, False, [2, 3]),
|
||||
((2, 3), True, True, [0]),
|
||||
((2, 3), False, True, []),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_X",
|
||||
[False, sparse.csr_matrix, sparse.csc_matrix],
|
||||
)
|
||||
def test_polynomial_features_one_feature(
|
||||
single_feature_degree3,
|
||||
degree,
|
||||
include_bias,
|
||||
interaction_only,
|
||||
indices,
|
||||
sparse_X,
|
||||
):
|
||||
"""Test PolynomialFeatures on single feature up to degree 3."""
|
||||
X, P = single_feature_degree3
|
||||
if sparse_X:
|
||||
X = sparse_X(X)
|
||||
tf = PolynomialFeatures(
|
||||
degree=degree, include_bias=include_bias, interaction_only=interaction_only
|
||||
).fit(X)
|
||||
out = tf.transform(X)
|
||||
if sparse_X:
|
||||
out = out.toarray()
|
||||
assert_allclose(out, P[:, indices])
|
||||
if tf.n_output_features_ > 0:
|
||||
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def two_features_degree3():
|
||||
X = np.arange(6).reshape((3, 2))
|
||||
x1 = X[:, :1]
|
||||
x2 = X[:, 1:]
|
||||
P = np.hstack(
|
||||
[
|
||||
x1**0 * x2**0, # 0
|
||||
x1**1 * x2**0, # 1
|
||||
x1**0 * x2**1, # 2
|
||||
x1**2 * x2**0, # 3
|
||||
x1**1 * x2**1, # 4
|
||||
x1**0 * x2**2, # 5
|
||||
x1**3 * x2**0, # 6
|
||||
x1**2 * x2**1, # 7
|
||||
x1**1 * x2**2, # 8
|
||||
x1**0 * x2**3, # 9
|
||||
]
|
||||
)
|
||||
return X, P
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"degree, include_bias, interaction_only, indices",
|
||||
[
|
||||
(2, True, False, slice(0, 6)),
|
||||
(2, False, False, slice(1, 6)),
|
||||
(2, True, True, [0, 1, 2, 4]),
|
||||
(2, False, True, [1, 2, 4]),
|
||||
((2, 2), True, False, [0, 3, 4, 5]),
|
||||
((2, 2), False, False, [3, 4, 5]),
|
||||
((2, 2), True, True, [0, 4]),
|
||||
((2, 2), False, True, [4]),
|
||||
(3, True, False, slice(None, None)),
|
||||
(3, False, False, slice(1, None)),
|
||||
(3, True, True, [0, 1, 2, 4]),
|
||||
(3, False, True, [1, 2, 4]),
|
||||
((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]),
|
||||
((2, 3), False, False, slice(3, None)),
|
||||
((2, 3), True, True, [0, 4]),
|
||||
((2, 3), False, True, [4]),
|
||||
((3, 3), True, False, [0, 6, 7, 8, 9]),
|
||||
((3, 3), False, False, [6, 7, 8, 9]),
|
||||
((3, 3), True, True, [0]),
|
||||
((3, 3), False, True, []), # would need 3 input features
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_X",
|
||||
[False, sparse.csr_matrix, sparse.csc_matrix],
|
||||
)
|
||||
def test_polynomial_features_two_features(
|
||||
two_features_degree3,
|
||||
degree,
|
||||
include_bias,
|
||||
interaction_only,
|
||||
indices,
|
||||
sparse_X,
|
||||
):
|
||||
"""Test PolynomialFeatures on 2 features up to degree 3."""
|
||||
X, P = two_features_degree3
|
||||
if sparse_X:
|
||||
X = sparse_X(X)
|
||||
tf = PolynomialFeatures(
|
||||
degree=degree, include_bias=include_bias, interaction_only=interaction_only
|
||||
).fit(X)
|
||||
out = tf.transform(X)
|
||||
if sparse_X:
|
||||
out = out.toarray()
|
||||
assert_allclose(out, P[:, indices])
|
||||
if tf.n_output_features_ > 0:
|
||||
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
|
||||
|
||||
|
||||
# TODO: Remove in 1.2 when get_feature_names is removed.
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
|
||||
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
|
||||
def test_polynomial_feature_names(get_names):
|
||||
X = np.arange(30).reshape(10, 3)
|
||||
poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
|
||||
feature_names = poly.get_feature_names()
|
||||
assert_array_equal(
|
||||
["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"],
|
||||
feature_names,
|
||||
)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
|
||||
feature_names = getattr(poly, get_names)(["a", "b", "c"])
|
||||
assert_array_equal(
|
||||
[
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"a^2",
|
||||
"a b",
|
||||
"a c",
|
||||
"b^2",
|
||||
"b c",
|
||||
"c^2",
|
||||
"a^3",
|
||||
"a^2 b",
|
||||
"a^2 c",
|
||||
"a b^2",
|
||||
"a b c",
|
||||
"a c^2",
|
||||
"b^3",
|
||||
"b^2 c",
|
||||
"b c^2",
|
||||
"c^3",
|
||||
],
|
||||
feature_names,
|
||||
)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X)
|
||||
feature_names = getattr(poly, get_names)(["a", "b", "c"])
|
||||
assert_array_equal(
|
||||
[
|
||||
"a^2",
|
||||
"a b",
|
||||
"a c",
|
||||
"b^2",
|
||||
"b c",
|
||||
"c^2",
|
||||
"a^3",
|
||||
"a^2 b",
|
||||
"a^2 c",
|
||||
"a b^2",
|
||||
"a b c",
|
||||
"a c^2",
|
||||
"b^3",
|
||||
"b^2 c",
|
||||
"b c^2",
|
||||
"c^3",
|
||||
],
|
||||
feature_names,
|
||||
)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
poly = PolynomialFeatures(
|
||||
degree=(3, 3), include_bias=True, interaction_only=True
|
||||
).fit(X)
|
||||
feature_names = getattr(poly, get_names)(["a", "b", "c"])
|
||||
assert_array_equal(["1", "a b c"], feature_names)
|
||||
assert len(feature_names) == poly.transform(X).shape[1]
|
||||
|
||||
# test some unicode
|
||||
poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
|
||||
feature_names = poly.get_feature_names(["\u0001F40D", "\u262E", "\u05D0"])
|
||||
assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], feature_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "include_bias", "interaction_only", "dtype"],
|
||||
[
|
||||
(1, True, False, int),
|
||||
(2, True, False, int),
|
||||
(2, True, False, np.float32),
|
||||
(2, True, False, np.float64),
|
||||
(3, False, False, np.float64),
|
||||
(3, False, True, np.float64),
|
||||
(4, False, False, np.float64),
|
||||
(4, False, True, np.float64),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randint(0, 2, (100, 2))
|
||||
X_csc = sparse.csc_matrix(X)
|
||||
|
||||
est = PolynomialFeatures(
|
||||
deg, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csc = est.fit_transform(X_csc.astype(dtype))
|
||||
Xt_dense = est.fit_transform(X.astype(dtype))
|
||||
|
||||
assert isinstance(Xt_csc, sparse.csc_matrix)
|
||||
assert Xt_csc.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csc.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "include_bias", "interaction_only", "dtype"],
|
||||
[
|
||||
(1, True, False, int),
|
||||
(2, True, False, int),
|
||||
(2, True, False, np.float32),
|
||||
(2, True, False, np.float64),
|
||||
(3, False, False, np.float64),
|
||||
(3, False, True, np.float64),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randint(0, 2, (100, 2))
|
||||
X_csr = sparse.csr_matrix(X)
|
||||
|
||||
est = PolynomialFeatures(
|
||||
deg, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csr = est.fit_transform(X_csr.astype(dtype))
|
||||
Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_features", [1, 4, 5])
|
||||
@pytest.mark.parametrize(
|
||||
"min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)]
|
||||
)
|
||||
@pytest.mark.parametrize("interaction_only", [True, False])
|
||||
@pytest.mark.parametrize("include_bias", [True, False])
|
||||
def test_num_combinations(
|
||||
n_features,
|
||||
min_degree,
|
||||
max_degree,
|
||||
interaction_only,
|
||||
include_bias,
|
||||
):
|
||||
"""
|
||||
Test that n_output_features_ is calculated correctly.
|
||||
"""
|
||||
x = sparse.csr_matrix(([1], ([0], [n_features - 1])))
|
||||
est = PolynomialFeatures(
|
||||
degree=max_degree,
|
||||
interaction_only=interaction_only,
|
||||
include_bias=include_bias,
|
||||
)
|
||||
est.fit(x)
|
||||
num_combos = est.n_output_features_
|
||||
|
||||
combos = PolynomialFeatures._combinations(
|
||||
n_features=n_features,
|
||||
min_degree=0,
|
||||
max_degree=max_degree,
|
||||
interaction_only=interaction_only,
|
||||
include_bias=include_bias,
|
||||
)
|
||||
assert num_combos == sum([1 for _ in combos])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "include_bias", "interaction_only", "dtype"],
|
||||
[
|
||||
(2, True, False, np.float32),
|
||||
(2, True, False, np.float64),
|
||||
(3, False, False, np.float64),
|
||||
(3, False, True, np.float64),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X_floats(deg, include_bias, interaction_only, dtype):
|
||||
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(
|
||||
deg, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csr = est.fit_transform(X_csr.astype(dtype))
|
||||
Xt_dense = est.fit_transform(X.astype(dtype))
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["zero_row_index", "deg", "interaction_only"],
|
||||
[
|
||||
(0, 2, True),
|
||||
(1, 2, True),
|
||||
(2, 2, True),
|
||||
(0, 3, True),
|
||||
(1, 3, True),
|
||||
(2, 3, True),
|
||||
(0, 2, False),
|
||||
(1, 2, False),
|
||||
(2, 2, False),
|
||||
(0, 3, False),
|
||||
(1, 3, False),
|
||||
(2, 3, False),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, interaction_only):
|
||||
X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr()
|
||||
X_csr[zero_row_index, :] = 0.0
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only)
|
||||
Xt_csr = est.fit_transform(X_csr)
|
||||
Xt_dense = est.fit_transform(X)
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
# This degree should always be one more than the highest degree supported by
|
||||
# _csr_expansion.
|
||||
@pytest.mark.parametrize(
|
||||
["include_bias", "interaction_only"],
|
||||
[(True, True), (True, False), (False, True), (False, False)],
|
||||
)
|
||||
def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only):
|
||||
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(
|
||||
4, include_bias=include_bias, interaction_only=interaction_only
|
||||
)
|
||||
Xt_csr = est.fit_transform(X_csr)
|
||||
Xt_dense = est.fit_transform(X)
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["deg", "dim", "interaction_only"],
|
||||
[
|
||||
(2, 1, True),
|
||||
(2, 2, True),
|
||||
(3, 1, True),
|
||||
(3, 2, True),
|
||||
(3, 3, True),
|
||||
(2, 1, False),
|
||||
(2, 2, False),
|
||||
(3, 1, False),
|
||||
(3, 2, False),
|
||||
(3, 3, False),
|
||||
],
|
||||
)
|
||||
def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only):
|
||||
X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr()
|
||||
X = X_csr.toarray()
|
||||
|
||||
est = PolynomialFeatures(deg, interaction_only=interaction_only)
|
||||
Xt_csr = est.fit_transform(X_csr)
|
||||
Xt_dense = est.fit_transform(X)
|
||||
|
||||
assert isinstance(Xt_csr, sparse.csr_matrix)
|
||||
assert Xt_csr.dtype == Xt_dense.dtype
|
||||
assert_array_almost_equal(Xt_csr.A, Xt_dense)
|
||||
|
||||
|
||||
def test_polynomial_features_deprecated_n_input_features():
|
||||
# check that we raise a deprecation warning when accessing
|
||||
# `n_input_features_`. FIXME: remove in 1.2
|
||||
depr_msg = (
|
||||
"The attribute `n_input_features_` was deprecated in version "
|
||||
"1.0 and will be removed in 1.2."
|
||||
)
|
||||
X = np.arange(10).reshape(5, 2)
|
||||
|
||||
with pytest.warns(FutureWarning, match=depr_msg):
|
||||
PolynomialFeatures().fit(X).n_input_features_
|
||||
|
||||
|
||||
# TODO: Remove in 1.2 when get_feature_names is removed
|
||||
@pytest.mark.parametrize("Transformer", [SplineTransformer, PolynomialFeatures])
|
||||
def test_get_feature_names_deprecated(Transformer):
|
||||
X = np.arange(30).reshape(10, 3)
|
||||
poly = Transformer().fit(X)
|
||||
msg = "get_feature_names is deprecated in 1.0"
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
poly.get_feature_names()
|
||||
|
||||
|
||||
def test_polynomial_features_behaviour_on_zero_degree():
|
||||
"""Check that PolynomialFeatures raises error when degree=0 and include_bias=False,
|
||||
and output a single constant column when include_bias=True
|
||||
"""
|
||||
X = np.ones((10, 2))
|
||||
poly = PolynomialFeatures(degree=0, include_bias=False)
|
||||
err_msg = (
|
||||
"Setting degree to zero and include_bias to False would result in"
|
||||
" an empty output array."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
poly.fit_transform(X)
|
||||
|
||||
poly = PolynomialFeatures(degree=(0, 0), include_bias=False)
|
||||
err_msg = (
|
||||
"Setting both min_deree and max_degree to zero and include_bias to"
|
||||
" False would result in an empty output array."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
poly.fit_transform(X)
|
||||
|
||||
for _X in [X, sparse.csr_matrix(X), sparse.csc_matrix(X)]:
|
||||
poly = PolynomialFeatures(degree=0, include_bias=True)
|
||||
output = poly.fit_transform(_X)
|
||||
# convert to dense array if needed
|
||||
if sparse.issparse(output):
|
||||
output = output.toarray()
|
||||
assert_array_equal(output, np.ones((X.shape[0], 1)))
|
||||
Reference in New Issue
Block a user