first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,183 @@
import warnings
import pytest
import numpy as np
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.preprocessing import maxabs_scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import scale
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import robust_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
iris = load_iris()
def _get_valid_samples_by_column(X, col):
"""Get non NaN samples in column of X"""
return X[:, [col]][~np.isnan(X[:, col])]
@pytest.mark.parametrize(
"est, func, support_sparse, strictly_positive, omit_kwargs",
[
(MaxAbsScaler(), maxabs_scale, True, False, []),
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
(StandardScaler(), scale, False, False, []),
(StandardScaler(with_mean=False), scale, True, False, []),
(PowerTransformer("yeo-johnson"), power_transform, False, False, []),
(PowerTransformer("box-cox"), power_transform, False, True, []),
(QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
(RobustScaler(), robust_scale, False, False, []),
(RobustScaler(with_centering=False), robust_scale, True, False, []),
],
)
def test_missing_value_handling(
est, func, support_sparse, strictly_positive, omit_kwargs
):
# check that the preprocessing method let pass nan
rng = np.random.RandomState(42)
X = iris.data.copy()
n_missing = 50
X[
rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
] = np.nan
if strictly_positive:
X += np.nanmin(X) + 0.1
X_train, X_test = train_test_split(X, random_state=1)
# sanity check
assert not np.all(np.isnan(X_train), axis=0).any()
assert np.any(np.isnan(X_train), axis=0).all()
assert np.any(np.isnan(X_test), axis=0).all()
X_test[:, 0] = np.nan # make sure this boundary case is tested
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt = est.fit(X_train).transform(X_test)
# ensure no warnings are raised
# missing values should still be missing, and only them
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
# check that the function leads to the same results as the class
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_class = est.transform(X_train)
kwargs = est.get_params()
# remove the parameters which should be omitted because they
# are not defined in the counterpart function of the preprocessing class
for kwarg in omit_kwargs:
_ = kwargs.pop(kwarg)
Xt_func = func(X_train, **kwargs)
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
# check that the inverse transform keep NaN
Xt_inv = est.inverse_transform(Xt)
assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
# FIXME: we can introduce equal_nan=True in recent version of numpy.
# For the moment which just check that non-NaN values are almost equal.
assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
for i in range(X.shape[1]):
# train only on non-NaN
est.fit(_get_valid_samples_by_column(X_train, i))
# check transforming with NaN works even when training without NaN
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_col = est.transform(X_test[:, [i]])
assert_allclose(Xt_col, Xt[:, [i]])
# check non-NaN is handled as before - the 1st column is all nan
if not np.isnan(X_test[:, i]).all():
Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
if support_sparse:
est_dense = clone(est)
est_sparse = clone(est)
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
for sparse_constructor in (
sparse.csr_matrix,
sparse.csc_matrix,
sparse.bsr_matrix,
sparse.coo_matrix,
sparse.dia_matrix,
sparse.dok_matrix,
sparse.lil_matrix,
):
# check that the dense and sparse inputs lead to the same results
# precompute the matrix to avoid catching side warnings
X_train_sp = sparse_constructor(X_train)
X_test_sp = sparse_constructor(X_test)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
assert_allclose(Xt_sp.A, Xt_dense)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
warnings.simplefilter("error", RuntimeWarning)
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
@pytest.mark.parametrize(
"est, func",
[
(MaxAbsScaler(), maxabs_scale),
(MinMaxScaler(), minmax_scale),
(StandardScaler(), scale),
(StandardScaler(with_mean=False), scale),
(PowerTransformer("yeo-johnson"), power_transform),
(
PowerTransformer("box-cox"),
power_transform,
),
(QuantileTransformer(n_quantiles=3), quantile_transform),
(RobustScaler(), robust_scale),
(RobustScaler(with_centering=False), robust_scale),
],
)
def test_missing_value_pandas_na_support(est, func):
# Test pandas IntegerArray with pd.NA
pd = pytest.importorskip("pandas")
X = np.array(
[
[1, 2, 3, np.nan, np.nan, 4, 5, 1],
[np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
[1, 2, 3, 4, 5, 6, 7, 8],
]
).T
# Creates dataframe with IntegerArrays with pd.NA
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"])
X_df["c"] = X_df["c"].astype("int")
X_trans = est.fit_transform(X)
X_df_trans = est.fit_transform(X_df)
assert_allclose(X_trans, X_df_trans)

View File

@@ -0,0 +1,472 @@
import pytest
import numpy as np
import scipy.sparse as sp
import warnings
from sklearn import clone
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils._testing import (
assert_array_almost_equal,
assert_array_equal,
assert_allclose_dense_sparse,
)
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
@pytest.mark.parametrize(
"strategy, expected",
[
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
],
)
def test_fit_transform(strategy, expected):
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
est.fit(X)
assert_array_equal(expected, est.transform(X))
def test_valid_n_bins():
KBinsDiscretizer(n_bins=2).fit_transform(X)
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
def test_invalid_n_bins():
est = KBinsDiscretizer(n_bins=1)
err_msg = (
"KBinsDiscretizer received an invalid number of bins. Received 1, expected at"
" least 2."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
est = KBinsDiscretizer(n_bins=1.1)
err_msg = (
"KBinsDiscretizer received an invalid n_bins type. Received float, expected"
" int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
def test_invalid_n_bins_array():
# Bad shape
n_bins = np.full((2, 4), 2.0)
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Incorrect number of features
n_bins = [1, 2, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Bad bin values
n_bins = [1, 2, 2, 1]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 3. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
# Float bin values
n_bins = [2.1, 2, 2.1, 2]
est = KBinsDiscretizer(n_bins=n_bins)
err_msg = (
"KBinsDiscretizer received an invalid number of bins "
"at indices 0, 2. Number of bins must be at least 2, "
"and must be an int."
)
with pytest.raises(ValueError, match=err_msg):
est.fit_transform(X)
@pytest.mark.parametrize(
"strategy, expected",
[
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
],
)
def test_fit_transform_n_bins_array(strategy, expected):
est = KBinsDiscretizer(
n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
).fit(X)
assert_array_equal(expected, est.transform(X))
# test the shape of bin_edges_
n_features = np.array(X).shape[1]
assert est.bin_edges_.shape == (n_features,)
for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
assert bin_edges.shape == (n_bins + 1,)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_same_min_max(strategy):
warnings.simplefilter("always")
X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]])
est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal")
warning_message = "Feature 0 is constant and will be replaced with 0."
with pytest.warns(UserWarning, match=warning_message):
est.fit(X)
assert est.n_bins_[0] == 1
# replace the feature with zeros
Xt = est.transform(X)
assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_transform_1d_behavior():
X = np.arange(4)
est = KBinsDiscretizer(n_bins=2)
with pytest.raises(ValueError):
est.fit(X)
est = KBinsDiscretizer(n_bins=2)
est.fit(X.reshape(-1, 1))
with pytest.raises(ValueError):
est.transform(X)
@pytest.mark.parametrize("i", range(1, 9))
def test_numeric_stability(i):
X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1)
Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
# Test up to discretizing nano units
X = X_init / 10**i
Xt = KBinsDiscretizer(n_bins=2, encode="ordinal").fit_transform(X)
assert_array_equal(Xt_expected, Xt)
def test_invalid_encode_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="invalid-encode")
err_msg = (
r"Valid options for 'encode' are "
r"\('onehot', 'onehot-dense', 'ordinal'\). "
r"Got encode='invalid-encode' instead."
)
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
def test_encode_options():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="ordinal").fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot-dense").fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(
OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False
).fit_transform(Xt_1),
Xt_2,
)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot").fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(
OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True)
.fit_transform(Xt_1)
.toarray(),
Xt_3.toarray(),
)
def test_invalid_strategy_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy="invalid-strategy")
err_msg = (
r"Valid options for 'strategy' are "
r"\('uniform', 'quantile', 'kmeans'\). "
r"Got strategy='invalid-strategy' instead."
)
with pytest.raises(ValueError, match=err_msg):
est.fit(X)
@pytest.mark.parametrize(
"strategy, expected_2bins, expected_3bins, expected_5bins",
[
("uniform", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
("kmeans", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
("quantile", [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4]),
],
)
def test_nonuniform_strategies(
strategy, expected_2bins, expected_3bins, expected_5bins
):
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
# with 2 bins
est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(expected_2bins, Xt.ravel())
# with 3 bins
est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(expected_3bins, Xt.ravel())
# with 5 bins
est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(expected_5bins, Xt.ravel())
@pytest.mark.parametrize(
"strategy, expected_inv",
[
(
"uniform",
[
[-1.5, 2.0, -3.5, -0.5],
[-0.5, 3.0, -2.5, -0.5],
[0.5, 4.0, -1.5, 0.5],
[0.5, 4.0, -1.5, 1.5],
],
),
(
"kmeans",
[
[-1.375, 2.125, -3.375, -0.5625],
[-1.375, 2.125, -3.375, -0.5625],
[-0.125, 3.375, -2.125, 0.5625],
[0.75, 4.25, -1.25, 1.625],
],
),
(
"quantile",
[
[-1.5, 2.0, -3.5, -0.75],
[-0.5, 3.0, -2.5, 0.0],
[0.5, 4.0, -1.5, 1.25],
[0.5, 4.0, -1.5, 1.25],
],
),
],
)
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_inverse_transform(strategy, encode, expected_inv):
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
Xt = kbd.fit_transform(X)
Xinv = kbd.inverse_transform(Xt)
assert_array_almost_equal(expected_inv, Xinv)
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
def test_transform_outside_fit_range(strategy):
X = np.array([0, 1, 2, 3])[:, None]
kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal")
kbd.fit(X)
X2 = np.array([-2, 5])[:, None]
X2t = kbd.transform(X2)
assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
assert_array_equal(X2t.min(axis=0), [0])
def test_overwrite():
X = np.array([0, 1, 2, 3])[:, None]
X_before = X.copy()
est = KBinsDiscretizer(n_bins=3, encode="ordinal")
Xt = est.fit_transform(X)
assert_array_equal(X, X_before)
Xt_before = Xt.copy()
Xinv = est.inverse_transform(Xt)
assert_array_equal(Xt, Xt_before)
assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
@pytest.mark.parametrize(
"strategy, expected_bin_edges", [("quantile", [0, 1, 3]), ("kmeans", [0, 1.5, 3])]
)
def test_redundant_bins(strategy, expected_bin_edges):
X = [[0], [0], [0], [0], [3], [3]]
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
def test_percentile_numeric_stability():
X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
Xt = np.array([0, 0, 4]).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
warning_message = "Consider decreasing the number of bins."
with pytest.warns(UserWarning, match=warning_message):
kbd.fit(X)
assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
assert_array_almost_equal(kbd.transform(X), Xt)
@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("out_dtype", [None, np.float16, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_consistent_dtype(in_dtype, out_dtype, encode):
X_input = np.array(X, dtype=in_dtype)
kbd = KBinsDiscretizer(n_bins=3, encode=encode, dtype=out_dtype)
# a error is raised if a wrong dtype is define for the model
if out_dtype not in [None, np.float32, np.float64]:
with pytest.raises(ValueError, match="Valid options for 'dtype' are"):
kbd.fit(X_input)
else:
kbd.fit(X_input)
# test output dtype
if out_dtype is not None:
expected_dtype = out_dtype
elif out_dtype is None and X_input.dtype == np.float16:
# wrong numeric input dtype are cast in np.float64
expected_dtype = np.float64
else:
expected_dtype = X_input.dtype
Xt = kbd.transform(X_input)
assert Xt.dtype == expected_dtype
@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64])
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
def test_32_equal_64(input_dtype, encode):
# TODO this check is redundant with common checks and can be removed
# once #16290 is merged
X_input = np.array(X, dtype=input_dtype)
# 32 bit output
kbd_32 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float32)
kbd_32.fit(X_input)
Xt_32 = kbd_32.transform(X_input)
# 64 bit output
kbd_64 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float64)
kbd_64.fit(X_input)
Xt_64 = kbd_64.transform(X_input)
assert_allclose_dense_sparse(Xt_32, Xt_64)
# FIXME: remove the `filterwarnings` in 1.3
@pytest.mark.filterwarnings("ignore:In version 1.3 onwards, subsample=2e5")
@pytest.mark.parametrize("subsample", [None, "warn"])
def test_kbinsdiscretizer_subsample_default(subsample):
# Since the size of X is small (< 2e5), subsampling will not take place.
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
kbd_default.fit(X)
kbd_with_subsampling = clone(kbd_default)
kbd_with_subsampling.set_params(subsample=subsample)
kbd_with_subsampling.fit(X)
for bin_kbd_default, bin_kbd_with_subsampling in zip(
kbd_default.bin_edges_[0], kbd_with_subsampling.bin_edges_[0]
):
np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
def test_kbinsdiscretizer_subsample_invalid_strategy():
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform", subsample=3)
err_msg = '`subsample` must be used with `strategy="quantile"`.'
with pytest.raises(ValueError, match=err_msg):
kbd.fit(X)
def test_kbinsdiscretizer_subsample_invalid_type():
X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
kbd = KBinsDiscretizer(
n_bins=10, encode="ordinal", strategy="quantile", subsample="full"
)
msg = "subsample must be an instance of int, not str."
with pytest.raises(TypeError, match=msg):
kbd.fit(X)
# TODO: Remove in 1.3
def test_kbinsdiscretizer_subsample_warn():
X = np.random.rand(200001, 1).reshape(-1, 1)
kbd = KBinsDiscretizer(n_bins=100, encode="ordinal", strategy="quantile")
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
with pytest.warns(FutureWarning, match=msg):
kbd.fit(X)
@pytest.mark.parametrize("subsample", [0, int(2e5)])
def test_kbinsdiscretizer_subsample_values(subsample):
X = np.random.rand(220000, 1).reshape(-1, 1)
kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
kbd_with_subsampling = clone(kbd_default)
kbd_with_subsampling.set_params(subsample=subsample)
if subsample == 0:
with pytest.raises(ValueError, match="subsample == 0, must be >= 1."):
kbd_with_subsampling.fit(X)
else:
# TODO: Remove in 1.3
msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
with pytest.warns(FutureWarning, match=msg):
kbd_default.fit(X)
kbd_with_subsampling.fit(X)
assert not np.all(
kbd_default.bin_edges_[0] == kbd_with_subsampling.bin_edges_[0]
)
assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
@pytest.mark.parametrize(
"encode, expected_names",
[
(
"onehot",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
(
"onehot-dense",
[
f"feat{col_id}_{float(bin_id)}"
for col_id in range(3)
for bin_id in range(4)
],
),
("ordinal", [f"feat{col_id}" for col_id in range(3)]),
],
)
def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names):
"""Check get_feature_names_out for different settings.
Non-regression test for #22731
"""
X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]]
kbd = KBinsDiscretizer(n_bins=4, encode=encode).fit(X)
Xt = kbd.transform(X)
input_features = [f"feat{i}" for i in range(3)]
output_names = kbd.get_feature_names_out(input_features)
assert Xt.shape[1] == output_names.shape[0]
assert_array_equal(output_names, expected_names)

View File

@@ -0,0 +1,392 @@
import warnings
import pytest
import numpy as np
from scipy import sparse
from sklearn.utils import _safe_indexing
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils._testing import (
assert_array_equal,
assert_allclose_dense_sparse,
_convert_container,
)
def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
def _func(X, *args, **kwargs):
args_store.append(X)
args_store.extend(args)
kwargs_store.update(kwargs)
return func(X)
return _func
def test_delegate_to_func():
# (args|kwargs)_store will hold the positional and keyword arguments
# passed to the function inside the FunctionTransformer.
args_store = []
kwargs_store = {}
X = np.arange(10).reshape((5, 2))
assert_array_equal(
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
X,
"transform should have returned X unchanged",
)
# The function should only have received X.
assert args_store == [
X
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
assert (
not kwargs_store
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
# reset the argument stores.
args_store[:] = []
kwargs_store.clear()
transformed = FunctionTransformer(
_make_func(args_store, kwargs_store),
).transform(X)
assert_array_equal(
transformed, X, err_msg="transform should have returned X unchanged"
)
# The function should have received X
assert args_store == [
X
], "Incorrect positional arguments passed to func: {args}".format(args=args_store)
assert (
not kwargs_store
), "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store)
def test_np_log():
X = np.arange(10).reshape((5, 2))
# Test that the numpy.log example still works.
assert_array_equal(
FunctionTransformer(np.log1p).transform(X),
np.log1p(X),
)
def test_kw_arg():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=3))
def test_kw_arg_update():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args["decimals"] = 1
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_kw_arg_reset():
X = np.linspace(0, 1, num=10).reshape((5, 2))
F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
F.kw_args = dict(decimals=1)
# Test that rounding is correct
assert_array_equal(F.transform(X), np.around(X, decimals=1))
def test_inverse_transform():
X = np.array([1, 4, 9, 16]).reshape((2, 2))
# Test that inverse_transform works correctly
F = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
inv_kw_args=dict(decimals=3),
)
assert_array_equal(
F.inverse_transform(F.transform(X)),
np.around(np.sqrt(X), decimals=3),
)
def test_check_inverse():
X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)]
for X in X_list:
if sparse.issparse(X):
accept_sparse = True
else:
accept_sparse = False
trans = FunctionTransformer(
func=np.sqrt,
inverse_func=np.around,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True,
)
warning_message = (
"The provided functions are not strictly"
" inverse of each other. If you are sure you"
" want to proceed regardless, set"
" 'check_inverse=False'."
)
with pytest.warns(UserWarning, match=warning_message):
trans.fit(X)
trans = FunctionTransformer(
func=np.expm1,
inverse_func=np.log1p,
accept_sparse=accept_sparse,
check_inverse=True,
validate=True,
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
Xt = trans.fit_transform(X)
assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
# check that we don't check inverse when one of the func or inverse is not
# provided.
trans = FunctionTransformer(
func=np.expm1, inverse_func=None, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X_dense)
trans = FunctionTransformer(
func=None, inverse_func=np.expm1, check_inverse=True, validate=True
)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(X_dense)
def test_function_transformer_frame():
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(np.random.randn(100, 10))
transformer = FunctionTransformer()
X_df_trans = transformer.fit_transform(X_df)
assert hasattr(X_df_trans, "loc")
@pytest.mark.parametrize("X_type", ["array", "series"])
def test_function_transformer_raise_error_with_mixed_dtype(X_type):
"""Check that `FunctionTransformer.check_inverse` raises error on mixed dtype."""
mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"}
inverse_mapping = {value: key for key, value in mapping.items()}
dtype = "object"
data = ["one", "two", "three", "one", "one", 5, 6]
data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype)
def func(X):
return np.array(
[mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object
)
def inverse_func(X):
return _convert_container(
[inverse_mapping[x] for x in X],
X_type,
columns_name=["value"],
dtype=dtype,
)
transformer = FunctionTransformer(
func=func, inverse_func=inverse_func, validate=False, check_inverse=True
)
msg = "'check_inverse' is only supported when all the elements in `X` is numerical."
with pytest.raises(ValueError, match=msg):
transformer.fit(data)
@pytest.mark.parametrize(
"X, feature_names_out, input_features, expected",
[
(
# NumPy inputs, default behavior: generate names
np.random.rand(100, 3),
"one-to-one",
None,
("x0", "x1", "x2"),
),
(
# Pandas input, default behavior: use input feature names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
None,
("a", "b"),
),
(
# NumPy input, feature_names_out=callable
np.random.rand(100, 3),
lambda transformer, input_features: ("a", "b"),
None,
("a", "b"),
),
(
# Pandas input, feature_names_out=callable
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: ("c", "d", "e"),
None,
("c", "d", "e"),
),
(
# NumPy input, feature_names_out=callable default input_features
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("a",),
None,
("x0", "x1", "x2", "a"),
),
(
# Pandas input, feature_names_out=callable default input_features
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
None,
("a", "b", "c"),
),
(
# NumPy input, input_features=list of names
np.random.rand(100, 3),
"one-to-one",
("a", "b", "c"),
("a", "b", "c"),
),
(
# Pandas input, input_features=list of names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
("a", "b"), # must match feature_names_in_
("a", "b"),
),
(
# NumPy input, feature_names_out=callable, input_features=list
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("d",),
("a", "b", "c"),
("a", "b", "c", "d"),
),
(
# Pandas input, feature_names_out=callable, input_features=list
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
("a", "b"), # must match feature_names_in_
("a", "b", "c"),
),
],
)
def test_function_transformer_get_feature_names_out(
X, feature_names_out, input_features, expected
):
if isinstance(X, dict):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X)
transformer = FunctionTransformer(
feature_names_out=feature_names_out, validate=True
)
transformer.fit_transform(X)
names = transformer.get_feature_names_out(input_features)
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected)
def test_function_transformer_get_feature_names_out_without_validation():
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = "When 'feature_names_out' is 'one-to-one', either"
with pytest.raises(ValueError, match=msg):
transformer.get_feature_names_out()
names = transformer.get_feature_names_out(("a", "b"))
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b"))
@pytest.mark.parametrize("feature_names_out", ["x0", ["x0"], ("x0",)])
def test_function_transformer_feature_names_out_string(feature_names_out):
transformer = FunctionTransformer(feature_names_out=feature_names_out)
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = """must either be "one-to-one" or a callable"""
with pytest.raises(ValueError, match=msg):
transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_is_None():
transformer = FunctionTransformer()
X = np.random.rand(100, 2)
transformer.fit_transform(X)
msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
with pytest.raises(AttributeError, match=msg):
transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_uses_estimator():
def add_n_random_features(X, n):
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)
def feature_names_out(transformer, input_features):
n = transformer.kw_args["n"]
return list(input_features) + [f"rnd{i}" for i in range(n)]
transformer = FunctionTransformer(
func=add_n_random_features,
feature_names_out=feature_names_out,
kw_args=dict(n=3),
validate=True,
)
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
transformer.fit_transform(df)
names = transformer.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
def test_function_transformer_validate_inverse():
"""Test that function transformer does not reset estimator in
`inverse_transform`."""
def add_constant_feature(X):
X_one = np.ones((X.shape[0], 1))
return np.concatenate((X, X_one), axis=1)
def inverse_add_constant(X):
return X[:, :-1]
X = np.array([[1, 2], [3, 4], [3, 4]])
trans = FunctionTransformer(
func=add_constant_feature,
inverse_func=inverse_add_constant,
validate=True,
)
X_trans = trans.fit_transform(X)
assert trans.n_features_in_ == X.shape[1]
trans.inverse_transform(X_trans)
assert trans.n_features_in_ == X.shape[1]

View File

@@ -0,0 +1,645 @@
import numpy as np
import pytest
from scipy.sparse import issparse
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
from sklearn.utils.multiclass import type_of_target
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.utils import _to_object_array
from sklearn.preprocessing._label import LabelBinarizer
from sklearn.preprocessing._label import MultiLabelBinarizer
from sklearn.preprocessing._label import LabelEncoder
from sklearn.preprocessing._label import label_binarize
from sklearn.preprocessing._label import _inverse_binarize_thresholding
from sklearn.preprocessing._label import _inverse_binarize_multiclass
from sklearn import datasets
iris = datasets.load_iris()
def toarray(a):
if hasattr(a, "toarray"):
a = a.toarray()
return a
def test_label_binarizer():
# one-class case defaults to negative label
# For dense case:
inp = ["pos", "pos", "pos", "pos"]
lb = LabelBinarizer(sparse_output=False)
expected = np.array([[0, 0, 0, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
# For sparse case:
lb = LabelBinarizer(sparse_output=True)
got = lb.fit_transform(inp)
assert issparse(got)
assert_array_equal(lb.classes_, ["pos"])
assert_array_equal(expected, got.toarray())
assert_array_equal(lb.inverse_transform(got.toarray()), inp)
lb = LabelBinarizer(sparse_output=False)
# two-class case
inp = ["neg", "pos", "pos", "neg"]
expected = np.array([[0, 1, 1, 0]]).T
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["neg", "pos"])
assert_array_equal(expected, got)
to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
assert_array_equal(lb.inverse_transform(to_invert), inp)
# multi-class case
inp = ["spam", "ham", "eggs", "ham", "0"]
expected = np.array(
[[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
)
got = lb.fit_transform(inp)
assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
got = lb.fit_transform(["b", "d", "e"])
assert_array_equal(expected, got)
expected = np.array(
[[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
)
got = lb.transform(["a", "b", "c", "d", "e", "f"])
assert_array_equal(expected, got)
def test_label_binarizer_set_label_encoding():
lb = LabelBinarizer(neg_label=-2, pos_label=0)
# two-class case with pos_label=0
inp = np.array([0, 1, 1, 0])
expected = np.array([[-2, 0, 0, -2]]).T
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
lb = LabelBinarizer(neg_label=-2, pos_label=2)
# multi-class case
inp = np.array([3, 2, 1, 2, 0])
expected = np.array(
[
[-2, -2, -2, +2],
[-2, -2, +2, -2],
[-2, +2, -2, -2],
[-2, -2, +2, -2],
[+2, -2, -2, -2],
]
)
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_array_equal(lb.inverse_transform(got), inp)
@ignore_warnings
def test_label_binarizer_errors():
# Check that invalid arguments yield ValueError
one_class = np.array([0, 0, 0, 0])
lb = LabelBinarizer().fit(one_class)
multi_label = [(2, 3), (0,), (0, 2)]
err_msg = "You appear to be using a legacy multi-label data representation."
with pytest.raises(ValueError, match=err_msg):
lb.transform(multi_label)
lb = LabelBinarizer()
err_msg = "This LabelBinarizer instance is not fitted yet"
with pytest.raises(ValueError, match=err_msg):
lb.transform([])
with pytest.raises(ValueError, match=err_msg):
lb.inverse_transform([])
input_labels = [0, 1, 0, 1]
err_msg = "neg_label=2 must be strictly less than pos_label=1."
lb = LabelBinarizer(neg_label=2, pos_label=1)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = "neg_label=2 must be strictly less than pos_label=2."
lb = LabelBinarizer(neg_label=2, pos_label=2)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
err_msg = (
"Sparse binarization is only supported with non zero pos_label and zero "
"neg_label, got pos_label=2 and neg_label=1"
)
lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
with pytest.raises(ValueError, match=err_msg):
lb.fit(input_labels)
# Fail on y_type
err_msg = "foo format is not supported"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2],
threshold=0,
)
# Sequence of seq type should raise ValueError
y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
err_msg = "You appear to be using a legacy multi-label data representation"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit_transform(y_seq_of_seqs)
# Fail on the number of classes
err_msg = "The number of class is not equal to the number of dimension of y."
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=csr_matrix([[1, 2], [2, 1]]),
output_type="foo",
classes=[1, 2, 3],
threshold=0,
)
# Fail on the dimension of 'binary'
err_msg = "output_type='binary', but y.shape"
with pytest.raises(ValueError, match=err_msg):
_inverse_binarize_thresholding(
y=np.array([[1, 2, 3], [2, 1, 3]]),
output_type="binary",
classes=[1, 2, 3],
threshold=0,
)
# Fail on multioutput data
err_msg = "Multioutput target data is not supported with label binarization"
with pytest.raises(ValueError, match=err_msg):
LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
with pytest.raises(ValueError, match=err_msg):
label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
@pytest.mark.parametrize(
"values, classes, unknown",
[
(
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array([1, 2, 3], dtype="int64"),
np.array([4], dtype="int64"),
),
(
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
np.array(["d"], dtype=object),
),
(
np.array(["b", "a", "c", "a", "c"]),
np.array(["a", "b", "c"]),
np.array(["d"]),
),
],
ids=["int64", "object", "str"],
)
def test_label_encoder(values, classes, unknown):
# Test LabelEncoder's transform, fit_transform and
# inverse_transform methods
le = LabelEncoder()
le.fit(values)
assert_array_equal(le.classes_, classes)
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
le = LabelEncoder()
ret = le.fit_transform(values)
assert_array_equal(ret, [1, 0, 2, 0, 2])
with pytest.raises(ValueError, match="unseen labels"):
le.transform(unknown)
def test_label_encoder_negative_ints():
le = LabelEncoder()
le.fit([1, 1, 4, 5, -1, 0])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
assert_array_equal(
le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
)
with pytest.raises(ValueError):
le.transform([0, 6])
@pytest.mark.parametrize("dtype", ["str", "object"])
def test_label_encoder_str_bad_shape(dtype):
le = LabelEncoder()
le.fit(np.array(["apple", "orange"], dtype=dtype))
msg = "should be a 1d array"
with pytest.raises(ValueError, match=msg):
le.transform("apple")
def test_label_encoder_errors():
# Check that invalid arguments yield ValueError
le = LabelEncoder()
with pytest.raises(ValueError):
le.transform([])
with pytest.raises(ValueError):
le.inverse_transform([])
# Fail on unseen labels
le = LabelEncoder()
le.fit([1, 2, 3, -1, 1])
msg = "contains previously unseen labels"
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2])
with pytest.raises(ValueError, match=msg):
le.inverse_transform([-2, -3, -4])
# Fail on inverse_transform("")
msg = r"should be a 1d array.+shape \(\)"
with pytest.raises(ValueError, match=msg):
le.inverse_transform("")
@pytest.mark.parametrize(
"values",
[
np.array([2, 1, 3, 1, 3], dtype="int64"),
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["b", "a", "c", "a", "c"]),
],
ids=["int64", "object", "str"],
)
def test_label_encoder_empty_array(values):
le = LabelEncoder()
le.fit(values)
# test empty transform
transformed = le.transform([])
assert_array_equal(np.array([]), transformed)
# test empty inverse transform
inverse_transformed = le.inverse_transform([])
assert_array_equal(np.array([]), inverse_transformed)
def test_sparse_output_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for sparse_output in [True, False]:
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit_transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer(sparse_output=sparse_output)
got = mlb.fit(inp()).transform(inp())
assert issparse(got) == sparse_output
if sparse_output:
# verify CSR assumption that indices and indptr have same dtype
assert got.indices.dtype == got.indptr.dtype
got = got.toarray()
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
with pytest.raises(ValueError):
mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
def test_multilabel_binarizer():
# test input as iterable of iterables
inputs = [
lambda: [(2, 3), (1,), (1, 2)],
lambda: ({2, 3}, {1}, {1, 2}),
lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
inverse = inputs[0]()
for inp in inputs:
# With fit_transform
mlb = MultiLabelBinarizer()
got = mlb.fit_transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
# With fit
mlb = MultiLabelBinarizer()
got = mlb.fit(inp()).transform(inp())
assert_array_equal(indicator_mat, got)
assert_array_equal([1, 2, 3], mlb.classes_)
assert mlb.inverse_transform(got) == inverse
def test_multilabel_binarizer_empty_sample():
mlb = MultiLabelBinarizer()
y = [[1, 2], [1], []]
Y = np.array([[1, 1], [1, 0], [0, 0]])
assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_unknown_class():
mlb = MultiLabelBinarizer()
y = [[1, 2]]
Y = np.array([[1, 0], [0, 1]])
warning_message = "unknown class.* will be ignored"
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
Y = np.array([[1, 0, 0], [0, 1, 0]])
mlb = MultiLabelBinarizer(classes=[1, 2, 3])
with pytest.warns(UserWarning, match=warning_message):
matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
assert_array_equal(matrix, Y)
def test_multilabel_binarizer_given_classes():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# fit().transform()
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, [1, 3, 2])
# ensure works with extra class
mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
assert_array_equal(
mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
)
assert_array_equal(mlb.classes_, [4, 1, 3, 2])
# ensure fit is no-op as iterable is not consumed
inp = iter(inp)
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
# ensure a ValueError is thrown if given duplicate classes
err_msg = (
"The classes argument contains duplicate classes. Remove "
"these duplicates before passing them to MultiLabelBinarizer."
)
mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
with pytest.raises(ValueError, match=err_msg):
mlb.fit(inp)
def test_multilabel_binarizer_multiple_calls():
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
# first call
mlb = MultiLabelBinarizer(classes=[1, 3, 2])
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
# second call change class
mlb.classes = [1, 2, 3]
assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def test_multilabel_binarizer_same_length_sequence():
# Ensure sequences of the same length are not interpreted as a 2-d array
inp = [[1], [0], [2]]
indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
# fit_transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_non_integer_labels():
tuple_classes = _to_object_array([(1,), (2,), (3,)])
inputs = [
([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
]
indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
for inp, classes in inputs:
# fit_transform()
mlb = MultiLabelBinarizer()
inp = np.array(inp, dtype=object)
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
# fit().transform()
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
assert_array_equal(mlb.classes_, classes)
indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
assert_array_equal(indicator_mat_inv, inp)
mlb = MultiLabelBinarizer()
with pytest.raises(TypeError):
mlb.fit_transform([({}), ({}, {"a": "b"})])
def test_multilabel_binarizer_non_unique():
inp = [(1, 1, 1, 0)]
indicator_mat = np.array([[1, 1]])
mlb = MultiLabelBinarizer()
assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_multilabel_binarizer_inverse_validation():
inp = [(1, 1, 1, 0)]
mlb = MultiLabelBinarizer()
mlb.fit_transform(inp)
# Not binary
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 3]]))
# The following binary cases are fine, however
mlb.inverse_transform(np.array([[0, 0]]))
mlb.inverse_transform(np.array([[1, 1]]))
mlb.inverse_transform(np.array([[1, 0]]))
# Wrong shape
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1]]))
with pytest.raises(ValueError):
mlb.inverse_transform(np.array([[1, 1, 1]]))
def test_label_binarize_with_class_order():
out = label_binarize([1, 6], classes=[1, 2, 4, 6])
expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
assert_array_equal(out, expected)
# Modified class order
out = label_binarize([1, 6], classes=[1, 6, 4, 2])
expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
assert_array_equal(out, expected)
out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
assert_array_equal(out, expected)
def check_binarized_results(y, classes, pos_label, neg_label, expected):
for sparse_output in [True, False]:
if (pos_label == 0 or neg_label != 0) and sparse_output:
with pytest.raises(ValueError):
label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
continue
# check label_binarize
binarized = label_binarize(
y,
classes=classes,
neg_label=neg_label,
pos_label=pos_label,
sparse_output=sparse_output,
)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
# check inverse
y_type = type_of_target(y)
if y_type == "multiclass":
inversed = _inverse_binarize_multiclass(binarized, classes=classes)
else:
inversed = _inverse_binarize_thresholding(
binarized,
output_type=y_type,
classes=classes,
threshold=((neg_label + pos_label) / 2.0),
)
assert_array_equal(toarray(inversed), toarray(y))
# Check label binarizer
lb = LabelBinarizer(
neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
)
binarized = lb.fit_transform(y)
assert_array_equal(toarray(binarized), expected)
assert issparse(binarized) == sparse_output
inverse_output = lb.inverse_transform(binarized)
assert_array_equal(toarray(inverse_output), toarray(y))
assert issparse(inverse_output) == issparse(y)
def test_label_binarize_binary():
y = [0, 1, 0]
classes = [0, 1]
pos_label = 2
neg_label = -1
expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
# Binary case where sparse_output = True will not result in a ValueError
y = [0, 1, 0]
classes = [0, 1]
pos_label = 3
neg_label = 0
expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
check_binarized_results(y, classes, pos_label, neg_label, expected)
def test_label_binarize_multiclass():
y = [0, 1, 2]
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = 2 * np.eye(3)
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
def test_label_binarize_multilabel():
y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
classes = [0, 1, 2]
pos_label = 2
neg_label = 0
expected = pos_label * y_ind
y_sparse = [
sparse_matrix(y_ind)
for sparse_matrix in [
coo_matrix,
csc_matrix,
csr_matrix,
dok_matrix,
lil_matrix,
]
]
for y in [y_ind] + y_sparse:
check_binarized_results(y, classes, pos_label, neg_label, expected)
with pytest.raises(ValueError):
label_binarize(
y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
)
def test_invalid_input_label_binarize():
with pytest.raises(ValueError):
label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
with pytest.raises(ValueError, match="continuous target data is not "):
label_binarize([1.2, 2.7], classes=[0, 1])
with pytest.raises(ValueError, match="mismatch with the labels"):
label_binarize([[1, 3]], classes=[1, 2, 3])
def test_inverse_binarize_multiclass():
got = _inverse_binarize_multiclass(
csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
)
assert_array_equal(got, np.array([1, 1, 0]))

View File

@@ -0,0 +1,930 @@
import numpy as np
import pytest
from scipy import sparse
from scipy.sparse import random as sparse_random
from sklearn.utils._testing import assert_array_almost_equal
from numpy.testing import assert_allclose, assert_array_equal
from scipy.interpolate import BSpline
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
KBinsDiscretizer,
PolynomialFeatures,
SplineTransformer,
)
@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
def test_polynomial_and_spline_array_order(est):
"""Test that output array has the given order."""
X = np.arange(10).reshape(5, 2)
def is_c_contiguous(a):
return np.isfortran(a.T)
assert is_c_contiguous(est().fit_transform(X))
assert is_c_contiguous(est(order="C").fit_transform(X))
assert np.isfortran(est(order="F").fit_transform(X))
@pytest.mark.parametrize(
"params, err_msg",
[
({"degree": -1}, "degree must be a non-negative integer"),
({"degree": 2.5}, "degree must be a non-negative integer"),
({"degree": "string"}, "degree must be a non-negative integer"),
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
({"n_knots": 2.5}, "n_knots must be a positive integer >= 2."),
({"n_knots": "string"}, "n_knots must be a positive integer >= 2."),
({"knots": 1}, "Expected 2D array, got scalar array instead:"),
({"knots": [1, 2]}, "Expected 2D array, got 1D array instead:"),
(
{"knots": [[1]]},
r"Number of knots, knots.shape\[0\], must be >= 2.",
),
(
{"knots": [[1, 5], [2, 6]]},
r"knots.shape\[1\] == n_features is violated.",
),
(
{"knots": [[1], [1], [2]]},
"knots must be sorted without duplicates.",
),
({"knots": [[2], [1]]}, "knots must be sorted without duplicates."),
(
{"extrapolation": None},
"extrapolation must be one of 'error', 'constant', 'linear', "
"'continue' or 'periodic'.",
),
(
{"extrapolation": 1},
"extrapolation must be one of 'error', 'constant', 'linear', "
"'continue' or 'periodic'.",
),
(
{"extrapolation": "string"},
"extrapolation must be one of 'error', 'constant', 'linear', "
"'continue' or 'periodic'.",
),
({"include_bias": None}, "include_bias must be bool."),
({"include_bias": 1}, "include_bias must be bool."),
({"include_bias": "string"}, "include_bias must be bool."),
(
{"extrapolation": "periodic", "n_knots": 3, "degree": 3},
"Periodic splines require degree < n_knots. Got n_knots=3 and degree=3.",
),
(
{"extrapolation": "periodic", "knots": [[0], [1]], "degree": 2},
"Periodic splines require degree < n_knots. Got n_knots=2 and degree=2.",
),
],
)
def test_spline_transformer_input_validation(params, err_msg):
"""Test that we raise errors for invalid input in SplineTransformer."""
X = [[1], [2]]
with pytest.raises(ValueError, match=err_msg):
SplineTransformer(**params).fit(X)
def test_spline_transformer_manual_knot_input():
"""
Test that array-like knot positions in SplineTransformer are accepted.
"""
X = np.arange(20).reshape(10, 2)
knots = [[0.5, 1], [1.5, 2], [5, 10]]
st1 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
knots = np.asarray(knots)
st2 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
for i in range(X.shape[1]):
assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t)
@pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
def test_spline_transformer_integer_knots(extrapolation):
"""Test that SplineTransformer accepts integer value knot positions."""
X = np.arange(20).reshape(10, 2)
knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
_ = SplineTransformer(
degree=3, knots=knots, extrapolation=extrapolation
).fit_transform(X)
# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
def test_spline_transformer_feature_names(get_names):
"""Test that SplineTransformer generates correct features name."""
X = np.arange(20).reshape(10, 2)
splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
feature_names = getattr(splt, get_names)()
assert_array_equal(
feature_names,
[
"x0_sp_0",
"x0_sp_1",
"x0_sp_2",
"x0_sp_3",
"x0_sp_4",
"x1_sp_0",
"x1_sp_1",
"x1_sp_2",
"x1_sp_3",
"x1_sp_4",
],
)
splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
feature_names = getattr(splt, get_names)(["a", "b"])
assert_array_equal(
feature_names,
[
"a_sp_0",
"a_sp_1",
"a_sp_2",
"a_sp_3",
"b_sp_0",
"b_sp_1",
"b_sp_2",
"b_sp_3",
],
)
@pytest.mark.parametrize("degree", range(1, 5))
@pytest.mark.parametrize("n_knots", range(3, 5))
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
@pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation):
"""Test that B-splines are indeed a decomposition of unity.
Splines basis functions must sum up to 1 per row, if we stay in between
boundaries.
"""
X = np.linspace(0, 1, 100)[:, None]
# make the boundaries 0 and 1 part of X_train, for sure.
X_train = np.r_[[[0]], X[::2, :], [[1]]]
X_test = X[1::2, :]
if extrapolation == "periodic":
n_knots = n_knots + degree # periodic splines require degree < n_knots
splt = SplineTransformer(
n_knots=n_knots,
degree=degree,
knots=knots,
include_bias=True,
extrapolation=extrapolation,
)
splt.fit(X_train)
for X in [X_train, X_test]:
assert_allclose(np.sum(splt.transform(X), axis=1), 1)
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
def test_spline_transformer_linear_regression(bias, intercept):
"""Test that B-splines fit a sinusodial curve pretty well."""
X = np.linspace(0, 10, 100)[:, None]
y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose
pipe = Pipeline(
steps=[
(
"spline",
SplineTransformer(
n_knots=15,
degree=3,
include_bias=bias,
extrapolation="constant",
),
),
("ols", LinearRegression(fit_intercept=intercept)),
]
)
pipe.fit(X, y)
assert_allclose(pipe.predict(X), y, rtol=1e-3)
@pytest.mark.parametrize(
["knots", "n_knots", "sample_weight", "expected_knots"],
[
("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])),
(
"uniform",
3,
np.array([0, 0, 1, 1, 0, 3, 1]),
np.array([[2, 2], [4, 8], [6, 14]]),
),
("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])),
("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])),
(
"quantile",
3,
np.array([0, 0, 1, 1, 0, 3, 1]),
np.array([[2, 2], [5, 8], [6, 14]]),
),
],
)
def test_spline_transformer_get_base_knot_positions(
knots, n_knots, sample_weight, expected_knots
):
# Check the behaviour to find the positions of the knots with and without
# `sample_weight`
X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]])
base_knots = SplineTransformer._get_base_knot_positions(
X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight
)
assert_allclose(base_knots, expected_knots)
@pytest.mark.parametrize(
"knots, n_knots, degree",
[
("uniform", 5, 3),
("uniform", 12, 8),
(
[[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]],
None,
3,
),
],
)
def test_spline_transformer_periodicity_of_extrapolation(knots, n_knots, degree):
"""Test that the SplineTransformer is periodic for multiple features."""
X_1 = np.linspace((-1, 0), (1, 5), 10)
X_2 = np.linspace((1, 5), (3, 10), 10)
splt = SplineTransformer(
knots=knots, n_knots=n_knots, degree=degree, extrapolation="periodic"
)
splt.fit(X_1)
assert_allclose(splt.transform(X_1), splt.transform(X_2))
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
def test_spline_transformer_periodic_linear_regression(bias, intercept):
"""Test that B-splines fit a periodic curve pretty well."""
# "+ 3" to avoid the value 0 in assert_allclose
def f(x):
return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
X = np.linspace(0, 1, 101)[:, None]
pipe = Pipeline(
steps=[
(
"spline",
SplineTransformer(
n_knots=20,
degree=3,
include_bias=bias,
extrapolation="periodic",
),
),
("ols", LinearRegression(fit_intercept=intercept)),
]
)
pipe.fit(X, f(X[:, 0]))
# Generate larger array to check periodic extrapolation
X_ = np.linspace(-1, 2, 301)[:, None]
predictions = pipe.predict(X_)
assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
def test_spline_transformer_periodic_spline_backport():
"""Test that the backport of extrapolate="periodic" works correctly"""
X = np.linspace(-2, 3.5, 10)[:, None]
degree = 2
# Use periodic extrapolation backport in SplineTransformer
transformer = SplineTransformer(
degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]
)
Xt = transformer.fit_transform(X)
# Use periodic extrapolation in BSpline
coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
Xspl = spl(X[:, 0])
assert_allclose(Xt, Xspl)
def test_spline_transformer_periodic_splines_periodicity():
"""
Test if shifted knots result in the same transformation up to permutation.
"""
X = np.linspace(0, 10, 101)[:, None]
transformer_1 = SplineTransformer(
degree=3,
extrapolation="periodic",
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
)
transformer_2 = SplineTransformer(
degree=3,
extrapolation="periodic",
knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]],
)
Xt_1 = transformer_1.fit_transform(X)
Xt_2 = transformer_2.fit_transform(X)
assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
@pytest.mark.parametrize("degree", [3, 5])
def test_spline_transformer_periodic_splines_smoothness(degree):
"""Test that spline transformation is smooth at first / last knot."""
X = np.linspace(-2, 10, 10_000)[:, None]
transformer = SplineTransformer(
degree=degree,
extrapolation="periodic",
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
)
Xt = transformer.fit_transform(X)
delta = (X.max() - X.min()) / len(X)
tol = 10 * delta
dXt = Xt
# We expect splines of degree `degree` to be (`degree`-1) times
# continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
# derivative should be continuous. This is the case if the (d+1)-th
# numerical derivative is reasonably small (smaller than `tol` in absolute
# value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
# and compare them to `tol`.
#
# Note that the 0-th derivative is the function itself, such that we are
# also checking its continuity.
for d in range(1, degree + 1):
# Check continuity of the (d-1)-th derivative
diff = np.diff(dXt, axis=0)
assert np.abs(diff).max() < tol
# Compute d-th numeric derivative
dXt = diff / delta
# As degree `degree` splines are not `degree` times continuously
# differentiable at the knots, the `degree + 1`-th numeric derivative
# should have spikes at the knots.
diff = np.diff(dXt, axis=0)
assert np.abs(diff).max() > 1
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
def test_spline_transformer_extrapolation(bias, intercept, degree):
"""Test that B-spline extrapolation works correctly."""
# we use a straight line for that
X = np.linspace(-1, 1, 100)[:, None]
y = X.squeeze()
# 'constant'
pipe = Pipeline(
[
[
"spline",
SplineTransformer(
n_knots=4,
degree=degree,
include_bias=bias,
extrapolation="constant",
),
],
["ols", LinearRegression(fit_intercept=intercept)],
]
)
pipe.fit(X, y)
assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
# 'linear'
pipe = Pipeline(
[
[
"spline",
SplineTransformer(
n_knots=4,
degree=degree,
include_bias=bias,
extrapolation="linear",
),
],
["ols", LinearRegression(fit_intercept=intercept)],
]
)
pipe.fit(X, y)
assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
# 'error'
splt = SplineTransformer(
n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
)
splt.fit(X)
with pytest.raises(ValueError):
splt.transform([[-10]])
with pytest.raises(ValueError):
splt.transform([[5]])
def test_spline_transformer_kbindiscretizer():
"""Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
rng = np.random.RandomState(97531)
X = rng.randn(200).reshape(200, 1)
n_bins = 5
n_knots = n_bins + 1
splt = SplineTransformer(
n_knots=n_knots, degree=0, knots="quantile", include_bias=True
)
splines = splt.fit_transform(X)
kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile")
kbins = kbd.fit_transform(X)
# Though they should be exactly equal, we test approximately with high
# accuracy.
assert_allclose(splines, kbins, rtol=1e-13)
@pytest.mark.parametrize("n_knots", [5, 10])
@pytest.mark.parametrize("include_bias", [True, False])
@pytest.mark.parametrize("degree", [3, 5])
def test_spline_transformer_n_features_out(n_knots, include_bias, degree):
"""Test that transform results in n_features_out_ features."""
splt = SplineTransformer(n_knots=n_knots, degree=degree, include_bias=include_bias)
X = np.linspace(0, 1, 10)[:, None]
splt.fit(X)
assert splt.transform(X).shape[1] == splt.n_features_out_
@pytest.mark.parametrize(
"params, err_msg",
[
({"degree": -1}, "degree must be a non-negative integer"),
({"degree": 2.5}, "degree must be a non-negative int or tuple"),
({"degree": "12"}, r"degree=\(min_degree, max_degree\) must"),
({"degree": "string"}, "degree must be a non-negative int or tuple"),
({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"),
({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"),
({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"),
],
)
def test_polynomial_features_input_validation(params, err_msg):
"""Test that we raise errors for invalid input in PolynomialFeatures."""
X = [[1], [2]]
with pytest.raises(ValueError, match=err_msg):
PolynomialFeatures(**params).fit(X)
@pytest.fixture()
def single_feature_degree3():
X = np.arange(6)[:, np.newaxis]
P = np.hstack([np.ones_like(X), X, X**2, X**3])
return X, P
@pytest.mark.parametrize(
"degree, include_bias, interaction_only, indices",
[
(3, True, False, slice(None, None)),
(3, False, False, slice(1, None)),
(3, True, True, [0, 1]),
(3, False, True, [1]),
((2, 3), True, False, [0, 2, 3]),
((2, 3), False, False, [2, 3]),
((2, 3), True, True, [0]),
((2, 3), False, True, []),
],
)
@pytest.mark.parametrize(
"sparse_X",
[False, sparse.csr_matrix, sparse.csc_matrix],
)
def test_polynomial_features_one_feature(
single_feature_degree3,
degree,
include_bias,
interaction_only,
indices,
sparse_X,
):
"""Test PolynomialFeatures on single feature up to degree 3."""
X, P = single_feature_degree3
if sparse_X:
X = sparse_X(X)
tf = PolynomialFeatures(
degree=degree, include_bias=include_bias, interaction_only=interaction_only
).fit(X)
out = tf.transform(X)
if sparse_X:
out = out.toarray()
assert_allclose(out, P[:, indices])
if tf.n_output_features_ > 0:
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
@pytest.fixture()
def two_features_degree3():
X = np.arange(6).reshape((3, 2))
x1 = X[:, :1]
x2 = X[:, 1:]
P = np.hstack(
[
x1**0 * x2**0, # 0
x1**1 * x2**0, # 1
x1**0 * x2**1, # 2
x1**2 * x2**0, # 3
x1**1 * x2**1, # 4
x1**0 * x2**2, # 5
x1**3 * x2**0, # 6
x1**2 * x2**1, # 7
x1**1 * x2**2, # 8
x1**0 * x2**3, # 9
]
)
return X, P
@pytest.mark.parametrize(
"degree, include_bias, interaction_only, indices",
[
(2, True, False, slice(0, 6)),
(2, False, False, slice(1, 6)),
(2, True, True, [0, 1, 2, 4]),
(2, False, True, [1, 2, 4]),
((2, 2), True, False, [0, 3, 4, 5]),
((2, 2), False, False, [3, 4, 5]),
((2, 2), True, True, [0, 4]),
((2, 2), False, True, [4]),
(3, True, False, slice(None, None)),
(3, False, False, slice(1, None)),
(3, True, True, [0, 1, 2, 4]),
(3, False, True, [1, 2, 4]),
((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]),
((2, 3), False, False, slice(3, None)),
((2, 3), True, True, [0, 4]),
((2, 3), False, True, [4]),
((3, 3), True, False, [0, 6, 7, 8, 9]),
((3, 3), False, False, [6, 7, 8, 9]),
((3, 3), True, True, [0]),
((3, 3), False, True, []), # would need 3 input features
],
)
@pytest.mark.parametrize(
"sparse_X",
[False, sparse.csr_matrix, sparse.csc_matrix],
)
def test_polynomial_features_two_features(
two_features_degree3,
degree,
include_bias,
interaction_only,
indices,
sparse_X,
):
"""Test PolynomialFeatures on 2 features up to degree 3."""
X, P = two_features_degree3
if sparse_X:
X = sparse_X(X)
tf = PolynomialFeatures(
degree=degree, include_bias=include_bias, interaction_only=interaction_only
).fit(X)
out = tf.transform(X)
if sparse_X:
out = out.toarray()
assert_allclose(out, P[:, indices])
if tf.n_output_features_ > 0:
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
def test_polynomial_feature_names(get_names):
X = np.arange(30).reshape(10, 3)
poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
feature_names = poly.get_feature_names()
assert_array_equal(
["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"],
feature_names,
)
assert len(feature_names) == poly.transform(X).shape[1]
poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
feature_names = getattr(poly, get_names)(["a", "b", "c"])
assert_array_equal(
[
"a",
"b",
"c",
"a^2",
"a b",
"a c",
"b^2",
"b c",
"c^2",
"a^3",
"a^2 b",
"a^2 c",
"a b^2",
"a b c",
"a c^2",
"b^3",
"b^2 c",
"b c^2",
"c^3",
],
feature_names,
)
assert len(feature_names) == poly.transform(X).shape[1]
poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X)
feature_names = getattr(poly, get_names)(["a", "b", "c"])
assert_array_equal(
[
"a^2",
"a b",
"a c",
"b^2",
"b c",
"c^2",
"a^3",
"a^2 b",
"a^2 c",
"a b^2",
"a b c",
"a c^2",
"b^3",
"b^2 c",
"b c^2",
"c^3",
],
feature_names,
)
assert len(feature_names) == poly.transform(X).shape[1]
poly = PolynomialFeatures(
degree=(3, 3), include_bias=True, interaction_only=True
).fit(X)
feature_names = getattr(poly, get_names)(["a", "b", "c"])
assert_array_equal(["1", "a b c"], feature_names)
assert len(feature_names) == poly.transform(X).shape[1]
# test some unicode
poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
feature_names = poly.get_feature_names(["\u0001F40D", "\u262E", "\u05D0"])
assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], feature_names)
@pytest.mark.parametrize(
["deg", "include_bias", "interaction_only", "dtype"],
[
(1, True, False, int),
(2, True, False, int),
(2, True, False, np.float32),
(2, True, False, np.float64),
(3, False, False, np.float64),
(3, False, True, np.float64),
(4, False, False, np.float64),
(4, False, True, np.float64),
],
)
def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype):
rng = np.random.RandomState(0)
X = rng.randint(0, 2, (100, 2))
X_csc = sparse.csc_matrix(X)
est = PolynomialFeatures(
deg, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csc = est.fit_transform(X_csc.astype(dtype))
Xt_dense = est.fit_transform(X.astype(dtype))
assert isinstance(Xt_csc, sparse.csc_matrix)
assert Xt_csc.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csc.A, Xt_dense)
@pytest.mark.parametrize(
["deg", "include_bias", "interaction_only", "dtype"],
[
(1, True, False, int),
(2, True, False, int),
(2, True, False, np.float32),
(2, True, False, np.float64),
(3, False, False, np.float64),
(3, False, True, np.float64),
],
)
def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
rng = np.random.RandomState(0)
X = rng.randint(0, 2, (100, 2))
X_csr = sparse.csr_matrix(X)
est = PolynomialFeatures(
deg, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csr = est.fit_transform(X_csr.astype(dtype))
Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
@pytest.mark.parametrize("n_features", [1, 4, 5])
@pytest.mark.parametrize(
"min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)]
)
@pytest.mark.parametrize("interaction_only", [True, False])
@pytest.mark.parametrize("include_bias", [True, False])
def test_num_combinations(
n_features,
min_degree,
max_degree,
interaction_only,
include_bias,
):
"""
Test that n_output_features_ is calculated correctly.
"""
x = sparse.csr_matrix(([1], ([0], [n_features - 1])))
est = PolynomialFeatures(
degree=max_degree,
interaction_only=interaction_only,
include_bias=include_bias,
)
est.fit(x)
num_combos = est.n_output_features_
combos = PolynomialFeatures._combinations(
n_features=n_features,
min_degree=0,
max_degree=max_degree,
interaction_only=interaction_only,
include_bias=include_bias,
)
assert num_combos == sum([1 for _ in combos])
@pytest.mark.parametrize(
["deg", "include_bias", "interaction_only", "dtype"],
[
(2, True, False, np.float32),
(2, True, False, np.float64),
(3, False, False, np.float64),
(3, False, True, np.float64),
],
)
def test_polynomial_features_csr_X_floats(deg, include_bias, interaction_only, dtype):
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
X = X_csr.toarray()
est = PolynomialFeatures(
deg, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csr = est.fit_transform(X_csr.astype(dtype))
Xt_dense = est.fit_transform(X.astype(dtype))
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
@pytest.mark.parametrize(
["zero_row_index", "deg", "interaction_only"],
[
(0, 2, True),
(1, 2, True),
(2, 2, True),
(0, 3, True),
(1, 3, True),
(2, 3, True),
(0, 2, False),
(1, 2, False),
(2, 2, False),
(0, 3, False),
(1, 3, False),
(2, 3, False),
],
)
def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, interaction_only):
X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr()
X_csr[zero_row_index, :] = 0.0
X = X_csr.toarray()
est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only)
Xt_csr = est.fit_transform(X_csr)
Xt_dense = est.fit_transform(X)
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
# This degree should always be one more than the highest degree supported by
# _csr_expansion.
@pytest.mark.parametrize(
["include_bias", "interaction_only"],
[(True, True), (True, False), (False, True), (False, False)],
)
def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only):
X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
X = X_csr.toarray()
est = PolynomialFeatures(
4, include_bias=include_bias, interaction_only=interaction_only
)
Xt_csr = est.fit_transform(X_csr)
Xt_dense = est.fit_transform(X)
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
@pytest.mark.parametrize(
["deg", "dim", "interaction_only"],
[
(2, 1, True),
(2, 2, True),
(3, 1, True),
(3, 2, True),
(3, 3, True),
(2, 1, False),
(2, 2, False),
(3, 1, False),
(3, 2, False),
(3, 3, False),
],
)
def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only):
X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr()
X = X_csr.toarray()
est = PolynomialFeatures(deg, interaction_only=interaction_only)
Xt_csr = est.fit_transform(X_csr)
Xt_dense = est.fit_transform(X)
assert isinstance(Xt_csr, sparse.csr_matrix)
assert Xt_csr.dtype == Xt_dense.dtype
assert_array_almost_equal(Xt_csr.A, Xt_dense)
def test_polynomial_features_deprecated_n_input_features():
# check that we raise a deprecation warning when accessing
# `n_input_features_`. FIXME: remove in 1.2
depr_msg = (
"The attribute `n_input_features_` was deprecated in version "
"1.0 and will be removed in 1.2."
)
X = np.arange(10).reshape(5, 2)
with pytest.warns(FutureWarning, match=depr_msg):
PolynomialFeatures().fit(X).n_input_features_
# TODO: Remove in 1.2 when get_feature_names is removed
@pytest.mark.parametrize("Transformer", [SplineTransformer, PolynomialFeatures])
def test_get_feature_names_deprecated(Transformer):
X = np.arange(30).reshape(10, 3)
poly = Transformer().fit(X)
msg = "get_feature_names is deprecated in 1.0"
with pytest.warns(FutureWarning, match=msg):
poly.get_feature_names()
def test_polynomial_features_behaviour_on_zero_degree():
"""Check that PolynomialFeatures raises error when degree=0 and include_bias=False,
and output a single constant column when include_bias=True
"""
X = np.ones((10, 2))
poly = PolynomialFeatures(degree=0, include_bias=False)
err_msg = (
"Setting degree to zero and include_bias to False would result in"
" an empty output array."
)
with pytest.raises(ValueError, match=err_msg):
poly.fit_transform(X)
poly = PolynomialFeatures(degree=(0, 0), include_bias=False)
err_msg = (
"Setting both min_deree and max_degree to zero and include_bias to"
" False would result in an empty output array."
)
with pytest.raises(ValueError, match=err_msg):
poly.fit_transform(X)
for _X in [X, sparse.csr_matrix(X), sparse.csc_matrix(X)]:
poly = PolynomialFeatures(degree=0, include_bias=True)
output = poly.fit_transform(_X)
# convert to dense array if needed
if sparse.issparse(output):
output = output.toarray()
assert_array_equal(output, np.ones((X.shape[0], 1)))