first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,81 @@
"""global_random_seed fixture
The goal of this fixture is to prevent tests that use it to be sensitive
to a specific seed value while still being deterministic by default.
See the documentation for the SKLEARN_TESTS_GLOBAL_RANDOM_SEED
variable for insrtuctions on how to use this fixture.
https://scikit-learn.org/dev/computing/parallelism.html#sklearn-tests-global-random-seed
"""
import pytest
from os import environ
from random import Random
# Passes the main worker's random seeds to workers
class XDistHooks:
def pytest_configure_node(self, node) -> None:
random_seeds = node.config.getoption("random_seeds")
node.workerinput["random_seeds"] = random_seeds
def pytest_configure(config):
if config.pluginmanager.hasplugin("xdist"):
config.pluginmanager.register(XDistHooks())
RANDOM_SEED_RANGE = list(range(100)) # All seeds in [0, 99] should be valid.
random_seed_var = environ.get("SKLEARN_TESTS_GLOBAL_RANDOM_SEED")
if hasattr(config, "workerinput") and "random_seeds" in config.workerinput:
# Set worker random seed from seed generated from main process
random_seeds = config.workerinput["random_seeds"]
elif random_seed_var is None:
# This is the way.
random_seeds = [42]
elif random_seed_var == "any":
# Pick-up one seed at random in the range of admissible random seeds.
random_seeds = [Random().choice(RANDOM_SEED_RANGE)]
elif random_seed_var == "all":
random_seeds = RANDOM_SEED_RANGE
else:
if "-" in random_seed_var:
start, stop = random_seed_var.split("-")
random_seeds = list(range(int(start), int(stop) + 1))
else:
random_seeds = [int(random_seed_var)]
if min(random_seeds) < 0 or max(random_seeds) > 99:
raise ValueError(
"The value(s) of the environment variable "
"SKLEARN_TESTS_GLOBAL_RANDOM_SEED must be in the range [0, 99] "
f"(or 'any' or 'all'), got: {random_seed_var}"
)
config.option.random_seeds = random_seeds
class GlobalRandomSeedPlugin:
@pytest.fixture(params=random_seeds)
def global_random_seed(self, request):
"""Fixture to ask for a random yet controllable random seed.
All tests that use this fixture accept the contract that they should
deterministically pass for any seed value from 0 to 99 included.
See the documentation for the SKLEARN_TESTS_GLOBAL_RANDOM_SEED
variable for insrtuctions on how to use this fixture.
https://scikit-learn.org/dev/computing/parallelism.html#sklearn-tests-global-random-seed
"""
yield request.param
config.pluginmanager.register(GlobalRandomSeedPlugin())
def pytest_report_header(config):
random_seed_var = environ.get("SKLEARN_TESTS_GLOBAL_RANDOM_SEED")
if random_seed_var == "any":
return [
"To reproduce this test run, set the following environment variable:",
f' SKLEARN_TESTS_GLOBAL_RANDOM_SEED="{config.option.random_seeds[0]}"',
"See: https://scikit-learn.org/dev/computing/parallelism.html"
"#sklearn-tests-global-random-seed",
]

View File

@@ -0,0 +1,661 @@
# Author: Gael Varoquaux
# License: BSD 3 clause
import re
import numpy as np
import scipy.sparse as sp
import pytest
import warnings
import sklearn
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_no_warnings
from sklearn.utils._testing import ignore_warnings
from sklearn.base import BaseEstimator, clone, is_classifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
from sklearn.base import TransformerMixin
from sklearn.utils._mocking import MockDataFrame
from sklearn import config_context
import pickle
#############################################################################
# A few test classes
class MyEstimator(BaseEstimator):
def __init__(self, l1=0, empty=None):
self.l1 = l1
self.empty = empty
class K(BaseEstimator):
def __init__(self, c=None, d=None):
self.c = c
self.d = d
class T(BaseEstimator):
def __init__(self, a=None, b=None):
self.a = a
self.b = b
class NaNTag(BaseEstimator):
def _more_tags(self):
return {"allow_nan": True}
class NoNaNTag(BaseEstimator):
def _more_tags(self):
return {"allow_nan": False}
class OverrideTag(NaNTag):
def _more_tags(self):
return {"allow_nan": False}
class DiamondOverwriteTag(NaNTag, NoNaNTag):
def _more_tags(self):
return dict()
class InheritDiamondOverwriteTag(DiamondOverwriteTag):
pass
class ModifyInitParams(BaseEstimator):
"""Deprecated behavior.
Equal parameters but with a type cast.
Doesn't fulfill a is a
"""
def __init__(self, a=np.array([0])):
self.a = a.copy()
class Buggy(BaseEstimator):
"A buggy estimator that does not set its parameters right."
def __init__(self, a=None):
self.a = 1
class NoEstimator:
def __init__(self):
pass
def fit(self, X=None, y=None):
return self
def predict(self, X=None):
return None
class VargEstimator(BaseEstimator):
"""scikit-learn estimators shouldn't have vargs."""
def __init__(self, *vargs):
pass
#############################################################################
# The tests
def test_clone():
# Tests that clone creates a correct deep copy.
# We create an estimator, make a copy of its original state
# (which, in this case, is the current state of the estimator),
# and check that the obtained copy is a correct deep copy.
from sklearn.feature_selection import SelectFpr, f_classif
selector = SelectFpr(f_classif, alpha=0.1)
new_selector = clone(selector)
assert selector is not new_selector
assert selector.get_params() == new_selector.get_params()
selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
new_selector = clone(selector)
assert selector is not new_selector
def test_clone_2():
# Tests that clone doesn't copy everything.
# We first create an estimator, give it an own attribute, and
# make a copy of its original state. Then we check that the copy doesn't
# have the specific attribute we manually added to the initial estimator.
from sklearn.feature_selection import SelectFpr, f_classif
selector = SelectFpr(f_classif, alpha=0.1)
selector.own_attribute = "test"
new_selector = clone(selector)
assert not hasattr(new_selector, "own_attribute")
def test_clone_buggy():
# Check that clone raises an error on buggy estimators.
buggy = Buggy()
buggy.a = 2
with pytest.raises(RuntimeError):
clone(buggy)
no_estimator = NoEstimator()
with pytest.raises(TypeError):
clone(no_estimator)
varg_est = VargEstimator()
with pytest.raises(RuntimeError):
clone(varg_est)
est = ModifyInitParams()
with pytest.raises(RuntimeError):
clone(est)
def test_clone_empty_array():
# Regression test for cloning estimators with empty arrays
clf = MyEstimator(empty=np.array([]))
clf2 = clone(clf)
assert_array_equal(clf.empty, clf2.empty)
clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]])))
clf2 = clone(clf)
assert_array_equal(clf.empty.data, clf2.empty.data)
def test_clone_nan():
# Regression test for cloning estimators with default parameter as np.nan
clf = MyEstimator(empty=np.nan)
clf2 = clone(clf)
assert clf.empty is clf2.empty
def test_clone_sparse_matrices():
sparse_matrix_classes = [
getattr(sp, name) for name in dir(sp) if name.endswith("_matrix")
]
for cls in sparse_matrix_classes:
sparse_matrix = cls(np.eye(5))
clf = MyEstimator(empty=sparse_matrix)
clf_cloned = clone(clf)
assert clf.empty.__class__ is clf_cloned.empty.__class__
assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray())
def test_clone_estimator_types():
# Check that clone works for parameters that are types rather than
# instances
clf = MyEstimator(empty=MyEstimator)
clf2 = clone(clf)
assert clf.empty is clf2.empty
def test_clone_class_rather_than_instance():
# Check that clone raises expected error message when
# cloning class rather than instance
msg = "You should provide an instance of scikit-learn estimator"
with pytest.raises(TypeError, match=msg):
clone(MyEstimator)
def test_repr():
# Smoke test the repr of the base estimator.
my_estimator = MyEstimator()
repr(my_estimator)
test = T(K(), K())
assert repr(test) == "T(a=K(), b=K())"
some_est = T(a=["long_params"] * 1000)
assert len(repr(some_est)) == 485
def test_str():
# Smoke test the str of the base estimator
my_estimator = MyEstimator()
str(my_estimator)
def test_get_params():
test = T(K(), K())
assert "a__d" in test.get_params(deep=True)
assert "a__d" not in test.get_params(deep=False)
test.set_params(a__d=2)
assert test.a.d == 2
with pytest.raises(ValueError):
test.set_params(a__a=2)
def test_is_classifier():
svc = SVC()
assert is_classifier(svc)
assert is_classifier(GridSearchCV(svc, {"C": [0.1, 1]}))
assert is_classifier(Pipeline([("svc", svc)]))
assert is_classifier(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))]))
def test_set_params():
# test nested estimator parameter setting
clf = Pipeline([("svc", SVC())])
# non-existing parameter in svc
with pytest.raises(ValueError):
clf.set_params(svc__stupid_param=True)
# non-existing parameter of pipeline
with pytest.raises(ValueError):
clf.set_params(svm__stupid_param=True)
# we don't currently catch if the things in pipeline are estimators
# bad_pipeline = Pipeline([("bad", NoEstimator())])
# assert_raises(AttributeError, bad_pipeline.set_params,
# bad__stupid_param=True)
def test_set_params_passes_all_parameters():
# Make sure all parameters are passed together to set_params
# of nested estimator. Regression test for #9944
class TestDecisionTree(DecisionTreeClassifier):
def set_params(self, **kwargs):
super().set_params(**kwargs)
# expected_kwargs is in test scope
assert kwargs == expected_kwargs
return self
expected_kwargs = {"max_depth": 5, "min_samples_leaf": 2}
for est in [
Pipeline([("estimator", TestDecisionTree())]),
GridSearchCV(TestDecisionTree(), {}),
]:
est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2)
def test_set_params_updates_valid_params():
# Check that set_params tries to set SVC().C, not
# DecisionTreeClassifier().C
gscv = GridSearchCV(DecisionTreeClassifier(), {})
gscv.set_params(estimator=SVC(), estimator__C=42.0)
assert gscv.estimator.C == 42.0
@pytest.mark.parametrize(
"tree,dataset",
[
(
DecisionTreeClassifier(max_depth=2, random_state=0),
datasets.make_classification(random_state=0),
),
(
DecisionTreeRegressor(max_depth=2, random_state=0),
datasets.make_regression(random_state=0),
),
],
)
def test_score_sample_weight(tree, dataset):
rng = np.random.RandomState(0)
# check that the score with and without sample weights are different
X, y = dataset
tree.fit(X, y)
# generate random sample weights
sample_weight = rng.randint(1, 10, size=len(y))
score_unweighted = tree.score(X, y)
score_weighted = tree.score(X, y, sample_weight=sample_weight)
msg = "Unweighted and weighted scores are unexpectedly equal"
assert score_unweighted != score_weighted, msg
def test_clone_pandas_dataframe():
class DummyEstimator(TransformerMixin, BaseEstimator):
"""This is a dummy class for generating numerical features
This feature extractor extracts numerical features from pandas data
frame.
Parameters
----------
df: pandas data frame
The pandas data frame parameter.
Notes
-----
"""
def __init__(self, df=None, scalar_param=1):
self.df = df
self.scalar_param = scalar_param
def fit(self, X, y=None):
pass
def transform(self, X):
pass
# build and clone estimator
d = np.arange(10)
df = MockDataFrame(d)
e = DummyEstimator(df, scalar_param=1)
cloned_e = clone(e)
# the test
assert (e.df == cloned_e.df).values.all()
assert e.scalar_param == cloned_e.scalar_param
def test_pickle_version_warning_is_not_raised_with_matching_version():
iris = datasets.load_iris()
tree = DecisionTreeClassifier().fit(iris.data, iris.target)
tree_pickle = pickle.dumps(tree)
assert b"version" in tree_pickle
tree_restored = assert_no_warnings(pickle.loads, tree_pickle)
# test that we can predict with the restored decision tree classifier
score_of_original = tree.score(iris.data, iris.target)
score_of_restored = tree_restored.score(iris.data, iris.target)
assert score_of_original == score_of_restored
class TreeBadVersion(DecisionTreeClassifier):
def __getstate__(self):
return dict(self.__dict__.items(), _sklearn_version="something")
pickle_error_message = (
"Trying to unpickle estimator {estimator} from "
"version {old_version} when using version "
"{current_version}. This might "
"lead to breaking code or invalid results. "
"Use at your own risk."
)
def test_pickle_version_warning_is_issued_upon_different_version():
iris = datasets.load_iris()
tree = TreeBadVersion().fit(iris.data, iris.target)
tree_pickle_other = pickle.dumps(tree)
message = pickle_error_message.format(
estimator="TreeBadVersion",
old_version="something",
current_version=sklearn.__version__,
)
with pytest.warns(UserWarning, match=message):
pickle.loads(tree_pickle_other)
class TreeNoVersion(DecisionTreeClassifier):
def __getstate__(self):
return self.__dict__
def test_pickle_version_warning_is_issued_when_no_version_info_in_pickle():
iris = datasets.load_iris()
# TreeNoVersion has no getstate, like pre-0.18
tree = TreeNoVersion().fit(iris.data, iris.target)
tree_pickle_noversion = pickle.dumps(tree)
assert b"version" not in tree_pickle_noversion
message = pickle_error_message.format(
estimator="TreeNoVersion",
old_version="pre-0.18",
current_version=sklearn.__version__,
)
# check we got the warning about using pre-0.18 pickle
with pytest.warns(UserWarning, match=message):
pickle.loads(tree_pickle_noversion)
def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator():
iris = datasets.load_iris()
tree = TreeNoVersion().fit(iris.data, iris.target)
tree_pickle_noversion = pickle.dumps(tree)
try:
module_backup = TreeNoVersion.__module__
TreeNoVersion.__module__ = "notsklearn"
assert_no_warnings(pickle.loads, tree_pickle_noversion)
finally:
TreeNoVersion.__module__ = module_backup
class DontPickleAttributeMixin:
def __getstate__(self):
data = self.__dict__.copy()
data["_attribute_not_pickled"] = None
return data
def __setstate__(self, state):
state["_restored"] = True
self.__dict__.update(state)
class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator):
def __init__(self, attribute_pickled=5):
self.attribute_pickled = attribute_pickled
self._attribute_not_pickled = None
def test_pickling_when_getstate_is_overwritten_by_mixin():
estimator = MultiInheritanceEstimator()
estimator._attribute_not_pickled = "this attribute should not be pickled"
serialized = pickle.dumps(estimator)
estimator_restored = pickle.loads(serialized)
assert estimator_restored.attribute_pickled == 5
assert estimator_restored._attribute_not_pickled is None
assert estimator_restored._restored
def test_pickling_when_getstate_is_overwritten_by_mixin_outside_of_sklearn():
try:
estimator = MultiInheritanceEstimator()
text = "this attribute should not be pickled"
estimator._attribute_not_pickled = text
old_mod = type(estimator).__module__
type(estimator).__module__ = "notsklearn"
serialized = estimator.__getstate__()
assert serialized == {"_attribute_not_pickled": None, "attribute_pickled": 5}
serialized["attribute_pickled"] = 4
estimator.__setstate__(serialized)
assert estimator.attribute_pickled == 4
assert estimator._restored
finally:
type(estimator).__module__ = old_mod
class SingleInheritanceEstimator(BaseEstimator):
def __init__(self, attribute_pickled=5):
self.attribute_pickled = attribute_pickled
self._attribute_not_pickled = None
def __getstate__(self):
data = self.__dict__.copy()
data["_attribute_not_pickled"] = None
return data
@ignore_warnings(category=(UserWarning))
def test_pickling_works_when_getstate_is_overwritten_in_the_child_class():
estimator = SingleInheritanceEstimator()
estimator._attribute_not_pickled = "this attribute should not be pickled"
serialized = pickle.dumps(estimator)
estimator_restored = pickle.loads(serialized)
assert estimator_restored.attribute_pickled == 5
assert estimator_restored._attribute_not_pickled is None
def test_tag_inheritance():
# test that changing tags by inheritance is not allowed
nan_tag_est = NaNTag()
no_nan_tag_est = NoNaNTag()
assert nan_tag_est._get_tags()["allow_nan"]
assert not no_nan_tag_est._get_tags()["allow_nan"]
redefine_tags_est = OverrideTag()
assert not redefine_tags_est._get_tags()["allow_nan"]
diamond_tag_est = DiamondOverwriteTag()
assert diamond_tag_est._get_tags()["allow_nan"]
inherit_diamond_tag_est = InheritDiamondOverwriteTag()
assert inherit_diamond_tag_est._get_tags()["allow_nan"]
def test_raises_on_get_params_non_attribute():
class MyEstimator(BaseEstimator):
def __init__(self, param=5):
pass
def fit(self, X, y=None):
return self
est = MyEstimator()
msg = "'MyEstimator' object has no attribute 'param'"
with pytest.raises(AttributeError, match=msg):
est.get_params()
def test_repr_mimebundle_():
# Checks the display configuration flag controls the json output
tree = DecisionTreeClassifier()
output = tree._repr_mimebundle_()
assert "text/plain" in output
assert "text/html" in output
with config_context(display="text"):
output = tree._repr_mimebundle_()
assert "text/plain" in output
assert "text/html" not in output
def test_repr_html_wraps():
# Checks the display configuration flag controls the html output
tree = DecisionTreeClassifier()
output = tree._repr_html_()
assert "<style>" in output
with config_context(display="text"):
msg = "_repr_html_ is only defined when"
with pytest.raises(AttributeError, match=msg):
output = tree._repr_html_()
def test_n_features_in_validation():
"""Check that `_check_n_features` validates data when reset=False"""
est = MyEstimator()
X_train = [[1, 2, 3], [4, 5, 6]]
est._check_n_features(X_train, reset=True)
assert est.n_features_in_ == 3
msg = "X does not contain any features, but MyEstimator is expecting 3 features"
with pytest.raises(ValueError, match=msg):
est._check_n_features("invalid X", reset=False)
def test_n_features_in_no_validation():
"""Check that `_check_n_features` does not validate data when
n_features_in_ is not defined."""
est = MyEstimator()
est._check_n_features("invalid X", reset=True)
assert not hasattr(est, "n_features_in_")
# does not raise
est._check_n_features("invalid X", reset=False)
def test_feature_names_in():
"""Check that feature_name_in are recorded by `_validate_data`"""
pd = pytest.importorskip("pandas")
iris = datasets.load_iris()
X_np = iris.data
df = pd.DataFrame(X_np, columns=iris.feature_names)
class NoOpTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
self._validate_data(X)
return self
def transform(self, X):
self._validate_data(X, reset=False)
return X
# fit on dataframe saves the feature names
trans = NoOpTransformer().fit(df)
assert_array_equal(trans.feature_names_in_, df.columns)
# fit again but on ndarray does not keep the previous feature names (see #21383)
trans.fit(X_np)
assert not hasattr(trans, "feature_names_in_")
trans.fit(df)
msg = "The feature names should match those that were passed"
df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1])
with pytest.warns(FutureWarning, match=msg):
trans.transform(df_bad)
# warns when fitted on dataframe and transforming a ndarray
msg = (
"X does not have valid feature names, but NoOpTransformer was "
"fitted with feature names"
)
with pytest.warns(UserWarning, match=msg):
trans.transform(X_np)
# warns when fitted on a ndarray and transforming dataframe
msg = "X has feature names, but NoOpTransformer was fitted without feature names"
trans = NoOpTransformer().fit(X_np)
with pytest.warns(UserWarning, match=msg):
trans.transform(df)
# fit on dataframe with all integer feature names works without warning
df_int_names = pd.DataFrame(X_np)
trans = NoOpTransformer()
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.fit(df_int_names)
# fit on dataframe with no feature names or all integer feature names
# -> do not warn on transform
Xs = [X_np, df_int_names]
for X in Xs:
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
trans.transform(X)
# TODO: Convert to a error in 1.2
# fit on dataframe with feature names that are mixed warns:
df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2])
trans = NoOpTransformer()
msg = re.escape(
"Feature names only support names that are all strings. "
"Got feature names with dtypes: ['int', 'str']"
)
with pytest.warns(FutureWarning, match=msg) as record:
trans.fit(df_mixed)
# transform on feature names that are mixed also warns:
with pytest.warns(FutureWarning, match=msg) as record:
trans.transform(df_mixed)

View File

@@ -0,0 +1,33 @@
import os
import pytest
import textwrap
from sklearn import __version__
from sklearn.utils._openmp_helpers import _openmp_parallelism_enabled
def test_openmp_parallelism_enabled():
# Check that sklearn is built with OpenMP-based parallelism enabled.
# This test can be skipped by setting the environment variable
# ``SKLEARN_SKIP_OPENMP_TEST``.
if os.getenv("SKLEARN_SKIP_OPENMP_TEST"):
pytest.skip("test explicitly skipped (SKLEARN_SKIP_OPENMP_TEST)")
base_url = "dev" if __version__.endswith(".dev0") else "stable"
err_msg = textwrap.dedent(
"""
This test fails because scikit-learn has been built without OpenMP.
This is not recommended since some estimators will run in sequential
mode instead of leveraging thread-based parallelism.
You can find instructions to build scikit-learn with OpenMP at this
address:
https://scikit-learn.org/{}/developers/advanced_installation.html
You can skip this test by setting the environment variable
SKLEARN_SKIP_OPENMP_TEST to any value.
"""
).format(base_url)
assert _openmp_parallelism_enabled(), err_msg

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
"""
Smoke Test the check_build module
"""
# Author: G Varoquaux
# License: BSD 3 clause
import pytest
from sklearn.__check_build import raise_build_error
def test_raise_build_error():
with pytest.raises(ImportError):
raise_build_error(ImportError())

View File

@@ -0,0 +1,444 @@
"""
General tests for all estimators in sklearn.
"""
# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
# Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause
import os
import warnings
import sys
import re
import pkgutil
from inspect import isgenerator, signature, Parameter
from itertools import product, chain
from functools import partial
import pytest
import numpy as np
from sklearn.utils import all_estimators
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import FitFailedWarning
from sklearn.utils.estimator_checks import check_estimator
import sklearn
from sklearn.decomposition import PCA
from sklearn.linear_model._base import LinearClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.utils import IS_PYPY
from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags
from sklearn.utils._testing import (
SkipTest,
set_random_state,
)
from sklearn.utils.estimator_checks import (
_construct_instance,
_set_checking_parameters,
_get_check_estimator_ids,
check_class_weight_balanced_linear_classifier,
parametrize_with_checks,
check_dataframe_column_names_consistency,
check_n_features_in_after_fitting,
check_transformer_get_feature_names_out,
check_transformer_get_feature_names_out_pandas,
)
def test_all_estimator_no_base_class():
# test that all_estimators doesn't find abstract classes.
for name, Estimator in all_estimators():
msg = (
"Base estimators such as {0} should not be included in all_estimators"
).format(name)
assert not name.lower().startswith("base"), msg
def _sample_func(x, y=1):
pass
@pytest.mark.parametrize(
"val, expected",
[
(partial(_sample_func, y=1), "_sample_func(y=1)"),
(_sample_func, "_sample_func"),
(partial(_sample_func, "world"), "_sample_func"),
(LogisticRegression(C=2.0), "LogisticRegression(C=2.0)"),
(
LogisticRegression(
random_state=1,
solver="newton-cg",
class_weight="balanced",
warm_start=True,
),
"LogisticRegression(class_weight='balanced',random_state=1,"
"solver='newton-cg',warm_start=True)",
),
],
)
def test_get_check_estimator_ids(val, expected):
assert _get_check_estimator_ids(val) == expected
def _tested_estimators(type_filter=None):
for name, Estimator in all_estimators(type_filter=type_filter):
try:
estimator = _construct_instance(Estimator)
except SkipTest:
continue
yield estimator
@parametrize_with_checks(list(_tested_estimators()))
def test_estimators(estimator, check, request):
# Common tests for estimator instances
with ignore_warnings(category=(FutureWarning, ConvergenceWarning, UserWarning)):
_set_checking_parameters(estimator)
check(estimator)
def test_check_estimator_generate_only():
all_instance_gen_checks = check_estimator(LogisticRegression(), generate_only=True)
assert isgenerator(all_instance_gen_checks)
def test_configure():
# Smoke test the 'configure' step of setup, this tests all the
# 'configure' functions in the setup.pys in scikit-learn
# This test requires Cython which is not necessarily there when running
# the tests of an installed version of scikit-learn or when scikit-learn
# is installed in editable mode by pip build isolation enabled.
pytest.importorskip("Cython")
cwd = os.getcwd()
setup_path = os.path.abspath(os.path.join(sklearn.__path__[0], ".."))
setup_filename = os.path.join(setup_path, "setup.py")
if not os.path.exists(setup_filename):
pytest.skip("setup.py not available")
# XXX unreached code as of v0.22
try:
os.chdir(setup_path)
old_argv = sys.argv
sys.argv = ["setup.py", "config"]
with warnings.catch_warnings():
# The configuration spits out warnings when not finding
# Blas/Atlas development headers
warnings.simplefilter("ignore", UserWarning)
with open("setup.py") as f:
exec(f.read(), dict(__name__="__main__"))
finally:
sys.argv = old_argv
os.chdir(cwd)
def _tested_linear_classifiers():
classifiers = all_estimators(type_filter="classifier")
with warnings.catch_warnings(record=True):
for name, clazz in classifiers:
required_parameters = getattr(clazz, "_required_parameters", [])
if len(required_parameters):
# FIXME
continue
if "class_weight" in clazz().get_params().keys() and issubclass(
clazz, LinearClassifierMixin
):
yield name, clazz
@pytest.mark.parametrize("name, Classifier", _tested_linear_classifiers())
def test_class_weight_balanced_linear_classifiers(name, Classifier):
check_class_weight_balanced_linear_classifier(name, Classifier)
@ignore_warnings
def test_import_all_consistency():
# Smoke test to check that any name in a __all__ list is actually defined
# in the namespace of the module or package.
pkgs = pkgutil.walk_packages(
path=sklearn.__path__, prefix="sklearn.", onerror=lambda _: None
)
submods = [modname for _, modname, _ in pkgs]
for modname in submods + ["sklearn"]:
if ".tests." in modname:
continue
if IS_PYPY and (
"_svmlight_format_io" in modname
or "feature_extraction._hashing_fast" in modname
):
continue
package = __import__(modname, fromlist="dummy")
for name in getattr(package, "__all__", ()):
assert hasattr(package, name), "Module '{0}' has no attribute '{1}'".format(
modname, name
)
def test_root_import_all_completeness():
EXCEPTIONS = ("utils", "tests", "base", "setup", "conftest")
for _, modname, _ in pkgutil.walk_packages(
path=sklearn.__path__, onerror=lambda _: None
):
if "." in modname or modname.startswith("_") or modname in EXCEPTIONS:
continue
assert modname in sklearn.__all__
def test_all_tests_are_importable():
# Ensure that for each contentful subpackage, there is a test directory
# within it that is also a subpackage (i.e. a directory with __init__.py)
HAS_TESTS_EXCEPTIONS = re.compile(
r"""(?x)
\.externals(\.|$)|
\.tests(\.|$)|
\._
"""
)
resource_modules = {
"sklearn.datasets.data",
"sklearn.datasets.descr",
"sklearn.datasets.images",
}
lookup = {
name: ispkg
for _, name, ispkg in pkgutil.walk_packages(sklearn.__path__, prefix="sklearn.")
}
missing_tests = [
name
for name, ispkg in lookup.items()
if ispkg
and name not in resource_modules
and not HAS_TESTS_EXCEPTIONS.search(name)
and name + ".tests" not in lookup
]
assert missing_tests == [], (
"{0} do not have `tests` subpackages. "
"Perhaps they require "
"__init__.py or an add_subpackage directive "
"in the parent "
"setup.py".format(missing_tests)
)
def test_class_support_removed():
# Make sure passing classes to check_estimator or parametrize_with_checks
# raises an error
msg = "Passing a class was deprecated.* isn't supported anymore"
with pytest.raises(TypeError, match=msg):
check_estimator(LogisticRegression)
with pytest.raises(TypeError, match=msg):
parametrize_with_checks([LogisticRegression])
def _generate_search_cv_instances():
for SearchCV, (Estimator, param_grid) in product(
[
GridSearchCV,
HalvingGridSearchCV,
RandomizedSearchCV,
HalvingGridSearchCV,
],
[
(Ridge, {"alpha": [0.1, 1.0]}),
(LogisticRegression, {"C": [0.1, 1.0]}),
],
):
init_params = signature(SearchCV).parameters
extra_params = (
{"min_resources": "smallest"} if "min_resources" in init_params else {}
)
search_cv = SearchCV(Estimator(), param_grid, cv=2, **extra_params)
set_random_state(search_cv)
yield search_cv
for SearchCV, (Estimator, param_grid) in product(
[
GridSearchCV,
HalvingGridSearchCV,
RandomizedSearchCV,
HalvingRandomSearchCV,
],
[
(Ridge, {"ridge__alpha": [0.1, 1.0]}),
(LogisticRegression, {"logisticregression__C": [0.1, 1.0]}),
],
):
init_params = signature(SearchCV).parameters
extra_params = (
{"min_resources": "smallest"} if "min_resources" in init_params else {}
)
search_cv = SearchCV(
make_pipeline(PCA(), Estimator()), param_grid, cv=2, **extra_params
).set_params(error_score="raise")
set_random_state(search_cv)
yield search_cv
@parametrize_with_checks(list(_generate_search_cv_instances()))
def test_search_cv(estimator, check, request):
# Common tests for SearchCV instances
# We have a separate test because those meta-estimators can accept a
# wide range of base estimators (classifiers, regressors, pipelines)
with ignore_warnings(
category=(
FutureWarning,
ConvergenceWarning,
UserWarning,
FitFailedWarning,
)
):
check(estimator)
@pytest.mark.parametrize(
"estimator", _tested_estimators(), ids=_get_check_estimator_ids
)
def test_valid_tag_types(estimator):
"""Check that estimator tags are valid."""
tags = _safe_tags(estimator)
for name, tag in tags.items():
correct_tags = type(_DEFAULT_TAGS[name])
if name == "_xfail_checks":
# _xfail_checks can be a dictionary
correct_tags = (correct_tags, dict)
assert isinstance(tag, correct_tags)
@pytest.mark.parametrize(
"estimator", _tested_estimators(), ids=_get_check_estimator_ids
)
def test_check_n_features_in_after_fitting(estimator):
_set_checking_parameters(estimator)
check_n_features_in_after_fitting(estimator.__class__.__name__, estimator)
def _estimators_that_predict_in_fit():
for estimator in _tested_estimators():
est_params = set(estimator.get_params())
if "oob_score" in est_params:
yield estimator.set_params(oob_score=True, bootstrap=True)
elif "early_stopping" in est_params:
est = estimator.set_params(early_stopping=True, n_iter_no_change=1)
if est.__class__.__name__ in {"MLPClassifier", "MLPRegressor"}:
# TODO: FIX MLP to not check validation set during MLP
yield pytest.param(
est, marks=pytest.mark.xfail(msg="MLP still validates in fit")
)
else:
yield est
elif "n_iter_no_change" in est_params:
yield estimator.set_params(n_iter_no_change=1)
# NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
# delegates validation to a base estimator, the check is testing that the base estimator
# is checking for column name consistency.
column_name_estimators = list(
chain(
_tested_estimators(),
[make_pipeline(LogisticRegression(C=1))],
list(_generate_search_cv_instances()),
_estimators_that_predict_in_fit(),
)
)
@pytest.mark.parametrize(
"estimator", column_name_estimators, ids=_get_check_estimator_ids
)
def test_pandas_column_name_consistency(estimator):
_set_checking_parameters(estimator)
with ignore_warnings(category=(FutureWarning)):
with warnings.catch_warnings(record=True) as record:
check_dataframe_column_names_consistency(
estimator.__class__.__name__, estimator
)
for warning in record:
assert "was fitted without feature names" not in str(warning.message)
# TODO: As more modules support get_feature_names_out they should be removed
# from this list to be tested
GET_FEATURES_OUT_MODULES_TO_IGNORE = [
"ensemble",
"kernel_approximation",
]
def _include_in_get_feature_names_out_check(transformer):
if hasattr(transformer, "get_feature_names_out"):
return True
module = transformer.__module__.split(".")[1]
return module not in GET_FEATURES_OUT_MODULES_TO_IGNORE
GET_FEATURES_OUT_ESTIMATORS = [
est
for est in _tested_estimators("transformer")
if _include_in_get_feature_names_out_check(est)
]
@pytest.mark.parametrize(
"transformer", GET_FEATURES_OUT_ESTIMATORS, ids=_get_check_estimator_ids
)
def test_transformers_get_feature_names_out(transformer):
_set_checking_parameters(transformer)
with ignore_warnings(category=(FutureWarning)):
check_transformer_get_feature_names_out(
transformer.__class__.__name__, transformer
)
check_transformer_get_feature_names_out_pandas(
transformer.__class__.__name__, transformer
)
VALIDATE_ESTIMATOR_INIT = [
"SGDOneClassSVM",
]
VALIDATE_ESTIMATOR_INIT = set(VALIDATE_ESTIMATOR_INIT)
@pytest.mark.parametrize(
"Estimator",
[est for name, est in all_estimators() if name not in VALIDATE_ESTIMATOR_INIT],
)
def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
"""Check that init or set_param does not raise errors."""
# Remove parameters with **kwargs by filtering out Parameter.VAR_KEYWORD
# TODO: Remove in 1.2 when **kwargs is removed in RadiusNeighborsClassifier
params = [
name
for name, param in signature(Estimator).parameters.items()
if param.kind != Parameter.VAR_KEYWORD
]
smoke_test_values = [-1, 3.0, "helloworld", np.array([1.0, 4.0]), [1], {}, []]
for value in smoke_test_values:
new_params = {key: value for key in params}
# Does not raise
est = Estimator(**new_params)
# Also do does not raise
est.set_params(**new_params)

View File

@@ -0,0 +1,142 @@
import time
from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel
import pytest
from sklearn import get_config, set_config, config_context
from sklearn.utils.fixes import delayed
def test_config_context():
assert get_config() == {
"assume_finite": False,
"working_memory": 1024,
"print_changed_only": True,
"display": "diagram",
"pairwise_dist_chunk_size": 256,
"enable_cython_pairwise_dist": True,
}
# Not using as a context manager affects nothing
config_context(assume_finite=True)
assert get_config()["assume_finite"] is False
with config_context(assume_finite=True):
assert get_config() == {
"assume_finite": True,
"working_memory": 1024,
"print_changed_only": True,
"display": "diagram",
"pairwise_dist_chunk_size": 256,
"enable_cython_pairwise_dist": True,
}
assert get_config()["assume_finite"] is False
with config_context(assume_finite=True):
with config_context(assume_finite=None):
assert get_config()["assume_finite"] is True
assert get_config()["assume_finite"] is True
with config_context(assume_finite=False):
assert get_config()["assume_finite"] is False
with config_context(assume_finite=None):
assert get_config()["assume_finite"] is False
# global setting will not be retained outside of context that
# did not modify this setting
set_config(assume_finite=True)
assert get_config()["assume_finite"] is True
assert get_config()["assume_finite"] is False
assert get_config()["assume_finite"] is True
assert get_config() == {
"assume_finite": False,
"working_memory": 1024,
"print_changed_only": True,
"display": "diagram",
"pairwise_dist_chunk_size": 256,
"enable_cython_pairwise_dist": True,
}
# No positional arguments
with pytest.raises(TypeError):
config_context(True)
# No unknown arguments
with pytest.raises(TypeError):
config_context(do_something_else=True).__enter__()
def test_config_context_exception():
assert get_config()["assume_finite"] is False
try:
with config_context(assume_finite=True):
assert get_config()["assume_finite"] is True
raise ValueError()
except ValueError:
pass
assert get_config()["assume_finite"] is False
def test_set_config():
assert get_config()["assume_finite"] is False
set_config(assume_finite=None)
assert get_config()["assume_finite"] is False
set_config(assume_finite=True)
assert get_config()["assume_finite"] is True
set_config(assume_finite=None)
assert get_config()["assume_finite"] is True
set_config(assume_finite=False)
assert get_config()["assume_finite"] is False
# No unknown arguments
with pytest.raises(TypeError):
set_config(do_something_else=True)
def set_assume_finite(assume_finite, sleep_duration):
"""Return the value of assume_finite after waiting `sleep_duration`."""
with config_context(assume_finite=assume_finite):
time.sleep(sleep_duration)
return get_config()["assume_finite"]
@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"])
def test_config_threadsafe_joblib(backend):
"""Test that the global config is threadsafe with all joblib backends.
Two jobs are spawned and sets assume_finite to two different values.
When the job with a duration 0.1s completes, the assume_finite value
should be the same as the value passed to the function. In other words,
it is not influenced by the other job setting assume_finite to True.
"""
assume_finites = [False, True]
sleep_durations = [0.1, 0.2]
items = Parallel(backend=backend, n_jobs=2)(
delayed(set_assume_finite)(assume_finite, sleep_dur)
for assume_finite, sleep_dur in zip(assume_finites, sleep_durations)
)
assert items == [False, True]
def test_config_threadsafe():
"""Uses threads directly to test that the global config does not change
between threads. Same test as `test_config_threadsafe_joblib` but with
`ThreadPoolExecutor`."""
assume_finites = [False, True]
sleep_durations = [0.1, 0.2]
with ThreadPoolExecutor(max_workers=2) as e:
items = [
output
for output in e.map(set_assume_finite, assume_finites, sleep_durations)
]
assert items == [False, True]

View File

@@ -0,0 +1,668 @@
import numpy as np
import pytest
from scipy import linalg
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_almost_equal
from sklearn.datasets import make_blobs
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import _cov
from sklearn.covariance import ledoit_wolf
from sklearn.cluster import KMeans
from sklearn.covariance import ShrunkCovariance
from sklearn.covariance import LedoitWolf
from sklearn.preprocessing import StandardScaler
# Data is just 6 separable points in the plane
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype="f")
y = np.array([1, 1, 1, 2, 2, 2])
y3 = np.array([1, 1, 2, 2, 3, 3])
# Degenerate data with only one feature (still should be separable)
X1 = np.array(
[[-2], [-1], [-1], [1], [1], [2]],
dtype="f",
)
# Data is just 9 separable points in the plane
X6 = np.array(
[[0, 0], [-2, -2], [-2, -1], [-1, -1], [-1, -2], [1, 3], [1, 2], [2, 1], [2, 2]]
)
y6 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2])
y7 = np.array([1, 2, 3, 2, 3, 1, 2, 3, 1])
# Degenerate data with 1 feature (still should be separable)
X7 = np.array([[-3], [-2], [-1], [-1], [0], [1], [1], [2], [3]])
# Data that has zero variance in one dimension and needs regularization
X2 = np.array(
[[-3, 0], [-2, 0], [-1, 0], [-1, 0], [0, 0], [1, 0], [1, 0], [2, 0], [3, 0]]
)
# One element class
y4 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 2])
# Data with less samples in a class than n_features
X5 = np.c_[np.arange(8), np.zeros((8, 3))]
y5 = np.array([0, 0, 0, 0, 0, 1, 1, 1])
solver_shrinkage = [
("svd", None),
("lsqr", None),
("eigen", None),
("lsqr", "auto"),
("lsqr", 0),
("lsqr", 0.43),
("eigen", "auto"),
("eigen", 0),
("eigen", 0.43),
]
def test_lda_predict():
# Test LDA classification.
# This checks that LDA implements fit and predict and returns correct
# values for simple toy data.
for test_case in solver_shrinkage:
solver, shrinkage = test_case
clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
y_pred = clf.fit(X, y).predict(X)
assert_array_equal(y_pred, y, "solver %s" % solver)
# Assert that it works with 1D data
y_pred1 = clf.fit(X1, y).predict(X1)
assert_array_equal(y_pred1, y, "solver %s" % solver)
# Test probability estimates
y_proba_pred1 = clf.predict_proba(X1)
assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver)
y_log_proba_pred1 = clf.predict_log_proba(X1)
assert_allclose(
np.exp(y_log_proba_pred1),
y_proba_pred1,
rtol=1e-6,
atol=1e-6,
err_msg="solver %s" % solver,
)
# Primarily test for commit 2f34950 -- "reuse" of priors
y_pred3 = clf.fit(X, y3).predict(X)
# LDA shouldn't be able to separate those
assert np.any(y_pred3 != y3), "solver %s" % solver
# Test invalid shrinkages
clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
with pytest.raises(ValueError):
clf.fit(X, y)
clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
with pytest.raises(ValueError):
clf.fit(X, y)
clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
with pytest.raises(NotImplementedError):
clf.fit(X, y)
clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=np.array([1, 2]))
with pytest.raises(TypeError, match="shrinkage must be a float or a string"):
clf.fit(X, y)
clf = LinearDiscriminantAnalysis(
solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()
)
with pytest.raises(
ValueError,
match=(
"covariance_estimator and shrinkage "
"parameters are not None. "
"Only one of the two can be set."
),
):
clf.fit(X, y)
# Test unknown solver
clf = LinearDiscriminantAnalysis(solver="dummy")
with pytest.raises(ValueError):
clf.fit(X, y)
# test bad solver with covariance_estimator
clf = LinearDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf())
with pytest.raises(
ValueError, match="covariance estimator is not supported with svd"
):
clf.fit(X, y)
# test bad covariance estimator
clf = LinearDiscriminantAnalysis(
solver="lsqr", covariance_estimator=KMeans(n_clusters=2)
)
with pytest.raises(
ValueError, match="KMeans does not have a covariance_ attribute"
):
clf.fit(X, y)
@pytest.mark.parametrize("n_classes", [2, 3])
@pytest.mark.parametrize("solver", ["svd", "lsqr", "eigen"])
def test_lda_predict_proba(solver, n_classes):
def generate_dataset(n_samples, centers, covariances, random_state=None):
"""Generate a multivariate normal data given some centers and
covariances"""
rng = check_random_state(random_state)
X = np.vstack(
[
rng.multivariate_normal(mean, cov, size=n_samples // len(centers))
for mean, cov in zip(centers, covariances)
]
)
y = np.hstack(
[[clazz] * (n_samples // len(centers)) for clazz in range(len(centers))]
)
return X, y
blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes]
blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers))
X, y = generate_dataset(
n_samples=90000, centers=blob_centers, covariances=blob_stds, random_state=42
)
lda = LinearDiscriminantAnalysis(
solver=solver, store_covariance=True, shrinkage=None
).fit(X, y)
# check that the empirical means and covariances are close enough to the
# one used to generate the data
assert_allclose(lda.means_, blob_centers, atol=1e-1)
assert_allclose(lda.covariance_, blob_stds[0], atol=1)
# implement the method to compute the probability given in The Elements
# of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression
# or LDA?")
precision = linalg.inv(blob_stds[0])
alpha_k = []
alpha_k_0 = []
for clazz in range(len(blob_centers) - 1):
alpha_k.append(
np.dot(precision, (blob_centers[clazz] - blob_centers[-1])[:, np.newaxis])
)
alpha_k_0.append(
np.dot(
-0.5 * (blob_centers[clazz] + blob_centers[-1])[np.newaxis, :],
alpha_k[-1],
)
)
sample = np.array([[-22, 22]])
def discriminant_func(sample, coef, intercept, clazz):
return np.exp(intercept[clazz] + np.dot(sample, coef[clazz]))
prob = np.array(
[
float(
discriminant_func(sample, alpha_k, alpha_k_0, clazz)
/ (
1
+ sum(
[
discriminant_func(sample, alpha_k, alpha_k_0, clazz)
for clazz in range(n_classes - 1)
]
)
)
)
for clazz in range(n_classes - 1)
]
)
prob_ref = 1 - np.sum(prob)
# check the consistency of the computed probability
# all probabilities should sum to one
prob_ref_2 = float(
1
/ (
1
+ sum(
[
discriminant_func(sample, alpha_k, alpha_k_0, clazz)
for clazz in range(n_classes - 1)
]
)
)
)
assert prob_ref == pytest.approx(prob_ref_2)
# check that the probability of LDA are close to the theoretical
# probabilties
assert_allclose(
lda.predict_proba(sample), np.hstack([prob, prob_ref])[np.newaxis], atol=1e-2
)
def test_lda_priors():
# Test priors (negative priors)
priors = np.array([0.5, -0.5])
clf = LinearDiscriminantAnalysis(priors=priors)
msg = "priors must be non-negative"
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)
# Test that priors passed as a list are correctly handled (run to see if
# failure)
clf = LinearDiscriminantAnalysis(priors=[0.5, 0.5])
clf.fit(X, y)
# Test that priors always sum to 1
priors = np.array([0.5, 0.6])
prior_norm = np.array([0.45, 0.55])
clf = LinearDiscriminantAnalysis(priors=priors)
with pytest.warns(UserWarning):
clf.fit(X, y)
assert_array_almost_equal(clf.priors_, prior_norm, 2)
def test_lda_coefs():
# Test if the coefficients of the solvers are approximately the same.
n_features = 2
n_classes = 2
n_samples = 1000
X, y = make_blobs(
n_samples=n_samples, n_features=n_features, centers=n_classes, random_state=11
)
clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
clf_lda_lsqr = LinearDiscriminantAnalysis(solver="lsqr")
clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
clf_lda_svd.fit(X, y)
clf_lda_lsqr.fit(X, y)
clf_lda_eigen.fit(X, y)
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_lsqr.coef_, 1)
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_eigen.coef_, 1)
assert_array_almost_equal(clf_lda_eigen.coef_, clf_lda_lsqr.coef_, 1)
def test_lda_transform():
# Test LDA transform.
clf = LinearDiscriminantAnalysis(solver="svd", n_components=1)
X_transformed = clf.fit(X, y).transform(X)
assert X_transformed.shape[1] == 1
clf = LinearDiscriminantAnalysis(solver="eigen", n_components=1)
X_transformed = clf.fit(X, y).transform(X)
assert X_transformed.shape[1] == 1
clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1)
clf.fit(X, y)
msg = "transform not implemented for 'lsqr'"
with pytest.raises(NotImplementedError, match=msg):
clf.transform(X)
def test_lda_explained_variance_ratio():
# Test if the sum of the normalized eigen vectors values equals 1,
# Also tests whether the explained_variance_ratio_ formed by the
# eigen solver is the same as the explained_variance_ratio_ formed
# by the svd solver
state = np.random.RandomState(0)
X = state.normal(loc=0, scale=100, size=(40, 20))
y = state.randint(0, 3, size=(40,))
clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
clf_lda_eigen.fit(X, y)
assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
assert clf_lda_eigen.explained_variance_ratio_.shape == (
2,
), "Unexpected length for explained_variance_ratio_"
clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
clf_lda_svd.fit(X, y)
assert_almost_equal(clf_lda_svd.explained_variance_ratio_.sum(), 1.0, 3)
assert clf_lda_svd.explained_variance_ratio_.shape == (
2,
), "Unexpected length for explained_variance_ratio_"
assert_array_almost_equal(
clf_lda_svd.explained_variance_ratio_, clf_lda_eigen.explained_variance_ratio_
)
def test_lda_orthogonality():
# arrange four classes with their means in a kite-shaped pattern
# the longer distance should be transformed to the first component, and
# the shorter distance to the second component.
means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]])
# We construct perfectly symmetric distributions, so the LDA can estimate
# precise means.
scatter = np.array(
[
[0.1, 0, 0],
[-0.1, 0, 0],
[0, 0.1, 0],
[0, -0.1, 0],
[0, 0, 0.1],
[0, 0, -0.1],
]
)
X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3))
y = np.repeat(np.arange(means.shape[0]), scatter.shape[0])
# Fit LDA and transform the means
clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y)
means_transformed = clf.transform(means)
d1 = means_transformed[3] - means_transformed[0]
d2 = means_transformed[2] - means_transformed[1]
d1 /= np.sqrt(np.sum(d1**2))
d2 /= np.sqrt(np.sum(d2**2))
# the transformed within-class covariance should be the identity matrix
assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2))
# the means of classes 0 and 3 should lie on the first component
assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0)
# the means of classes 1 and 2 should lie on the second component
assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
def test_lda_scaling():
# Test if classification works correctly with differently scaled features.
n = 100
rng = np.random.RandomState(1234)
# use uniform distribution of features to make sure there is absolutely no
# overlap between classes.
x1 = rng.uniform(-1, 1, (n, 3)) + [-10, 0, 0]
x2 = rng.uniform(-1, 1, (n, 3)) + [10, 0, 0]
x = np.vstack((x1, x2)) * [1, 100, 10000]
y = [-1] * n + [1] * n
for solver in ("svd", "lsqr", "eigen"):
clf = LinearDiscriminantAnalysis(solver=solver)
# should be able to separate the data perfectly
assert clf.fit(x, y).score(x, y) == 1.0, "using covariance: %s" % solver
def test_lda_store_covariance():
# Test for solver 'lsqr' and 'eigen'
# 'store_covariance' has no effect on 'lsqr' and 'eigen' solvers
for solver in ("lsqr", "eigen"):
clf = LinearDiscriminantAnalysis(solver=solver).fit(X6, y6)
assert hasattr(clf, "covariance_")
# Test the actual attribute:
clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(
X6, y6
)
assert hasattr(clf, "covariance_")
assert_array_almost_equal(
clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
)
# Test for SVD solver, the default is to not set the covariances_ attribute
clf = LinearDiscriminantAnalysis(solver="svd").fit(X6, y6)
assert not hasattr(clf, "covariance_")
# Test the actual attribute:
clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(X6, y6)
assert hasattr(clf, "covariance_")
assert_array_almost_equal(
clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
)
@pytest.mark.parametrize("seed", range(10))
def test_lda_shrinkage(seed):
# Test that shrunk covariance estimator and shrinkage parameter behave the
# same
rng = np.random.RandomState(seed)
X = rng.rand(100, 10)
y = rng.randint(3, size=(100))
c1 = LinearDiscriminantAnalysis(store_covariance=True, shrinkage=0.5, solver="lsqr")
c2 = LinearDiscriminantAnalysis(
store_covariance=True,
covariance_estimator=ShrunkCovariance(shrinkage=0.5),
solver="lsqr",
)
c1.fit(X, y)
c2.fit(X, y)
assert_allclose(c1.means_, c2.means_)
assert_allclose(c1.covariance_, c2.covariance_)
def test_lda_ledoitwolf():
# When shrinkage="auto" current implementation uses ledoitwolf estimation
# of covariance after standardizing the data. This checks that it is indeed
# the case
class StandardizedLedoitWolf:
def fit(self, X):
sc = StandardScaler() # standardize features
X_sc = sc.fit_transform(X)
s = ledoit_wolf(X_sc)[0]
# rescale
s = sc.scale_[:, np.newaxis] * s * sc.scale_[np.newaxis, :]
self.covariance_ = s
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
y = rng.randint(3, size=(100,))
c1 = LinearDiscriminantAnalysis(
store_covariance=True, shrinkage="auto", solver="lsqr"
)
c2 = LinearDiscriminantAnalysis(
store_covariance=True,
covariance_estimator=StandardizedLedoitWolf(),
solver="lsqr",
)
c1.fit(X, y)
c2.fit(X, y)
assert_allclose(c1.means_, c2.means_)
assert_allclose(c1.covariance_, c2.covariance_)
@pytest.mark.parametrize("n_features", [3, 5])
@pytest.mark.parametrize("n_classes", [5, 3])
def test_lda_dimension_warning(n_classes, n_features):
rng = check_random_state(0)
n_samples = 10
X = rng.randn(n_samples, n_features)
# we create n_classes labels by repeating and truncating a
# range(n_classes) until n_samples
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
max_components = min(n_features, n_classes - 1)
for n_components in [max_components - 1, None, max_components]:
# if n_components <= min(n_classes - 1, n_features), no warning
lda = LinearDiscriminantAnalysis(n_components=n_components)
lda.fit(X, y)
for n_components in [max_components + 1, max(n_features, n_classes - 1) + 1]:
# if n_components > min(n_classes - 1, n_features), raise error.
# We test one unit higher than max_components, and then something
# larger than both n_features and n_classes - 1 to ensure the test
# works for any value of n_component
lda = LinearDiscriminantAnalysis(n_components=n_components)
msg = "n_components cannot be larger than "
with pytest.raises(ValueError, match=msg):
lda.fit(X, y)
@pytest.mark.parametrize(
"data_type, expected_type",
[
(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64),
],
)
def test_lda_dtype_match(data_type, expected_type):
for solver, shrinkage in solver_shrinkage:
clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
clf.fit(X.astype(data_type), y.astype(data_type))
assert clf.coef_.dtype == expected_type
def test_lda_numeric_consistency_float32_float64():
for solver, shrinkage in solver_shrinkage:
clf_32 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
clf_32.fit(X.astype(np.float32), y.astype(np.float32))
clf_64 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
clf_64.fit(X.astype(np.float64), y.astype(np.float64))
# Check value consistency between types
rtol = 1e-6
assert_allclose(clf_32.coef_, clf_64.coef_, rtol=rtol)
def test_qda():
# QDA classification.
# This checks that QDA implements fit and predict and returns
# correct values for a simple toy dataset.
clf = QuadraticDiscriminantAnalysis()
y_pred = clf.fit(X6, y6).predict(X6)
assert_array_equal(y_pred, y6)
# Assure that it works with 1D data
y_pred1 = clf.fit(X7, y6).predict(X7)
assert_array_equal(y_pred1, y6)
# Test probas estimates
y_proba_pred1 = clf.predict_proba(X7)
assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6)
y_log_proba_pred1 = clf.predict_log_proba(X7)
assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8)
y_pred3 = clf.fit(X6, y7).predict(X6)
# QDA shouldn't be able to separate those
assert np.any(y_pred3 != y7)
# Classes should have at least 2 elements
with pytest.raises(ValueError):
clf.fit(X6, y4)
def test_qda_priors():
clf = QuadraticDiscriminantAnalysis()
y_pred = clf.fit(X6, y6).predict(X6)
n_pos = np.sum(y_pred == 2)
neg = 1e-10
clf = QuadraticDiscriminantAnalysis(priors=np.array([neg, 1 - neg]))
y_pred = clf.fit(X6, y6).predict(X6)
n_pos2 = np.sum(y_pred == 2)
assert n_pos2 > n_pos
def test_qda_store_covariance():
# The default is to not set the covariances_ attribute
clf = QuadraticDiscriminantAnalysis().fit(X6, y6)
assert not hasattr(clf, "covariance_")
# Test the actual attribute:
clf = QuadraticDiscriminantAnalysis(store_covariance=True).fit(X6, y6)
assert hasattr(clf, "covariance_")
assert_array_almost_equal(clf.covariance_[0], np.array([[0.7, 0.45], [0.45, 0.7]]))
assert_array_almost_equal(
clf.covariance_[1],
np.array([[0.33333333, -0.33333333], [-0.33333333, 0.66666667]]),
)
def test_qda_regularization():
# The default is reg_param=0. and will cause issues when there is a
# constant variable.
# Fitting on data with constant variable triggers an UserWarning.
collinear_msg = "Variables are collinear"
clf = QuadraticDiscriminantAnalysis()
with pytest.warns(UserWarning, match=collinear_msg):
y_pred = clf.fit(X2, y6)
# XXX: RuntimeWarning is also raised at predict time because of divisions
# by zero when the model is fit with a constant feature and without
# regularization: should this be considered a bug? Either by the fit-time
# message more informative, raising and exception instead of a warning in
# this case or somehow changing predict to avoid division by zero.
with pytest.warns(RuntimeWarning, match="divide by zero"):
y_pred = clf.predict(X2)
assert np.any(y_pred != y6)
# Adding a little regularization fixes the division by zero at predict
# time. But UserWarning will persist at fit time.
clf = QuadraticDiscriminantAnalysis(reg_param=0.01)
with pytest.warns(UserWarning, match=collinear_msg):
clf.fit(X2, y6)
y_pred = clf.predict(X2)
assert_array_equal(y_pred, y6)
# UserWarning should also be there for the n_samples_in_a_class <
# n_features case.
clf = QuadraticDiscriminantAnalysis(reg_param=0.1)
with pytest.warns(UserWarning, match=collinear_msg):
clf.fit(X5, y5)
y_pred5 = clf.predict(X5)
assert_array_equal(y_pred5, y5)
def test_covariance():
x, y = make_blobs(n_samples=100, n_features=5, centers=1, random_state=42)
# make features correlated
x = np.dot(x, np.arange(x.shape[1] ** 2).reshape(x.shape[1], x.shape[1]))
c_e = _cov(x, "empirical")
assert_almost_equal(c_e, c_e.T)
c_s = _cov(x, "auto")
assert_almost_equal(c_s, c_s.T)
@pytest.mark.parametrize("solver", ["svd, lsqr", "eigen"])
def test_raises_value_error_on_same_number_of_classes_and_samples(solver):
"""
Tests that if the number of samples equals the number
of classes, a ValueError is raised.
"""
X = np.array([[0.5, 0.6], [0.6, 0.5]])
y = np.array(["a", "b"])
clf = LinearDiscriminantAnalysis(solver=solver)
with pytest.raises(ValueError, match="The number of samples must be more"):
clf.fit(X, y)
def test_get_feature_names_out():
"""Check get_feature_names_out uses class name as prefix."""
est = LinearDiscriminantAnalysis().fit(X, y)
names_out = est.get_feature_names_out()
class_name_lower = "LinearDiscriminantAnalysis".lower()
expected_names_out = np.array(
[
f"{class_name_lower}{i}"
for i in range(est.explained_variance_ratio_.shape[0])
],
dtype=object,
)
assert_array_equal(names_out, expected_names_out)

View File

@@ -0,0 +1,348 @@
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Raghav RV <rvraghav93@gmail.com>
# License: BSD 3 clause
import inspect
import warnings
import importlib
from pkgutil import walk_packages
from inspect import signature
import numpy as np
import sklearn
from sklearn.utils import IS_PYPY
from sklearn.utils._testing import check_docstring_parameters
from sklearn.utils._testing import _get_func_name
from sklearn.utils._testing import ignore_warnings
from sklearn.utils import all_estimators
from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
from sklearn.utils.estimator_checks import _enforce_estimator_tags_x
from sklearn.utils.estimator_checks import _construct_instance
from sklearn.utils.deprecation import _is_deprecated
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
import pytest
# walk_packages() ignores DeprecationWarnings, now we need to ignore
# FutureWarnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# mypy error: Module has no attribute "__path__"
sklearn_path = sklearn.__path__ # type: ignore # mypy issue #1422
PUBLIC_MODULES = set(
[
pckg[1]
for pckg in walk_packages(prefix="sklearn.", path=sklearn_path)
if not ("._" in pckg[1] or ".tests." in pckg[1])
]
)
# functions to ignore args / docstring of
_DOCSTRING_IGNORES = [
"sklearn.utils.deprecation.load_mlcomp",
"sklearn.pipeline.make_pipeline",
"sklearn.pipeline.make_union",
"sklearn.utils.extmath.safe_sparse_dot",
"sklearn.utils._joblib",
]
# Methods where y param should be ignored if y=None by default
_METHODS_IGNORE_NONE_Y = [
"fit",
"score",
"fit_predict",
"fit_transform",
"partial_fit",
"predict",
]
# numpydoc 0.8.0's docscrape tool raises because of collections.abc under
# Python 3.7
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@pytest.mark.skipif(IS_PYPY, reason="test segfaults on PyPy")
def test_docstring_parameters():
# Test module docstring formatting
# Skip test if numpydoc is not found
pytest.importorskip(
"numpydoc", reason="numpydoc is required to test the docstrings"
)
# XXX unreached code as of v0.22
from numpydoc import docscrape
incorrect = []
for name in PUBLIC_MODULES:
if name.endswith(".conftest"):
# pytest tooling, not part of the scikit-learn API
continue
if name == "sklearn.utils.fixes":
# We cannot always control these docstrings
continue
with warnings.catch_warnings(record=True):
module = importlib.import_module(name)
classes = inspect.getmembers(module, inspect.isclass)
# Exclude non-scikit-learn classes
classes = [cls for cls in classes if cls[1].__module__.startswith("sklearn")]
for cname, cls in classes:
this_incorrect = []
if cname in _DOCSTRING_IGNORES or cname.startswith("_"):
continue
if inspect.isabstract(cls):
continue
with warnings.catch_warnings(record=True) as w:
cdoc = docscrape.ClassDoc(cls)
if len(w):
raise RuntimeError(
"Error for __init__ of %s in %s:\n%s" % (cls, name, w[0])
)
cls_init = getattr(cls, "__init__", None)
if _is_deprecated(cls_init):
continue
elif cls_init is not None:
this_incorrect += check_docstring_parameters(cls.__init__, cdoc)
for method_name in cdoc.methods:
method = getattr(cls, method_name)
if _is_deprecated(method):
continue
param_ignore = None
# Now skip docstring test for y when y is None
# by default for API reason
if method_name in _METHODS_IGNORE_NONE_Y:
sig = signature(method)
if "y" in sig.parameters and sig.parameters["y"].default is None:
param_ignore = ["y"] # ignore y for fit and score
result = check_docstring_parameters(method, ignore=param_ignore)
this_incorrect += result
incorrect += this_incorrect
functions = inspect.getmembers(module, inspect.isfunction)
# Exclude imported functions
functions = [fn for fn in functions if fn[1].__module__ == name]
for fname, func in functions:
# Don't test private methods / functions
if fname.startswith("_"):
continue
if fname == "configuration" and name.endswith("setup"):
continue
name_ = _get_func_name(func)
if not any(d in name_ for d in _DOCSTRING_IGNORES) and not _is_deprecated(
func
):
incorrect += check_docstring_parameters(func)
msg = "\n".join(incorrect)
if len(incorrect) > 0:
raise AssertionError("Docstring Error:\n" + msg)
@ignore_warnings(category=FutureWarning)
def test_tabs():
# Test that there are no tabs in our source files
for importer, modname, ispkg in walk_packages(sklearn.__path__, prefix="sklearn."):
if IS_PYPY and (
"_svmlight_format_io" in modname
or "feature_extraction._hashing_fast" in modname
):
continue
# because we don't import
mod = importlib.import_module(modname)
try:
source = inspect.getsource(mod)
except IOError: # user probably should have run "make clean"
continue
assert "\t" not in source, (
'"%s" has tabs, please remove them ',
"or add it to the ignore list" % modname,
)
def _construct_searchcv_instance(SearchCV):
return SearchCV(LogisticRegression(), {"C": [0.1, 1]})
def _construct_compose_pipeline_instance(Estimator):
# Minimal / degenerate instances: only useful to test the docstrings.
if Estimator.__name__ == "ColumnTransformer":
return Estimator(transformers=[("transformer", "passthrough", [0, 1])])
elif Estimator.__name__ == "Pipeline":
return Estimator(steps=[("clf", LogisticRegression())])
elif Estimator.__name__ == "FeatureUnion":
return Estimator(transformer_list=[("transformer", FunctionTransformer())])
def _construct_sparse_coder(Estimator):
# XXX: hard-coded assumption that n_features=3
dictionary = np.array(
[[0, 1, 0], [-1, -1, 2], [1, 1, 1], [0, 1, 1], [0, 2, 1]],
dtype=np.float64,
)
return Estimator(dictionary=dictionary)
@pytest.mark.parametrize("name, Estimator", all_estimators())
def test_fit_docstring_attributes(name, Estimator):
pytest.importorskip("numpydoc")
from numpydoc import docscrape
doc = docscrape.ClassDoc(Estimator)
attributes = doc["Attributes"]
if Estimator.__name__ in (
"HalvingRandomSearchCV",
"RandomizedSearchCV",
"HalvingGridSearchCV",
"GridSearchCV",
):
est = _construct_searchcv_instance(Estimator)
elif Estimator.__name__ in (
"ColumnTransformer",
"Pipeline",
"FeatureUnion",
):
est = _construct_compose_pipeline_instance(Estimator)
elif Estimator.__name__ == "SparseCoder":
est = _construct_sparse_coder(Estimator)
else:
est = _construct_instance(Estimator)
if Estimator.__name__ == "SelectKBest":
est.set_params(k=2)
elif Estimator.__name__ == "DummyClassifier":
est.set_params(strategy="stratified")
elif Estimator.__name__ == "CCA" or Estimator.__name__.startswith("PLS"):
# default = 2 is invalid for single target
est.set_params(n_components=1)
elif Estimator.__name__ in (
"GaussianRandomProjection",
"SparseRandomProjection",
):
# default="auto" raises an error with the shape of `X`
est.set_params(n_components=2)
# FIXME: TO BE REMOVED in 1.4 (avoid FutureWarning)
if Estimator.__name__ in (
"OrthogonalMatchingPursuit",
"OrthogonalMatchingPursuitCV",
"Lars",
"LarsCV",
"LassoLars",
"LassoLarsCV",
"LassoLarsIC",
):
est.set_params(normalize=False)
# FIXME: TO BE REMOVED for 1.2 (avoid FutureWarning)
if Estimator.__name__ == "TSNE":
est.set_params(learning_rate=200.0, init="random")
# FIXME: TO BE REMOVED for 1.3 (avoid FutureWarning)
if Estimator.__name__ == "SequentialFeatureSelector":
est.set_params(n_features_to_select="auto")
# FIXME: TO BE REMOVED for 1.3 (avoid FutureWarning)
if Estimator.__name__ == "FastICA":
est.set_params(whiten="unit-variance")
# FIXME: TO BE REMOVED for 1.3 (avoid FutureWarning)
if Estimator.__name__ == "MiniBatchDictionaryLearning":
est.set_params(batch_size=5)
# In case we want to deprecate some attributes in the future
skipped_attributes = {}
if Estimator.__name__.endswith("Vectorizer"):
# Vectorizer require some specific input data
if Estimator.__name__ in (
"CountVectorizer",
"HashingVectorizer",
"TfidfVectorizer",
):
X = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
elif Estimator.__name__ == "DictVectorizer":
X = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]
y = None
else:
X, y = make_classification(
n_samples=20,
n_features=3,
n_redundant=0,
n_classes=2,
random_state=2,
)
y = _enforce_estimator_tags_y(est, y)
X = _enforce_estimator_tags_x(est, X)
if "1dlabels" in est._get_tags()["X_types"]:
est.fit(y)
elif "2dlabels" in est._get_tags()["X_types"]:
est.fit(np.c_[y, y])
else:
est.fit(X, y)
for attr in attributes:
if attr.name in skipped_attributes:
continue
desc = " ".join(attr.desc).lower()
# As certain attributes are present "only" if a certain parameter is
# provided, this checks if the word "only" is present in the attribute
# description, and if not the attribute is required to be present.
if "only " in desc:
continue
# ignore deprecation warnings
with ignore_warnings(category=FutureWarning):
assert hasattr(est, attr.name)
fit_attr = _get_all_fitted_attributes(est)
fit_attr_names = [attr.name for attr in attributes]
undocumented_attrs = set(fit_attr).difference(fit_attr_names)
undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes)
if undocumented_attrs:
raise AssertionError(
f"Undocumented attributes for {Estimator.__name__}: {undocumented_attrs}"
)
def _get_all_fitted_attributes(estimator):
"Get all the fitted attributes of an estimator including properties"
# attributes
fit_attr = list(estimator.__dict__.keys())
# properties
with warnings.catch_warnings():
warnings.filterwarnings("error", category=FutureWarning)
for name in dir(estimator.__class__):
obj = getattr(estimator.__class__, name)
if not isinstance(obj, property):
continue
# ignore properties that raises an AttributeError and deprecated
# properties
try:
getattr(estimator, name)
except (AttributeError, FutureWarning):
continue
fit_attr.append(name)
return [k for k in fit_attr if k.endswith("_") and not k.startswith("_")]

View File

@@ -0,0 +1,358 @@
import re
from inspect import signature
import pkgutil
import inspect
import importlib
from typing import Optional
import pytest
from sklearn.utils import all_estimators
import sklearn
numpydoc_validation = pytest.importorskip("numpydoc.validate")
FUNCTION_DOCSTRING_IGNORE_LIST = [
"sklearn.datasets._kddcup99.fetch_kddcup99",
"sklearn.datasets._lfw.fetch_lfw_pairs",
"sklearn.datasets._lfw.fetch_lfw_people",
"sklearn.datasets._samples_generator.make_gaussian_quantiles",
"sklearn.datasets._samples_generator.make_spd_matrix",
"sklearn.datasets._species_distributions.fetch_species_distributions",
"sklearn.datasets._svmlight_format_io.load_svmlight_file",
"sklearn.datasets._svmlight_format_io.load_svmlight_files",
"sklearn.decomposition._dict_learning.dict_learning",
"sklearn.decomposition._dict_learning.dict_learning_online",
"sklearn.decomposition._nmf.non_negative_factorization",
"sklearn.externals._packaging.version.parse",
"sklearn.feature_extraction.image.extract_patches_2d",
"sklearn.feature_extraction.text.strip_accents_unicode",
"sklearn.feature_selection._univariate_selection.chi2",
"sklearn.feature_selection._univariate_selection.f_oneway",
"sklearn.inspection._partial_dependence.partial_dependence",
"sklearn.inspection._plot.partial_dependence.plot_partial_dependence",
"sklearn.linear_model._least_angle.lars_path_gram",
"sklearn.linear_model._omp.orthogonal_mp_gram",
"sklearn.manifold._locally_linear.locally_linear_embedding",
"sklearn.manifold._t_sne.trustworthiness",
"sklearn.metrics._classification.brier_score_loss",
"sklearn.metrics._classification.cohen_kappa_score",
"sklearn.metrics._classification.fbeta_score",
"sklearn.metrics._classification.jaccard_score",
"sklearn.metrics._classification.log_loss",
"sklearn.metrics._plot.det_curve.plot_det_curve",
"sklearn.metrics._plot.precision_recall_curve.plot_precision_recall_curve",
"sklearn.metrics._ranking.auc",
"sklearn.metrics._ranking.coverage_error",
"sklearn.metrics._ranking.dcg_score",
"sklearn.metrics._ranking.label_ranking_average_precision_score",
"sklearn.metrics._ranking.roc_auc_score",
"sklearn.metrics._ranking.roc_curve",
"sklearn.metrics._ranking.top_k_accuracy_score",
"sklearn.metrics._regression.mean_pinball_loss",
"sklearn.metrics.cluster._bicluster.consensus_score",
"sklearn.metrics.cluster._supervised.adjusted_mutual_info_score",
"sklearn.metrics.cluster._supervised.adjusted_rand_score",
"sklearn.metrics.cluster._supervised.entropy",
"sklearn.metrics.cluster._supervised.fowlkes_mallows_score",
"sklearn.metrics.cluster._supervised.homogeneity_completeness_v_measure",
"sklearn.metrics.cluster._supervised.mutual_info_score",
"sklearn.metrics.cluster._supervised.normalized_mutual_info_score",
"sklearn.metrics.cluster._supervised.pair_confusion_matrix",
"sklearn.metrics.cluster._supervised.rand_score",
"sklearn.metrics.cluster._supervised.v_measure_score",
"sklearn.metrics.pairwise.additive_chi2_kernel",
"sklearn.metrics.pairwise.check_paired_arrays",
"sklearn.metrics.pairwise.check_pairwise_arrays",
"sklearn.metrics.pairwise.chi2_kernel",
"sklearn.metrics.pairwise.cosine_distances",
"sklearn.metrics.pairwise.cosine_similarity",
"sklearn.metrics.pairwise.distance_metrics",
"sklearn.metrics.pairwise.kernel_metrics",
"sklearn.metrics.pairwise.paired_manhattan_distances",
"sklearn.metrics.pairwise.pairwise_distances_argmin",
"sklearn.metrics.pairwise.pairwise_distances_argmin_min",
"sklearn.metrics.pairwise.pairwise_distances_chunked",
"sklearn.metrics.pairwise.pairwise_kernels",
"sklearn.metrics.pairwise.polynomial_kernel",
"sklearn.metrics.pairwise.rbf_kernel",
"sklearn.metrics.pairwise.sigmoid_kernel",
"sklearn.model_selection._validation.learning_curve",
"sklearn.model_selection._validation.permutation_test_score",
"sklearn.model_selection._validation.validation_curve",
"sklearn.pipeline.make_union",
"sklearn.preprocessing._data.maxabs_scale",
"sklearn.preprocessing._data.robust_scale",
"sklearn.preprocessing._data.scale",
"sklearn.preprocessing._label.label_binarize",
"sklearn.random_projection.johnson_lindenstrauss_min_dim",
"sklearn.svm._bounds.l1_min_c",
"sklearn.tree._export.plot_tree",
"sklearn.utils.axis0_safe_slice",
"sklearn.utils.extmath.density",
"sklearn.utils.extmath.fast_logdet",
"sklearn.utils.extmath.randomized_svd",
"sklearn.utils.extmath.safe_sparse_dot",
"sklearn.utils.extmath.squared_norm",
"sklearn.utils.extmath.stable_cumsum",
"sklearn.utils.extmath.svd_flip",
"sklearn.utils.extmath.weighted_mode",
"sklearn.utils.fixes.delayed",
"sklearn.utils.fixes.linspace",
# To be fixed in upstream issue:
# https://github.com/joblib/threadpoolctl/issues/108
"sklearn.utils.fixes.threadpool_info",
"sklearn.utils.fixes.threadpool_limits",
"sklearn.utils.gen_batches",
"sklearn.utils.gen_even_slices",
"sklearn.utils.graph.graph_shortest_path",
"sklearn.utils.graph.single_source_shortest_path_length",
"sklearn.utils.is_scalar_nan",
"sklearn.utils.metaestimators.available_if",
"sklearn.utils.metaestimators.if_delegate_has_method",
"sklearn.utils.multiclass.class_distribution",
"sklearn.utils.multiclass.type_of_target",
"sklearn.utils.multiclass.unique_labels",
"sklearn.utils.resample",
"sklearn.utils.safe_mask",
"sklearn.utils.safe_sqr",
"sklearn.utils.shuffle",
"sklearn.utils.sparsefuncs.count_nonzero",
"sklearn.utils.sparsefuncs.csc_median_axis_0",
"sklearn.utils.sparsefuncs.incr_mean_variance_axis",
"sklearn.utils.sparsefuncs.inplace_swap_column",
"sklearn.utils.sparsefuncs.inplace_swap_row",
"sklearn.utils.sparsefuncs.inplace_swap_row_csc",
"sklearn.utils.sparsefuncs.inplace_swap_row_csr",
"sklearn.utils.sparsefuncs.mean_variance_axis",
"sklearn.utils.validation.check_is_fitted",
]
FUNCTION_DOCSTRING_IGNORE_LIST = set(FUNCTION_DOCSTRING_IGNORE_LIST)
def get_all_methods():
estimators = all_estimators()
for name, Estimator in estimators:
if name.startswith("_"):
# skip private classes
continue
methods = []
for name in dir(Estimator):
if name.startswith("_"):
continue
method_obj = getattr(Estimator, name)
if hasattr(method_obj, "__call__") or isinstance(method_obj, property):
methods.append(name)
methods.append(None)
for method in sorted(methods, key=lambda x: str(x)):
yield Estimator, method
def _is_checked_function(item):
if not inspect.isfunction(item):
return False
if item.__name__.startswith("_"):
return False
mod = item.__module__
if not mod.startswith("sklearn.") or mod.endswith("estimator_checks"):
return False
return True
def get_all_functions_names():
"""Get all public functions define in the sklearn module"""
modules_to_ignore = {
"tests",
"externals",
"setup",
"conftest",
"experimental",
"estimator_checks",
}
all_functions_names = set()
for module_finder, module_name, ispkg in pkgutil.walk_packages(
path=sklearn.__path__, prefix="sklearn."
):
module_parts = module_name.split(".")
if (
any(part in modules_to_ignore for part in module_parts)
or "._" in module_name
):
continue
module = importlib.import_module(module_name)
functions = inspect.getmembers(module, _is_checked_function)
for name, func in functions:
full_name = f"{func.__module__}.{func.__name__}"
all_functions_names.add(full_name)
return sorted(all_functions_names)
def filter_errors(errors, method, Estimator=None):
"""
Ignore some errors based on the method type.
These rules are specific for scikit-learn."""
for code, message in errors:
# We ignore following error code,
# - RT02: The first line of the Returns section
# should contain only the type, ..
# (as we may need refer to the name of the returned
# object)
# - GL01: Docstring text (summary) should start in the line
# immediately after the opening quotes (not in the same line,
# or leaving a blank line in between)
# - GL02: If there's a blank line, it should be before the
# first line of the Returns section, not after (it allows to have
# short docstrings for properties).
if code in ["RT02", "GL01", "GL02"]:
continue
# Ignore PR02: Unknown parameters for properties. We sometimes use
# properties for ducktyping, i.e. SGDClassifier.predict_proba
if code == "PR02" and Estimator is not None and method is not None:
method_obj = getattr(Estimator, method)
if isinstance(method_obj, property):
continue
# Following codes are only taken into account for the
# top level class docstrings:
# - ES01: No extended summary found
# - SA01: See Also section not found
# - EX01: No examples section found
if method is not None and code in ["EX01", "SA01", "ES01"]:
continue
yield code, message
def repr_errors(res, estimator=None, method: Optional[str] = None) -> str:
"""Pretty print original docstring and the obtained errors
Parameters
----------
res : dict
result of numpydoc.validate.validate
estimator : {estimator, None}
estimator object or None
method : str
if estimator is not None, either the method name or None.
Returns
-------
str
String representation of the error.
"""
if method is None:
if hasattr(estimator, "__init__"):
method = "__init__"
elif estimator is None:
raise ValueError("At least one of estimator, method should be provided")
else:
raise NotImplementedError
if estimator is not None:
obj = getattr(estimator, method)
try:
obj_signature = str(signature(obj))
except TypeError:
# In particular we can't parse the signature of properties
obj_signature = (
"\nParsing of the method signature failed, "
"possibly because this is a property."
)
obj_name = estimator.__name__ + "." + method
else:
obj_signature = ""
obj_name = method
msg = "\n\n" + "\n\n".join(
[
str(res["file"]),
obj_name + obj_signature,
res["docstring"],
"# Errors",
"\n".join(
" - {}: {}".format(code, message) for code, message in res["errors"]
),
]
)
return msg
@pytest.mark.parametrize("function_name", get_all_functions_names())
def test_function_docstring(function_name, request):
"""Check function docstrings using numpydoc."""
if function_name in FUNCTION_DOCSTRING_IGNORE_LIST:
request.applymarker(
pytest.mark.xfail(run=False, reason="TODO pass numpydoc validation")
)
res = numpydoc_validation.validate(function_name)
res["errors"] = list(filter_errors(res["errors"], method="function"))
if res["errors"]:
msg = repr_errors(res, method=f"Tested function: {function_name}")
raise ValueError(msg)
@pytest.mark.parametrize("Estimator, method", get_all_methods())
def test_docstring(Estimator, method, request):
base_import_path = Estimator.__module__
import_path = [base_import_path, Estimator.__name__]
if method is not None:
import_path.append(method)
import_path = ".".join(import_path)
res = numpydoc_validation.validate(import_path)
res["errors"] = list(filter_errors(res["errors"], method, Estimator=Estimator))
if res["errors"]:
msg = repr_errors(res, Estimator, method)
raise ValueError(msg)
if __name__ == "__main__":
import sys
import argparse
parser = argparse.ArgumentParser(description="Validate docstring with numpydoc.")
parser.add_argument("import_path", help="Import path to validate")
args = parser.parse_args()
res = numpydoc_validation.validate(args.import_path)
import_path_sections = args.import_path.split(".")
# When applied to classes, detect class method. For functions
# method = None.
# TODO: this detection can be improved. Currently we assume that we have
# class # methods if the second path element before last is in camel case.
if len(import_path_sections) >= 2 and re.match(
r"(?:[A-Z][a-z]*)+", import_path_sections[-2]
):
method = import_path_sections[-1]
else:
method = None
res["errors"] = list(filter_errors(res["errors"], method))
if res["errors"]:
msg = repr_errors(res, method=args.import_path)
print(msg)
sys.exit(1)
else:
print("All docstring checks passed for {}!".format(args.import_path))

View File

@@ -0,0 +1,744 @@
import pytest
import numpy as np
import scipy.sparse as sp
from sklearn.base import clone
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.utils.stats import _weighted_percentile
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.exceptions import NotFittedError
@ignore_warnings
def _check_predict_proba(clf, X, y):
proba = clf.predict_proba(X)
# We know that we can have division by zero
log_proba = clf.predict_log_proba(X)
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
n_samples = len(X)
if n_outputs == 1:
proba = [proba]
log_proba = [log_proba]
for k in range(n_outputs):
assert proba[k].shape[0] == n_samples
assert proba[k].shape[1] == len(np.unique(y[:, k]))
assert_array_almost_equal(proba[k].sum(axis=1), np.ones(len(X)))
# We know that we can have division by zero
assert_array_almost_equal(np.log(proba[k]), log_proba[k])
def _check_behavior_2d(clf):
# 1d case
X = np.array([[0], [0], [0], [0]]) # ignored
y = np.array([1, 2, 1, 1])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert y.shape == y_pred.shape
# 2d case
y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert y.shape == y_pred.shape
def _check_behavior_2d_for_constant(clf):
# 2d case only
X = np.array([[0], [0], [0], [0]]) # ignored
y = np.array([[1, 0, 5, 4, 3], [2, 0, 1, 2, 5], [1, 0, 4, 5, 2], [1, 3, 3, 2, 0]])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert y.shape == y_pred.shape
def _check_equality_regressor(statistic, y_learn, y_pred_learn, y_test, y_pred_test):
assert_array_almost_equal(np.tile(statistic, (y_learn.shape[0], 1)), y_pred_learn)
assert_array_almost_equal(np.tile(statistic, (y_test.shape[0], 1)), y_pred_test)
def test_most_frequent_and_prior_strategy():
X = [[0], [0], [0], [0]] # ignored
y = [1, 2, 1, 1]
for strategy in ("most_frequent", "prior"):
clf = DummyClassifier(strategy=strategy, random_state=0)
clf.fit(X, y)
assert_array_equal(clf.predict(X), np.ones(len(X)))
_check_predict_proba(clf, X, y)
if strategy == "prior":
assert_array_almost_equal(
clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1))
)
else:
assert_array_almost_equal(
clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1)) > 0.5
)
def test_most_frequent_and_prior_strategy_with_2d_column_y():
# non-regression test added in
# https://github.com/scikit-learn/scikit-learn/pull/13545
X = [[0], [0], [0], [0]]
y_1d = [1, 2, 1, 1]
y_2d = [[1], [2], [1], [1]]
for strategy in ("most_frequent", "prior"):
clf_1d = DummyClassifier(strategy=strategy, random_state=0)
clf_2d = DummyClassifier(strategy=strategy, random_state=0)
clf_1d.fit(X, y_1d)
clf_2d.fit(X, y_2d)
assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
def test_most_frequent_and_prior_strategy_multioutput():
X = [[0], [0], [0], [0]] # ignored
y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
n_samples = len(X)
for strategy in ("prior", "most_frequent"):
clf = DummyClassifier(strategy=strategy, random_state=0)
clf.fit(X, y)
assert_array_equal(
clf.predict(X),
np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]),
)
_check_predict_proba(clf, X, y)
_check_behavior_2d(clf)
def test_stratified_strategy():
X = [[0]] * 5 # ignored
y = [1, 2, 1, 1, 2]
clf = DummyClassifier(strategy="stratified", random_state=0)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
p = np.bincount(y_pred) / float(len(X))
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
assert_almost_equal(p[2], 2.0 / 5, decimal=1)
_check_predict_proba(clf, X, y)
def test_stratified_strategy_multioutput():
X = [[0]] * 5 # ignored
y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]])
clf = DummyClassifier(strategy="stratified", random_state=0)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
assert_almost_equal(p[2], 2.0 / 5, decimal=1)
_check_predict_proba(clf, X, y)
_check_behavior_2d(clf)
def test_uniform_strategy():
X = [[0]] * 4 # ignored
y = [1, 2, 1, 1]
clf = DummyClassifier(strategy="uniform", random_state=0)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
p = np.bincount(y_pred) / float(len(X))
assert_almost_equal(p[1], 0.5, decimal=1)
assert_almost_equal(p[2], 0.5, decimal=1)
_check_predict_proba(clf, X, y)
def test_uniform_strategy_multioutput():
X = [[0]] * 4 # ignored
y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]])
clf = DummyClassifier(strategy="uniform", random_state=0)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 0.5, decimal=1)
assert_almost_equal(p[2], 0.5, decimal=1)
_check_predict_proba(clf, X, y)
_check_behavior_2d(clf)
def test_string_labels():
X = [[0]] * 5
y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
clf = DummyClassifier(strategy="most_frequent")
clf.fit(X, y)
assert_array_equal(clf.predict(X), ["paris"] * 5)
@pytest.mark.parametrize(
"y,y_test",
[
([2, 1, 1, 1], [2, 2, 1, 1]),
(
np.array([[2, 2], [1, 1], [1, 1], [1, 1]]),
np.array([[2, 2], [2, 2], [1, 1], [1, 1]]),
),
],
)
def test_classifier_score_with_None(y, y_test):
clf = DummyClassifier(strategy="most_frequent")
clf.fit(None, y)
assert clf.score(None, y_test) == 0.5
@pytest.mark.parametrize(
"strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
)
def test_classifier_prediction_independent_of_X(strategy):
y = [0, 2, 1, 1]
X1 = [[0]] * 4
clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
clf1.fit(X1, y)
predictions1 = clf1.predict(X1)
X2 = [[1]] * 4
clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
clf2.fit(X2, y)
predictions2 = clf2.predict(X2)
assert_array_equal(predictions1, predictions2)
def test_classifier_exceptions():
clf = DummyClassifier(strategy="unknown")
with pytest.raises(ValueError):
clf.fit([], [])
with pytest.raises(NotFittedError):
clf.predict([])
with pytest.raises(NotFittedError):
clf.predict_proba([])
def test_mean_strategy_regressor():
random_state = np.random.RandomState(seed=1)
X = [[0]] * 4 # ignored
y = random_state.randn(4)
reg = DummyRegressor()
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
def test_mean_strategy_multioutput_regressor():
random_state = np.random.RandomState(seed=1)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
mean = np.mean(y_learn, axis=0).reshape((1, -1))
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor()
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d(est)
def test_regressor_exceptions():
reg = DummyRegressor()
with pytest.raises(NotFittedError):
reg.predict([])
def test_median_strategy_regressor():
random_state = np.random.RandomState(seed=1)
X = [[0]] * 5 # ignored
y = random_state.randn(5)
reg = DummyRegressor(strategy="median")
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
def test_median_strategy_multioutput_regressor():
random_state = np.random.RandomState(seed=1)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
median = np.median(y_learn, axis=0).reshape((1, -1))
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor(strategy="median")
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d(est)
def test_quantile_strategy_regressor():
random_state = np.random.RandomState(seed=1)
X = [[0]] * 5 # ignored
y = random_state.randn(5)
reg = DummyRegressor(strategy="quantile", quantile=0.5)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
reg = DummyRegressor(strategy="quantile", quantile=0)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.min(y)] * len(X))
reg = DummyRegressor(strategy="quantile", quantile=1)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.max(y)] * len(X))
reg = DummyRegressor(strategy="quantile", quantile=0.3)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
def test_quantile_strategy_multioutput_regressor():
random_state = np.random.RandomState(seed=1)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
median = np.median(y_learn, axis=0).reshape((1, -1))
quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor(strategy="quantile", quantile=0.5)
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d(est)
# Correctness oracle
est = DummyRegressor(strategy="quantile", quantile=0.8)
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(
quantile_values, y_learn, y_pred_learn, y_test, y_pred_test
)
_check_behavior_2d(est)
def test_quantile_invalid():
X = [[0]] * 5 # ignored
y = [0] * 5 # ignored
est = DummyRegressor(strategy="quantile")
with pytest.raises(ValueError):
est.fit(X, y)
est = DummyRegressor(strategy="quantile", quantile=None)
with pytest.raises(ValueError):
est.fit(X, y)
est = DummyRegressor(strategy="quantile", quantile=[0])
with pytest.raises(ValueError):
est.fit(X, y)
est = DummyRegressor(strategy="quantile", quantile=-0.1)
with pytest.raises(ValueError):
est.fit(X, y)
est = DummyRegressor(strategy="quantile", quantile=1.1)
with pytest.raises(ValueError):
est.fit(X, y)
est = DummyRegressor(strategy="quantile", quantile="abc")
with pytest.raises(TypeError):
est.fit(X, y)
def test_quantile_strategy_empty_train():
est = DummyRegressor(strategy="quantile", quantile=0.4)
with pytest.raises(ValueError):
est.fit([], [])
def test_constant_strategy_regressor():
random_state = np.random.RandomState(seed=1)
X = [[0]] * 5 # ignored
y = random_state.randn(5)
reg = DummyRegressor(strategy="constant", constant=[43])
reg.fit(X, y)
assert_array_equal(reg.predict(X), [43] * len(X))
reg = DummyRegressor(strategy="constant", constant=43)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [43] * len(X))
# non-regression test for #22478
assert not isinstance(reg.constant, np.ndarray)
def test_constant_strategy_multioutput_regressor():
random_state = np.random.RandomState(seed=1)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
# test with 2d array
constants = random_state.randn(5)
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor(strategy="constant", constant=constants)
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d_for_constant(est)
def test_y_mean_attribute_regressor():
X = [[0]] * 5
y = [1, 2, 4, 6, 8]
# when strategy = 'mean'
est = DummyRegressor(strategy="mean")
est.fit(X, y)
assert est.constant_ == np.mean(y)
def test_unknown_strategey_regressor():
X = [[0]] * 5
y = [1, 2, 4, 6, 8]
est = DummyRegressor(strategy="gona")
with pytest.raises(ValueError):
est.fit(X, y)
def test_constants_not_specified_regressor():
X = [[0]] * 5
y = [1, 2, 4, 6, 8]
est = DummyRegressor(strategy="constant")
with pytest.raises(TypeError):
est.fit(X, y)
def test_constant_size_multioutput_regressor():
random_state = np.random.RandomState(seed=1)
X = random_state.randn(10, 10)
y = random_state.randn(10, 5)
est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4])
with pytest.raises(ValueError):
est.fit(X, y)
def test_constant_strategy():
X = [[0], [0], [0], [0]] # ignored
y = [2, 1, 2, 2]
clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
clf.fit(X, y)
assert_array_equal(clf.predict(X), np.ones(len(X)))
_check_predict_proba(clf, X, y)
X = [[0], [0], [0], [0]] # ignored
y = ["two", "one", "two", "two"]
clf = DummyClassifier(strategy="constant", random_state=0, constant="one")
clf.fit(X, y)
assert_array_equal(clf.predict(X), np.array(["one"] * 4))
_check_predict_proba(clf, X, y)
def test_constant_strategy_multioutput():
X = [[0], [0], [0], [0]] # ignored
y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]])
n_samples = len(X)
clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
clf.fit(X, y)
assert_array_equal(
clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
)
_check_predict_proba(clf, X, y)
@pytest.mark.parametrize(
"y, params, err_msg",
[
([2, 1, 2, 2], {"random_state": 0}, "Constant.*has to be specified"),
([2, 1, 2, 2], {"constant": [2, 0]}, "Constant.*should have shape"),
(
np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
{"constant": 2},
"Constant.*should have shape",
),
(
[2, 1, 2, 2],
{"constant": "my-constant"},
"constant=my-constant.*Possible values.*\\[1, 2]",
),
(
np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
{"constant": [2, "unknown"]},
"constant=\\[2, 'unknown'].*Possible values.*\\[1, 2]",
),
],
ids=[
"no-constant",
"too-many-constant",
"not-enough-output",
"single-output",
"multi-output",
],
)
def test_constant_strategy_exceptions(y, params, err_msg):
X = [[0], [0], [0], [0]]
clf = DummyClassifier(strategy="constant", **params)
with pytest.raises(ValueError, match=err_msg):
clf.fit(X, y)
def test_classification_sample_weight():
X = [[0], [0], [1]]
y = [0, 1, 0]
sample_weight = [0.1, 1.0, 0.1]
clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1.0 / 1.2])
def test_constant_strategy_sparse_target():
X = [[0]] * 5 # ignored
y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]]))
n_samples = len(X)
clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
clf.fit(X, y)
y_pred = clf.predict(X)
assert sp.issparse(y_pred)
assert_array_equal(
y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
)
def test_uniform_strategy_sparse_target_warning():
X = [[0]] * 5 # ignored
y = sp.csc_matrix(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]]))
clf = DummyClassifier(strategy="uniform", random_state=0)
with pytest.warns(UserWarning, match="the uniform strategy would not save memory"):
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 1 / 3, decimal=1)
assert_almost_equal(p[2], 1 / 3, decimal=1)
assert_almost_equal(p[4], 1 / 3, decimal=1)
def test_stratified_strategy_sparse_target():
X = [[0]] * 5 # ignored
y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]]))
clf = DummyClassifier(strategy="stratified", random_state=0)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
assert sp.issparse(y_pred)
y_pred = y_pred.toarray()
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
assert_almost_equal(p[0], 1.0 / 5, decimal=1)
assert_almost_equal(p[4], 1.0 / 5, decimal=1)
def test_most_frequent_and_prior_strategy_sparse_target():
X = [[0]] * 5 # ignored
y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]]))
n_samples = len(X)
y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
for strategy in ("most_frequent", "prior"):
clf = DummyClassifier(strategy=strategy, random_state=0)
clf.fit(X, y)
y_pred = clf.predict(X)
assert sp.issparse(y_pred)
assert_array_equal(y_pred.toarray(), y_expected)
def test_dummy_regressor_sample_weight(n_samples=10):
random_state = np.random.RandomState(seed=1)
X = [[0]] * n_samples
y = random_state.rand(n_samples)
sample_weight = random_state.rand(n_samples)
est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
assert est.constant_ == np.average(y, weights=sample_weight)
est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
assert est.constant_ == _weighted_percentile(y, sample_weight, 50.0)
est = DummyRegressor(strategy="quantile", quantile=0.95).fit(X, y, sample_weight)
assert est.constant_ == _weighted_percentile(y, sample_weight, 95.0)
def test_dummy_regressor_on_3D_array():
X = np.array([[["foo"]], [["bar"]], [["baz"]]])
y = np.array([2, 2, 2])
y_expected = np.array([2, 2, 2])
cls = DummyRegressor()
cls.fit(X, y)
y_pred = cls.predict(X)
assert_array_equal(y_pred, y_expected)
def test_dummy_classifier_on_3D_array():
X = np.array([[["foo"]], [["bar"]], [["baz"]]])
y = [2, 2, 2]
y_expected = [2, 2, 2]
y_proba_expected = [[1], [1], [1]]
cls = DummyClassifier(strategy="stratified")
cls.fit(X, y)
y_pred = cls.predict(X)
y_pred_proba = cls.predict_proba(X)
assert_array_equal(y_pred, y_expected)
assert_array_equal(y_pred_proba, y_proba_expected)
def test_dummy_regressor_return_std():
X = [[0]] * 3 # ignored
y = np.array([2, 2, 2])
y_std_expected = np.array([0, 0, 0])
cls = DummyRegressor()
cls.fit(X, y)
y_pred_list = cls.predict(X, return_std=True)
# there should be two elements when return_std is True
assert len(y_pred_list) == 2
# the second element should be all zeros
assert_array_equal(y_pred_list[1], y_std_expected)
@pytest.mark.parametrize(
"y,y_test",
[
([1, 1, 1, 2], [1.25] * 4),
(np.array([[2, 2], [1, 1], [1, 1], [1, 1]]), [[1.25, 1.25]] * 4),
],
)
def test_regressor_score_with_None(y, y_test):
reg = DummyRegressor()
reg.fit(None, y)
assert reg.score(None, y_test) == 1.0
@pytest.mark.parametrize("strategy", ["mean", "median", "quantile", "constant"])
def test_regressor_prediction_independent_of_X(strategy):
y = [0, 2, 1, 1]
X1 = [[0]] * 4
reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
reg1.fit(X1, y)
predictions1 = reg1.predict(X1)
X2 = [[1]] * 4
reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
reg2.fit(X2, y)
predictions2 = reg2.predict(X2)
assert_array_equal(predictions1, predictions2)
@pytest.mark.parametrize(
"strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
)
def test_dtype_of_classifier_probas(strategy):
y = [0, 2, 1, 1]
X = np.zeros(4)
model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
probas = model.fit(X, y).predict_proba(X)
assert probas.dtype == np.float64
# TODO: remove in 1.2
@pytest.mark.filterwarnings("ignore:`n_features_in_` is deprecated")
@pytest.mark.parametrize("Dummy", (DummyRegressor, DummyClassifier))
def test_n_features_in_(Dummy):
X = [[1, 2]]
y = [0]
d = Dummy()
assert not hasattr(d, "n_features_in_")
d.fit(X, y)
with pytest.warns(FutureWarning, match="`n_features_in_` is deprecated"):
n_features_in = d.n_features_in_
assert n_features_in is None

View File

@@ -0,0 +1,20 @@
# Basic unittests to test functioning of module's top-level
__author__ = "Yaroslav Halchenko"
__license__ = "BSD"
try:
from sklearn import * # noqa
_top_import_error = None
except Exception as e:
_top_import_error = e
def test_import_skl():
# Test either above import has failed for some reason
# "import *" is discouraged outside of the module level, hence we
# rely on setting up the variable above
assert _top_import_error is None

View File

@@ -0,0 +1,712 @@
import warnings
import numpy as np
import pickle
import copy
import pytest
from sklearn.datasets import make_regression
from sklearn.isotonic import (
check_increasing,
isotonic_regression,
IsotonicRegression,
_make_unique,
)
from sklearn.utils.validation import check_array
from sklearn.utils._testing import (
assert_allclose,
assert_array_equal,
assert_array_almost_equal,
)
from sklearn.utils import shuffle
from scipy.special import expit
def test_permutation_invariance():
# check that fit is permutation invariant.
# regression test of missing sorting of sample-weights
ir = IsotonicRegression()
x = [1, 2, 3, 4, 5, 6, 7]
y = [1, 41, 51, 1, 2, 5, 24]
sample_weight = [1, 2, 3, 4, 5, 6, 7]
x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0)
y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight)
y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x)
assert_array_equal(y_transformed, y_transformed_s)
def test_check_increasing_small_number_of_samples():
x = [0, 1, 2]
y = [1, 1.1, 1.05]
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
is_increasing = check_increasing(x, y)
assert is_increasing
def test_check_increasing_up():
x = [0, 1, 2, 3, 4, 5]
y = [0, 1.5, 2.77, 8.99, 8.99, 50]
# Check that we got increasing=True and no warnings
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
is_increasing = check_increasing(x, y)
assert is_increasing
def test_check_increasing_up_extreme():
x = [0, 1, 2, 3, 4, 5]
y = [0, 1, 2, 3, 4, 5]
# Check that we got increasing=True and no warnings
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
is_increasing = check_increasing(x, y)
assert is_increasing
def test_check_increasing_down():
x = [0, 1, 2, 3, 4, 5]
y = [0, -1.5, -2.77, -8.99, -8.99, -50]
# Check that we got increasing=False and no warnings
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
is_increasing = check_increasing(x, y)
assert not is_increasing
def test_check_increasing_down_extreme():
x = [0, 1, 2, 3, 4, 5]
y = [0, -1, -2, -3, -4, -5]
# Check that we got increasing=False and no warnings
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
is_increasing = check_increasing(x, y)
assert not is_increasing
def test_check_ci_warn():
x = [0, 1, 2, 3, 4, 5]
y = [0, -1, 2, -3, 4, -5]
# Check that we got increasing=False and CI interval warning
msg = "interval"
with pytest.warns(UserWarning, match=msg):
is_increasing = check_increasing(x, y)
assert not is_increasing
def test_isotonic_regression():
y = np.array([3, 7, 5, 9, 8, 7, 10])
y_ = np.array([3, 6, 6, 8, 8, 8, 10])
assert_array_equal(y_, isotonic_regression(y))
y = np.array([10, 0, 2])
y_ = np.array([4, 4, 4])
assert_array_equal(y_, isotonic_regression(y))
x = np.arange(len(y))
ir = IsotonicRegression(y_min=0.0, y_max=1.0)
ir.fit(x, y)
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
assert_array_equal(ir.transform(x), ir.predict(x))
# check that it is immune to permutation
perm = np.random.permutation(len(y))
ir = IsotonicRegression(y_min=0.0, y_max=1.0)
assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm])
assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
# check we don't crash when all x are equal:
ir = IsotonicRegression()
assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
def test_isotonic_regression_ties_min():
# Setup examples with ties on minimum
x = [1, 1, 2, 3, 4, 5]
y = [1, 2, 3, 4, 5, 6]
y_true = [1.5, 1.5, 3, 4, 5, 6]
# Check that we get identical results for fit/transform and fit_transform
ir = IsotonicRegression()
ir.fit(x, y)
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
assert_array_equal(y_true, ir.fit_transform(x, y))
def test_isotonic_regression_ties_max():
# Setup examples with ties on maximum
x = [1, 2, 3, 4, 5, 5]
y = [1, 2, 3, 4, 5, 6]
y_true = [1, 2, 3, 4, 5.5, 5.5]
# Check that we get identical results for fit/transform and fit_transform
ir = IsotonicRegression()
ir.fit(x, y)
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
assert_array_equal(y_true, ir.fit_transform(x, y))
def test_isotonic_regression_ties_secondary_():
"""
Test isotonic regression fit, transform and fit_transform
against the "secondary" ties method and "pituitary" data from R
"isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
(PAVA) and Active Set Methods
Set values based on pituitary example and
the following R command detailed in the paper above:
> library("isotone")
> data("pituitary")
> res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
> res1$x
`isotone` version: 1.0-2, 2014-09-07
R version: R version 3.1.1 (2014-07-10)
"""
x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
y_true = [
22.22222,
22.22222,
22.22222,
22.22222,
22.22222,
22.22222,
22.22222,
22.22222,
22.22222,
24.25,
24.25,
]
# Check fit, transform and fit_transform
ir = IsotonicRegression()
ir.fit(x, y)
assert_array_almost_equal(ir.transform(x), y_true, 4)
assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
def test_isotonic_regression_with_ties_in_differently_sized_groups():
"""
Non-regression test to handle issue 9432:
https://github.com/scikit-learn/scikit-learn/issues/9432
Compare against output in R:
> library("isotone")
> x <- c(0, 1, 1, 2, 3, 4)
> y <- c(0, 0, 1, 0, 0, 1)
> res1 <- gpava(x, y, ties="secondary")
> res1$x
`isotone` version: 1.1-0, 2015-07-24
R version: R version 3.3.2 (2016-10-31)
"""
x = np.array([0, 1, 1, 2, 3, 4])
y = np.array([0, 0, 1, 0, 0, 1])
y_true = np.array([0.0, 0.25, 0.25, 0.25, 0.25, 1.0])
ir = IsotonicRegression()
ir.fit(x, y)
assert_array_almost_equal(ir.transform(x), y_true)
assert_array_almost_equal(ir.fit_transform(x, y), y_true)
def test_isotonic_regression_reversed():
y = np.array([10, 9, 10, 7, 6, 6.1, 5])
y_ = IsotonicRegression(increasing=False).fit_transform(np.arange(len(y)), y)
assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
def test_isotonic_regression_auto_decreasing():
# Set y and x for decreasing
y = np.array([10, 9, 10, 7, 6, 6.1, 5])
x = np.arange(len(y))
# Create model and fit_transform
ir = IsotonicRegression(increasing="auto")
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
y_ = ir.fit_transform(x, y)
# work-around for pearson divide warnings in scipy <= 0.17.0
assert all(["invalid value encountered in " in str(warn.message) for warn in w])
# Check that relationship decreases
is_increasing = y_[0] < y_[-1]
assert not is_increasing
def test_isotonic_regression_auto_increasing():
# Set y and x for decreasing
y = np.array([5, 6.1, 6, 7, 10, 9, 10])
x = np.arange(len(y))
# Create model and fit_transform
ir = IsotonicRegression(increasing="auto")
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
y_ = ir.fit_transform(x, y)
# work-around for pearson divide warnings in scipy <= 0.17.0
assert all(["invalid value encountered in " in str(warn.message) for warn in w])
# Check that relationship increases
is_increasing = y_[0] < y_[-1]
assert is_increasing
def test_assert_raises_exceptions():
ir = IsotonicRegression()
rng = np.random.RandomState(42)
msg = "Found input variables with inconsistent numbers of samples"
with pytest.raises(ValueError, match=msg):
ir.fit([0, 1, 2], [5, 7, 3], [0.1, 0.6])
with pytest.raises(ValueError, match=msg):
ir.fit([0, 1, 2], [5, 7])
msg = "X should be a 1d array"
with pytest.raises(ValueError, match=msg):
ir.fit(rng.randn(3, 10), [0, 1, 2])
msg = "Isotonic regression input X should be a 1d array"
with pytest.raises(ValueError, match=msg):
ir.transform(rng.randn(3, 10))
def test_isotonic_sample_weight_parameter_default_value():
# check if default value of sample_weight parameter is one
ir = IsotonicRegression()
# random test data
rng = np.random.RandomState(42)
n = 100
x = np.arange(n)
y = rng.randint(-50, 50, size=(n,)) + 50.0 * np.log(1 + np.arange(n))
# check if value is correctly used
weights = np.ones(n)
y_set_value = ir.fit_transform(x, y, sample_weight=weights)
y_default_value = ir.fit_transform(x, y)
assert_array_equal(y_set_value, y_default_value)
def test_isotonic_min_max_boundaries():
# check if min value is used correctly
ir = IsotonicRegression(y_min=2, y_max=4)
n = 6
x = np.arange(n)
y = np.arange(n)
y_test = [2, 2, 2, 3, 4, 4]
y_result = np.round(ir.fit_transform(x, y))
assert_array_equal(y_result, y_test)
def test_isotonic_sample_weight():
ir = IsotonicRegression()
x = [1, 2, 3, 4, 5, 6, 7]
y = [1, 41, 51, 1, 2, 5, 24]
sample_weight = [1, 2, 3, 4, 5, 6, 7]
expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24]
received_y = ir.fit_transform(x, y, sample_weight=sample_weight)
assert_array_equal(expected_y, received_y)
def test_isotonic_regression_oob_raise():
# Set y and x
y = np.array([3, 7, 5, 9, 8, 7, 10])
x = np.arange(len(y))
# Create model and fit
ir = IsotonicRegression(increasing="auto", out_of_bounds="raise")
ir.fit(x, y)
# Check that an exception is thrown
msg = "A value in x_new is below the interpolation range"
with pytest.raises(ValueError, match=msg):
ir.predict([min(x) - 10, max(x) + 10])
def test_isotonic_regression_oob_clip():
# Set y and x
y = np.array([3, 7, 5, 9, 8, 7, 10])
x = np.arange(len(y))
# Create model and fit
ir = IsotonicRegression(increasing="auto", out_of_bounds="clip")
ir.fit(x, y)
# Predict from training and test x and check that min/max match.
y1 = ir.predict([min(x) - 10, max(x) + 10])
y2 = ir.predict(x)
assert max(y1) == max(y2)
assert min(y1) == min(y2)
def test_isotonic_regression_oob_nan():
# Set y and x
y = np.array([3, 7, 5, 9, 8, 7, 10])
x = np.arange(len(y))
# Create model and fit
ir = IsotonicRegression(increasing="auto", out_of_bounds="nan")
ir.fit(x, y)
# Predict from training and test x and check that we have two NaNs.
y1 = ir.predict([min(x) - 10, max(x) + 10])
assert sum(np.isnan(y1)) == 2
def test_isotonic_regression_oob_bad():
# Set y and x
y = np.array([3, 7, 5, 9, 8, 7, 10])
x = np.arange(len(y))
# Create model and fit
ir = IsotonicRegression(increasing="auto", out_of_bounds="xyz")
# Make sure that we throw an error for bad out_of_bounds value
msg = "The argument ``out_of_bounds`` must be in 'nan', 'clip', 'raise'; got xyz"
with pytest.raises(ValueError, match=msg):
ir.fit(x, y)
def test_isotonic_regression_oob_bad_after():
# Set y and x
y = np.array([3, 7, 5, 9, 8, 7, 10])
x = np.arange(len(y))
# Create model and fit
ir = IsotonicRegression(increasing="auto", out_of_bounds="raise")
# Make sure that we throw an error for bad out_of_bounds value in transform
ir.fit(x, y)
ir.out_of_bounds = "xyz"
msg = "The argument ``out_of_bounds`` must be in 'nan', 'clip', 'raise'; got xyz"
with pytest.raises(ValueError, match=msg):
ir.transform(x)
def test_isotonic_regression_pickle():
y = np.array([3, 7, 5, 9, 8, 7, 10])
x = np.arange(len(y))
# Create model and fit
ir = IsotonicRegression(increasing="auto", out_of_bounds="clip")
ir.fit(x, y)
ir_ser = pickle.dumps(ir, pickle.HIGHEST_PROTOCOL)
ir2 = pickle.loads(ir_ser)
np.testing.assert_array_equal(ir.predict(x), ir2.predict(x))
def test_isotonic_duplicate_min_entry():
x = [0, 0, 1]
y = [0, 0, 1]
ir = IsotonicRegression(increasing=True, out_of_bounds="clip")
ir.fit(x, y)
all_predictions_finite = np.all(np.isfinite(ir.predict(x)))
assert all_predictions_finite
def test_isotonic_ymin_ymax():
# Test from @NelleV's issue:
# https://github.com/scikit-learn/scikit-learn/issues/6921
x = np.array(
[
1.263,
1.318,
-0.572,
0.307,
-0.707,
-0.176,
-1.599,
1.059,
1.396,
1.906,
0.210,
0.028,
-0.081,
0.444,
0.018,
-0.377,
-0.896,
-0.377,
-1.327,
0.180,
]
)
y = isotonic_regression(x, y_min=0.0, y_max=0.1)
assert np.all(y >= 0)
assert np.all(y <= 0.1)
# Also test decreasing case since the logic there is different
y = isotonic_regression(x, y_min=0.0, y_max=0.1, increasing=False)
assert np.all(y >= 0)
assert np.all(y <= 0.1)
# Finally, test with only one bound
y = isotonic_regression(x, y_min=0.0, increasing=False)
assert np.all(y >= 0)
def test_isotonic_zero_weight_loop():
# Test from @ogrisel's issue:
# https://github.com/scikit-learn/scikit-learn/issues/4297
# Get deterministic RNG with seed
rng = np.random.RandomState(42)
# Create regression and samples
regression = IsotonicRegression()
n_samples = 50
x = np.linspace(-3, 3, n_samples)
y = x + rng.uniform(size=n_samples)
# Get some random weights and zero out
w = rng.uniform(size=n_samples)
w[5:8] = 0
regression.fit(x, y, sample_weight=w)
# This will hang in failure case.
regression.fit(x, y, sample_weight=w)
def test_fast_predict():
# test that the faster prediction change doesn't
# affect out-of-sample predictions:
# https://github.com/scikit-learn/scikit-learn/pull/6206
rng = np.random.RandomState(123)
n_samples = 10**3
# X values over the -10,10 range
X_train = 20.0 * rng.rand(n_samples) - 10
y_train = (
np.less(rng.rand(n_samples), expit(X_train)).astype("int64").astype("float64")
)
weights = rng.rand(n_samples)
# we also want to test that everything still works when some weights are 0
weights[rng.rand(n_samples) < 0.1] = 0
slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
# Build interpolation function with ALL input data, not just the
# non-redundant subset. The following 2 lines are taken from the
# .fit() method, without removing unnecessary points
X_train_fit, y_train_fit = slow_model._build_y(
X_train, y_train, sample_weight=weights, trim_duplicates=False
)
slow_model._build_f(X_train_fit, y_train_fit)
# fit with just the necessary data
fast_model.fit(X_train, y_train, sample_weight=weights)
X_test = 20.0 * rng.rand(n_samples) - 10
y_pred_slow = slow_model.predict(X_test)
y_pred_fast = fast_model.predict(X_test)
assert_array_equal(y_pred_slow, y_pred_fast)
def test_isotonic_copy_before_fit():
# https://github.com/scikit-learn/scikit-learn/issues/6628
ir = IsotonicRegression()
copy.copy(ir)
def test_isotonic_dtype():
y = [2, 1, 4, 3, 5]
weights = np.array([0.9, 0.9, 0.9, 0.9, 0.9], dtype=np.float64)
reg = IsotonicRegression()
for dtype in (np.int32, np.int64, np.float32, np.float64):
for sample_weight in (None, weights.astype(np.float32), weights):
y_np = np.array(y, dtype=dtype)
expected_dtype = check_array(
y_np, dtype=[np.float64, np.float32], ensure_2d=False
).dtype
res = isotonic_regression(y_np, sample_weight=sample_weight)
assert res.dtype == expected_dtype
X = np.arange(len(y)).astype(dtype)
reg.fit(X, y_np, sample_weight=sample_weight)
res = reg.predict(X)
assert res.dtype == expected_dtype
@pytest.mark.parametrize("y_dtype", [np.int32, np.int64, np.float32, np.float64])
def test_isotonic_mismatched_dtype(y_dtype):
# regression test for #15004
# check that data are converted when X and y dtype differ
reg = IsotonicRegression()
y = np.array([2, 1, 4, 3, 5], dtype=y_dtype)
X = np.arange(len(y), dtype=np.float32)
reg.fit(X, y)
assert reg.predict(X).dtype == X.dtype
def test_make_unique_dtype():
x_list = [2, 2, 2, 3, 5]
for dtype in (np.float32, np.float64):
x = np.array(x_list, dtype=dtype)
y = x.copy()
w = np.ones_like(x)
x, y, w = _make_unique(x, y, w)
assert_array_equal(x, [2, 3, 5])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_make_unique_tolerance(dtype):
# Check that equality takes account of np.finfo tolerance
x = np.array([0, 1e-16, 1, 1 + 1e-14], dtype=dtype)
y = x.copy()
w = np.ones_like(x)
x, y, w = _make_unique(x, y, w)
if dtype == np.float64:
x_out = np.array([0, 1, 1 + 1e-14])
else:
x_out = np.array([0, 1])
assert_array_equal(x, x_out)
def test_isotonic_make_unique_tolerance():
# Check that averaging of targets for duplicate X is done correctly,
# taking into account tolerance
X = np.array([0, 1, 1 + 1e-16, 2], dtype=np.float64)
y = np.array([0, 1, 2, 3], dtype=np.float64)
ireg = IsotonicRegression().fit(X, y)
y_pred = ireg.predict([0, 0.5, 1, 1.5, 2])
assert_array_equal(y_pred, np.array([0, 0.75, 1.5, 2.25, 3]))
assert_array_equal(ireg.X_thresholds_, np.array([0.0, 1.0, 2.0]))
assert_array_equal(ireg.y_thresholds_, np.array([0.0, 1.5, 3.0]))
def test_isotonic_non_regression_inf_slope():
# Non-regression test to ensure that inf values are not returned
# see: https://github.com/scikit-learn/scikit-learn/issues/10903
X = np.array([0.0, 4.1e-320, 4.4e-314, 1.0])
y = np.array([0.42, 0.42, 0.44, 0.44])
ireg = IsotonicRegression().fit(X, y)
y_pred = ireg.predict(np.array([0, 2.1e-319, 5.4e-316, 1e-10]))
assert np.all(np.isfinite(y_pred))
@pytest.mark.parametrize("increasing", [True, False])
def test_isotonic_thresholds(increasing):
rng = np.random.RandomState(42)
n_samples = 30
X = rng.normal(size=n_samples)
y = rng.normal(size=n_samples)
ireg = IsotonicRegression(increasing=increasing).fit(X, y)
X_thresholds, y_thresholds = ireg.X_thresholds_, ireg.y_thresholds_
assert X_thresholds.shape == y_thresholds.shape
# Input thresholds are a strict subset of the training set (unless
# the data is already strictly monotonic which is not the case with
# this random data)
assert X_thresholds.shape[0] < X.shape[0]
assert np.in1d(X_thresholds, X).all()
# Output thresholds lie in the range of the training set:
assert y_thresholds.max() <= y.max()
assert y_thresholds.min() >= y.min()
assert all(np.diff(X_thresholds) > 0)
if increasing:
assert all(np.diff(y_thresholds) >= 0)
else:
assert all(np.diff(y_thresholds) <= 0)
def test_input_shape_validation():
# Test from #15012
# Check that IsotonicRegression can handle 2darray with only 1 feature
X = np.arange(10)
X_2d = X.reshape(-1, 1)
y = np.arange(10)
iso_reg = IsotonicRegression().fit(X, y)
iso_reg_2d = IsotonicRegression().fit(X_2d, y)
assert iso_reg.X_max_ == iso_reg_2d.X_max_
assert iso_reg.X_min_ == iso_reg_2d.X_min_
assert iso_reg.y_max == iso_reg_2d.y_max
assert iso_reg.y_min == iso_reg_2d.y_min
assert_array_equal(iso_reg.X_thresholds_, iso_reg_2d.X_thresholds_)
assert_array_equal(iso_reg.y_thresholds_, iso_reg_2d.y_thresholds_)
y_pred1 = iso_reg.predict(X)
y_pred2 = iso_reg_2d.predict(X_2d)
assert_allclose(y_pred1, y_pred2)
def test_isotonic_2darray_more_than_1_feature():
# Ensure IsotonicRegression raises error if input has more than 1 feature
X = np.arange(10)
X_2d = np.c_[X, X]
y = np.arange(10)
msg = "should be a 1d array or 2d array with 1 feature"
with pytest.raises(ValueError, match=msg):
IsotonicRegression().fit(X_2d, y)
iso_reg = IsotonicRegression().fit(X, y)
with pytest.raises(ValueError, match=msg):
iso_reg.predict(X_2d)
with pytest.raises(ValueError, match=msg):
iso_reg.transform(X_2d)
def test_isotonic_regression_sample_weight_not_overwritten():
"""Check that calling fitting function of isotonic regression will not
overwrite `sample_weight`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20508
"""
X, y = make_regression(n_samples=10, n_features=1, random_state=41)
sample_weight_original = np.ones_like(y)
sample_weight_original[0] = 10
sample_weight_fit = sample_weight_original.copy()
isotonic_regression(y, sample_weight=sample_weight_fit)
assert_allclose(sample_weight_fit, sample_weight_original)
IsotonicRegression().fit(X, y, sample_weight=sample_weight_fit)
assert_allclose(sample_weight_fit, sample_weight_original)
@pytest.mark.parametrize("shape", ["1d", "2d"])
def test_get_feature_names_out(shape):
"""Check `get_feature_names_out` for `IsotonicRegression`."""
X = np.arange(10)
if shape == "2d":
X = X.reshape(-1, 1)
y = np.arange(10)
iso = IsotonicRegression().fit(X, y)
names = iso.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(["isotonicregression0"], names)

View File

@@ -0,0 +1,392 @@
import re
import numpy as np
from scipy.sparse import csr_matrix
import pytest
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.metrics.pairwise import kernel_metrics
from sklearn.kernel_approximation import RBFSampler
from sklearn.kernel_approximation import AdditiveChi2Sampler
from sklearn.kernel_approximation import SkewedChi2Sampler
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import PolynomialCountSketch
from sklearn.datasets import make_classification
from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel, chi2_kernel
# generate data
rng = np.random.RandomState(0)
X = rng.random_sample(size=(300, 50))
Y = rng.random_sample(size=(300, 50))
X /= X.sum(axis=1)[:, np.newaxis]
Y /= Y.sum(axis=1)[:, np.newaxis]
@pytest.mark.parametrize("degree", [-1, 0])
def test_polynomial_count_sketch_raises_if_degree_lower_than_one(degree):
with pytest.raises(ValueError, match=f"degree={degree} should be >=1."):
ps_transform = PolynomialCountSketch(degree=degree)
ps_transform.fit(X, Y)
@pytest.mark.parametrize("X", [X, csr_matrix(X)])
@pytest.mark.parametrize("Y", [Y, csr_matrix(Y)])
@pytest.mark.parametrize("gamma", [0.1, 1, 2.5])
@pytest.mark.parametrize("degree", [1, 2, 3])
@pytest.mark.parametrize("coef0", [0, 1, 2.5])
def test_polynomial_count_sketch(X, Y, gamma, degree, coef0):
# test that PolynomialCountSketch approximates polynomial
# kernel on random data
# compute exact kernel
kernel = polynomial_kernel(X, Y, gamma=gamma, degree=degree, coef0=coef0)
# approximate kernel mapping
ps_transform = PolynomialCountSketch(
n_components=5000, gamma=gamma, coef0=coef0, degree=degree, random_state=42
)
X_trans = ps_transform.fit_transform(X)
Y_trans = ps_transform.transform(Y)
kernel_approx = np.dot(X_trans, Y_trans.T)
error = kernel - kernel_approx
assert np.abs(np.mean(error)) <= 0.05 # close to unbiased
np.abs(error, out=error)
assert np.max(error) <= 0.1 # nothing too far off
assert np.mean(error) <= 0.05 # mean is fairly close
def _linear_kernel(X, Y):
return np.dot(X, Y.T)
def test_additive_chi2_sampler():
# test that AdditiveChi2Sampler approximates kernel on random data
# compute exact kernel
# abbreviations for easier formula
X_ = X[:, np.newaxis, :]
Y_ = Y[np.newaxis, :, :]
large_kernel = 2 * X_ * Y_ / (X_ + Y_)
# reduce to n_samples_x x n_samples_y by summing over features
kernel = large_kernel.sum(axis=2)
# approximate kernel mapping
transform = AdditiveChi2Sampler(sample_steps=3)
X_trans = transform.fit_transform(X)
Y_trans = transform.transform(Y)
kernel_approx = np.dot(X_trans, Y_trans.T)
assert_array_almost_equal(kernel, kernel_approx, 1)
X_sp_trans = transform.fit_transform(csr_matrix(X))
Y_sp_trans = transform.transform(csr_matrix(Y))
assert_array_equal(X_trans, X_sp_trans.A)
assert_array_equal(Y_trans, Y_sp_trans.A)
# test error is raised on negative input
Y_neg = Y.copy()
Y_neg[0, 0] = -1
msg = "Negative values in data passed to"
with pytest.raises(ValueError, match=msg):
transform.transform(Y_neg)
# test error on invalid sample_steps
transform = AdditiveChi2Sampler(sample_steps=4)
msg = re.escape(
"If sample_steps is not in [1, 2, 3], you need to provide sample_interval"
)
with pytest.raises(ValueError, match=msg):
transform.fit(X)
# test that the sample interval is set correctly
sample_steps_available = [1, 2, 3]
for sample_steps in sample_steps_available:
# test that the sample_interval is initialized correctly
transform = AdditiveChi2Sampler(sample_steps=sample_steps)
assert transform.sample_interval is None
# test that the sample_interval is changed in the fit method
transform.fit(X)
assert transform.sample_interval_ is not None
# test that the sample_interval is set correctly
sample_interval = 0.3
transform = AdditiveChi2Sampler(sample_steps=4, sample_interval=sample_interval)
assert transform.sample_interval == sample_interval
transform.fit(X)
assert transform.sample_interval_ == sample_interval
def test_skewed_chi2_sampler():
# test that RBFSampler approximates kernel on random data
# compute exact kernel
c = 0.03
# set on negative component but greater than c to ensure that the kernel
# approximation is valid on the group (-c; +\infty) endowed with the skewed
# multiplication.
Y[0, 0] = -c / 2.0
# abbreviations for easier formula
X_c = (X + c)[:, np.newaxis, :]
Y_c = (Y + c)[np.newaxis, :, :]
# we do it in log-space in the hope that it's more stable
# this array is n_samples_x x n_samples_y big x n_features
log_kernel = (
(np.log(X_c) / 2.0) + (np.log(Y_c) / 2.0) + np.log(2.0) - np.log(X_c + Y_c)
)
# reduce to n_samples_x x n_samples_y by summing over features in log-space
kernel = np.exp(log_kernel.sum(axis=2))
# approximate kernel mapping
transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42)
X_trans = transform.fit_transform(X)
Y_trans = transform.transform(Y)
kernel_approx = np.dot(X_trans, Y_trans.T)
assert_array_almost_equal(kernel, kernel_approx, 1)
assert np.isfinite(kernel).all(), "NaNs found in the Gram matrix"
assert np.isfinite(kernel_approx).all(), "NaNs found in the approximate Gram matrix"
# test error is raised on when inputs contains values smaller than -c
Y_neg = Y.copy()
Y_neg[0, 0] = -c * 2.0
msg = "X may not contain entries smaller than -skewedness"
with pytest.raises(ValueError, match=msg):
transform.transform(Y_neg)
def test_additive_chi2_sampler_exceptions():
"""Ensures correct error message"""
transformer = AdditiveChi2Sampler()
X_neg = X.copy()
X_neg[0, 0] = -1
with pytest.raises(ValueError, match="X in AdditiveChi2Sampler.fit"):
transformer.fit(X_neg)
with pytest.raises(ValueError, match="X in AdditiveChi2Sampler.transform"):
transformer.fit(X)
transformer.transform(X_neg)
def test_rbf_sampler():
# test that RBFSampler approximates kernel on random data
# compute exact kernel
gamma = 10.0
kernel = rbf_kernel(X, Y, gamma=gamma)
# approximate kernel mapping
rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
X_trans = rbf_transform.fit_transform(X)
Y_trans = rbf_transform.transform(Y)
kernel_approx = np.dot(X_trans, Y_trans.T)
error = kernel - kernel_approx
assert np.abs(np.mean(error)) <= 0.01 # close to unbiased
np.abs(error, out=error)
assert np.max(error) <= 0.1 # nothing too far off
assert np.mean(error) <= 0.05 # mean is fairly close
def test_input_validation():
# Regression test: kernel approx. transformers should work on lists
# No assertions; the old versions would simply crash
X = [[1, 2], [3, 4], [5, 6]]
AdditiveChi2Sampler().fit(X).transform(X)
SkewedChi2Sampler().fit(X).transform(X)
RBFSampler().fit(X).transform(X)
X = csr_matrix(X)
RBFSampler().fit(X).transform(X)
def test_nystroem_approximation():
# some basic tests
rnd = np.random.RandomState(0)
X = rnd.uniform(size=(10, 4))
# With n_components = n_samples this is exact
X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
K = rbf_kernel(X)
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
trans = Nystroem(n_components=2, random_state=rnd)
X_transformed = trans.fit(X).transform(X)
assert X_transformed.shape == (X.shape[0], 2)
# test callable kernel
trans = Nystroem(n_components=2, kernel=_linear_kernel, random_state=rnd)
X_transformed = trans.fit(X).transform(X)
assert X_transformed.shape == (X.shape[0], 2)
# test that available kernels fit and transform
kernels_available = kernel_metrics()
for kern in kernels_available:
trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
X_transformed = trans.fit(X).transform(X)
assert X_transformed.shape == (X.shape[0], 2)
def test_nystroem_default_parameters():
rnd = np.random.RandomState(42)
X = rnd.uniform(size=(10, 4))
# rbf kernel should behave as gamma=None by default
# aka gamma = 1 / n_features
nystroem = Nystroem(n_components=10)
X_transformed = nystroem.fit_transform(X)
K = rbf_kernel(X, gamma=None)
K2 = np.dot(X_transformed, X_transformed.T)
assert_array_almost_equal(K, K2)
# chi2 kernel should behave as gamma=1 by default
nystroem = Nystroem(kernel="chi2", n_components=10)
X_transformed = nystroem.fit_transform(X)
K = chi2_kernel(X, gamma=1)
K2 = np.dot(X_transformed, X_transformed.T)
assert_array_almost_equal(K, K2)
def test_nystroem_singular_kernel():
# test that nystroem works with singular kernel matrix
rng = np.random.RandomState(0)
X = rng.rand(10, 20)
X = np.vstack([X] * 2) # duplicate samples
gamma = 100
N = Nystroem(gamma=gamma, n_components=X.shape[0]).fit(X)
X_transformed = N.transform(X)
K = rbf_kernel(X, gamma=gamma)
assert_array_almost_equal(K, np.dot(X_transformed, X_transformed.T))
assert np.all(np.isfinite(Y))
def test_nystroem_poly_kernel_params():
# Non-regression: Nystroem should pass other parameters beside gamma.
rnd = np.random.RandomState(37)
X = rnd.uniform(size=(10, 4))
K = polynomial_kernel(X, degree=3.1, coef0=0.1)
nystroem = Nystroem(
kernel="polynomial", n_components=X.shape[0], degree=3.1, coef0=0.1
)
X_transformed = nystroem.fit_transform(X)
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
def test_nystroem_callable():
# Test Nystroem on a callable.
rnd = np.random.RandomState(42)
n_samples = 10
X = rnd.uniform(size=(n_samples, 4))
def logging_histogram_kernel(x, y, log):
"""Histogram kernel that writes to a log."""
log.append(1)
return np.minimum(x, y).sum()
kernel_log = []
X = list(X) # test input validation
Nystroem(
kernel=logging_histogram_kernel,
n_components=(n_samples - 1),
kernel_params={"log": kernel_log},
).fit(X)
assert len(kernel_log) == n_samples * (n_samples - 1) / 2
# if degree, gamma or coef0 is passed, we raise a ValueError
msg = "Don't pass gamma, coef0 or degree to Nystroem"
params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
for param in params:
ny = Nystroem(kernel=_linear_kernel, n_components=(n_samples - 1), **param)
with pytest.raises(ValueError, match=msg):
ny.fit(X)
def test_nystroem_precomputed_kernel():
# Non-regression: test Nystroem on precomputed kernel.
# PR - 14706
rnd = np.random.RandomState(12)
X = rnd.uniform(size=(10, 4))
K = polynomial_kernel(X, degree=2, coef0=0.1)
nystroem = Nystroem(kernel="precomputed", n_components=X.shape[0])
X_transformed = nystroem.fit_transform(K)
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
# if degree, gamma or coef0 is passed, we raise a ValueError
msg = "Don't pass gamma, coef0 or degree to Nystroem"
params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
for param in params:
ny = Nystroem(kernel="precomputed", n_components=X.shape[0], **param)
with pytest.raises(ValueError, match=msg):
ny.fit(K)
def test_nystroem_component_indices():
"""Check that `component_indices_` corresponds to the subset of
training points used to construct the feature map.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20474
"""
X, _ = make_classification(n_samples=100, n_features=20)
feature_map_nystroem = Nystroem(
n_components=10,
random_state=0,
)
feature_map_nystroem.fit(X)
assert feature_map_nystroem.component_indices_.shape == (10,)
@pytest.mark.parametrize(
"Estimator", [PolynomialCountSketch, RBFSampler, SkewedChi2Sampler, Nystroem]
)
def test_get_feature_names_out(Estimator):
"""Check get_feature_names_out"""
est = Estimator().fit(X)
X_trans = est.transform(X)
names_out = est.get_feature_names_out()
class_name = Estimator.__name__.lower()
expected_names = [f"{class_name}{i}" for i in range(X_trans.shape[1])]
assert_array_equal(names_out, expected_names)
def test_additivechi2sampler_get_feature_names_out():
"""Check get_feature_names_out for AdditiveChi2Sampler."""
rng = np.random.RandomState(0)
X = rng.random_sample(size=(300, 3))
chi2_sampler = AdditiveChi2Sampler(sample_steps=3).fit(X)
input_names = ["f0", "f1", "f2"]
suffixes = [
"f0_sqrt",
"f1_sqrt",
"f2_sqrt",
"f0_cos1",
"f1_cos1",
"f2_cos1",
"f0_sin1",
"f1_sin1",
"f2_sin1",
"f0_cos2",
"f1_cos2",
"f2_cos2",
"f0_sin2",
"f1_sin2",
"f2_sin2",
]
names_out = chi2_sampler.get_feature_names_out(input_features=input_names)
expected_names = [f"additivechi2sampler_{suffix}" for suffix in suffixes]
assert_array_equal(names_out, expected_names)

View File

@@ -0,0 +1,92 @@
import numpy as np
import scipy.sparse as sp
from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.utils._testing import ignore_warnings
from sklearn.utils._testing import assert_array_almost_equal
X, y = make_regression(n_features=10, random_state=0)
Xcsr = sp.csr_matrix(X)
Xcsc = sp.csc_matrix(X)
Y = np.array([y, y]).T
def test_kernel_ridge():
pred = Ridge(alpha=1, fit_intercept=False).fit(X, y).predict(X)
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
assert_array_almost_equal(pred, pred2)
def test_kernel_ridge_csr():
pred = (
Ridge(alpha=1, fit_intercept=False, solver="cholesky")
.fit(Xcsr, y)
.predict(Xcsr)
)
pred2 = KernelRidge(kernel="linear", alpha=1).fit(Xcsr, y).predict(Xcsr)
assert_array_almost_equal(pred, pred2)
def test_kernel_ridge_csc():
pred = (
Ridge(alpha=1, fit_intercept=False, solver="cholesky")
.fit(Xcsc, y)
.predict(Xcsc)
)
pred2 = KernelRidge(kernel="linear", alpha=1).fit(Xcsc, y).predict(Xcsc)
assert_array_almost_equal(pred, pred2)
def test_kernel_ridge_singular_kernel():
# alpha=0 causes a LinAlgError in computing the dual coefficients,
# which causes a fallback to a lstsq solver. This is tested here.
pred = Ridge(alpha=0, fit_intercept=False).fit(X, y).predict(X)
kr = KernelRidge(kernel="linear", alpha=0)
ignore_warnings(kr.fit)(X, y)
pred2 = kr.predict(X)
assert_array_almost_equal(pred, pred2)
def test_kernel_ridge_precomputed():
for kernel in ["linear", "rbf", "poly", "cosine"]:
K = pairwise_kernels(X, X, metric=kernel)
pred = KernelRidge(kernel=kernel).fit(X, y).predict(X)
pred2 = KernelRidge(kernel="precomputed").fit(K, y).predict(K)
assert_array_almost_equal(pred, pred2)
def test_kernel_ridge_precomputed_kernel_unchanged():
K = np.dot(X, X.T)
K2 = K.copy()
KernelRidge(kernel="precomputed").fit(K, y)
assert_array_almost_equal(K, K2)
def test_kernel_ridge_sample_weights():
K = np.dot(X, X.T) # precomputed kernel
sw = np.random.RandomState(0).rand(X.shape[0])
pred = Ridge(alpha=1, fit_intercept=False).fit(X, y, sample_weight=sw).predict(X)
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y, sample_weight=sw).predict(X)
pred3 = (
KernelRidge(kernel="precomputed", alpha=1)
.fit(K, y, sample_weight=sw)
.predict(K)
)
assert_array_almost_equal(pred, pred2)
assert_array_almost_equal(pred, pred3)
def test_kernel_ridge_multi_output():
pred = Ridge(alpha=1, fit_intercept=False).fit(X, Y).predict(X)
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, Y).predict(X)
assert_array_almost_equal(pred, pred2)
pred3 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
pred3 = np.array([pred3, pred3]).T
assert_array_almost_equal(pred2, pred3)

View File

@@ -0,0 +1,301 @@
"""Common tests for metaestimators"""
import functools
from inspect import signature
import numpy as np
import pytest
from sklearn.base import BaseEstimator
from sklearn.base import is_regressor
from sklearn.datasets import make_classification
from sklearn.utils import all_estimators
from sklearn.utils.estimator_checks import _enforce_estimator_tags_x
from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
from sklearn.utils.validation import check_is_fitted
from sklearn.utils._testing import set_random_state
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import BaggingClassifier
from sklearn.exceptions import NotFittedError
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
class DelegatorData:
def __init__(
self, name, construct, skip_methods=(), fit_args=make_classification()
):
self.name = name
self.construct = construct
self.fit_args = fit_args
self.skip_methods = skip_methods
DELEGATING_METAESTIMATORS = [
DelegatorData("Pipeline", lambda est: Pipeline([("est", est)])),
DelegatorData(
"GridSearchCV",
lambda est: GridSearchCV(est, param_grid={"param": [5]}, cv=2),
skip_methods=["score"],
),
DelegatorData(
"RandomizedSearchCV",
lambda est: RandomizedSearchCV(
est, param_distributions={"param": [5]}, cv=2, n_iter=1
),
skip_methods=["score"],
),
DelegatorData("RFE", RFE, skip_methods=["transform", "inverse_transform"]),
DelegatorData("RFECV", RFECV, skip_methods=["transform", "inverse_transform"]),
DelegatorData(
"BaggingClassifier",
BaggingClassifier,
skip_methods=[
"transform",
"inverse_transform",
"score",
"predict_proba",
"predict_log_proba",
"predict",
],
),
DelegatorData(
"SelfTrainingClassifier",
lambda est: SelfTrainingClassifier(est),
skip_methods=["transform", "inverse_transform", "predict_proba"],
),
]
def test_metaestimator_delegation():
# Ensures specified metaestimators have methods iff subestimator does
def hides(method):
@property
def wrapper(obj):
if obj.hidden_method == method.__name__:
raise AttributeError("%r is hidden" % obj.hidden_method)
return functools.partial(method, obj)
return wrapper
class SubEstimator(BaseEstimator):
def __init__(self, param=1, hidden_method=None):
self.param = param
self.hidden_method = hidden_method
def fit(self, X, y=None, *args, **kwargs):
self.coef_ = np.arange(X.shape[1])
self.classes_ = []
return True
def _check_fit(self):
check_is_fitted(self)
@hides
def inverse_transform(self, X, *args, **kwargs):
self._check_fit()
return X
@hides
def transform(self, X, *args, **kwargs):
self._check_fit()
return X
@hides
def predict(self, X, *args, **kwargs):
self._check_fit()
return np.ones(X.shape[0])
@hides
def predict_proba(self, X, *args, **kwargs):
self._check_fit()
return np.ones(X.shape[0])
@hides
def predict_log_proba(self, X, *args, **kwargs):
self._check_fit()
return np.ones(X.shape[0])
@hides
def decision_function(self, X, *args, **kwargs):
self._check_fit()
return np.ones(X.shape[0])
@hides
def score(self, X, y, *args, **kwargs):
self._check_fit()
return 1.0
methods = [
k
for k in SubEstimator.__dict__.keys()
if not k.startswith("_") and not k.startswith("fit")
]
methods.sort()
for delegator_data in DELEGATING_METAESTIMATORS:
delegate = SubEstimator()
delegator = delegator_data.construct(delegate)
for method in methods:
if method in delegator_data.skip_methods:
continue
assert hasattr(delegate, method)
assert hasattr(
delegator, method
), "%s does not have method %r when its delegate does" % (
delegator_data.name,
method,
)
# delegation before fit raises a NotFittedError
if method == "score":
with pytest.raises(NotFittedError):
getattr(delegator, method)(
delegator_data.fit_args[0], delegator_data.fit_args[1]
)
else:
with pytest.raises(NotFittedError):
getattr(delegator, method)(delegator_data.fit_args[0])
delegator.fit(*delegator_data.fit_args)
for method in methods:
if method in delegator_data.skip_methods:
continue
# smoke test delegation
if method == "score":
getattr(delegator, method)(
delegator_data.fit_args[0], delegator_data.fit_args[1]
)
else:
getattr(delegator, method)(delegator_data.fit_args[0])
for method in methods:
if method in delegator_data.skip_methods:
continue
delegate = SubEstimator(hidden_method=method)
delegator = delegator_data.construct(delegate)
assert not hasattr(delegate, method)
assert not hasattr(
delegator, method
), "%s has method %r when its delegate does not" % (
delegator_data.name,
method,
)
def _generate_meta_estimator_instances_with_pipeline():
"""Generate instances of meta-estimators fed with a pipeline
Are considered meta-estimators all estimators accepting one of "estimator",
"base_estimator" or "estimators".
"""
for _, Estimator in sorted(all_estimators()):
sig = set(signature(Estimator).parameters)
if "estimator" in sig or "base_estimator" in sig or "regressor" in sig:
if is_regressor(Estimator):
estimator = make_pipeline(TfidfVectorizer(), Ridge())
param_grid = {"ridge__alpha": [0.1, 1.0]}
else:
estimator = make_pipeline(TfidfVectorizer(), LogisticRegression())
param_grid = {"logisticregression__C": [0.1, 1.0]}
if "param_grid" in sig or "param_distributions" in sig:
# SearchCV estimators
extra_params = {"n_iter": 2} if "n_iter" in sig else {}
yield Estimator(estimator, param_grid, **extra_params)
else:
yield Estimator(estimator)
elif "transformer_list" in sig:
# FeatureUnion
transformer_list = [
("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())),
(
"trans2",
make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False)),
),
]
yield Estimator(transformer_list)
elif "estimators" in sig:
# stacking, voting
if is_regressor(Estimator):
estimator = [
("est1", make_pipeline(TfidfVectorizer(), Ridge(alpha=0.1))),
("est2", make_pipeline(TfidfVectorizer(), Ridge(alpha=1))),
]
else:
estimator = [
(
"est1",
make_pipeline(TfidfVectorizer(), LogisticRegression(C=0.1)),
),
("est2", make_pipeline(TfidfVectorizer(), LogisticRegression(C=1))),
]
yield Estimator(estimator)
else:
continue
# TODO: remove data validation for the following estimators
# They should be able to work on any data and delegate data validation to
# their inner estimator(s).
DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [
"AdaBoostClassifier",
"AdaBoostRegressor",
"BaggingClassifier",
"BaggingRegressor",
"ClassifierChain", # data validation is necessary
"IterativeImputer",
"OneVsOneClassifier", # input validation can't be avoided
"RANSACRegressor",
"RFE",
"RFECV",
"RegressorChain", # data validation is necessary
"SelfTrainingClassifier",
"SequentialFeatureSelector", # not applicable (2D data mandatory)
]
DATA_VALIDATION_META_ESTIMATORS = [
est
for est in _generate_meta_estimator_instances_with_pipeline()
if est.__class__.__name__ not in DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE
]
def _get_meta_estimator_id(estimator):
return estimator.__class__.__name__
@pytest.mark.parametrize(
"estimator", DATA_VALIDATION_META_ESTIMATORS, ids=_get_meta_estimator_id
)
def test_meta_estimators_delegate_data_validation(estimator):
# Check that meta-estimators delegate data validation to the inner
# estimator(s).
rng = np.random.RandomState(0)
set_random_state(estimator)
n_samples = 30
X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples)
if is_regressor(estimator):
y = rng.normal(size=n_samples)
else:
y = rng.randint(3, size=n_samples)
# We convert to lists to make sure it works on array-like
X = _enforce_estimator_tags_x(estimator, X).tolist()
y = _enforce_estimator_tags_y(estimator, y).tolist()
# Calling fit should not raise any data validation exception since X is a
# valid input datastructure for the first step of the pipeline passed as
# base estimator to the meta estimator.
estimator.fit(X, y)
# n_features_in_ should not be defined since data is not tabular data.
assert not hasattr(estimator, "n_features_in_")

View File

@@ -0,0 +1,52 @@
"""Tests for the minimum dependencies in the README.rst file."""
import os
import re
import platform
from pathlib import Path
import pytest
import sklearn
from sklearn._min_dependencies import dependent_packages
from sklearn.utils.fixes import parse_version
def test_min_dependencies_readme():
# Test that the minimum dependencies in the README.rst file are
# consistent with the minimum dependencies defined at the file:
# sklearn/_min_dependencies.py
if platform.python_implementation() == "PyPy":
pytest.skip("PyPy does not always share the same minimum deps")
pattern = re.compile(
r"(\.\. \|)"
+ r"(([A-Za-z]+\-?)+)"
+ r"(MinVersion\| replace::)"
+ r"( [0-9]+\.[0-9]+(\.[0-9]+)?)"
)
readme_path = Path(sklearn.__path__[0]).parents[0]
readme_file = readme_path / "README.rst"
if not os.path.exists(readme_file):
# Skip the test if the README.rst file is not available.
# For instance, when installing scikit-learn from wheels
pytest.skip("The README.rst file is not available.")
with readme_file.open("r") as f:
for line in f:
matched = pattern.match(line)
if not matched:
continue
package, version = matched.group(2), matched.group(5)
package = package.lower()
if package in dependent_packages:
version = parse_version(version)
min_version = parse_version(dependent_packages[package][0])
assert version == min_version, f"{package} has a mismatched version"

View File

@@ -0,0 +1,926 @@
import numpy as np
import scipy.sparse as sp
import pytest
from numpy.testing import assert_allclose
from re import escape
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._mocking import CheckingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.utils.multiclass import check_classification_targets, type_of_target
from sklearn.utils import (
check_array,
shuffle,
)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import (
LinearRegression,
Lasso,
ElasticNet,
Ridge,
Perceptron,
LogisticRegression,
SGDClassifier,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.exceptions import NotFittedError
from sklearn import datasets
from sklearn.datasets import load_breast_cancer
iris = datasets.load_iris()
rng = np.random.RandomState(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
n_classes = 3
def test_ovr_exceptions():
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
# test predicting without fitting
with pytest.raises(NotFittedError):
ovr.predict([])
# Fail on multioutput data
msg = "Multioutput target data is not supported with label binarization"
with pytest.raises(ValueError, match=msg):
X = np.array([[1, 0], [0, 1]])
y = np.array([[1, 2], [3, 1]])
OneVsRestClassifier(MultinomialNB()).fit(X, y)
with pytest.raises(ValueError, match=msg):
X = np.array([[1, 0], [0, 1]])
y = np.array([[1.5, 2.4], [3.1, 0.8]])
OneVsRestClassifier(MultinomialNB()).fit(X, y)
def test_check_classification_targets():
# Test that check_classification_target return correct type. #5782
y = np.array([0.0, 1.1, 2.0, 3.0])
msg = type_of_target(y)
with pytest.raises(ValueError, match=msg):
check_classification_targets(y)
def test_ovr_fit_predict():
# A classifier which implements decision_function.
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
assert len(ovr.estimators_) == n_classes
clf = LinearSVC(random_state=0)
pred2 = clf.fit(iris.data, iris.target).predict(iris.data)
assert np.mean(iris.target == pred) == np.mean(iris.target == pred2)
# A classifier which implements predict_proba.
ovr = OneVsRestClassifier(MultinomialNB())
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
assert np.mean(iris.target == pred) > 0.65
def test_ovr_partial_fit():
# Test if partial_fit is working as intended
X, y = shuffle(iris.data, iris.target, random_state=0)
ovr = OneVsRestClassifier(MultinomialNB())
ovr.partial_fit(X[:100], y[:100], np.unique(y))
ovr.partial_fit(X[100:], y[100:])
pred = ovr.predict(X)
ovr2 = OneVsRestClassifier(MultinomialNB())
pred2 = ovr2.fit(X, y).predict(X)
assert_almost_equal(pred, pred2)
assert len(ovr.estimators_) == len(np.unique(y))
assert np.mean(y == pred) > 0.65
# Test when mini batches doesn't have all classes
# with SGDClassifier
X = np.abs(np.random.randn(14, 2))
y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
ovr = OneVsRestClassifier(
SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)
)
ovr.partial_fit(X[:7], y[:7], np.unique(y))
ovr.partial_fit(X[7:], y[7:])
pred = ovr.predict(X)
ovr1 = OneVsRestClassifier(
SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)
)
pred1 = ovr1.fit(X, y).predict(X)
assert np.mean(pred == y) == np.mean(pred1 == y)
# test partial_fit only exists if estimator has it:
ovr = OneVsRestClassifier(SVC())
assert not hasattr(ovr, "partial_fit")
def test_ovr_partial_fit_exceptions():
ovr = OneVsRestClassifier(MultinomialNB())
X = np.abs(np.random.randn(14, 2))
y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
ovr.partial_fit(X[:7], y[:7], np.unique(y))
# If a new class that was not in the first call of partial fit is seen
# it should raise ValueError
y1 = [5] + y[7:-1]
msg = r"Mini-batch contains \[.+\] while classes must be subset of \[.+\]"
with pytest.raises(ValueError, match=msg):
ovr.partial_fit(X=X[7:], y=y1)
def test_ovr_ovo_regressor():
# test that ovr and ovo work on regressors which don't have a decision_
# function
ovr = OneVsRestClassifier(DecisionTreeRegressor())
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
assert len(ovr.estimators_) == n_classes
assert_array_equal(np.unique(pred), [0, 1, 2])
# we are doing something sensible
assert np.mean(pred == iris.target) > 0.9
ovr = OneVsOneClassifier(DecisionTreeRegressor())
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
assert len(ovr.estimators_) == n_classes * (n_classes - 1) / 2
assert_array_equal(np.unique(pred), [0, 1, 2])
# we are doing something sensible
assert np.mean(pred == iris.target) > 0.9
def test_ovr_fit_predict_sparse():
for sparse in [
sp.csr_matrix,
sp.csc_matrix,
sp.coo_matrix,
sp.dok_matrix,
sp.lil_matrix,
]:
base_clf = MultinomialNB(alpha=1)
X, Y = datasets.make_multilabel_classification(
n_samples=100,
n_features=20,
n_classes=5,
n_labels=3,
length=50,
allow_unlabeled=True,
random_state=0,
)
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
Y_pred_sprs = clf_sprs.predict(X_test)
assert clf.multilabel_
assert sp.issparse(Y_pred_sprs)
assert_array_equal(Y_pred_sprs.toarray(), Y_pred)
# Test predict_proba
Y_proba = clf_sprs.predict_proba(X_test)
# predict assigns a label if the probability that the
# sample has the label is greater than 0.5.
pred = Y_proba > 0.5
assert_array_equal(pred, Y_pred_sprs.toarray())
# Test decision_function
clf = svm.SVC()
clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train))
dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
def test_ovr_always_present():
# Test that ovr works with classes that are always present or absent.
# Note: tests is the case where _ConstantPredictor is utilised
X = np.ones((10, 2))
X[:5, :] = 0
# Build an indicator matrix where two features are always on.
# As list of lists, it would be: [[int(i >= 5), 2, 3] for i in range(10)]
y = np.zeros((10, 3))
y[5:, 0] = 1
y[:, 1] = 1
y[:, 2] = 1
ovr = OneVsRestClassifier(LogisticRegression())
msg = r"Label .+ is present in all training examples"
with pytest.warns(UserWarning, match=msg):
ovr.fit(X, y)
y_pred = ovr.predict(X)
assert_array_equal(np.array(y_pred), np.array(y))
y_pred = ovr.decision_function(X)
assert np.unique(y_pred[:, -2:]) == 1
y_pred = ovr.predict_proba(X)
assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))
# y has a constantly absent label
y = np.zeros((10, 2))
y[5:, 0] = 1 # variable label
ovr = OneVsRestClassifier(LogisticRegression())
msg = r"Label not 1 is present in all training examples"
with pytest.warns(UserWarning, match=msg):
ovr.fit(X, y)
y_pred = ovr.predict_proba(X)
assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
def test_ovr_multiclass():
# Toy dataset where features correspond directly to labels.
X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
y = ["eggs", "spam", "ham", "eggs", "ham"]
Y = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1], [1, 0, 0]])
classes = set("ham eggs spam".split())
for base_clf in (
MultinomialNB(),
LinearSVC(random_state=0),
LinearRegression(),
Ridge(),
ElasticNet(),
):
clf = OneVsRestClassifier(base_clf).fit(X, y)
assert set(clf.classes_) == classes
y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
assert_array_equal(y_pred, ["eggs"])
# test input as label indicator matrix
clf = OneVsRestClassifier(base_clf).fit(X, Y)
y_pred = clf.predict([[0, 0, 4]])[0]
assert_array_equal(y_pred, [0, 0, 1])
def test_ovr_binary():
# Toy dataset where features correspond directly to labels.
X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
y = ["eggs", "spam", "spam", "eggs", "spam"]
Y = np.array([[0, 1, 1, 0, 1]]).T
classes = set("eggs spam".split())
def conduct_test(base_clf, test_predict_proba=False):
clf = OneVsRestClassifier(base_clf).fit(X, y)
assert set(clf.classes_) == classes
y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
assert_array_equal(y_pred, ["eggs"])
if hasattr(base_clf, "decision_function"):
dec = clf.decision_function(X)
assert dec.shape == (5,)
if test_predict_proba:
X_test = np.array([[0, 0, 4]])
probabilities = clf.predict_proba(X_test)
assert 2 == len(probabilities[0])
assert clf.classes_[np.argmax(probabilities, axis=1)] == clf.predict(X_test)
# test input as label indicator matrix
clf = OneVsRestClassifier(base_clf).fit(X, Y)
y_pred = clf.predict([[3, 0, 0]])[0]
assert y_pred == 1
for base_clf in (
LinearSVC(random_state=0),
LinearRegression(),
Ridge(),
ElasticNet(),
):
conduct_test(base_clf)
for base_clf in (MultinomialNB(), SVC(probability=True), LogisticRegression()):
conduct_test(base_clf, test_predict_proba=True)
def test_ovr_multilabel():
# Toy dataset where features correspond directly to labels.
X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]])
for base_clf in (
MultinomialNB(),
LinearSVC(random_state=0),
LinearRegression(),
Ridge(),
ElasticNet(),
Lasso(alpha=0.5),
):
clf = OneVsRestClassifier(base_clf).fit(X, y)
y_pred = clf.predict([[0, 4, 4]])[0]
assert_array_equal(y_pred, [0, 1, 1])
assert clf.multilabel_
def test_ovr_fit_predict_svc():
ovr = OneVsRestClassifier(svm.SVC())
ovr.fit(iris.data, iris.target)
assert len(ovr.estimators_) == 3
assert ovr.score(iris.data, iris.target) > 0.9
def test_ovr_multilabel_dataset():
base_clf = MultinomialNB(alpha=1)
for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
X, Y = datasets.make_multilabel_classification(
n_samples=100,
n_features=20,
n_classes=5,
n_labels=2,
length=50,
allow_unlabeled=au,
random_state=0,
)
X_train, Y_train = X[:80], Y[:80]
X_test, Y_test = X[80:], Y[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
assert clf.multilabel_
assert_almost_equal(
precision_score(Y_test, Y_pred, average="micro"), prec, decimal=2
)
assert_almost_equal(
recall_score(Y_test, Y_pred, average="micro"), recall, decimal=2
)
def test_ovr_multilabel_predict_proba():
base_clf = MultinomialNB(alpha=1)
for au in (False, True):
X, Y = datasets.make_multilabel_classification(
n_samples=100,
n_features=20,
n_classes=5,
n_labels=3,
length=50,
allow_unlabeled=au,
random_state=0,
)
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
# Decision function only estimator.
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
assert not hasattr(decision_only, "predict_proba")
# Estimator with predict_proba disabled, depending on parameters.
decision_only = OneVsRestClassifier(svm.SVC(probability=False))
assert not hasattr(decision_only, "predict_proba")
decision_only.fit(X_train, Y_train)
assert not hasattr(decision_only, "predict_proba")
assert hasattr(decision_only, "decision_function")
# Estimator which can get predict_proba enabled after fitting
gs = GridSearchCV(
svm.SVC(probability=False), param_grid={"probability": [True]}
)
proba_after_fit = OneVsRestClassifier(gs)
assert not hasattr(proba_after_fit, "predict_proba")
proba_after_fit.fit(X_train, Y_train)
assert hasattr(proba_after_fit, "predict_proba")
Y_pred = clf.predict(X_test)
Y_proba = clf.predict_proba(X_test)
# predict assigns a label if the probability that the
# sample has the label is greater than 0.5.
pred = Y_proba > 0.5
assert_array_equal(pred, Y_pred)
def test_ovr_single_label_predict_proba():
base_clf = MultinomialNB(alpha=1)
X, Y = iris.data, iris.target
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
# Decision function only estimator.
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
assert not hasattr(decision_only, "predict_proba")
Y_pred = clf.predict(X_test)
Y_proba = clf.predict_proba(X_test)
assert_almost_equal(Y_proba.sum(axis=1), 1.0)
# predict assigns a label if the probability that the
# sample has the label with the greatest predictive probability.
pred = Y_proba.argmax(axis=1)
assert not (pred - Y_pred).any()
def test_ovr_multilabel_decision_function():
X, Y = datasets.make_multilabel_classification(
n_samples=100,
n_features=20,
n_classes=5,
n_labels=3,
length=50,
allow_unlabeled=True,
random_state=0,
)
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
assert_array_equal(
(clf.decision_function(X_test) > 0).astype(int), clf.predict(X_test)
)
def test_ovr_single_label_decision_function():
X, Y = datasets.make_classification(n_samples=100, n_features=20, random_state=0)
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
assert_array_equal(clf.decision_function(X_test).ravel() > 0, clf.predict(X_test))
def test_ovr_gridsearch():
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
Cs = [0.1, 0.5, 0.8]
cv = GridSearchCV(ovr, {"estimator__C": Cs})
cv.fit(iris.data, iris.target)
best_C = cv.best_estimator_.estimators_[0].C
assert best_C in Cs
def test_ovr_pipeline():
# Test with pipeline of length one
# This test is needed because the multiclass estimators may fail to detect
# the presence of predict_proba or decision_function.
clf = Pipeline([("tree", DecisionTreeClassifier())])
ovr_pipe = OneVsRestClassifier(clf)
ovr_pipe.fit(iris.data, iris.target)
ovr = OneVsRestClassifier(DecisionTreeClassifier())
ovr.fit(iris.data, iris.target)
assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
def test_ovo_exceptions():
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
with pytest.raises(NotFittedError):
ovo.predict([])
def test_ovo_fit_on_list():
# Test that OneVsOne fitting works with a list of targets and yields the
# same output as predict from an array
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data)
iris_data_list = [list(a) for a in iris.data]
prediction_from_list = ovo.fit(iris_data_list, list(iris.target)).predict(
iris_data_list
)
assert_array_equal(prediction_from_array, prediction_from_list)
def test_ovo_fit_predict():
# A classifier which implements decision_function.
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
ovo.fit(iris.data, iris.target).predict(iris.data)
assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
# A classifier which implements predict_proba.
ovo = OneVsOneClassifier(MultinomialNB())
ovo.fit(iris.data, iris.target).predict(iris.data)
assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
def test_ovo_partial_fit_predict():
temp = datasets.load_iris()
X, y = temp.data, temp.target
ovo1 = OneVsOneClassifier(MultinomialNB())
ovo1.partial_fit(X[:100], y[:100], np.unique(y))
ovo1.partial_fit(X[100:], y[100:])
pred1 = ovo1.predict(X)
ovo2 = OneVsOneClassifier(MultinomialNB())
ovo2.fit(X, y)
pred2 = ovo2.predict(X)
assert len(ovo1.estimators_) == n_classes * (n_classes - 1) / 2
assert np.mean(y == pred1) > 0.65
assert_almost_equal(pred1, pred2)
# Test when mini-batches have binary target classes
ovo1 = OneVsOneClassifier(MultinomialNB())
ovo1.partial_fit(X[:60], y[:60], np.unique(y))
ovo1.partial_fit(X[60:], y[60:])
pred1 = ovo1.predict(X)
ovo2 = OneVsOneClassifier(MultinomialNB())
pred2 = ovo2.fit(X, y).predict(X)
assert_almost_equal(pred1, pred2)
assert len(ovo1.estimators_) == len(np.unique(y))
assert np.mean(y == pred1) > 0.65
ovo = OneVsOneClassifier(MultinomialNB())
X = np.random.rand(14, 2)
y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2]
ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4])
ovo.partial_fit(X[7:], y[7:])
pred = ovo.predict(X)
ovo2 = OneVsOneClassifier(MultinomialNB())
pred2 = ovo2.fit(X, y).predict(X)
assert_almost_equal(pred, pred2)
# raises error when mini-batch does not have classes from all_classes
ovo = OneVsOneClassifier(MultinomialNB())
error_y = [0, 1, 2, 3, 4, 5, 2]
message_re = escape(
"Mini-batch contains {0} while it must be subset of {1}".format(
np.unique(error_y), np.unique(y)
)
)
with pytest.raises(ValueError, match=message_re):
ovo.partial_fit(X[:7], error_y, np.unique(y))
# test partial_fit only exists if estimator has it:
ovr = OneVsOneClassifier(SVC())
assert not hasattr(ovr, "partial_fit")
def test_ovo_decision_function():
n_samples = iris.data.shape[0]
ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0))
# first binary
ovo_clf.fit(iris.data, iris.target == 0)
decisions = ovo_clf.decision_function(iris.data)
assert decisions.shape == (n_samples,)
# then multi-class
ovo_clf.fit(iris.data, iris.target)
decisions = ovo_clf.decision_function(iris.data)
assert decisions.shape == (n_samples, n_classes)
assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data))
# Compute the votes
votes = np.zeros((n_samples, n_classes))
k = 0
for i in range(n_classes):
for j in range(i + 1, n_classes):
pred = ovo_clf.estimators_[k].predict(iris.data)
votes[pred == 0, i] += 1
votes[pred == 1, j] += 1
k += 1
# Extract votes and verify
assert_array_equal(votes, np.round(decisions))
for class_idx in range(n_classes):
# For each sample and each class, there only 3 possible vote levels
# because they are only 3 distinct class pairs thus 3 distinct
# binary classifiers.
# Therefore, sorting predictions based on votes would yield
# mostly tied predictions:
assert set(votes[:, class_idx]).issubset(set([0.0, 1.0, 2.0]))
# The OVO decision function on the other hand is able to resolve
# most of the ties on this data as it combines both the vote counts
# and the aggregated confidence levels of the binary classifiers
# to compute the aggregate decision function. The iris dataset
# has 150 samples with a couple of duplicates. The OvO decisions
# can resolve most of the ties:
assert len(np.unique(decisions[:, class_idx])) > 146
def test_ovo_gridsearch():
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
Cs = [0.1, 0.5, 0.8]
cv = GridSearchCV(ovo, {"estimator__C": Cs})
cv.fit(iris.data, iris.target)
best_C = cv.best_estimator_.estimators_[0].C
assert best_C in Cs
def test_ovo_ties():
# Test that ties are broken using the decision function,
# not defaulting to the smallest label
X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
y = np.array([2, 0, 1, 2])
multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None))
ovo_prediction = multi_clf.fit(X, y).predict(X)
ovo_decision = multi_clf.decision_function(X)
# Classifiers are in order 0-1, 0-2, 1-2
# Use decision_function to compute the votes and the normalized
# sum_of_confidences, which is used to disambiguate when there is a tie in
# votes.
votes = np.round(ovo_decision)
normalized_confidences = ovo_decision - votes
# For the first point, there is one vote per class
assert_array_equal(votes[0, :], 1)
# For the rest, there is no tie and the prediction is the argmax
assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
# For the tie, the prediction is the class with the highest score
assert ovo_prediction[0] == normalized_confidences[0].argmax()
def test_ovo_ties2():
# test that ties can not only be won by the first two labels
X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
y_ref = np.array([2, 0, 1, 2])
# cycle through labels so that each label wins once
for i in range(3):
y = (y_ref + i) % 3
multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None))
ovo_prediction = multi_clf.fit(X, y).predict(X)
assert ovo_prediction[0] == i % 3
def test_ovo_string_y():
# Test that the OvO doesn't mess up the encoding of string labels
X = np.eye(4)
y = np.array(["a", "b", "c", "d"])
ovo = OneVsOneClassifier(LinearSVC())
ovo.fit(X, y)
assert_array_equal(y, ovo.predict(X))
def test_ovo_one_class():
# Test error for OvO with one class
X = np.eye(4)
y = np.array(["a"] * 4)
ovo = OneVsOneClassifier(LinearSVC())
msg = "when only one class"
with pytest.raises(ValueError, match=msg):
ovo.fit(X, y)
def test_ovo_float_y():
# Test that the OvO errors on float targets
X = iris.data
y = iris.data[:, 0]
ovo = OneVsOneClassifier(LinearSVC())
msg = "Unknown label type"
with pytest.raises(ValueError, match=msg):
ovo.fit(X, y)
def test_ecoc_exceptions():
ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
with pytest.raises(NotFittedError):
ecoc.predict([])
def test_ecoc_fit_predict():
# A classifier which implements decision_function.
ecoc = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0)
ecoc.fit(iris.data, iris.target).predict(iris.data)
assert len(ecoc.estimators_) == n_classes * 2
# A classifier which implements predict_proba.
ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2, random_state=0)
ecoc.fit(iris.data, iris.target).predict(iris.data)
assert len(ecoc.estimators_) == n_classes * 2
def test_ecoc_gridsearch():
ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0)
Cs = [0.1, 0.5, 0.8]
cv = GridSearchCV(ecoc, {"estimator__C": Cs})
cv.fit(iris.data, iris.target)
best_C = cv.best_estimator_.estimators_[0].C
assert best_C in Cs
def test_ecoc_float_y():
# Test that the OCC errors on float targets
X = iris.data
y = iris.data[:, 0]
ovo = OutputCodeClassifier(LinearSVC())
msg = "Unknown label type"
with pytest.raises(ValueError, match=msg):
ovo.fit(X, y)
ovo = OutputCodeClassifier(LinearSVC(), code_size=-1)
msg = "code_size should be greater than 0, got -1"
with pytest.raises(ValueError, match=msg):
ovo.fit(X, y)
def test_ecoc_delegate_sparse_base_estimator():
# Non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/17218
X, y = iris.data, iris.target
X_sp = sp.csc_matrix(X)
# create an estimator that does not support sparse input
base_estimator = CheckingClassifier(
check_X=check_array,
check_X_params={"ensure_2d": True, "accept_sparse": False},
)
ecoc = OutputCodeClassifier(base_estimator, random_state=0)
with pytest.raises(TypeError, match="A sparse matrix was passed"):
ecoc.fit(X_sp, y)
ecoc.fit(X, y)
with pytest.raises(TypeError, match="A sparse matrix was passed"):
ecoc.predict(X_sp)
# smoke test to check when sparse input should be supported
ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
ecoc.fit(X_sp, y).predict(X_sp)
assert len(ecoc.estimators_) == 4
def test_pairwise_indices():
clf_precomputed = svm.SVC(kernel="precomputed")
X, y = iris.data, iris.target
ovr_false = OneVsOneClassifier(clf_precomputed)
linear_kernel = np.dot(X, X.T)
ovr_false.fit(linear_kernel, y)
n_estimators = len(ovr_false.estimators_)
precomputed_indices = ovr_false.pairwise_indices_
for idx in precomputed_indices:
assert (
idx.shape[0] * n_estimators / (n_estimators - 1) == linear_kernel.shape[0]
)
def test_pairwise_n_features_in():
"""Check the n_features_in_ attributes of the meta and base estimators
When the training data is a regular design matrix, everything is intuitive.
However, when the training data is a precomputed kernel matrix, the
multiclass strategy can resample the kernel matrix of the underlying base
estimator both row-wise and column-wise and this has a non-trivial impact
on the expected value for the n_features_in_ of both the meta and the base
estimators.
"""
X, y = iris.data, iris.target
# Remove the last sample to make the classes not exactly balanced and make
# the test more interesting.
assert y[-1] == 0
X = X[:-1]
y = y[:-1]
# Fitting directly on the design matrix:
assert X.shape == (149, 4)
clf_notprecomputed = svm.SVC(kernel="linear").fit(X, y)
assert clf_notprecomputed.n_features_in_ == 4
ovr_notprecomputed = OneVsRestClassifier(clf_notprecomputed).fit(X, y)
assert ovr_notprecomputed.n_features_in_ == 4
for est in ovr_notprecomputed.estimators_:
assert est.n_features_in_ == 4
ovo_notprecomputed = OneVsOneClassifier(clf_notprecomputed).fit(X, y)
assert ovo_notprecomputed.n_features_in_ == 4
assert ovo_notprecomputed.n_classes_ == 3
assert len(ovo_notprecomputed.estimators_) == 3
for est in ovo_notprecomputed.estimators_:
assert est.n_features_in_ == 4
# When working with precomputed kernels we have one "feature" per training
# sample:
K = X @ X.T
assert K.shape == (149, 149)
clf_precomputed = svm.SVC(kernel="precomputed").fit(K, y)
assert clf_precomputed.n_features_in_ == 149
ovr_precomputed = OneVsRestClassifier(clf_precomputed).fit(K, y)
assert ovr_precomputed.n_features_in_ == 149
assert ovr_precomputed.n_classes_ == 3
assert len(ovr_precomputed.estimators_) == 3
for est in ovr_precomputed.estimators_:
assert est.n_features_in_ == 149
# This becomes really interesting with OvO and precomputed kernel together:
# internally, OvO will drop the samples of the classes not part of the pair
# of classes under consideration for a given binary classifier. Since we
# use a precomputed kernel, it will also drop the matching columns of the
# kernel matrix, and therefore we have fewer "features" as result.
#
# Since class 0 has 49 samples, and class 1 and 2 have 50 samples each, a
# single OvO binary classifier works with a sub-kernel matrix of shape
# either (99, 99) or (100, 100).
ovo_precomputed = OneVsOneClassifier(clf_precomputed).fit(K, y)
assert ovo_precomputed.n_features_in_ == 149
assert ovr_precomputed.n_classes_ == 3
assert len(ovr_precomputed.estimators_) == 3
assert ovo_precomputed.estimators_[0].n_features_in_ == 99 # class 0 vs class 1
assert ovo_precomputed.estimators_[1].n_features_in_ == 99 # class 0 vs class 2
assert ovo_precomputed.estimators_[2].n_features_in_ == 100 # class 1 vs class 2
@pytest.mark.parametrize(
"MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
)
def test_pairwise_tag(MultiClassClassifier):
clf_precomputed = svm.SVC(kernel="precomputed")
clf_notprecomputed = svm.SVC()
ovr_false = MultiClassClassifier(clf_notprecomputed)
assert not ovr_false._get_tags()["pairwise"]
ovr_true = MultiClassClassifier(clf_precomputed)
assert ovr_true._get_tags()["pairwise"]
@pytest.mark.parametrize(
"MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
)
def test_pairwise_cross_val_score(MultiClassClassifier):
clf_precomputed = svm.SVC(kernel="precomputed")
clf_notprecomputed = svm.SVC(kernel="linear")
X, y = iris.data, iris.target
multiclass_clf_notprecomputed = MultiClassClassifier(clf_notprecomputed)
multiclass_clf_precomputed = MultiClassClassifier(clf_precomputed)
linear_kernel = np.dot(X, X.T)
score_not_precomputed = cross_val_score(
multiclass_clf_notprecomputed, X, y, error_score="raise"
)
score_precomputed = cross_val_score(
multiclass_clf_precomputed, linear_kernel, y, error_score="raise"
)
assert_array_equal(score_precomputed, score_not_precomputed)
@pytest.mark.parametrize(
"MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_support_missing_values(MultiClassClassifier):
# smoke test to check that pipeline OvR and OvO classifiers are letting
# the validation of missing values to
# the underlying pipeline or classifiers
rng = np.random.RandomState(42)
X, y = iris.data, iris.target
X = np.copy(X) # Copy to avoid that the original data is modified
mask = rng.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
X[mask] = np.nan
lr = make_pipeline(SimpleImputer(), LogisticRegression(random_state=rng))
MultiClassClassifier(lr).fit(X, y).score(X, y)
@pytest.mark.parametrize("make_y", [np.ones, np.zeros])
def test_constant_int_target(make_y):
"""Check that constant y target does not raise.
Non-regression test for #21869
"""
X = np.ones((10, 2))
y = make_y((10, 1), dtype=np.int32)
ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X, y)
y_pred = ovr.predict_proba(X)
expected = np.zeros((X.shape[0], 2))
expected[:, 0] = 1
assert_allclose(y_pred, expected)
def test_ovo_consistent_binary_classification():
"""Check that ovo is consistent with binary classifier.
Non-regression test for #13617.
"""
X, y = load_breast_cancer(return_X_y=True)
clf = KNeighborsClassifier(n_neighbors=8, weights="distance")
ovo = OneVsOneClassifier(clf)
clf.fit(X, y)
ovo.fit(X, y)
assert_array_equal(clf.predict(X), ovo.predict(X))

View File

@@ -0,0 +1,718 @@
import pytest
import numpy as np
import scipy.sparse as sp
from joblib import cpu_count
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn import datasets
from sklearn.base import clone
from sklearn.datasets import make_classification
from sklearn.datasets import load_linnerud
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import jaccard_score, mean_squared_error
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain, RegressorChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVC
from sklearn.base import ClassifierMixin
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor
def test_multi_target_regression():
X, y = datasets.make_regression(n_targets=3, random_state=0)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
for n in range(3):
rgr = GradientBoostingRegressor(random_state=0)
rgr.fit(X_train, y_train[:, n])
references[:, n] = rgr.predict(X_test)
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)
assert_almost_equal(references, y_pred)
def test_multi_target_regression_partial_fit():
X, y = datasets.make_regression(n_targets=3, random_state=0)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
half_index = 25
for n in range(3):
sgr = SGDRegressor(random_state=0, max_iter=5)
sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
references[:, n] = sgr.predict(X_test)
sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
sgr.partial_fit(X_train[:half_index], y_train[:half_index])
sgr.partial_fit(X_train[half_index:], y_train[half_index:])
y_pred = sgr.predict(X_test)
assert_almost_equal(references, y_pred)
assert not hasattr(MultiOutputRegressor(Lasso), "partial_fit")
def test_multi_target_regression_one_target():
# Test multi target regression raises
X, y = datasets.make_regression(n_targets=1, random_state=0)
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
msg = "at least two dimensions"
with pytest.raises(ValueError, match=msg):
rgr.fit(X, y)
def test_multi_target_sparse_regression():
X, y = datasets.make_regression(n_targets=3, random_state=0)
X_train, y_train = X[:50], y[:50]
X_test = X[50:]
for sparse in [
sp.csr_matrix,
sp.csc_matrix,
sp.coo_matrix,
sp.dok_matrix,
sp.lil_matrix,
]:
rgr = MultiOutputRegressor(Lasso(random_state=0))
rgr_sparse = MultiOutputRegressor(Lasso(random_state=0))
rgr.fit(X_train, y_train)
rgr_sparse.fit(sparse(X_train), y_train)
assert_almost_equal(rgr.predict(X_test), rgr_sparse.predict(sparse(X_test)))
def test_multi_target_sample_weights_api():
X = [[1, 2, 3], [4, 5, 6]]
y = [[3.141, 2.718], [2.718, 3.141]]
w = [0.8, 0.6]
rgr = MultiOutputRegressor(OrthogonalMatchingPursuit())
msg = "does not support sample weights"
with pytest.raises(ValueError, match=msg):
rgr.fit(X, y, w)
# no exception should be raised if the base estimator supports weights
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X, y, w)
def test_multi_target_sample_weight_partial_fit():
# weighted regressor
X = [[1, 2, 3], [4, 5, 6]]
y = [[3.141, 2.718], [2.718, 3.141]]
w = [2.0, 1.0]
rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
rgr_w.partial_fit(X, y, w)
# weighted with different weights
w = [2.0, 2.0]
rgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
rgr.partial_fit(X, y, w)
assert rgr.predict(X)[0][0] != rgr_w.predict(X)[0][0]
def test_multi_target_sample_weights():
# weighted regressor
Xw = [[1, 2, 3], [4, 5, 6]]
yw = [[3.141, 2.718], [2.718, 3.141]]
w = [2.0, 1.0]
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X, y)
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
# Import the data
iris = datasets.load_iris()
# create a multiple targets by randomized shuffling and concatenating y.
X = iris.data
y1 = iris.target
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
y = np.column_stack((y1, y2, y3))
n_samples, n_features = X.shape
n_outputs = y.shape[1]
n_classes = len(np.unique(y1))
classes = list(map(np.unique, (y1, y2, y3)))
def test_multi_output_classification_partial_fit_parallelism():
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
mor.partial_fit(X, y, classes)
est1 = mor.estimators_[0]
mor.partial_fit(X, y)
est2 = mor.estimators_[0]
if cpu_count() > 1:
# parallelism requires this to be the case for a sane implementation
assert est1 is not est2
# check multioutput has predict_proba
def test_hasattr_multi_output_predict_proba():
# default SGDClassifier has loss='hinge'
# which does not expose a predict_proba method
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
multi_target_linear.fit(X, y)
assert not hasattr(multi_target_linear, "predict_proba")
# case where predict_proba attribute exists
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
multi_target_linear.fit(X, y)
assert hasattr(multi_target_linear, "predict_proba")
# check predict_proba passes
def test_multi_output_predict_proba():
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, loss="log_loss")
param = {"loss": ("hinge", "log", "modified_huber")}
# inner function for custom scoring
def custom_scorer(estimator, X, y):
if hasattr(estimator, "predict_proba"):
return 1.0
else:
return 0.0
grid_clf = GridSearchCV(
sgd_linear_clf, param_grid=param, scoring=custom_scorer, cv=3
)
multi_target_linear = MultiOutputClassifier(grid_clf)
multi_target_linear.fit(X, y)
multi_target_linear.predict_proba(X)
# SGDClassifier defaults to loss='hinge' which is not a probabilistic
# loss function; therefore it does not expose a predict_proba method
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
multi_target_linear.fit(X, y)
err_msg = "probability estimates are not available for loss='hinge'"
with pytest.raises(AttributeError, match=err_msg):
multi_target_linear.predict_proba(X)
def test_multi_output_classification_partial_fit():
# test if multi_target initializes correctly with base estimator and fit
# assert predictions work as expected for predict
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
# train the multi_target_linear and also get the predictions.
half_index = X.shape[0] // 2
multi_target_linear.partial_fit(X[:half_index], y[:half_index], classes=classes)
first_predictions = multi_target_linear.predict(X)
assert (n_samples, n_outputs) == first_predictions.shape
multi_target_linear.partial_fit(X[half_index:], y[half_index:])
second_predictions = multi_target_linear.predict(X)
assert (n_samples, n_outputs) == second_predictions.shape
# train the linear classification with each column and assert that
# predictions are equal after first partial_fit and second partial_fit
for i in range(3):
# create a clone with the same state
sgd_linear_clf = clone(sgd_linear_clf)
sgd_linear_clf.partial_fit(
X[:half_index], y[:half_index, i], classes=classes[i]
)
assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification_partial_fit_no_first_classes_exception():
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
msg = "classes must be passed on the first call to partial_fit."
with pytest.raises(ValueError, match=msg):
multi_target_linear.partial_fit(X, y)
def test_multi_output_classification():
# test if multi_target initializes correctly with base estimator and fit
# assert predictions work as expected for predict, prodict_proba and score
forest = RandomForestClassifier(n_estimators=10, random_state=1)
multi_target_forest = MultiOutputClassifier(forest)
# train the multi_target_forest and also get the predictions.
multi_target_forest.fit(X, y)
predictions = multi_target_forest.predict(X)
assert (n_samples, n_outputs) == predictions.shape
predict_proba = multi_target_forest.predict_proba(X)
assert len(predict_proba) == n_outputs
for class_probabilities in predict_proba:
assert (n_samples, n_classes) == class_probabilities.shape
assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions)
# train the forest with each column and assert that predictions are equal
for i in range(3):
forest_ = clone(forest) # create a clone with the same state
forest_.fit(X, y[:, i])
assert list(forest_.predict(X)) == list(predictions[:, i])
assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
def test_multiclass_multioutput_estimator():
# test to check meta of meta estimators
svc = LinearSVC(random_state=0)
multi_class_svc = OneVsRestClassifier(svc)
multi_target_svc = MultiOutputClassifier(multi_class_svc)
multi_target_svc.fit(X, y)
predictions = multi_target_svc.predict(X)
assert (n_samples, n_outputs) == predictions.shape
# train the forest with each column and assert that predictions are equal
for i in range(3):
multi_class_svc_ = clone(multi_class_svc) # create a clone
multi_class_svc_.fit(X, y[:, i])
assert list(multi_class_svc_.predict(X)) == list(predictions[:, i])
def test_multiclass_multioutput_estimator_predict_proba():
seed = 542
# make test deterministic
rng = np.random.RandomState(seed)
# random features
X = rng.normal(size=(5, 5))
# random labels
y1 = np.array(["b", "a", "a", "b", "a"]).reshape(5, 1) # 2 classes
y2 = np.array(["d", "e", "f", "e", "d"]).reshape(5, 1) # 3 classes
Y = np.concatenate([y1, y2], axis=1)
clf = MultiOutputClassifier(
LogisticRegression(solver="liblinear", random_state=seed)
)
clf.fit(X, Y)
y_result = clf.predict_proba(X)
y_actual = [
np.array(
[
[0.23481764, 0.76518236],
[0.67196072, 0.32803928],
[0.54681448, 0.45318552],
[0.34883923, 0.65116077],
[0.73687069, 0.26312931],
]
),
np.array(
[
[0.5171785, 0.23878628, 0.24403522],
[0.22141451, 0.64102704, 0.13755846],
[0.16751315, 0.18256843, 0.64991843],
[0.27357372, 0.55201592, 0.17441036],
[0.65745193, 0.26062899, 0.08191907],
]
),
]
for i in range(len(y_actual)):
assert_almost_equal(y_result[i], y_actual[i])
def test_multi_output_classification_sample_weights():
# weighted classifier
Xw = [[1, 2, 3], [4, 5, 6]]
yw = [[3, 2], [2, 3]]
w = np.asarray([2.0, 1.0])
forest = RandomForestClassifier(n_estimators=10, random_state=1)
clf_w = MultiOutputClassifier(forest)
clf_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
y = [[3, 2], [3, 2], [2, 3]]
forest = RandomForestClassifier(n_estimators=10, random_state=1)
clf = MultiOutputClassifier(forest)
clf.fit(X, y)
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_classification_partial_fit_sample_weights():
# weighted classifier
Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
yw = [[3, 2], [2, 3], [3, 2]]
w = np.asarray([2.0, 1.0, 1.0])
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
clf_w = MultiOutputClassifier(sgd_linear_clf)
clf_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
y = [[3, 2], [3, 2], [2, 3], [3, 2]]
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
clf = MultiOutputClassifier(sgd_linear_clf)
clf.fit(X, y)
X_test = [[1.5, 2.5, 3.5]]
assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_exceptions():
# NotFittedError when fit is not done but score, predict and
# and predict_proba are called
moc = MultiOutputClassifier(LinearSVC(random_state=0))
with pytest.raises(NotFittedError):
moc.score(X, y)
# ValueError when number of outputs is different
# for fit and score
y_new = np.column_stack((y1, y2))
moc.fit(X, y)
with pytest.raises(ValueError):
moc.score(X, y_new)
# ValueError when y is continuous
msg = "Unknown label type"
with pytest.raises(ValueError, match=msg):
moc.fit(X, X[:, 1])
@pytest.mark.parametrize("response_method", ["predict_proba", "predict"])
def test_multi_output_not_fitted_error(response_method):
"""Check that we raise the proper error when the estimator is not fitted"""
moc = MultiOutputClassifier(LogisticRegression())
with pytest.raises(NotFittedError):
getattr(moc, response_method)(X)
def test_multi_output_delegate_predict_proba():
"""Check the behavior for the delegation of predict_proba to the underlying
estimator"""
# A base estimator with `predict_proba`should expose the method even before fit
moc = MultiOutputClassifier(LogisticRegression())
assert hasattr(moc, "predict_proba")
moc.fit(X, y)
assert hasattr(moc, "predict_proba")
# A base estimator without `predict_proba` should raise an AttributeError
moc = MultiOutputClassifier(LinearSVC())
assert not hasattr(moc, "predict_proba")
msg = "'LinearSVC' object has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=msg):
moc.predict_proba(X)
moc.fit(X, y)
assert not hasattr(moc, "predict_proba")
with pytest.raises(AttributeError, match=msg):
moc.predict_proba(X)
def generate_multilabel_dataset_with_correlations():
# Generate a multilabel data set from a multiclass dataset as a way of
# by representing the integer number of the original class using a binary
# encoding.
X, y = make_classification(
n_samples=1000, n_features=100, n_classes=16, n_informative=10, random_state=0
)
Y_multi = np.array([[int(yyy) for yyy in format(yy, "#06b")[2:]] for yy in y])
return X, Y_multi
def test_classifier_chain_fit_and_predict_with_linear_svc():
# Fit classifier chain and verify predict performance using LinearSVC
X, Y = generate_multilabel_dataset_with_correlations()
classifier_chain = ClassifierChain(LinearSVC())
classifier_chain.fit(X, Y)
Y_pred = classifier_chain.predict(X)
assert Y_pred.shape == Y.shape
Y_decision = classifier_chain.decision_function(X)
Y_binary = Y_decision >= 0
assert_array_equal(Y_binary, Y_pred)
assert not hasattr(classifier_chain, "predict_proba")
def test_classifier_chain_fit_and_predict_with_sparse_data():
# Fit classifier chain with sparse data
X, Y = generate_multilabel_dataset_with_correlations()
X_sparse = sp.csr_matrix(X)
classifier_chain = ClassifierChain(LogisticRegression())
classifier_chain.fit(X_sparse, Y)
Y_pred_sparse = classifier_chain.predict(X_sparse)
classifier_chain = ClassifierChain(LogisticRegression())
classifier_chain.fit(X, Y)
Y_pred_dense = classifier_chain.predict(X)
assert_array_equal(Y_pred_sparse, Y_pred_dense)
def test_classifier_chain_vs_independent_models():
# Verify that an ensemble of classifier chains (each of length
# N) can achieve a higher Jaccard similarity score than N independent
# models
X, Y = generate_multilabel_dataset_with_correlations()
X_train = X[:600, :]
X_test = X[600:, :]
Y_train = Y[:600, :]
Y_test = Y[600:, :]
ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
chain = ClassifierChain(LogisticRegression())
chain.fit(X_train, Y_train)
Y_pred_chain = chain.predict(X_test)
assert jaccard_score(Y_test, Y_pred_chain, average="samples") > jaccard_score(
Y_test, Y_pred_ovr, average="samples"
)
def test_base_chain_fit_and_predict():
# Fit base chain and verify predict performance
X, Y = generate_multilabel_dataset_with_correlations()
chains = [RegressorChain(Ridge()), ClassifierChain(LogisticRegression())]
for chain in chains:
chain.fit(X, Y)
Y_pred = chain.predict(X)
assert Y_pred.shape == Y.shape
assert [c.coef_.size for c in chain.estimators_] == list(
range(X.shape[1], X.shape[1] + Y.shape[1])
)
Y_prob = chains[1].predict_proba(X)
Y_binary = Y_prob >= 0.5
assert_array_equal(Y_binary, Y_pred)
assert isinstance(chains[1], ClassifierMixin)
def test_base_chain_fit_and_predict_with_sparse_data_and_cv():
# Fit base chain with sparse data cross_val_predict
X, Y = generate_multilabel_dataset_with_correlations()
X_sparse = sp.csr_matrix(X)
base_chains = [
ClassifierChain(LogisticRegression(), cv=3),
RegressorChain(Ridge(), cv=3),
]
for chain in base_chains:
chain.fit(X_sparse, Y)
Y_pred = chain.predict(X_sparse)
assert Y_pred.shape == Y.shape
def test_base_chain_random_order():
# Fit base chain with random order
X, Y = generate_multilabel_dataset_with_correlations()
for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
chain_random = clone(chain).set_params(order="random", random_state=42)
chain_random.fit(X, Y)
chain_fixed = clone(chain).set_params(order=chain_random.order_)
chain_fixed.fit(X, Y)
assert_array_equal(chain_fixed.order_, chain_random.order_)
assert list(chain_random.order) != list(range(4))
assert len(chain_random.order_) == 4
assert len(set(chain_random.order_)) == 4
# Randomly ordered chain should behave identically to a fixed order
# chain with the same order.
for est1, est2 in zip(chain_random.estimators_, chain_fixed.estimators_):
assert_array_almost_equal(est1.coef_, est2.coef_)
def test_base_chain_crossval_fit_and_predict():
# Fit chain with cross_val_predict and verify predict
# performance
X, Y = generate_multilabel_dataset_with_correlations()
for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
chain.fit(X, Y)
chain_cv = clone(chain).set_params(cv=3)
chain_cv.fit(X, Y)
Y_pred_cv = chain_cv.predict(X)
Y_pred = chain.predict(X)
assert Y_pred_cv.shape == Y_pred.shape
assert not np.all(Y_pred == Y_pred_cv)
if isinstance(chain, ClassifierChain):
assert jaccard_score(Y, Y_pred_cv, average="samples") > 0.4
else:
assert mean_squared_error(Y, Y_pred_cv) < 0.25
@pytest.mark.parametrize(
"estimator",
[
RandomForestClassifier(n_estimators=2),
MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
ClassifierChain(RandomForestClassifier(n_estimators=2)),
],
)
def test_multi_output_classes_(estimator):
# Tests classes_ attribute of multioutput classifiers
# RandomForestClassifier supports multioutput out-of-the-box
estimator.fit(X, y)
assert isinstance(estimator.classes_, list)
assert len(estimator.classes_) == n_outputs
for estimator_classes, expected_classes in zip(classes, estimator.classes_):
assert_array_equal(estimator_classes, expected_classes)
class DummyRegressorWithFitParams(DummyRegressor):
def fit(self, X, y, sample_weight=None, **fit_params):
self._fit_params = fit_params
return super().fit(X, y, sample_weight)
class DummyClassifierWithFitParams(DummyClassifier):
def fit(self, X, y, sample_weight=None, **fit_params):
self._fit_params = fit_params
return super().fit(X, y, sample_weight)
@pytest.mark.filterwarnings("ignore:`n_features_in_` is deprecated")
@pytest.mark.parametrize(
"estimator, dataset",
[
(
MultiOutputClassifier(DummyClassifierWithFitParams(strategy="prior")),
datasets.make_multilabel_classification(),
),
(
MultiOutputRegressor(DummyRegressorWithFitParams()),
datasets.make_regression(n_targets=3, random_state=0),
),
],
)
def test_multioutput_estimator_with_fit_params(estimator, dataset):
X, y = dataset
some_param = np.zeros_like(X)
estimator.fit(X, y, some_param=some_param)
for dummy_estimator in estimator.estimators_:
assert "some_param" in dummy_estimator._fit_params
def test_regressor_chain_w_fit_params():
# Make sure fit_params are properly propagated to the sub-estimators
rng = np.random.RandomState(0)
X, y = datasets.make_regression(n_targets=3, random_state=0)
weight = rng.rand(y.shape[0])
class MySGD(SGDRegressor):
def fit(self, X, y, **fit_params):
self.sample_weight_ = fit_params["sample_weight"]
super().fit(X, y, **fit_params)
model = RegressorChain(MySGD())
# Fitting with params
fit_param = {"sample_weight": weight}
model.fit(X, y, **fit_param)
for est in model.estimators_:
assert est.sample_weight_ is weight
@pytest.mark.parametrize(
"MultiOutputEstimator, Estimator",
[(MultiOutputClassifier, LogisticRegression), (MultiOutputRegressor, Ridge)],
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_support_missing_values(MultiOutputEstimator, Estimator):
# smoke test to check that pipeline MultioutputEstimators are letting
# the validation of missing values to
# the underlying pipeline, regressor or classifier
rng = np.random.RandomState(42)
X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
mask = rng.choice([1, 0], X.shape, p=[0.01, 0.99]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), Estimator())
MultiOutputEstimator(pipe).fit(X, y).score(X, y)
@pytest.mark.parametrize("order_type", [list, np.array, tuple])
def test_classifier_chain_tuple_order(order_type):
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
y = [[3, 2], [2, 3], [3, 2]]
order = order_type([1, 0])
chain = ClassifierChain(RandomForestClassifier(), order=order)
chain.fit(X, y)
X_test = [[1.5, 2.5, 3.5]]
y_test = [[3, 2]]
assert_array_almost_equal(chain.predict(X_test), y_test)
def test_classifier_chain_tuple_invalid_order():
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
y = [[3, 2], [2, 3], [3, 2]]
order = tuple([1, 2])
chain = ClassifierChain(RandomForestClassifier(), order=order)
with pytest.raises(ValueError, match="invalid order"):
chain.fit(X, y)
def test_multioutputregressor_ducktypes_fitted_estimator():
"""Test that MultiOutputRegressor checks the fitted estimator for
predict. Non-regression test for #16549."""
X, y = load_linnerud(return_X_y=True)
stacker = StackingRegressor(
estimators=[("sgd", SGDRegressor(random_state=1))],
final_estimator=Ridge(),
cv=2,
)
reg = MultiOutputRegressor(estimator=stacker).fit(X, y)
# Does not raise
reg.predict(X)

View File

@@ -0,0 +1,947 @@
import re
import numpy as np
import scipy.sparse
import pytest
import warnings
from sklearn.datasets import load_digits, load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.naive_bayes import CategoricalNB
DISCRETE_NAIVE_BAYES_CLASSES = [BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB]
ALL_NAIVE_BAYES_CLASSES = DISCRETE_NAIVE_BAYES_CLASSES + [GaussianNB]
# Data is just 6 separable points in the plane
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
y = np.array([1, 1, 1, 2, 2, 2])
# A bit more random tests
rng = np.random.RandomState(0)
X1 = rng.normal(size=(10, 3))
y1 = (rng.normal(size=(10)) > 0).astype(int)
# Data is 6 random integer points in a 100 dimensional space classified to
# three classes.
X2 = rng.randint(5, size=(6, 100))
y2 = np.array([1, 1, 2, 2, 3, 3])
def test_gnb():
# Gaussian Naive Bayes classification.
# This checks that GaussianNB implements fit and predict and returns
# correct values for a simple toy dataset.
clf = GaussianNB()
y_pred = clf.fit(X, y).predict(X)
assert_array_equal(y_pred, y)
y_pred_proba = clf.predict_proba(X)
y_pred_log_proba = clf.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
# Test whether label mismatch between target y and classes raises
# an Error
# FIXME Remove this test once the more general partial_fit tests are merged
with pytest.raises(
ValueError, match="The target label.* in y do not exist in the initial classes"
):
GaussianNB().partial_fit(X, y, classes=[0, 1])
# TODO remove in 1.2 once sigma_ attribute is removed (GH #18842)
def test_gnb_var():
clf = GaussianNB()
clf.fit(X, y)
with pytest.warns(FutureWarning, match="Attribute `sigma_` was deprecated"):
assert_array_equal(clf.sigma_, clf.var_)
def test_gnb_prior():
# Test whether class priors are properly set.
clf = GaussianNB().fit(X, y)
assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8)
clf = GaussianNB().fit(X1, y1)
# Check that the class priors sum to 1
assert_array_almost_equal(clf.class_prior_.sum(), 1)
def test_gnb_sample_weight():
"""Test whether sample weights are properly used in GNB."""
# Sample weights all being 1 should not change results
sw = np.ones(6)
clf = GaussianNB().fit(X, y)
clf_sw = GaussianNB().fit(X, y, sw)
assert_array_almost_equal(clf.theta_, clf_sw.theta_)
assert_array_almost_equal(clf.var_, clf_sw.var_)
# Fitting twice with half sample-weights should result
# in same result as fitting once with full weights
sw = rng.rand(y.shape[0])
clf1 = GaussianNB().fit(X, y, sample_weight=sw)
clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2)
clf2.partial_fit(X, y, sample_weight=sw / 2)
assert_array_almost_equal(clf1.theta_, clf2.theta_)
assert_array_almost_equal(clf1.var_, clf2.var_)
# Check that duplicate entries and correspondingly increased sample
# weights yield the same result
ind = rng.randint(0, X.shape[0], 20)
sample_weight = np.bincount(ind, minlength=X.shape[0])
clf_dupl = GaussianNB().fit(X[ind], y[ind])
clf_sw = GaussianNB().fit(X, y, sample_weight)
assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_)
assert_array_almost_equal(clf_dupl.var_, clf_sw.var_)
def test_gnb_neg_priors():
"""Test whether an error is raised in case of negative priors"""
clf = GaussianNB(priors=np.array([-1.0, 2.0]))
msg = "Priors must be non-negative"
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)
def test_gnb_priors():
"""Test whether the class prior override is properly used"""
clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y)
assert_array_almost_equal(
clf.predict_proba([[-0.1, -0.1]]),
np.array([[0.825303662161683, 0.174696337838317]]),
8,
)
assert_array_almost_equal(clf.class_prior_, np.array([0.3, 0.7]))
def test_gnb_priors_sum_isclose():
# test whether the class prior sum is properly tested"""
X = np.array(
[
[-1, -1],
[-2, -1],
[-3, -2],
[-4, -5],
[-5, -4],
[1, 1],
[2, 1],
[3, 2],
[4, 4],
[5, 5],
]
)
priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14, 0.11, 0.0])
Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
clf = GaussianNB(priors=priors)
# smoke test for issue #9633
clf.fit(X, Y)
def test_gnb_wrong_nb_priors():
"""Test whether an error is raised if the number of prior is different
from the number of class"""
clf = GaussianNB(priors=np.array([0.25, 0.25, 0.25, 0.25]))
msg = "Number of priors must match number of classes"
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)
def test_gnb_prior_greater_one():
"""Test if an error is raised if the sum of prior greater than one"""
clf = GaussianNB(priors=np.array([2.0, 1.0]))
msg = "The sum of the priors should be 1"
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)
def test_gnb_prior_large_bias():
"""Test if good prediction when class prior favor largely one class"""
clf = GaussianNB(priors=np.array([0.01, 0.99]))
clf.fit(X, y)
assert clf.predict([[-0.1, -0.1]]) == np.array([2])
def test_gnb_check_update_with_no_data():
"""Test when the partial fit is called without any data"""
# Create an empty array
prev_points = 100
mean = 0.0
var = 1.0
x_empty = np.empty((0, X.shape[1]))
tmean, tvar = GaussianNB._update_mean_variance(prev_points, mean, var, x_empty)
assert tmean == mean
assert tvar == var
def test_gnb_partial_fit():
clf = GaussianNB().fit(X, y)
clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
assert_array_almost_equal(clf.theta_, clf_pf.theta_)
assert_array_almost_equal(clf.var_, clf_pf.var_)
assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_)
clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y))
clf_pf2.partial_fit(X[1::2], y[1::2])
assert_array_almost_equal(clf.theta_, clf_pf2.theta_)
assert_array_almost_equal(clf.var_, clf_pf2.var_)
assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_)
def test_gnb_naive_bayes_scale_invariance():
# Scaling the data should not change the prediction results
iris = load_iris()
X, y = iris.data, iris.target
labels = [GaussianNB().fit(f * X, y).predict(f * X) for f in [1e-10, 1, 1e10]]
assert_array_equal(labels[0], labels[1])
assert_array_equal(labels[1], labels[2])
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
def test_discretenb_prior(DiscreteNaiveBayes):
# Test whether class priors are properly set.
clf = DiscreteNaiveBayes().fit(X2, y2)
assert_array_almost_equal(
np.log(np.array([2, 2, 2]) / 6.0), clf.class_log_prior_, 8
)
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
def test_discretenb_partial_fit(DiscreteNaiveBayes):
clf1 = DiscreteNaiveBayes()
clf1.fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1])
clf2 = DiscreteNaiveBayes()
clf2.partial_fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1], classes=[0, 1])
assert_array_equal(clf1.class_count_, clf2.class_count_)
if DiscreteNaiveBayes is CategoricalNB:
for i in range(len(clf1.category_count_)):
assert_array_equal(clf1.category_count_[i], clf2.category_count_[i])
else:
assert_array_equal(clf1.feature_count_, clf2.feature_count_)
clf3 = DiscreteNaiveBayes()
# all categories have to appear in the first partial fit
clf3.partial_fit([[0, 1]], [0], classes=[0, 1])
clf3.partial_fit([[1, 0]], [1])
clf3.partial_fit([[1, 1]], [1])
assert_array_equal(clf1.class_count_, clf3.class_count_)
if DiscreteNaiveBayes is CategoricalNB:
# the categories for each feature of CategoricalNB are mapped to an
# index chronologically with each call of partial fit and therefore
# the category_count matrices cannot be compared for equality
for i in range(len(clf1.category_count_)):
assert_array_equal(
clf1.category_count_[i].shape, clf3.category_count_[i].shape
)
assert_array_equal(
np.sum(clf1.category_count_[i], axis=1),
np.sum(clf3.category_count_[i], axis=1),
)
# assert category 0 occurs 1x in the first class and 0x in the 2nd
# class
assert_array_equal(clf1.category_count_[0][0], np.array([1, 0]))
# assert category 1 occurs 0x in the first class and 2x in the 2nd
# class
assert_array_equal(clf1.category_count_[0][1], np.array([0, 2]))
# assert category 0 occurs 0x in the first class and 1x in the 2nd
# class
assert_array_equal(clf1.category_count_[1][0], np.array([0, 1]))
# assert category 1 occurs 1x in the first class and 1x in the 2nd
# class
assert_array_equal(clf1.category_count_[1][1], np.array([1, 1]))
else:
assert_array_equal(clf1.feature_count_, clf3.feature_count_)
@pytest.mark.parametrize("NaiveBayes", ALL_NAIVE_BAYES_CLASSES)
def test_NB_partial_fit_no_first_classes(NaiveBayes):
# classes is required for first call to partial fit
with pytest.raises(
ValueError, match="classes must be passed on the first call to partial_fit."
):
NaiveBayes().partial_fit(X2, y2)
# check consistency of consecutive classes values
clf = NaiveBayes()
clf.partial_fit(X2, y2, classes=np.unique(y2))
with pytest.raises(
ValueError, match="is not the same as on last call to partial_fit"
):
clf.partial_fit(X2, y2, classes=np.arange(42))
def test_discretenb_predict_proba():
# Test discrete NB classes' probability scores
# The 100s below distinguish Bernoulli from multinomial.
# FIXME: write a test to show this.
X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
X_multinomial = [[0, 1], [1, 3], [4, 0]]
# test binary case (1-d output)
y = [0, 0, 2] # 2 is regression test for binary case, 02e673
for DiscreteNaiveBayes, X in zip(
[BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]
):
clf = DiscreteNaiveBayes().fit(X, y)
assert clf.predict(X[-1:]) == 2
assert clf.predict_proba([X[0]]).shape == (1, 2)
assert_array_almost_equal(
clf.predict_proba(X[:2]).sum(axis=1), np.array([1.0, 1.0]), 6
)
# test multiclass case (2-d output, must sum to one)
y = [0, 1, 2]
for DiscreteNaiveBayes, X in zip(
[BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]
):
clf = DiscreteNaiveBayes().fit(X, y)
assert clf.predict_proba(X[0:1]).shape == (1, 3)
assert clf.predict_proba(X[:2]).shape == (2, 3)
assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1)
assert_almost_equal(np.sum(clf.predict_proba([X[-1]])), 1)
assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
def test_discretenb_uniform_prior(DiscreteNaiveBayes):
# Test whether discrete NB classes fit a uniform prior
# when fit_prior=False and class_prior=None
clf = DiscreteNaiveBayes()
clf.set_params(fit_prior=False)
clf.fit([[0], [0], [1]], [0, 0, 1])
prior = np.exp(clf.class_log_prior_)
assert_array_almost_equal(prior, np.array([0.5, 0.5]))
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
def test_discretenb_provide_prior(DiscreteNaiveBayes):
# Test whether discrete NB classes use provided prior
clf = DiscreteNaiveBayes(class_prior=[0.5, 0.5])
clf.fit([[0], [0], [1]], [0, 0, 1])
prior = np.exp(clf.class_log_prior_)
assert_array_almost_equal(prior, np.array([0.5, 0.5]))
# Inconsistent number of classes with prior
msg = "Number of priors must match number of classes"
with pytest.raises(ValueError, match=msg):
clf.fit([[0], [1], [2]], [0, 1, 2])
msg = "is not the same as on last call to partial_fit"
with pytest.raises(ValueError, match=msg):
clf.partial_fit([[0], [1]], [0, 1], classes=[0, 1, 1])
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
def test_discretenb_provide_prior_with_partial_fit(DiscreteNaiveBayes):
# Test whether discrete NB classes use provided prior
# when using partial_fit
iris = load_iris()
iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
iris.data, iris.target, test_size=0.4, random_state=415
)
for prior in [None, [0.3, 0.3, 0.4]]:
clf_full = DiscreteNaiveBayes(class_prior=prior)
clf_full.fit(iris.data, iris.target)
clf_partial = DiscreteNaiveBayes(class_prior=prior)
clf_partial.partial_fit(iris_data1, iris_target1, classes=[0, 1, 2])
clf_partial.partial_fit(iris_data2, iris_target2)
assert_array_almost_equal(
clf_full.class_log_prior_, clf_partial.class_log_prior_
)
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
def test_discretenb_sample_weight_multiclass(DiscreteNaiveBayes):
# check shape consistency for number of samples at fit time
X = [
[0, 0, 1],
[0, 1, 1],
[0, 1, 1],
[1, 0, 0],
]
y = [0, 0, 1, 2]
sample_weight = np.array([1, 1, 2, 2], dtype=np.float64)
sample_weight /= sample_weight.sum()
clf = DiscreteNaiveBayes().fit(X, y, sample_weight=sample_weight)
assert_array_equal(clf.predict(X), [0, 1, 1, 2])
# Check sample weight using the partial_fit method
clf = DiscreteNaiveBayes()
clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2], sample_weight=sample_weight[:2])
clf.partial_fit(X[2:3], y[2:3], sample_weight=sample_weight[2:3])
clf.partial_fit(X[3:], y[3:], sample_weight=sample_weight[3:])
assert_array_equal(clf.predict(X), [0, 1, 1, 2])
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES)
@pytest.mark.parametrize("use_partial_fit", [False, True])
@pytest.mark.parametrize("train_on_single_class_y", [False, True])
def test_discretenb_degenerate_one_class_case(
DiscreteNaiveBayes,
use_partial_fit,
train_on_single_class_y,
):
# Most array attributes of a discrete naive Bayes classifier should have a
# first-axis length equal to the number of classes. Exceptions include:
# ComplementNB.feature_all_, CategoricalNB.n_categories_.
# Confirm that this is the case for binary problems and the degenerate
# case of a single class in the training set, when fitting with `fit` or
# `partial_fit`.
# Non-regression test for handling degenerate one-class case:
# https://github.com/scikit-learn/scikit-learn/issues/18974
X = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
y = [1, 1, 2]
if train_on_single_class_y:
X = X[:-1]
y = y[:-1]
classes = sorted(list(set(y)))
num_classes = len(classes)
clf = DiscreteNaiveBayes()
if use_partial_fit:
clf.partial_fit(X, y, classes=classes)
else:
clf.fit(X, y)
assert clf.predict(X[:1]) == y[0]
# Check that attributes have expected first-axis lengths
attribute_names = [
"classes_",
"class_count_",
"class_log_prior_",
"feature_count_",
"feature_log_prob_",
]
for attribute_name in attribute_names:
attribute = getattr(clf, attribute_name, None)
if attribute is None:
# CategoricalNB has no feature_count_ attribute
continue
if isinstance(attribute, np.ndarray):
assert attribute.shape[0] == num_classes
else:
# CategoricalNB.feature_log_prob_ is a list of arrays
for element in attribute:
assert element.shape[0] == num_classes
@pytest.mark.parametrize("kind", ("dense", "sparse"))
def test_mnnb(kind):
# Test Multinomial Naive Bayes classification.
# This checks that MultinomialNB implements fit and predict and returns
# correct values for a simple toy dataset.
if kind == "dense":
X = X2
elif kind == "sparse":
X = scipy.sparse.csr_matrix(X2)
# Check the ability to predict the learning set.
clf = MultinomialNB()
msg = "Negative values in data passed to"
with pytest.raises(ValueError, match=msg):
clf.fit(-X, y2)
y_pred = clf.fit(X, y2).predict(X)
assert_array_equal(y_pred, y2)
# Verify that np.log(clf.predict_proba(X)) gives the same results as
# clf.predict_log_proba(X)
y_pred_proba = clf.predict_proba(X)
y_pred_log_proba = clf.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
# Check that incremental fitting yields the same results
clf2 = MultinomialNB()
clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2))
clf2.partial_fit(X[2:5], y2[2:5])
clf2.partial_fit(X[5:], y2[5:])
y_pred2 = clf2.predict(X)
assert_array_equal(y_pred2, y2)
y_pred_proba2 = clf2.predict_proba(X)
y_pred_log_proba2 = clf2.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8)
assert_array_almost_equal(y_pred_proba2, y_pred_proba)
assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba)
# Partial fit on the whole data at once should be the same as fit too
clf3 = MultinomialNB()
clf3.partial_fit(X, y2, classes=np.unique(y2))
y_pred3 = clf3.predict(X)
assert_array_equal(y_pred3, y2)
y_pred_proba3 = clf3.predict_proba(X)
y_pred_log_proba3 = clf3.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8)
assert_array_almost_equal(y_pred_proba3, y_pred_proba)
assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)
def test_mnb_prior_unobserved_targets():
# test smoothing of prior for yet unobserved targets
# Create toy training data
X = np.array([[0, 1], [1, 0]])
y = np.array([0, 1])
clf = MultinomialNB()
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
clf.partial_fit(X, y, classes=[0, 1, 2])
assert clf.predict([[0, 1]]) == 0
assert clf.predict([[1, 0]]) == 1
assert clf.predict([[1, 1]]) == 0
# add a training example with previously unobserved class
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
clf.partial_fit([[1, 1]], [2])
assert clf.predict([[0, 1]]) == 0
assert clf.predict([[1, 0]]) == 1
assert clf.predict([[1, 1]]) == 2
def test_bnb():
# Tests that BernoulliNB when alpha=1.0 gives the same values as
# those given for the toy example in Manning, Raghavan, and
# Schuetze's "Introduction to Information Retrieval" book:
# https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
# Training data points are:
# Chinese Beijing Chinese (class: China)
# Chinese Chinese Shanghai (class: China)
# Chinese Macao (class: China)
# Tokyo Japan Chinese (class: Japan)
# Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo
X = np.array(
[[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]
)
# Classes are China (0), Japan (1)
Y = np.array([0, 0, 0, 1])
# Fit BernoulliBN w/ alpha = 1.0
clf = BernoulliNB(alpha=1.0)
clf.fit(X, Y)
# Check the class prior is correct
class_prior = np.array([0.75, 0.25])
assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior)
# Check the feature probabilities are correct
feature_prob = np.array(
[
[0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
[1 / 3.0, 2 / 3.0, 2 / 3.0, 1 / 3.0, 1 / 3.0, 2 / 3.0],
]
)
assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob)
# Testing data point is:
# Chinese Chinese Chinese Tokyo Japan
X_test = np.array([[0, 1, 1, 0, 0, 1]])
# Check the predictive probabilities are correct
unnorm_predict_proba = np.array([[0.005183999999999999, 0.02194787379972565]])
predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba)
assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)
def test_bnb_feature_log_prob():
# Test for issue #4268.
# Tests that the feature log prob value computed by BernoulliNB when
# alpha=1.0 is equal to the expression given in Manning, Raghavan,
# and Schuetze's "Introduction to Information Retrieval" book:
# http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
Y = np.array([0, 0, 1, 2, 2])
# Fit Bernoulli NB w/ alpha = 1.0
clf = BernoulliNB(alpha=1.0)
clf.fit(X, Y)
# Manually form the (log) numerator and denominator that
# constitute P(feature presence | class)
num = np.log(clf.feature_count_ + 1.0)
denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T
# Check manual estimate matches
assert_array_almost_equal(clf.feature_log_prob_, (num - denom))
def test_cnb():
# Tests ComplementNB when alpha=1.0 for the toy example in Manning,
# Raghavan, and Schuetze's "Introduction to Information Retrieval" book:
# https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
# Training data points are:
# Chinese Beijing Chinese (class: China)
# Chinese Chinese Shanghai (class: China)
# Chinese Macao (class: China)
# Tokyo Japan Chinese (class: Japan)
# Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo.
X = np.array(
[[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]
)
# Classes are China (0), Japan (1).
Y = np.array([0, 0, 0, 1])
# Check that weights are correct. See steps 4-6 in Table 4 of
# Rennie et al. (2003).
theta = np.array(
[
[
(0 + 1) / (3 + 6),
(1 + 1) / (3 + 6),
(1 + 1) / (3 + 6),
(0 + 1) / (3 + 6),
(0 + 1) / (3 + 6),
(1 + 1) / (3 + 6),
],
[
(1 + 1) / (6 + 6),
(3 + 1) / (6 + 6),
(0 + 1) / (6 + 6),
(1 + 1) / (6 + 6),
(1 + 1) / (6 + 6),
(0 + 1) / (6 + 6),
],
]
)
weights = np.zeros(theta.shape)
normed_weights = np.zeros(theta.shape)
for i in range(2):
weights[i] = -np.log(theta[i])
normed_weights[i] = weights[i] / weights[i].sum()
# Verify inputs are nonnegative.
clf = ComplementNB(alpha=1.0)
msg = re.escape("Negative values in data passed to ComplementNB (input X)")
with pytest.raises(ValueError, match=msg):
clf.fit(-X, Y)
clf.fit(X, Y)
# Check that counts/weights are correct.
feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]])
assert_array_equal(clf.feature_count_, feature_count)
class_count = np.array([3, 1])
assert_array_equal(clf.class_count_, class_count)
feature_all = np.array([1, 4, 1, 1, 1, 1])
assert_array_equal(clf.feature_all_, feature_all)
assert_array_almost_equal(clf.feature_log_prob_, weights)
clf = ComplementNB(alpha=1.0, norm=True)
clf.fit(X, Y)
assert_array_almost_equal(clf.feature_log_prob_, normed_weights)
def test_categoricalnb():
# Check the ability to predict the training set.
clf = CategoricalNB()
y_pred = clf.fit(X2, y2).predict(X2)
assert_array_equal(y_pred, y2)
X3 = np.array([[1, 4], [2, 5]])
y3 = np.array([1, 2])
clf = CategoricalNB(alpha=1, fit_prior=False)
clf.fit(X3, y3)
assert_array_equal(clf.n_categories_, np.array([3, 6]))
# Check error is raised for X with negative entries
X = np.array([[0, -1]])
y = np.array([1])
error_msg = re.escape("Negative values in data passed to CategoricalNB (input X)")
with pytest.raises(ValueError, match=error_msg):
clf.predict(X)
with pytest.raises(ValueError, match=error_msg):
clf.fit(X, y)
# Test alpha
X3_test = np.array([[2, 5]])
# alpha=1 increases the count of all categories by one so the final
# probability for each category is not 50/50 but 1/3 to 2/3
bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]])
bayes_denominator = bayes_numerator.sum()
assert_array_almost_equal(
clf.predict_proba(X3_test), bayes_numerator / bayes_denominator
)
# Assert category_count has counted all features
assert len(clf.category_count_) == X3.shape[1]
# Check sample_weight
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
y = np.array([1, 1, 2, 2])
clf = CategoricalNB(alpha=1, fit_prior=False)
clf.fit(X, y)
assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1]))
assert_array_equal(clf.n_categories_, np.array([2, 2]))
for factor in [1.0, 0.3, 5, 0.0001]:
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
y = np.array([1, 1, 2, 2])
sample_weight = np.array([1, 1, 10, 0.1]) * factor
clf = CategoricalNB(alpha=1, fit_prior=False)
clf.fit(X, y, sample_weight=sample_weight)
assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
assert_array_equal(clf.n_categories_, np.array([2, 2]))
@pytest.mark.parametrize(
"min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_",
[
# check min_categories with int > observed categories
(
3,
np.array([[2, 0, 0], [1, 1, 0]]),
np.array([[1, 1, 0], [1, 1, 0]]),
np.array([[0, 2]]),
np.array([3, 3]),
),
# check with list input
(
[3, 4],
np.array([[2, 0, 0], [1, 1, 0]]),
np.array([[1, 1, 0, 0], [1, 1, 0, 0]]),
np.array([[0, 3]]),
np.array([3, 4]),
),
# check min_categories with min less than actual
(
[
1,
np.array([[2, 0], [1, 1]]),
np.array([[1, 1], [1, 1]]),
np.array([[0, 1]]),
np.array([2, 2]),
]
),
],
)
def test_categoricalnb_with_min_categories(
min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_
):
X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
y_n_categories = np.array([1, 1, 2, 2])
expected_prediction = np.array([1])
clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories)
clf.fit(X_n_categories, y_n_categories)
X1_count, X2_count = clf.category_count_
assert_array_equal(X1_count, exp_X1_count)
assert_array_equal(X2_count, exp_X2_count)
predictions = clf.predict(new_X)
assert_array_equal(predictions, expected_prediction)
assert_array_equal(clf.n_categories_, exp_n_categories_)
@pytest.mark.parametrize(
"min_categories, error_msg",
[
("bad_arg", "'min_categories' should have integral"),
([[3, 2], [2, 4]], "'min_categories' should have shape"),
(1.0, "'min_categories' should have integral"),
],
)
def test_categoricalnb_min_categories_errors(min_categories, error_msg):
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
y = np.array([1, 1, 2, 2])
clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories)
with pytest.raises(ValueError, match=error_msg):
clf.fit(X, y)
def test_alpha():
# Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
X = np.array([[1, 0], [1, 1]])
y = np.array([0, 1])
nb = BernoulliNB(alpha=0.0)
msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10"
with pytest.warns(UserWarning, match=msg):
nb.partial_fit(X, y, classes=[0, 1])
with pytest.warns(UserWarning, match=msg):
nb.fit(X, y)
prob = np.array([[1, 0], [0, 1]])
assert_array_almost_equal(nb.predict_proba(X), prob)
nb = MultinomialNB(alpha=0.0)
with pytest.warns(UserWarning, match=msg):
nb.partial_fit(X, y, classes=[0, 1])
with pytest.warns(UserWarning, match=msg):
nb.fit(X, y)
prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
assert_array_almost_equal(nb.predict_proba(X), prob)
nb = CategoricalNB(alpha=0.0)
with pytest.warns(UserWarning, match=msg):
nb.fit(X, y)
prob = np.array([[1.0, 0.0], [0.0, 1.0]])
assert_array_almost_equal(nb.predict_proba(X), prob)
# Test sparse X
X = scipy.sparse.csr_matrix(X)
nb = BernoulliNB(alpha=0.0)
with pytest.warns(UserWarning, match=msg):
nb.fit(X, y)
prob = np.array([[1, 0], [0, 1]])
assert_array_almost_equal(nb.predict_proba(X), prob)
nb = MultinomialNB(alpha=0.0)
with pytest.warns(UserWarning, match=msg):
nb.fit(X, y)
prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
assert_array_almost_equal(nb.predict_proba(X), prob)
# Test for alpha < 0
X = np.array([[1, 0], [1, 1]])
y = np.array([0, 1])
expected_msg = re.escape(
"Smoothing parameter alpha = -1.0e-01. alpha should be > 0."
)
b_nb = BernoulliNB(alpha=-0.1)
m_nb = MultinomialNB(alpha=-0.1)
c_nb = CategoricalNB(alpha=-0.1)
with pytest.raises(ValueError, match=expected_msg):
b_nb.fit(X, y)
with pytest.raises(ValueError, match=expected_msg):
m_nb.fit(X, y)
with pytest.raises(ValueError, match=expected_msg):
c_nb.fit(X, y)
b_nb = BernoulliNB(alpha=-0.1)
m_nb = MultinomialNB(alpha=-0.1)
with pytest.raises(ValueError, match=expected_msg):
b_nb.partial_fit(X, y, classes=[0, 1])
with pytest.raises(ValueError, match=expected_msg):
m_nb.partial_fit(X, y, classes=[0, 1])
def test_alpha_vector():
X = np.array([[1, 0], [1, 1]])
y = np.array([0, 1])
# Setting alpha=np.array with same length
# as number of features should be fine
alpha = np.array([1, 2])
nb = MultinomialNB(alpha=alpha)
nb.partial_fit(X, y, classes=[0, 1])
# Test feature probabilities uses pseudo-counts (alpha)
feature_prob = np.array([[1 / 2, 1 / 2], [2 / 5, 3 / 5]])
assert_array_almost_equal(nb.feature_log_prob_, np.log(feature_prob))
# Test predictions
prob = np.array([[5 / 9, 4 / 9], [25 / 49, 24 / 49]])
assert_array_almost_equal(nb.predict_proba(X), prob)
# Test alpha non-negative
alpha = np.array([1.0, -0.1])
m_nb = MultinomialNB(alpha=alpha)
expected_msg = "Smoothing parameter alpha = -1.0e-01. alpha should be > 0."
with pytest.raises(ValueError, match=expected_msg):
m_nb.fit(X, y)
# Test that too small pseudo-counts are replaced
ALPHA_MIN = 1e-10
alpha = np.array([ALPHA_MIN / 2, 0.5])
m_nb = MultinomialNB(alpha=alpha)
m_nb.partial_fit(X, y, classes=[0, 1])
assert_array_almost_equal(m_nb._check_alpha(), [ALPHA_MIN, 0.5], decimal=12)
# Test correct dimensions
alpha = np.array([1.0, 2.0, 3.0])
m_nb = MultinomialNB(alpha=alpha)
expected_msg = re.escape(
"alpha should be a scalar or a numpy array with shape [n_features]"
)
with pytest.raises(ValueError, match=expected_msg):
m_nb.fit(X, y)
def test_check_accuracy_on_digits():
# Non regression test to make sure that any further refactoring / optim
# of the NB models do not harm the performance on a slightly non-linearly
# separable dataset
X, y = load_digits(return_X_y=True)
binary_3v8 = np.logical_or(y == 3, y == 8)
X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]
# Multinomial NB
scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
assert scores.mean() > 0.86
scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
assert scores.mean() > 0.94
# Bernoulli NB
scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
assert scores.mean() > 0.83
scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
assert scores.mean() > 0.92
# Gaussian NB
scores = cross_val_score(GaussianNB(), X, y, cv=10)
assert scores.mean() > 0.77
scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10)
assert scores.mean() > 0.89
scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
assert scores.mean() > 0.86
# FIXME: remove in 1.2
@pytest.mark.parametrize("Estimator", DISCRETE_NAIVE_BAYES_CLASSES)
def test_n_features_deprecation(Estimator):
# Check that we raise the proper deprecation warning if accessing
# `n_features_`.
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 0])
est = Estimator().fit(X, y)
with pytest.warns(FutureWarning, match="`n_features_` was deprecated"):
est.n_features_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,469 @@
import functools
from typing import List, Any
import warnings
import numpy as np
import scipy.sparse as sp
import pytest
from sklearn.metrics import euclidean_distances
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import _gaussian_random_matrix
from sklearn.random_projection import _sparse_random_matrix
from sklearn.random_projection import SparseRandomProjection
from sklearn.random_projection import GaussianRandomProjection
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.exceptions import DataDimensionalityWarning
all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection
# Make some random data with uniformly located non zero entries with
# Gaussian distributed values
def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=0):
rng = np.random.RandomState(random_state)
data_coo = sp.coo_matrix(
(
rng.randn(n_nonzeros),
(
rng.randint(n_samples, size=n_nonzeros),
rng.randint(n_features, size=n_nonzeros),
),
),
shape=(n_samples, n_features),
)
return data_coo.toarray(), data_coo.tocsr()
def densify(matrix):
if not sp.issparse(matrix):
return matrix
else:
return matrix.toarray()
n_samples, n_features = (10, 1000)
n_nonzeros = int(n_samples * n_features / 100.0)
data, data_csr = make_sparse_random_data(n_samples, n_features, n_nonzeros)
###############################################################################
# test on JL lemma
###############################################################################
@pytest.mark.parametrize(
"n_samples, eps", [(100, 1.1), (100, 0.0), (100, -0.1), (0, 0.5)]
)
def test_invalid_jl_domain(n_samples, eps):
with pytest.raises(ValueError):
johnson_lindenstrauss_min_dim(n_samples, eps=eps)
def test_input_size_jl_min_dim():
with pytest.raises(ValueError):
johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])
johnson_lindenstrauss_min_dim(
np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
)
###############################################################################
# tests random matrix generation
###############################################################################
def check_input_size_random_matrix(random_matrix):
inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)]
for n_components, n_features in inputs:
with pytest.raises(ValueError):
random_matrix(n_components, n_features)
def check_size_generated(random_matrix):
inputs = [(1, 5), (5, 1), (5, 5), (1, 1)]
for n_components, n_features in inputs:
assert random_matrix(n_components, n_features).shape == (
n_components,
n_features,
)
def check_zero_mean_and_unit_norm(random_matrix):
# All random matrix should produce a transformation matrix
# with zero mean and unit norm for each columns
A = densify(random_matrix(10000, 1, random_state=0))
assert_array_almost_equal(0, np.mean(A), 3)
assert_array_almost_equal(1.0, np.linalg.norm(A), 1)
def check_input_with_sparse_random_matrix(random_matrix):
n_components, n_features = 5, 10
for density in [-1.0, 0.0, 1.1]:
with pytest.raises(ValueError):
random_matrix(n_components, n_features, density=density)
@pytest.mark.parametrize("random_matrix", all_random_matrix)
def test_basic_property_of_random_matrix(random_matrix):
# Check basic properties of random matrix generation
check_input_size_random_matrix(random_matrix)
check_size_generated(random_matrix)
check_zero_mean_and_unit_norm(random_matrix)
@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix)
def test_basic_property_of_sparse_random_matrix(random_matrix):
check_input_with_sparse_random_matrix(random_matrix)
random_matrix_dense = functools.partial(random_matrix, density=1.0)
check_zero_mean_and_unit_norm(random_matrix_dense)
def test_gaussian_random_matrix():
# Check some statical properties of Gaussian random matrix
# Check that the random matrix follow the proper distribution.
# Let's say that each element of a_{ij} of A is taken from
# a_ij ~ N(0.0, 1 / n_components).
#
n_components = 100
n_features = 1000
A = _gaussian_random_matrix(n_components, n_features, random_state=0)
assert_array_almost_equal(0.0, np.mean(A), 2)
assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1)
def test_sparse_random_matrix():
# Check some statical properties of sparse random matrix
n_components = 100
n_features = 500
for density in [0.3, 1.0]:
s = 1 / density
A = _sparse_random_matrix(
n_components, n_features, density=density, random_state=0
)
A = densify(A)
# Check possible values
values = np.unique(A)
assert np.sqrt(s) / np.sqrt(n_components) in values
assert -np.sqrt(s) / np.sqrt(n_components) in values
if density == 1.0:
assert np.size(values) == 2
else:
assert 0.0 in values
assert np.size(values) == 3
# Check that the random matrix follow the proper distribution.
# Let's say that each element of a_{ij} of A is taken from
#
# - -sqrt(s) / sqrt(n_components) with probability 1 / 2s
# - 0 with probability 1 - 1 / s
# - +sqrt(s) / sqrt(n_components) with probability 1 / 2s
#
assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
assert_almost_equal(
np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
)
assert_almost_equal(
np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
)
assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2)
assert_almost_equal(
np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1),
(1 - 1 / (2 * s)) * 1 / (2 * s),
decimal=2,
)
assert_almost_equal(
np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1),
(1 - 1 / (2 * s)) * 1 / (2 * s),
decimal=2,
)
###############################################################################
# tests on random projection transformer
###############################################################################
@pytest.mark.parametrize("density", [1.1, 0, -0.1])
def test_sparse_random_projection_transformer_invalid_density(density):
for RandomProjection in all_SparseRandomProjection:
with pytest.raises(ValueError):
RandomProjection(density=density).fit(data)
@pytest.mark.parametrize("n_components, fit_data", [("auto", [[0, 1, 2]]), (-10, data)])
def test_random_projection_transformer_invalid_input(n_components, fit_data):
for RandomProjection in all_RandomProjection:
with pytest.raises(ValueError):
RandomProjection(n_components=n_components).fit(fit_data)
def test_try_to_transform_before_fit():
for RandomProjection in all_RandomProjection:
with pytest.raises(ValueError):
RandomProjection(n_components="auto").transform(data)
def test_too_many_samples_to_find_a_safe_embedding():
data, _ = make_sparse_random_data(1000, 100, 1000)
for RandomProjection in all_RandomProjection:
rp = RandomProjection(n_components="auto", eps=0.1)
expected_msg = (
"eps=0.100000 and n_samples=1000 lead to a target dimension"
" of 5920 which is larger than the original space with"
" n_features=100"
)
with pytest.raises(ValueError, match=expected_msg):
rp.fit(data)
def test_random_projection_embedding_quality():
data, _ = make_sparse_random_data(8, 5000, 15000)
eps = 0.2
original_distances = euclidean_distances(data, squared=True)
original_distances = original_distances.ravel()
non_identical = original_distances != 0.0
# remove 0 distances to avoid division by 0
original_distances = original_distances[non_identical]
for RandomProjection in all_RandomProjection:
rp = RandomProjection(n_components="auto", eps=eps, random_state=0)
projected = rp.fit_transform(data)
projected_distances = euclidean_distances(projected, squared=True)
projected_distances = projected_distances.ravel()
# remove 0 distances to avoid division by 0
projected_distances = projected_distances[non_identical]
distances_ratio = projected_distances / original_distances
# check that the automatically tuned values for the density respect the
# contract for eps: pairwise distances are preserved according to the
# Johnson-Lindenstrauss lemma
assert distances_ratio.max() < 1 + eps
assert 1 - eps < distances_ratio.min()
def test_SparseRandomProj_output_representation():
for SparseRandomProj in all_SparseRandomProjection:
# when using sparse input, the projected data can be forced to be a
# dense numpy array
rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0)
rp.fit(data)
assert isinstance(rp.transform(data), np.ndarray)
sparse_data = sp.csr_matrix(data)
assert isinstance(rp.transform(sparse_data), np.ndarray)
# the output can be left to a sparse matrix instead
rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0)
rp = rp.fit(data)
# output for dense input will stay dense:
assert isinstance(rp.transform(data), np.ndarray)
# output for sparse output will be sparse:
assert sp.issparse(rp.transform(sparse_data))
def test_correct_RandomProjection_dimensions_embedding():
for RandomProjection in all_RandomProjection:
rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data)
# the number of components is adjusted from the shape of the training
# set
assert rp.n_components == "auto"
assert rp.n_components_ == 110
if RandomProjection in all_SparseRandomProjection:
assert rp.density == "auto"
assert_almost_equal(rp.density_, 0.03, 2)
assert rp.components_.shape == (110, n_features)
projected_1 = rp.transform(data)
assert projected_1.shape == (n_samples, 110)
# once the RP is 'fitted' the projection is always the same
projected_2 = rp.transform(data)
assert_array_equal(projected_1, projected_2)
# fit transform with same random seed will lead to the same results
rp2 = RandomProjection(random_state=0, eps=0.5)
projected_3 = rp2.fit_transform(data)
assert_array_equal(projected_1, projected_3)
# Try to transform with an input X of size different from fitted.
with pytest.raises(ValueError):
rp.transform(data[:, 1:5])
# it is also possible to fix the number of components and the density
# level
if RandomProjection in all_SparseRandomProjection:
rp = RandomProjection(n_components=100, density=0.001, random_state=0)
projected = rp.fit_transform(data)
assert projected.shape == (n_samples, 100)
assert rp.components_.shape == (100, n_features)
assert rp.components_.nnz < 115 # close to 1% density
assert 85 < rp.components_.nnz # close to 1% density
def test_warning_n_components_greater_than_n_features():
n_features = 20
data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
for RandomProjection in all_RandomProjection:
with pytest.warns(DataDimensionalityWarning):
RandomProjection(n_components=n_features + 1).fit(data)
def test_works_with_sparse_data():
n_features = 20
data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
for RandomProjection in all_RandomProjection:
rp_dense = RandomProjection(n_components=3, random_state=1).fit(data)
rp_sparse = RandomProjection(n_components=3, random_state=1).fit(
sp.csr_matrix(data)
)
assert_array_almost_equal(
densify(rp_dense.components_), densify(rp_sparse.components_)
)
def test_johnson_lindenstrauss_min_dim():
"""Test Johnson-Lindenstrauss for small eps.
Regression test for #17111: before #19374, 32-bit systems would fail.
"""
assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
def test_random_projection_feature_names_out(random_projection_cls):
random_projection = random_projection_cls(n_components=2)
random_projection.fit(data)
names_out = random_projection.get_feature_names_out()
class_name_lower = random_projection_cls.__name__.lower()
expected_names_out = np.array(
[f"{class_name_lower}{i}" for i in range(random_projection.n_components_)],
dtype=object,
)
assert_array_equal(names_out, expected_names_out)
@pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000))
@pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000))
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
@pytest.mark.parametrize("compute_inverse_components", [True, False])
def test_inverse_transform(
n_samples,
n_features,
random_projection_cls,
compute_inverse_components,
global_random_seed,
):
n_components = 10
random_projection = random_projection_cls(
n_components=n_components,
compute_inverse_components=compute_inverse_components,
random_state=global_random_seed,
)
X_dense, X_csr = make_sparse_random_data(
n_samples,
n_features,
n_samples * n_features // 100 + 1,
random_state=global_random_seed,
)
for X in [X_dense, X_csr]:
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message=(
"The number of components is higher than the number of features"
),
category=DataDimensionalityWarning,
)
projected = random_projection.fit_transform(X)
if compute_inverse_components:
assert hasattr(random_projection, "inverse_components_")
inv_components = random_projection.inverse_components_
assert inv_components.shape == (n_features, n_components)
projected_back = random_projection.inverse_transform(projected)
assert projected_back.shape == X.shape
projected_again = random_projection.transform(projected_back)
if hasattr(projected, "toarray"):
projected = projected.toarray()
assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10)
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
@pytest.mark.parametrize(
"input_dtype, expected_dtype",
(
(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64),
),
)
def test_random_projection_dtype_match(
random_projection_cls, input_dtype, expected_dtype
):
# Verify output matrix dtype
rng = np.random.RandomState(42)
X = rng.rand(25, 3000)
rp = random_projection_cls(random_state=0)
transformed = rp.fit_transform(X.astype(input_dtype))
assert rp.components_.dtype == expected_dtype
assert transformed.dtype == expected_dtype
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
def test_random_projection_numerical_consistency(random_projection_cls):
# Verify numerical consistency among np.float32 and np.float64
atol = 1e-5
rng = np.random.RandomState(42)
X = rng.rand(25, 3000)
rp_32 = random_projection_cls(random_state=0)
rp_64 = random_projection_cls(random_state=0)
projection_32 = rp_32.fit_transform(X.astype(np.float32))
projection_64 = rp_64.fit_transform(X.astype(np.float64))
assert_allclose(projection_64, projection_32, atol=atol)
assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)